Home | History | Annotate | Download | only in text
      1 /*
      2  *******************************************************************************
      3  * Copyright (C) 1996-2014, International Business Machines Corporation and    *
      4  * others. All Rights Reserved.                                                *
      5  *******************************************************************************
      6  */
      7 package com.ibm.icu.text;
      8 
      9 import java.util.ArrayList;
     10 import java.util.HashSet;
     11 import java.util.Iterator;
     12 import java.util.List;
     13 import java.util.Set;
     14 
     15 import com.ibm.icu.impl.Norm2AllModes;
     16 import com.ibm.icu.impl.Normalizer2Impl;
     17 import com.ibm.icu.impl.Utility;
     18 import com.ibm.icu.lang.UCharacter;
     19 
     20 /**
     21  * This class allows one to iterate through all the strings that are canonically equivalent to a given
     22  * string. For example, here are some sample results:
     23  * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
     24  * <pre>
     25  1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
     26  2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
     27  3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
     28  4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
     29  5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
     30  6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
     31  7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
     32  8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
     33  9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}
     34 10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}
     35 11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}
     36 12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}
     37  *</pre>
     38  *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
     39  * since it has not been optimized for that situation.
     40  * @author M. Davis
     41  * @stable ICU 2.4
     42  */
     43 
     44 public final class CanonicalIterator {
     45     /**
     46      * Construct a CanonicalIterator object
     47      * @param source string to get results for
     48      * @stable ICU 2.4
     49      */
     50     public CanonicalIterator(String source) {
     51         Norm2AllModes allModes = Norm2AllModes.getNFCInstance();
     52         nfd = allModes.decomp;
     53         nfcImpl = allModes.impl.ensureCanonIterData();
     54         setSource(source);
     55     }
     56 
     57     /**
     58      * Gets the NFD form of the current source we are iterating over.
     59      * @return gets the source: NOTE: it is the NFD form of the source originally passed in
     60      * @stable ICU 2.4
     61      */
     62     public String getSource() {
     63       return source;
     64     }
     65 
     66     /**
     67      * Resets the iterator so that one can start again from the beginning.
     68      * @stable ICU 2.4
     69      */
     70     public void reset() {
     71         done = false;
     72         for (int i = 0; i < current.length; ++i) {
     73             current[i] = 0;
     74         }
     75     }
     76 
     77     /**
     78      * Get the next canonically equivalent string.
     79      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
     80      * @return the next string that is canonically equivalent. The value null is returned when
     81      * the iteration is done.
     82      * @stable ICU 2.4
     83      */
     84     public String next() {
     85         if (done) return null;
     86 
     87         // construct return value
     88 
     89         buffer.setLength(0); // delete old contents
     90         for (int i = 0; i < pieces.length; ++i) {
     91             buffer.append(pieces[i][current[i]]);
     92         }
     93         String result = buffer.toString();
     94 
     95         // find next value for next time
     96 
     97         for (int i = current.length - 1; ; --i) {
     98             if (i < 0) {
     99                 done = true;
    100                 break;
    101             }
    102             current[i]++;
    103             if (current[i] < pieces[i].length) break; // got sequence
    104             current[i] = 0;
    105         }
    106         return result;
    107     }
    108 
    109     /**
    110      * Set a new source for this iterator. Allows object reuse.
    111      * @param newSource the source string to iterate against. This allows the same iterator to be used
    112      * while changing the source string, saving object creation.
    113      * @stable ICU 2.4
    114      */
    115     public void setSource(String newSource) {
    116         source = nfd.normalize(newSource);
    117         done = false;
    118 
    119         // catch degenerate case
    120         if (newSource.length() == 0) {
    121             pieces = new String[1][];
    122             current = new int[1];
    123             pieces[0] = new String[]{""};
    124             return;
    125         }
    126 
    127         // find the segments
    128         List<String> segmentList = new ArrayList<String>();
    129         int cp;
    130         int start = 0;
    131 
    132         // i should be the end of the first code point
    133         // break up the string into segements
    134 
    135         int i = UTF16.findOffsetFromCodePoint(source, 1);
    136 
    137         for (; i < source.length(); i += Character.charCount(cp)) {
    138             cp = source.codePointAt(i);
    139             if (nfcImpl.isCanonSegmentStarter(cp)) {
    140                 segmentList.add(source.substring(start, i)); // add up to i
    141                 start = i;
    142             }
    143         }
    144         segmentList.add(source.substring(start, i)); // add last one
    145 
    146         // allocate the arrays, and find the strings that are CE to each segment
    147         pieces = new String[segmentList.size()][];
    148         current = new int[segmentList.size()];
    149         for (i = 0; i < pieces.length; ++i) {
    150             if (PROGRESS) System.out.println("SEGMENT");
    151             pieces[i] = getEquivalents(segmentList.get(i));
    152         }
    153     }
    154 
    155     /**
    156      * Simple implementation of permutation.
    157      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
    158      * @param source the string to find permutations for
    159      * @param skipZeros set to true to skip characters with canonical combining class zero
    160      * @param output the set to add the results to
    161      * @internal
    162      * @deprecated This API is ICU internal only.
    163      */
    164     @Deprecated
    165     public static void permute(String source, boolean skipZeros, Set<String> output) {
    166         // TODO: optimize
    167         //if (PROGRESS) System.out.println("Permute: " + source);
    168 
    169         // optimization:
    170         // if zero or one character, just return a set with it
    171         // we check for length < 2 to keep from counting code points all the time
    172         if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
    173             output.add(source);
    174             return;
    175         }
    176 
    177         // otherwise iterate through the string, and recursively permute all the other characters
    178         Set<String> subpermute = new HashSet<String>();
    179         int cp;
    180         for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
    181             cp = UTF16.charAt(source, i);
    182 
    183             // optimization:
    184             // if the character is canonical combining class zero,
    185             // don't permute it
    186             if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {
    187                 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
    188                 continue;
    189             }
    190 
    191             // see what the permutations of the characters before and after this one are
    192             subpermute.clear();
    193             permute(source.substring(0,i)
    194                 + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
    195 
    196             // prefix this character to all of them
    197             String chStr = UTF16.valueOf(source, i);
    198             for (String s : subpermute) {
    199                 String piece = chStr + s;
    200                 //if (PROGRESS) System.out.println("  Piece: " + piece);
    201                 output.add(piece);
    202             }
    203         }
    204     }
    205 
    206     // FOR TESTING
    207 
    208     /*
    209      *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
    210      *
    211     public static UnicodeSet getSafeStart() {
    212         return (UnicodeSet) SAFE_START.clone();
    213     }
    214     */
    215     /*
    216      *@return the set of characters whose decompositions start with the given character
    217      *
    218     public static UnicodeSet getStarts(int cp) {
    219         UnicodeSet result = AT_START.get(cp);
    220         if (result == null) result = EMPTY;
    221         return (UnicodeSet) result.clone();
    222     }
    223     */
    224 
    225     // ===================== PRIVATES ==============================
    226 
    227     // debug
    228     private static boolean PROGRESS = false; // debug progress
    229     //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
    230     private static boolean SKIP_ZEROS = true;
    231 
    232     // fields
    233     private final Normalizer2 nfd;
    234     private final Normalizer2Impl nfcImpl;
    235     private String source;
    236     private boolean done;
    237     private String[][] pieces;
    238     private int[] current;
    239     // Note: C will need two more fields, since arrays there don't have lengths
    240     // int pieces_length;
    241     // int[] pieces_lengths;
    242 
    243     // transient fields
    244     private transient StringBuilder buffer = new StringBuilder();
    245 
    246 
    247     // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
    248     private String[] getEquivalents(String segment) {
    249         Set<String> result = new HashSet<String>();
    250         Set<String> basic = getEquivalents2(segment);
    251         Set<String> permutations = new HashSet<String>();
    252 
    253         // now get all the permutations
    254         // add only the ones that are canonically equivalent
    255         // TODO: optimize by not permuting any class zero.
    256         Iterator<String> it = basic.iterator();
    257         while (it.hasNext()) {
    258             String item = it.next();
    259             permutations.clear();
    260             permute(item, SKIP_ZEROS, permutations);
    261             Iterator<String> it2 = permutations.iterator();
    262             while (it2.hasNext()) {
    263                 String possible = it2.next();
    264 
    265 /*
    266                 String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
    267                 if (attempt.equals(segment)) {
    268 */
    269                 if (Normalizer.compare(possible, segment,0)==0) {
    270 
    271                     if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible));
    272                     result.add(possible);
    273 
    274                 } else {
    275                     if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible));
    276                 }
    277             }
    278         }
    279 
    280         // convert into a String[] to clean up storage
    281         String[] finalResult = new String[result.size()];
    282         result.toArray(finalResult);
    283         return finalResult;
    284     }
    285 
    286 
    287     private Set<String> getEquivalents2(String segment) {
    288 
    289         Set<String> result = new HashSet<String>();
    290 
    291         if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment));
    292 
    293         result.add(segment);
    294         StringBuffer workingBuffer = new StringBuffer();
    295         UnicodeSet starts = new UnicodeSet();
    296 
    297         // cycle through all the characters
    298         int cp;
    299         for (int i = 0; i < segment.length(); i += Character.charCount(cp)) {
    300 
    301             // see if any character is at the start of some decomposition
    302             cp = segment.codePointAt(i);
    303             if (!nfcImpl.getCanonStartSet(cp, starts)) {
    304               continue;
    305             }
    306             // if so, see which decompositions match
    307             for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) {
    308                 int cp2 = iter.codepoint;
    309                 Set<String> remainder = extract(cp2, segment, i, workingBuffer);
    310                 if (remainder == null) {
    311                     continue;
    312                 }
    313 
    314                 // there were some matches, so add all the possibilities to the set.
    315                 String prefix= segment.substring(0,i);
    316                 prefix += UTF16.valueOf(cp2);
    317                 for (String item : remainder) {
    318                     result.add(prefix + item);
    319                 }
    320             }
    321         }
    322         return result;
    323         /*
    324         Set result = new HashSet();
    325         if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
    326         result.add(segment);
    327         StringBuffer workingBuffer = new StringBuffer();
    328 
    329         // cycle through all the characters
    330         int cp;
    331 
    332         for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
    333             // see if any character is at the start of some decomposition
    334             cp = UTF16.charAt(segment, i);
    335             NormalizerImpl.getCanonStartSet(c,fillSet)
    336             UnicodeSet starts = AT_START.get(cp);
    337             if (starts == null) continue;
    338             UnicodeSetIterator usi = new UnicodeSetIterator(starts);
    339             // if so, see which decompositions match
    340             while (usi.next()) {
    341                 int cp2 = usi.codepoint;
    342                 // we know that there are no strings in it
    343                 // so we don't have to check CharacterIterator.IS_STRING
    344                 Set remainder = extract(cp2, segment, i, workingBuffer);
    345                 if (remainder == null) continue;
    346 
    347                 // there were some matches, so add all the possibilities to the set.
    348                 String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
    349                 Iterator it = remainder.iterator();
    350                 while (it.hasNext()) {
    351                     String item = (String) it.next();
    352                     if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
    353                     result.add(prefix + item);
    354                 }
    355             }
    356         }
    357         return result;
    358         */
    359     }
    360 
    361     /**
    362      * See if the decomposition of cp2 is at segment starting at segmentPos
    363      * (with canonical rearrangment!)
    364      * If so, take the remainder, and return the equivalents
    365      */
    366     private Set<String> extract(int comp, String segment, int segmentPos, StringBuffer buf) {
    367         if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
    368             + ", " + Utility.hex(segment.substring(segmentPos)));
    369 
    370         String decomp = nfcImpl.getDecomposition(comp);
    371         if (decomp == null) {
    372             decomp = UTF16.valueOf(comp);
    373         }
    374 
    375         // See if it matches the start of segment (at segmentPos)
    376         boolean ok = false;
    377         int cp;
    378         int decompPos = 0;
    379         int decompCp = UTF16.charAt(decomp,0);
    380         decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
    381         //int decompClass = getClass(decompCp);
    382         buf.setLength(0); // initialize working buffer, shared among callees
    383 
    384         for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
    385             cp = UTF16.charAt(segment, i);
    386             if (cp == decompCp) { // if equal, eat another cp from decomp
    387                 if (PROGRESS) System.out.println("  matches: " + Utility.hex(UTF16.valueOf(cp)));
    388                 if (decompPos == decomp.length()) { // done, have all decomp characters!
    389                     buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
    390                     ok = true;
    391                     break;
    392                 }
    393                 decompCp = UTF16.charAt(decomp, decompPos);
    394                 decompPos += UTF16.getCharCount(decompCp);
    395                 //decompClass = getClass(decompCp);
    396             } else {
    397                 if (PROGRESS) System.out.println("  buffer: " + Utility.hex(UTF16.valueOf(cp)));
    398                 // brute force approach
    399                 UTF16.append(buf, cp);
    400                 /* TODO: optimize
    401                 // since we know that the classes are monotonically increasing, after zero
    402                 // e.g. 0 5 7 9 0 3
    403                 // we can do an optimization
    404                 // there are only a few cases that work: zero, less, same, greater
    405                 // if both classes are the same, we fail
    406                 // if the decomp class < the segment class, we fail
    407 
    408                 segClass = getClass(cp);
    409                 if (decompClass <= segClass) return null;
    410                 */
    411             }
    412         }
    413         if (!ok) return null; // we failed, characters left over
    414         if (PROGRESS) System.out.println("Matches");
    415         if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder
    416         String remainder = buf.toString();
    417 
    418         // brute force approach
    419         // to check to make sure result is canonically equivalent
    420         /*
    421         String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
    422         if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
    423         */
    424 
    425         if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;
    426 
    427         // get the remaining combinations
    428         return getEquivalents2(remainder);
    429     }
    430 
    431     /*
    432     // TODO: fix once we have a codepoint interface to get the canonical combining class
    433     // TODO: Need public access to canonical combining class in UCharacter!
    434     private static int getClass(int cp) {
    435         return Normalizer.getClass((char)cp);
    436     }
    437     */
    438 
    439    // ================= BUILDER =========================
    440     // TODO: Flatten this data so it doesn't have to be reconstructed each time!
    441 
    442     //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
    443     private static final Set<String> SET_WITH_NULL_STRING = new HashSet<String>(); // constant, don't change
    444     static {
    445         SET_WITH_NULL_STRING.add("");
    446     }
    447 
    448   //  private static UnicodeSet SAFE_START = new UnicodeSet();
    449   //  private static CharMap AT_START = new CharMap();
    450 
    451         // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
    452         // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
    453   //  private static int LAST_UNICODE = 0x10FFFF;
    454     /*
    455     static {
    456         buildData();
    457     }
    458     */
    459     /*
    460     private static void buildData() {
    461 
    462         if (PROGRESS) System.out.println("Getting Safe Start");
    463         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
    464             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
    465             int cc = UCharacter.getCombiningClass(cp);
    466             if (cc == 0) SAFE_START.add(cp);
    467             // will fix to be really safe below
    468         }
    469         if (PROGRESS) System.out.println();
    470 
    471         if (PROGRESS) System.out.println("Getting Containment");
    472         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
    473             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
    474 
    475             if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
    476 
    477             //String istr = UTF16.valueOf(cp);
    478             String decomp = Normalizer.normalize(cp, Normalizer.NFD);
    479             //if (decomp.equals(istr)) continue;
    480 
    481             // add each character in the decomposition to canBeIn
    482 
    483             int component;
    484             for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
    485                 component = UTF16.charAt(decomp, i);
    486                 if (i == 0) {
    487                     AT_START.add(component, cp);
    488                 } else if (UCharacter.getCombiningClass(component) == 0) {
    489                     SAFE_START.remove(component);
    490                 }
    491             }
    492         }
    493         if (PROGRESS) System.out.println();
    494     }
    495         // the following is just for a map from characters to a set of characters
    496 
    497     private static class CharMap {
    498         Map storage = new HashMap();
    499         MutableInt probe = new MutableInt();
    500         boolean converted = false;
    501 
    502         public void add(int cp, int whatItIsIn) {
    503             UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
    504             if (result == null) {
    505                 result = new UnicodeSet();
    506                 storage.put(probe, result);
    507             }
    508             result.add(whatItIsIn);
    509         }
    510 
    511         public UnicodeSet get(int cp) {
    512             return (UnicodeSet) storage.get(probe.set(cp));
    513         }
    514     }
    515 
    516     private static class MutableInt {
    517         public int contents;
    518         public int hashCode() { return contents; }
    519         public boolean equals(Object other) {
    520             return ((MutableInt)other).contents == contents;
    521         }
    522         // allows chaining
    523         public MutableInt set(int contents) {
    524             this.contents = contents;
    525             return this;
    526         }
    527     }
    528     */
    529 
    530 }
    531