Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 1996-2014, International Business Machines Corporation and    *
      6  * others. All Rights Reserved.                                                *
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.text;
     10 
     11 import java.util.ArrayList;
     12 import java.util.HashSet;
     13 import java.util.Iterator;
     14 import java.util.List;
     15 import java.util.Set;
     16 
     17 import com.ibm.icu.impl.Norm2AllModes;
     18 import com.ibm.icu.impl.Normalizer2Impl;
     19 import com.ibm.icu.impl.Utility;
     20 import com.ibm.icu.lang.UCharacter;
     21 
     22 /**
     23  * This class allows one to iterate through all the strings that are canonically equivalent to a given
     24  * string. For example, here are some sample results:
     25  * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
     26  * <pre>
     27  1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
     28  2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
     29  3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
     30  4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
     31  5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
     32  6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
     33  7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
     34  8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
     35  9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}
     36 10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}
     37 11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}
     38 12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}
     39  *</pre>
     40  *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
     41  * since it has not been optimized for that situation.
     42  * @author M. Davis
     43  * @stable ICU 2.4
     44  */
     45 
     46 public final class CanonicalIterator {
     47     /**
     48      * Construct a CanonicalIterator object
     49      * @param source string to get results for
     50      * @stable ICU 2.4
     51      */
     52     public CanonicalIterator(String source) {
     53         Norm2AllModes allModes = Norm2AllModes.getNFCInstance();
     54         nfd = allModes.decomp;
     55         nfcImpl = allModes.impl.ensureCanonIterData();
     56         setSource(source);
     57     }
     58 
     59     /**
     60      * Gets the NFD form of the current source we are iterating over.
     61      * @return gets the source: NOTE: it is the NFD form of the source originally passed in
     62      * @stable ICU 2.4
     63      */
     64     public String getSource() {
     65       return source;
     66     }
     67 
     68     /**
     69      * Resets the iterator so that one can start again from the beginning.
     70      * @stable ICU 2.4
     71      */
     72     public void reset() {
     73         done = false;
     74         for (int i = 0; i < current.length; ++i) {
     75             current[i] = 0;
     76         }
     77     }
     78 
     79     /**
     80      * Get the next canonically equivalent string.
     81      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
     82      * @return the next string that is canonically equivalent. The value null is returned when
     83      * the iteration is done.
     84      * @stable ICU 2.4
     85      */
     86     public String next() {
     87         if (done) return null;
     88 
     89         // construct return value
     90 
     91         buffer.setLength(0); // delete old contents
     92         for (int i = 0; i < pieces.length; ++i) {
     93             buffer.append(pieces[i][current[i]]);
     94         }
     95         String result = buffer.toString();
     96 
     97         // find next value for next time
     98 
     99         for (int i = current.length - 1; ; --i) {
    100             if (i < 0) {
    101                 done = true;
    102                 break;
    103             }
    104             current[i]++;
    105             if (current[i] < pieces[i].length) break; // got sequence
    106             current[i] = 0;
    107         }
    108         return result;
    109     }
    110 
    111     /**
    112      * Set a new source for this iterator. Allows object reuse.
    113      * @param newSource the source string to iterate against. This allows the same iterator to be used
    114      * while changing the source string, saving object creation.
    115      * @stable ICU 2.4
    116      */
    117     public void setSource(String newSource) {
    118         source = nfd.normalize(newSource);
    119         done = false;
    120 
    121         // catch degenerate case
    122         if (newSource.length() == 0) {
    123             pieces = new String[1][];
    124             current = new int[1];
    125             pieces[0] = new String[]{""};
    126             return;
    127         }
    128 
    129         // find the segments
    130         List<String> segmentList = new ArrayList<String>();
    131         int cp;
    132         int start = 0;
    133 
    134         // i should be the end of the first code point
    135         // break up the string into segements
    136 
    137         int i = UTF16.findOffsetFromCodePoint(source, 1);
    138 
    139         for (; i < source.length(); i += Character.charCount(cp)) {
    140             cp = source.codePointAt(i);
    141             if (nfcImpl.isCanonSegmentStarter(cp)) {
    142                 segmentList.add(source.substring(start, i)); // add up to i
    143                 start = i;
    144             }
    145         }
    146         segmentList.add(source.substring(start, i)); // add last one
    147 
    148         // allocate the arrays, and find the strings that are CE to each segment
    149         pieces = new String[segmentList.size()][];
    150         current = new int[segmentList.size()];
    151         for (i = 0; i < pieces.length; ++i) {
    152             if (PROGRESS) System.out.println("SEGMENT");
    153             pieces[i] = getEquivalents(segmentList.get(i));
    154         }
    155     }
    156 
    157     /**
    158      * Simple implementation of permutation.
    159      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
    160      * @param source the string to find permutations for
    161      * @param skipZeros set to true to skip characters with canonical combining class zero
    162      * @param output the set to add the results to
    163      * @internal
    164      * @deprecated This API is ICU internal only.
    165      */
    166     @Deprecated
    167     public static void permute(String source, boolean skipZeros, Set<String> output) {
    168         // TODO: optimize
    169         //if (PROGRESS) System.out.println("Permute: " + source);
    170 
    171         // optimization:
    172         // if zero or one character, just return a set with it
    173         // we check for length < 2 to keep from counting code points all the time
    174         if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
    175             output.add(source);
    176             return;
    177         }
    178 
    179         // otherwise iterate through the string, and recursively permute all the other characters
    180         Set<String> subpermute = new HashSet<String>();
    181         int cp;
    182         for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
    183             cp = UTF16.charAt(source, i);
    184 
    185             // optimization:
    186             // if the character is canonical combining class zero,
    187             // don't permute it
    188             if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {
    189                 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
    190                 continue;
    191             }
    192 
    193             // see what the permutations of the characters before and after this one are
    194             subpermute.clear();
    195             permute(source.substring(0,i)
    196                 + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
    197 
    198             // prefix this character to all of them
    199             String chStr = UTF16.valueOf(source, i);
    200             for (String s : subpermute) {
    201                 String piece = chStr + s;
    202                 //if (PROGRESS) System.out.println("  Piece: " + piece);
    203                 output.add(piece);
    204             }
    205         }
    206     }
    207 
    208     // FOR TESTING
    209 
    210     /*
    211      *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
    212      *
    213     public static UnicodeSet getSafeStart() {
    214         return (UnicodeSet) SAFE_START.clone();
    215     }
    216     */
    217     /*
    218      *@return the set of characters whose decompositions start with the given character
    219      *
    220     public static UnicodeSet getStarts(int cp) {
    221         UnicodeSet result = AT_START.get(cp);
    222         if (result == null) result = EMPTY;
    223         return (UnicodeSet) result.clone();
    224     }
    225     */
    226 
    227     // ===================== PRIVATES ==============================
    228 
    229     // debug
    230     private static boolean PROGRESS = false; // debug progress
    231     //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
    232     private static boolean SKIP_ZEROS = true;
    233 
    234     // fields
    235     private final Normalizer2 nfd;
    236     private final Normalizer2Impl nfcImpl;
    237     private String source;
    238     private boolean done;
    239     private String[][] pieces;
    240     private int[] current;
    241     // Note: C will need two more fields, since arrays there don't have lengths
    242     // int pieces_length;
    243     // int[] pieces_lengths;
    244 
    245     // transient fields
    246     private transient StringBuilder buffer = new StringBuilder();
    247 
    248 
    249     // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
    250     private String[] getEquivalents(String segment) {
    251         Set<String> result = new HashSet<String>();
    252         Set<String> basic = getEquivalents2(segment);
    253         Set<String> permutations = new HashSet<String>();
    254 
    255         // now get all the permutations
    256         // add only the ones that are canonically equivalent
    257         // TODO: optimize by not permuting any class zero.
    258         Iterator<String> it = basic.iterator();
    259         while (it.hasNext()) {
    260             String item = it.next();
    261             permutations.clear();
    262             permute(item, SKIP_ZEROS, permutations);
    263             Iterator<String> it2 = permutations.iterator();
    264             while (it2.hasNext()) {
    265                 String possible = it2.next();
    266 
    267 /*
    268                 String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
    269                 if (attempt.equals(segment)) {
    270 */
    271                 if (Normalizer.compare(possible, segment,0)==0) {
    272 
    273                     if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible));
    274                     result.add(possible);
    275 
    276                 } else {
    277                     if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible));
    278                 }
    279             }
    280         }
    281 
    282         // convert into a String[] to clean up storage
    283         String[] finalResult = new String[result.size()];
    284         result.toArray(finalResult);
    285         return finalResult;
    286     }
    287 
    288 
    289     private Set<String> getEquivalents2(String segment) {
    290 
    291         Set<String> result = new HashSet<String>();
    292 
    293         if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment));
    294 
    295         result.add(segment);
    296         StringBuffer workingBuffer = new StringBuffer();
    297         UnicodeSet starts = new UnicodeSet();
    298 
    299         // cycle through all the characters
    300         int cp;
    301         for (int i = 0; i < segment.length(); i += Character.charCount(cp)) {
    302 
    303             // see if any character is at the start of some decomposition
    304             cp = segment.codePointAt(i);
    305             if (!nfcImpl.getCanonStartSet(cp, starts)) {
    306               continue;
    307             }
    308             // if so, see which decompositions match
    309             for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) {
    310                 int cp2 = iter.codepoint;
    311                 Set<String> remainder = extract(cp2, segment, i, workingBuffer);
    312                 if (remainder == null) {
    313                     continue;
    314                 }
    315 
    316                 // there were some matches, so add all the possibilities to the set.
    317                 String prefix= segment.substring(0,i);
    318                 prefix += UTF16.valueOf(cp2);
    319                 for (String item : remainder) {
    320                     result.add(prefix + item);
    321                 }
    322             }
    323         }
    324         return result;
    325         /*
    326         Set result = new HashSet();
    327         if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
    328         result.add(segment);
    329         StringBuffer workingBuffer = new StringBuffer();
    330 
    331         // cycle through all the characters
    332         int cp;
    333 
    334         for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
    335             // see if any character is at the start of some decomposition
    336             cp = UTF16.charAt(segment, i);
    337             NormalizerImpl.getCanonStartSet(c,fillSet)
    338             UnicodeSet starts = AT_START.get(cp);
    339             if (starts == null) continue;
    340             UnicodeSetIterator usi = new UnicodeSetIterator(starts);
    341             // if so, see which decompositions match
    342             while (usi.next()) {
    343                 int cp2 = usi.codepoint;
    344                 // we know that there are no strings in it
    345                 // so we don't have to check CharacterIterator.IS_STRING
    346                 Set remainder = extract(cp2, segment, i, workingBuffer);
    347                 if (remainder == null) continue;
    348 
    349                 // there were some matches, so add all the possibilities to the set.
    350                 String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
    351                 Iterator it = remainder.iterator();
    352                 while (it.hasNext()) {
    353                     String item = (String) it.next();
    354                     if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
    355                     result.add(prefix + item);
    356                 }
    357             }
    358         }
    359         return result;
    360         */
    361     }
    362 
    363     /**
    364      * See if the decomposition of cp2 is at segment starting at segmentPos
    365      * (with canonical rearrangment!)
    366      * If so, take the remainder, and return the equivalents
    367      */
    368     private Set<String> extract(int comp, String segment, int segmentPos, StringBuffer buf) {
    369         if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
    370             + ", " + Utility.hex(segment.substring(segmentPos)));
    371 
    372         String decomp = nfcImpl.getDecomposition(comp);
    373         if (decomp == null) {
    374             decomp = UTF16.valueOf(comp);
    375         }
    376 
    377         // See if it matches the start of segment (at segmentPos)
    378         boolean ok = false;
    379         int cp;
    380         int decompPos = 0;
    381         int decompCp = UTF16.charAt(decomp,0);
    382         decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
    383         //int decompClass = getClass(decompCp);
    384         buf.setLength(0); // initialize working buffer, shared among callees
    385 
    386         for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
    387             cp = UTF16.charAt(segment, i);
    388             if (cp == decompCp) { // if equal, eat another cp from decomp
    389                 if (PROGRESS) System.out.println("  matches: " + Utility.hex(UTF16.valueOf(cp)));
    390                 if (decompPos == decomp.length()) { // done, have all decomp characters!
    391                     buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
    392                     ok = true;
    393                     break;
    394                 }
    395                 decompCp = UTF16.charAt(decomp, decompPos);
    396                 decompPos += UTF16.getCharCount(decompCp);
    397                 //decompClass = getClass(decompCp);
    398             } else {
    399                 if (PROGRESS) System.out.println("  buffer: " + Utility.hex(UTF16.valueOf(cp)));
    400                 // brute force approach
    401                 UTF16.append(buf, cp);
    402                 /* TODO: optimize
    403                 // since we know that the classes are monotonically increasing, after zero
    404                 // e.g. 0 5 7 9 0 3
    405                 // we can do an optimization
    406                 // there are only a few cases that work: zero, less, same, greater
    407                 // if both classes are the same, we fail
    408                 // if the decomp class < the segment class, we fail
    409 
    410                 segClass = getClass(cp);
    411                 if (decompClass <= segClass) return null;
    412                 */
    413             }
    414         }
    415         if (!ok) return null; // we failed, characters left over
    416         if (PROGRESS) System.out.println("Matches");
    417         if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder
    418         String remainder = buf.toString();
    419 
    420         // brute force approach
    421         // to check to make sure result is canonically equivalent
    422         /*
    423         String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
    424         if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
    425         */
    426 
    427         if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;
    428 
    429         // get the remaining combinations
    430         return getEquivalents2(remainder);
    431     }
    432 
    433     /*
    434     // TODO: fix once we have a codepoint interface to get the canonical combining class
    435     // TODO: Need public access to canonical combining class in UCharacter!
    436     private static int getClass(int cp) {
    437         return Normalizer.getClass((char)cp);
    438     }
    439     */
    440 
    441    // ================= BUILDER =========================
    442     // TODO: Flatten this data so it doesn't have to be reconstructed each time!
    443 
    444     //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
    445     private static final Set<String> SET_WITH_NULL_STRING = new HashSet<String>(); // constant, don't change
    446     static {
    447         SET_WITH_NULL_STRING.add("");
    448     }
    449 
    450   //  private static UnicodeSet SAFE_START = new UnicodeSet();
    451   //  private static CharMap AT_START = new CharMap();
    452 
    453         // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
    454         // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
    455   //  private static int LAST_UNICODE = 0x10FFFF;
    456     /*
    457     static {
    458         buildData();
    459     }
    460     */
    461     /*
    462     private static void buildData() {
    463 
    464         if (PROGRESS) System.out.println("Getting Safe Start");
    465         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
    466             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
    467             int cc = UCharacter.getCombiningClass(cp);
    468             if (cc == 0) SAFE_START.add(cp);
    469             // will fix to be really safe below
    470         }
    471         if (PROGRESS) System.out.println();
    472 
    473         if (PROGRESS) System.out.println("Getting Containment");
    474         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
    475             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
    476 
    477             if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
    478 
    479             //String istr = UTF16.valueOf(cp);
    480             String decomp = Normalizer.normalize(cp, Normalizer.NFD);
    481             //if (decomp.equals(istr)) continue;
    482 
    483             // add each character in the decomposition to canBeIn
    484 
    485             int component;
    486             for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
    487                 component = UTF16.charAt(decomp, i);
    488                 if (i == 0) {
    489                     AT_START.add(component, cp);
    490                 } else if (UCharacter.getCombiningClass(component) == 0) {
    491                     SAFE_START.remove(component);
    492                 }
    493             }
    494         }
    495         if (PROGRESS) System.out.println();
    496     }
    497         // the following is just for a map from characters to a set of characters
    498 
    499     private static class CharMap {
    500         Map storage = new HashMap();
    501         MutableInt probe = new MutableInt();
    502         boolean converted = false;
    503 
    504         public void add(int cp, int whatItIsIn) {
    505             UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
    506             if (result == null) {
    507                 result = new UnicodeSet();
    508                 storage.put(probe, result);
    509             }
    510             result.add(whatItIsIn);
    511         }
    512 
    513         public UnicodeSet get(int cp) {
    514             return (UnicodeSet) storage.get(probe.set(cp));
    515         }
    516     }
    517 
    518     private static class MutableInt {
    519         public int contents;
    520         public int hashCode() { return contents; }
    521         public boolean equals(Object other) {
    522             return ((MutableInt)other).contents == contents;
    523         }
    524         // allows chaining
    525         public MutableInt set(int contents) {
    526             this.contents = contents;
    527             return this;
    528         }
    529     }
    530     */
    531 
    532 }
    533