Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  *******************************************************************************
      6  * Copyright (C) 1996-2014, International Business Machines Corporation and    *
      7  * others. All Rights Reserved.                                                *
      8  *******************************************************************************
      9  */
     10 package android.icu.text;
     11 
     12 import java.util.ArrayList;
     13 import java.util.HashSet;
     14 import java.util.Iterator;
     15 import java.util.List;
     16 import java.util.Set;
     17 
     18 import android.icu.impl.Norm2AllModes;
     19 import android.icu.impl.Normalizer2Impl;
     20 import android.icu.impl.Utility;
     21 import android.icu.lang.UCharacter;
     22 
     23 /**
     24  * This class allows one to iterate through all the strings that are canonically equivalent to a given
     25  * string. For example, here are some sample results:
     26  * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
     27  * <pre>
     28  1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
     29  2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
     30  3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
     31  4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
     32  5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
     33  6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
     34  7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
     35  8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
     36  9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}
     37 10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}
     38 11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}
     39 12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}
     40  *</pre>
     41  *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
     42  * since it has not been optimized for that situation.
     43  * @author M. Davis
     44  * @hide Only a subset of ICU is exposed in Android
     45  */
     46 
     47 public final class CanonicalIterator {
     48     /**
     49      * Construct a CanonicalIterator object
     50      * @param source string to get results for
     51      */
     52     public CanonicalIterator(String source) {
     53         Norm2AllModes allModes = Norm2AllModes.getNFCInstance();
     54         nfd = allModes.decomp;
     55         nfcImpl = allModes.impl.ensureCanonIterData();
     56         setSource(source);
     57     }
     58 
     59     /**
     60      * Gets the NFD form of the current source we are iterating over.
     61      * @return gets the source: NOTE: it is the NFD form of the source originally passed in
     62      */
     63     public String getSource() {
     64       return source;
     65     }
     66 
     67     /**
     68      * Resets the iterator so that one can start again from the beginning.
     69      */
     70     public void reset() {
     71         done = false;
     72         for (int i = 0; i < current.length; ++i) {
     73             current[i] = 0;
     74         }
     75     }
     76 
     77     /**
     78      * Get the next canonically equivalent string.
     79      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
     80      * @return the next string that is canonically equivalent. The value null is returned when
     81      * the iteration is done.
     82      */
     83     public String next() {
     84         if (done) return null;
     85 
     86         // construct return value
     87 
     88         buffer.setLength(0); // delete old contents
     89         for (int i = 0; i < pieces.length; ++i) {
     90             buffer.append(pieces[i][current[i]]);
     91         }
     92         String result = buffer.toString();
     93 
     94         // find next value for next time
     95 
     96         for (int i = current.length - 1; ; --i) {
     97             if (i < 0) {
     98                 done = true;
     99                 break;
    100             }
    101             current[i]++;
    102             if (current[i] < pieces[i].length) break; // got sequence
    103             current[i] = 0;
    104         }
    105         return result;
    106     }
    107 
    108     /**
    109      * Set a new source for this iterator. Allows object reuse.
    110      * @param newSource the source string to iterate against. This allows the same iterator to be used
    111      * while changing the source string, saving object creation.
    112      */
    113     public void setSource(String newSource) {
    114         source = nfd.normalize(newSource);
    115         done = false;
    116 
    117         // catch degenerate case
    118         if (newSource.length() == 0) {
    119             pieces = new String[1][];
    120             current = new int[1];
    121             pieces[0] = new String[]{""};
    122             return;
    123         }
    124 
    125         // find the segments
    126         List<String> segmentList = new ArrayList<String>();
    127         int cp;
    128         int start = 0;
    129 
    130         // i should be the end of the first code point
    131         // break up the string into segements
    132 
    133         int i = UTF16.findOffsetFromCodePoint(source, 1);
    134 
    135         for (; i < source.length(); i += Character.charCount(cp)) {
    136             cp = source.codePointAt(i);
    137             if (nfcImpl.isCanonSegmentStarter(cp)) {
    138                 segmentList.add(source.substring(start, i)); // add up to i
    139                 start = i;
    140             }
    141         }
    142         segmentList.add(source.substring(start, i)); // add last one
    143 
    144         // allocate the arrays, and find the strings that are CE to each segment
    145         pieces = new String[segmentList.size()][];
    146         current = new int[segmentList.size()];
    147         for (i = 0; i < pieces.length; ++i) {
    148             if (PROGRESS) System.out.println("SEGMENT");
    149             pieces[i] = getEquivalents(segmentList.get(i));
    150         }
    151     }
    152 
    153     /**
    154      * Simple implementation of permutation.
    155      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
    156      * @param source the string to find permutations for
    157      * @param skipZeros set to true to skip characters with canonical combining class zero
    158      * @param output the set to add the results to
    159      * @deprecated This API is ICU internal only.
    160      * @hide draft / provisional / internal are hidden on Android
    161      */
    162     @Deprecated
    163     public static void permute(String source, boolean skipZeros, Set<String> output) {
    164         // TODO: optimize
    165         //if (PROGRESS) System.out.println("Permute: " + source);
    166 
    167         // optimization:
    168         // if zero or one character, just return a set with it
    169         // we check for length < 2 to keep from counting code points all the time
    170         if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
    171             output.add(source);
    172             return;
    173         }
    174 
    175         // otherwise iterate through the string, and recursively permute all the other characters
    176         Set<String> subpermute = new HashSet<String>();
    177         int cp;
    178         for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
    179             cp = UTF16.charAt(source, i);
    180 
    181             // optimization:
    182             // if the character is canonical combining class zero,
    183             // don't permute it
    184             if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {
    185                 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
    186                 continue;
    187             }
    188 
    189             // see what the permutations of the characters before and after this one are
    190             subpermute.clear();
    191             permute(source.substring(0,i)
    192                 + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
    193 
    194             // prefix this character to all of them
    195             String chStr = UTF16.valueOf(source, i);
    196             for (String s : subpermute) {
    197                 String piece = chStr + s;
    198                 //if (PROGRESS) System.out.println("  Piece: " + piece);
    199                 output.add(piece);
    200             }
    201         }
    202     }
    203 
    204     // FOR TESTING
    205 
    206     /*
    207      *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
    208      *
    209     public static UnicodeSet getSafeStart() {
    210         return (UnicodeSet) SAFE_START.clone();
    211     }
    212     */
    213     /*
    214      *@return the set of characters whose decompositions start with the given character
    215      *
    216     public static UnicodeSet getStarts(int cp) {
    217         UnicodeSet result = AT_START.get(cp);
    218         if (result == null) result = EMPTY;
    219         return (UnicodeSet) result.clone();
    220     }
    221     */
    222 
    223     // ===================== PRIVATES ==============================
    224 
    225     // debug
    226     private static boolean PROGRESS = false; // debug progress
    227     //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
    228     private static boolean SKIP_ZEROS = true;
    229 
    230     // fields
    231     private final Normalizer2 nfd;
    232     private final Normalizer2Impl nfcImpl;
    233     private String source;
    234     private boolean done;
    235     private String[][] pieces;
    236     private int[] current;
    237     // Note: C will need two more fields, since arrays there don't have lengths
    238     // int pieces_length;
    239     // int[] pieces_lengths;
    240 
    241     // transient fields
    242     private transient StringBuilder buffer = new StringBuilder();
    243 
    244 
    245     // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
    246     private String[] getEquivalents(String segment) {
    247         Set<String> result = new HashSet<String>();
    248         Set<String> basic = getEquivalents2(segment);
    249         Set<String> permutations = new HashSet<String>();
    250 
    251         // now get all the permutations
    252         // add only the ones that are canonically equivalent
    253         // TODO: optimize by not permuting any class zero.
    254         Iterator<String> it = basic.iterator();
    255         while (it.hasNext()) {
    256             String item = it.next();
    257             permutations.clear();
    258             permute(item, SKIP_ZEROS, permutations);
    259             Iterator<String> it2 = permutations.iterator();
    260             while (it2.hasNext()) {
    261                 String possible = it2.next();
    262 
    263 /*
    264                 String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
    265                 if (attempt.equals(segment)) {
    266 */
    267                 if (Normalizer.compare(possible, segment,0)==0) {
    268 
    269                     if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible));
    270                     result.add(possible);
    271 
    272                 } else {
    273                     if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible));
    274                 }
    275             }
    276         }
    277 
    278         // convert into a String[] to clean up storage
    279         String[] finalResult = new String[result.size()];
    280         result.toArray(finalResult);
    281         return finalResult;
    282     }
    283 
    284 
    285     private Set<String> getEquivalents2(String segment) {
    286 
    287         Set<String> result = new HashSet<String>();
    288 
    289         if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment));
    290 
    291         result.add(segment);
    292         StringBuffer workingBuffer = new StringBuffer();
    293         UnicodeSet starts = new UnicodeSet();
    294 
    295         // cycle through all the characters
    296         int cp;
    297         for (int i = 0; i < segment.length(); i += Character.charCount(cp)) {
    298 
    299             // see if any character is at the start of some decomposition
    300             cp = segment.codePointAt(i);
    301             if (!nfcImpl.getCanonStartSet(cp, starts)) {
    302               continue;
    303             }
    304             // if so, see which decompositions match
    305             for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) {
    306                 int cp2 = iter.codepoint;
    307                 Set<String> remainder = extract(cp2, segment, i, workingBuffer);
    308                 if (remainder == null) {
    309                     continue;
    310                 }
    311 
    312                 // there were some matches, so add all the possibilities to the set.
    313                 String prefix= segment.substring(0,i);
    314                 prefix += UTF16.valueOf(cp2);
    315                 for (String item : remainder) {
    316                     result.add(prefix + item);
    317                 }
    318             }
    319         }
    320         return result;
    321         /*
    322         Set result = new HashSet();
    323         if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
    324         result.add(segment);
    325         StringBuffer workingBuffer = new StringBuffer();
    326 
    327         // cycle through all the characters
    328         int cp;
    329 
    330         for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
    331             // see if any character is at the start of some decomposition
    332             cp = UTF16.charAt(segment, i);
    333             NormalizerImpl.getCanonStartSet(c,fillSet)
    334             UnicodeSet starts = AT_START.get(cp);
    335             if (starts == null) continue;
    336             UnicodeSetIterator usi = new UnicodeSetIterator(starts);
    337             // if so, see which decompositions match
    338             while (usi.next()) {
    339                 int cp2 = usi.codepoint;
    340                 // we know that there are no strings in it
    341                 // so we don't have to check CharacterIterator.IS_STRING
    342                 Set remainder = extract(cp2, segment, i, workingBuffer);
    343                 if (remainder == null) continue;
    344 
    345                 // there were some matches, so add all the possibilities to the set.
    346                 String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
    347                 Iterator it = remainder.iterator();
    348                 while (it.hasNext()) {
    349                     String item = (String) it.next();
    350                     if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
    351                     result.add(prefix + item);
    352                 }
    353             }
    354         }
    355         return result;
    356         */
    357     }
    358 
    359     /**
    360      * See if the decomposition of cp2 is at segment starting at segmentPos
    361      * (with canonical rearrangment!)
    362      * If so, take the remainder, and return the equivalents
    363      */
    364     private Set<String> extract(int comp, String segment, int segmentPos, StringBuffer buf) {
    365         if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
    366             + ", " + Utility.hex(segment.substring(segmentPos)));
    367 
    368         String decomp = nfcImpl.getDecomposition(comp);
    369         if (decomp == null) {
    370             decomp = UTF16.valueOf(comp);
    371         }
    372 
    373         // See if it matches the start of segment (at segmentPos)
    374         boolean ok = false;
    375         int cp;
    376         int decompPos = 0;
    377         int decompCp = UTF16.charAt(decomp,0);
    378         decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
    379         //int decompClass = getClass(decompCp);
    380         buf.setLength(0); // initialize working buffer, shared among callees
    381 
    382         for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
    383             cp = UTF16.charAt(segment, i);
    384             if (cp == decompCp) { // if equal, eat another cp from decomp
    385                 if (PROGRESS) System.out.println("  matches: " + Utility.hex(UTF16.valueOf(cp)));
    386                 if (decompPos == decomp.length()) { // done, have all decomp characters!
    387                     buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
    388                     ok = true;
    389                     break;
    390                 }
    391                 decompCp = UTF16.charAt(decomp, decompPos);
    392                 decompPos += UTF16.getCharCount(decompCp);
    393                 //decompClass = getClass(decompCp);
    394             } else {
    395                 if (PROGRESS) System.out.println("  buffer: " + Utility.hex(UTF16.valueOf(cp)));
    396                 // brute force approach
    397                 UTF16.append(buf, cp);
    398                 /* TODO: optimize
    399                 // since we know that the classes are monotonically increasing, after zero
    400                 // e.g. 0 5 7 9 0 3
    401                 // we can do an optimization
    402                 // there are only a few cases that work: zero, less, same, greater
    403                 // if both classes are the same, we fail
    404                 // if the decomp class < the segment class, we fail
    405 
    406                 segClass = getClass(cp);
    407                 if (decompClass <= segClass) return null;
    408                 */
    409             }
    410         }
    411         if (!ok) return null; // we failed, characters left over
    412         if (PROGRESS) System.out.println("Matches");
    413         if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder
    414         String remainder = buf.toString();
    415 
    416         // brute force approach
    417         // to check to make sure result is canonically equivalent
    418         /*
    419         String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
    420         if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
    421         */
    422 
    423         if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;
    424 
    425         // get the remaining combinations
    426         return getEquivalents2(remainder);
    427     }
    428 
    429     /*
    430     // TODO: fix once we have a codepoint interface to get the canonical combining class
    431     // TODO: Need public access to canonical combining class in UCharacter!
    432     private static int getClass(int cp) {
    433         return Normalizer.getClass((char)cp);
    434     }
    435     */
    436 
    437    // ================= BUILDER =========================
    438     // TODO: Flatten this data so it doesn't have to be reconstructed each time!
    439 
    440     //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
    441     private static final Set<String> SET_WITH_NULL_STRING = new HashSet<String>(); // constant, don't change
    442     static {
    443         SET_WITH_NULL_STRING.add("");
    444     }
    445 
    446   //  private static UnicodeSet SAFE_START = new UnicodeSet();
    447   //  private static CharMap AT_START = new CharMap();
    448 
    449         // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
    450         // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
    451   //  private static int LAST_UNICODE = 0x10FFFF;
    452     /*
    453     static {
    454         buildData();
    455     }
    456     */
    457     /*
    458     private static void buildData() {
    459 
    460         if (PROGRESS) System.out.println("Getting Safe Start");
    461         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
    462             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
    463             int cc = UCharacter.getCombiningClass(cp);
    464             if (cc == 0) SAFE_START.add(cp);
    465             // will fix to be really safe below
    466         }
    467         if (PROGRESS) System.out.println();
    468 
    469         if (PROGRESS) System.out.println("Getting Containment");
    470         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
    471             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
    472 
    473             if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
    474 
    475             //String istr = UTF16.valueOf(cp);
    476             String decomp = Normalizer.normalize(cp, Normalizer.NFD);
    477             //if (decomp.equals(istr)) continue;
    478 
    479             // add each character in the decomposition to canBeIn
    480 
    481             int component;
    482             for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
    483                 component = UTF16.charAt(decomp, i);
    484                 if (i == 0) {
    485                     AT_START.add(component, cp);
    486                 } else if (UCharacter.getCombiningClass(component) == 0) {
    487                     SAFE_START.remove(component);
    488                 }
    489             }
    490         }
    491         if (PROGRESS) System.out.println();
    492     }
    493         // the following is just for a map from characters to a set of characters
    494 
    495     private static class CharMap {
    496         Map storage = new HashMap();
    497         MutableInt probe = new MutableInt();
    498         boolean converted = false;
    499 
    500         public void add(int cp, int whatItIsIn) {
    501             UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
    502             if (result == null) {
    503                 result = new UnicodeSet();
    504                 storage.put(probe, result);
    505             }
    506             result.add(whatItIsIn);
    507         }
    508 
    509         public UnicodeSet get(int cp) {
    510             return (UnicodeSet) storage.get(probe.set(cp));
    511         }
    512     }
    513 
    514     private static class MutableInt {
    515         public int contents;
    516         public int hashCode() { return contents; }
    517         public boolean equals(Object other) {
    518             return ((MutableInt)other).contents == contents;
    519         }
    520         // allows chaining
    521         public MutableInt set(int contents) {
    522             this.contents = contents;
    523             return this;
    524         }
    525     }
    526     */
    527 
    528 }
    529