Home | History | Annotate | Download | only in normalizer
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 1996-2010, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.dev.test.normalizer;
     10 
     11 import java.util.Collection;
     12 import java.util.Iterator;
     13 import java.util.Set;
     14 import java.util.SortedSet;
     15 import java.util.TreeSet;
     16 
     17 import org.junit.Test;
     18 import org.junit.runner.RunWith;
     19 import org.junit.runners.JUnit4;
     20 
     21 import com.ibm.icu.dev.test.TestFmwk;
     22 import com.ibm.icu.impl.Utility;
     23 import com.ibm.icu.lang.UCharacter;
     24 import com.ibm.icu.text.CanonicalIterator;
     25 import com.ibm.icu.text.Normalizer;
     26 import com.ibm.icu.text.UTF16;
     27 
     28 
     29 // TODO: fit into test framework
     30 
     31 @RunWith(JUnit4.class)
     32 public class TestCanonicalIterator extends TestFmwk {
     33 
     34     static final boolean SHOW_NAMES = false;
     35 
     36     static final String testArray[][] = {
     37         {"\u00C5d\u0307\u0327", "A\u030Ad\u0307\u0327, A\u030Ad\u0327\u0307, A\u030A\u1E0B\u0327, "
     38             + "A\u030A\u1E11\u0307, \u00C5d\u0307\u0327, \u00C5d\u0327\u0307, "
     39             + "\u00C5\u1E0B\u0327, \u00C5\u1E11\u0307, \u212Bd\u0307\u0327, "
     40             + "\u212Bd\u0327\u0307, \u212B\u1E0B\u0327, \u212B\u1E11\u0307"},
     41         {"\u010d\u017E", "c\u030Cz\u030C, c\u030C\u017E, \u010Dz\u030C, \u010D\u017E"},
     42         {"x\u0307\u0327", "x\u0307\u0327, x\u0327\u0307, \u1E8B\u0327"},
     43     };
     44 
     45     @Test
     46     public void TestExhaustive() {
     47         int counter = 0;
     48         CanonicalIterator it = new CanonicalIterator("");
     49         /*
     50         CanonicalIterator slowIt = new CanonicalIterator("");
     51         slowIt.SKIP_ZEROS = false;
     52         */
     53         //Transliterator name = Transliterator.getInstance("[^\\u0020-\\u007F] name");
     54         //Set itSet = new TreeSet();
     55         //Set slowItSet = new TreeSet();
     56 
     57 
     58         for (int i = 0; i < 0x10FFFF; ++i) {
     59 
     60             // skip characters we know don't have decomps
     61             int type = UCharacter.getType(i);
     62             if (type == Character.UNASSIGNED || type == Character.PRIVATE_USE
     63                 || type == Character.SURROGATE) continue;
     64 
     65             if ((++counter % 5000) == 0) logln("Testing " + Utility.hex(i,0));
     66 
     67             String s = UTF16.valueOf(i);
     68             characterTest(s, i, it);
     69 
     70             characterTest(s + "\u0345", i, it);
     71         }
     72     }
     73 
     74     public int TestSpeed() {
     75          // skip unless verbose
     76         if (!isVerbose()) return 0;
     77 
     78            String s = "\uAC01\u0345";
     79 
     80         CanonicalIterator it = new CanonicalIterator(s);
     81         double start, end;
     82         int x = 0; // just to keep code from optimizing away.
     83         int iterations = 10000;
     84         double slowDelta = 0;
     85 
     86         /*
     87         CanonicalIterator slowIt = new CanonicalIterator(s);
     88         slowIt.SKIP_ZEROS = false;
     89 
     90         start = System.currentTimeMillis();
     91         for (int i = 0; i < iterations; ++i) {
     92             slowIt.setSource(s);
     93             while (true) {
     94                 String item = slowIt.next();
     95                 if (item == null) break;
     96                 x += item.length();
     97             }
     98         }
     99         end = System.currentTimeMillis();
    100         double slowDelta = (end-start) / iterations;
    101         logln("Slow iteration: " + slowDelta);
    102         */
    103 
    104         start = System.currentTimeMillis();
    105         for (int i = 0; i < iterations; ++i) {
    106             it.setSource(s);
    107             while (true) {
    108                 String item = it.next();
    109                 if (item == null) break;
    110                 x += item.length();
    111             }
    112         }
    113         end = System.currentTimeMillis();
    114         double fastDelta = (end-start) / iterations;
    115         logln("Fast iteration: " + fastDelta + (slowDelta != 0 ? ", " + (fastDelta/slowDelta) : ""));
    116 
    117 
    118         return x;
    119     }
    120 
    121     @Test
    122     public void TestBasic() {
    123 //      This is not interesting anymore as the data is already built
    124 //      beforehand
    125 
    126 //        check build
    127 //        UnicodeSet ss = CanonicalIterator.getSafeStart();
    128 //        logln("Safe Start: " + ss.toPattern(true));
    129 //        ss = CanonicalIterator.getStarts('a');
    130 //        expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
    131 //            new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
    132 //            + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
    133 //                );
    134 
    135         // check permute
    136         // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!
    137 
    138         Set results = new TreeSet();
    139         CanonicalIterator.permute("ABC", false, results);
    140         expectEqual("Simple permutation ", "", collectionToString(results), "ABC, ACB, BAC, BCA, CAB, CBA");
    141 
    142         // try samples
    143         SortedSet set = new TreeSet();
    144         for (int i = 0; i < testArray.length; ++i) {
    145             //logln("Results for: " + name.transliterate(testArray[i]));
    146             CanonicalIterator it = new CanonicalIterator(testArray[i][0]);
    147            // int counter = 0;
    148             set.clear();
    149             String first = null;
    150             while (true) {
    151                 String result = it.next();
    152                 if(first==null){
    153                     first = result;
    154                 }
    155                 if (result == null) break;
    156                 set.add(result); // sort them
    157                 //logln(++counter + ": " + hex.transliterate(result));
    158                 //logln(" = " + name.transliterate(result));
    159             }
    160             expectEqual(i + ": ", testArray[i][0], collectionToString(set), testArray[i][1]);
    161             it.reset();
    162             if(!it.next().equals(first)){
    163                 errln("CanonicalIterator.reset() failed");
    164             }
    165             if(!it.getSource().equals(Normalizer.normalize(testArray[i][0],Normalizer.NFD))){
    166                 errln("CanonicalIterator.getSource() does not return NFD of input source");
    167             }
    168         }
    169     }
    170 
    171     private void expectEqual(String message, String item, Object a, Object b) {
    172         if (!a.equals(b)) {
    173             errln("FAIL: " + message + getReadable(item));
    174             errln("\t" + getReadable(a));
    175             errln("\t" + getReadable(b));
    176         } else {
    177             logln("Checked: " + message + getReadable(item));
    178             logln("\t" + getReadable(a));
    179             logln("\t" + getReadable(b));
    180         }
    181     }
    182 
    183     //Transliterator name = null;
    184     //Transliterator hex = null;
    185 
    186     public String getReadable(Object obj) {
    187         if (obj == null) return "null";
    188         String s = obj.toString();
    189         if (s.length() == 0) return "";
    190         // set up for readable display
    191         //if (name == null) name = Transliterator.getInstance("[^\\ -\\u007F] name");
    192         //if (hex == null) hex = Transliterator.getInstance("[^\\ -\\u007F] hex");
    193         return "[" + (SHOW_NAMES ? hex(s) + "; " : "") + hex(s) + "]";
    194     }
    195 
    196     private void characterTest(String s, int ch, CanonicalIterator it)
    197     {
    198         int mixedCounter = 0;
    199         int lastMixedCounter = -1;
    200         boolean gotDecomp = false;
    201         boolean gotComp = false;
    202         boolean gotSource = false;
    203         String decomp = Normalizer.decompose(s, false);
    204         String comp = Normalizer.compose(s, false);
    205 
    206         // skip characters that don't have either decomp.
    207         // need quick test for this!
    208         if (s.equals(decomp) && s.equals(comp)) return;
    209 
    210         it.setSource(s);
    211 
    212         while (true) {
    213             String item = it.next();
    214             if (item == null) break;
    215             if (item.equals(s)) gotSource = true;
    216             if (item.equals(decomp)) gotDecomp = true;
    217             if (item.equals(comp)) gotComp = true;
    218             if ((mixedCounter & 0x7F) == 0 && (ch < 0xAD00 || ch > 0xAC00 + 11172)) {
    219                 if (lastMixedCounter != mixedCounter) {
    220                     logln("");
    221                     lastMixedCounter = mixedCounter;
    222                 }
    223                 logln("\t" + mixedCounter + "\t" + hex(item)
    224                 + (item.equals(s) ? "\t(*original*)" : "")
    225                 + (item.equals(decomp) ? "\t(*decomp*)" : "")
    226                 + (item.equals(comp) ? "\t(*comp*)" : "")
    227                 );
    228             }
    229 
    230         }
    231 
    232         // check that zeros optimization doesn't mess up.
    233         /*
    234         if (true) {
    235             it.reset();
    236             itSet.clear();
    237             while (true) {
    238                 String item = it.next();
    239                 if (item == null) break;
    240                 itSet.add(item);
    241             }
    242             slowIt.setSource(s);
    243             slowItSet.clear();
    244             while (true) {
    245                 String item = slowIt.next();
    246                 if (item == null) break;
    247                 slowItSet.add(item);
    248             }
    249             if (!itSet.equals(slowItSet)) {
    250                 errln("Zero optimization failure with " + getReadable(s));
    251             }
    252         }
    253         */
    254 
    255         mixedCounter++;
    256         if (!gotSource || !gotDecomp || !gotComp) {
    257             errln("FAIL CanonicalIterator: " + s + " decomp: " +decomp+" comp: "+comp);
    258             it.reset();
    259             for(String item=it.next();item!=null;item=it.next()){
    260                 err(item + "    ");
    261             }
    262             errln("");
    263         }
    264     }
    265 
    266     static String collectionToString(Collection col) {
    267         StringBuffer result = new StringBuffer();
    268         Iterator it = col.iterator();
    269         while (it.hasNext()) {
    270             if (result.length() != 0) result.append(", ");
    271             result.append(it.next().toString());
    272         }
    273         return result.toString();
    274     }
    275 }