Home | History | Annotate | Download | only in translit
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /**
      4  *******************************************************************************
      5  * Copyright (C) 2001-2010, International Business Machines Corporation and    *
      6  * others. All Rights Reserved.                                                *
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.dev.demo.translit;
     10 import java.io.BufferedWriter;
     11 import java.io.File;
     12 import java.io.FileOutputStream;
     13 import java.io.IOException;
     14 import java.io.OutputStreamWriter;
     15 import java.io.PrintWriter;
     16 import java.util.Comparator;
     17 import java.util.HashMap;
     18 import java.util.Iterator;
     19 import java.util.Set;
     20 import java.util.TreeSet;
     21 
     22 import com.ibm.icu.impl.Utility;
     23 import com.ibm.icu.lang.UCharacter;
     24 import com.ibm.icu.lang.UScript;
     25 import com.ibm.icu.text.Normalizer;
     26 import com.ibm.icu.text.Transliterator;
     27 import com.ibm.icu.text.UTF16;
     28 import com.ibm.icu.text.UnicodeSet;
     29 import com.ibm.icu.text.UnicodeSetIterator;
     30 
     31 public class TransliterationChart {
     32     public static void main(String[] args) throws IOException {
     33         System.out.println("Start");
     34         UnicodeSet lengthMarks = new UnicodeSet("[\u09D7\u0B56-\u0B57\u0BD7\u0C56\u0CD5-\u0CD6\u0D57\u0C55\u0CD5]");
     35         int[] indicScripts = {
     36             UScript.LATIN,
     37             UScript.DEVANAGARI,
     38             UScript.BENGALI,
     39             UScript.GURMUKHI,
     40             UScript.GUJARATI,
     41             UScript.ORIYA,
     42             UScript.TAMIL,
     43             UScript.TELUGU,
     44             UScript.KANNADA,
     45             UScript.MALAYALAM,
     46         };
     47         String[] names = new String[indicScripts.length];
     48         UnicodeSet[] sets = new UnicodeSet[indicScripts.length];
     49         Transliterator[] fallbacks = new Transliterator[indicScripts.length];
     50         for (int i = 0; i < indicScripts.length; ++i) {
     51             names[i] = UScript.getName(indicScripts[i]);
     52             sets[i] = new UnicodeSet("[[:" + names[i] + ":]&[[:L:][:M:]]&[:age=3.1:]]");
     53             fallbacks[i] = Transliterator.getInstance("any-" + names[i]);
     54         }
     55         EquivClass eq = new EquivClass(new ReverseComparator());
     56         PrintWriter pw = openPrintWriter("transChart.html");
     57         pw.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
     58         pw.println("<title>Indic Transliteration Chart</title><style>");
     59         pw.println("td { text-align: Center; font-size: 200% }");
     60         pw.println("tt { font-size: 50% }");
     61         pw.println("td.miss { background-color: #CCCCFF }");
     62         pw.println("</style></head><body bgcolor='#FFFFFF'>");
     63 
     64         Transliterator anyToLatin = Transliterator.getInstance("any-latin");
     65 
     66         String testString = "\u0946\u093E";
     67 
     68         UnicodeSet failNorm = new UnicodeSet();
     69         Set latinFail = new TreeSet();
     70 
     71         for (int i = 0; i < indicScripts.length; ++i) {
     72             if (indicScripts[i] == UScript.LATIN) continue;
     73             String source = names[i];
     74             System.out.println(source);
     75             UnicodeSet sourceChars = sets[i];
     76 
     77             for (int j = 0; j < indicScripts.length; ++j) {
     78                 if (i == j) continue;
     79                 String target = names[j];
     80                 Transliterator forward = Transliterator.getInstance(source + '-' + target);
     81                 Transliterator backward = forward.getInverse();
     82                 UnicodeSetIterator it = new UnicodeSetIterator(sourceChars);
     83                 while (it.next()) {
     84                     if (lengthMarks.contains(it.codepoint)) continue;
     85                     String s = Normalizer.normalize(it.codepoint,Normalizer.NFC,0);
     86                     //if (!Normalizer.isNormalized(s,Normalizer.NFC,0)) continue;
     87                     if (!s.equals(Normalizer.normalize(s,Normalizer.NFD,0))) {
     88                         failNorm.add(it.codepoint);
     89                     }
     90                     String t = fix(forward.transliterate(s));
     91                     if (t.equals(testString)) {
     92                         System.out.println("debug");
     93                     }
     94 
     95                     String r = fix(backward.transliterate(t));
     96                     if (Normalizer.compare(s,r,0) == 0) {
     97                         if (indicScripts[j] != UScript.LATIN) eq.add(s,t);
     98                     } else {
     99                         if (indicScripts[j] == UScript.LATIN) {
    100                             latinFail.add(s + " - " + t + " - " + r);
    101                         }
    102                     }
    103                 }
    104             }
    105         }
    106         // collect equivalents
    107         pw.println("<table border='1' cellspacing='0'><tr>");
    108         for (int i = 0; i < indicScripts.length; ++i) {
    109             pw.print("<th width='10%'>" + names[i].substring(0,3) + "</th>");
    110         }
    111         pw.println("</tr>");
    112 
    113         Iterator rit = eq.getSetIterator(new MyComparator());
    114         while(rit.hasNext()) {
    115             Set equivs = (Set)rit.next();
    116             pw.print("<tr>");
    117             Iterator sit = equivs.iterator();
    118             String source = (String)sit.next();
    119             String item = anyToLatin.transliterate(source);
    120             if (item.equals("") || source.equals(item)) item = "&nbsp;";
    121             pw.print("<td>" + item + "</td>");
    122             for (int i = 1; i < indicScripts.length; ++i) {
    123                 sit = equivs.iterator();
    124                 item = "";
    125                 while (sit.hasNext()) {
    126                     String trial = (String)sit.next();
    127                     if (!sets[i].containsAll(trial)) continue;
    128                     item = trial;
    129                     break;
    130                 }
    131                 String classString = "";
    132                 if (item.equals("")) {
    133                     classString = " class='miss'";
    134                     String temp = fallbacks[i].transliterate(source);
    135                     if (!temp.equals("") && !temp.equals(source)) item = temp;
    136                 }
    137                 String backup = item.equals("") ? "&nbsp;" : item;
    138                 pw.print("<td" + classString + " title='" + getName(item, "; ") + "'>"
    139                     + backup + "<br><tt>" + Utility.hex(item) + "</tt></td>");
    140             }
    141             /*
    142             Iterator sit = equivs.iterator();
    143             while (sit.hasNext()) {
    144                 String item = (String)sit.next();
    145                 pw.print("<td>" + item + "</td>");
    146             }
    147             */
    148             pw.println("</tr>");
    149         }
    150         pw.println("</table>");
    151         if (true) {
    152             pw.println("<h2>Failed Normalization</h2>");
    153 
    154             UnicodeSetIterator it = new UnicodeSetIterator(failNorm);
    155             UnicodeSet pieces = new UnicodeSet();
    156             while (it.next()) {
    157                 String s = UTF16.valueOf(it.codepoint);
    158                 String d = Normalizer.normalize(s,Normalizer.NFD,0);
    159                 pw.println("Norm:" + s + ", " + Utility.hex(s) + " " + UCharacter.getName(it.codepoint)
    160                      + "; " + d + ", " + Utility.hex(d) + ", ");
    161                 pw.println(UCharacter.getName(d.charAt(1)) + "<br>");
    162                 if (UCharacter.getName(d.charAt(1)).indexOf("LENGTH") >= 0) pieces.add(d.charAt(1));
    163             }
    164             pw.println(pieces);
    165 
    166             pw.println("<h2>Failed Round-Trip</h2>");
    167             Iterator cit = latinFail.iterator();
    168             while (cit.hasNext()) {
    169                 pw.println(cit.next() + "<br>");
    170             }
    171         }
    172 
    173         pw.println("</table></body></html>");
    174         pw.close();
    175         System.out.println("Done");
    176     }
    177 
    178     public static String fix(String s) {
    179         if (s.equals("\u0946\u093E")) return "\u094A";
    180         if (s.equals("\u0C46\u0C3E")) return "\u0C4A";
    181         if (s.equals("\u0CC6\u0CBE")) return "\u0CCA";
    182 
    183         if (s.equals("\u0947\u093E")) return "\u094B";
    184         if (s.equals("\u0A47\u0A3E")) return "\u0A4B";
    185         if (s.equals("\u0AC7\u0ABE")) return "\u0ACB";
    186         if (s.equals("\u0C47\u0C3E")) return "\u0C4B";
    187         if (s.equals("\u0CC7\u0CBE")) return "\u0CCB";
    188 
    189         //return Normalizer.normalize(s,Normalizer.NFD,0);
    190         return s;
    191     }
    192 
    193     public static PrintWriter openPrintWriter(String fileName) throws IOException {
    194         File lf = new File(fileName);
    195         System.out.println("Creating file: " + lf.getAbsoluteFile());
    196 
    197         return new PrintWriter(
    198                 new BufferedWriter(
    199                     new OutputStreamWriter(
    200                         new FileOutputStream(fileName), "UTF8"), 4*1024));
    201     }
    202 
    203 
    204     public static String getName(String s, String separator) {
    205         int cp;
    206         StringBuffer sb = new StringBuffer();
    207         for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
    208             cp = UTF16.charAt(s,i);
    209             if (i != 0) sb.append(separator);
    210             sb.append(UCharacter.getName(cp));
    211         }
    212         return sb.toString();
    213     }
    214 
    215     static class MyComparator implements Comparator {
    216        public int compare(Object o1, Object o2) {
    217             Iterator i1 = ((TreeSet) o1).iterator();
    218             Iterator i2 = ((TreeSet) o2).iterator();
    219             while (i1.hasNext() && i2.hasNext()) {
    220                 String a = (String)i1.next();
    221                 String b = (String)i2.next();
    222                 int result = a.compareTo(b);
    223                 if (result != 0) return result;
    224             }
    225             if (i1.hasNext()) return 1;
    226             if (i2.hasNext()) return -1;
    227             return 0;
    228         }
    229 
    230     }
    231     static class ReverseComparator implements Comparator {
    232         public int compare(Object o1, Object o2) {
    233             String a = o1.toString();
    234             char a1 = a.charAt(0);
    235             String b = o2.toString();
    236             char b1 = b.charAt(0);
    237             if (a1 < 0x900 && b1 > 0x900) return -1;
    238             if (a1 > 0x900 && b1 < 0x900) return +1;
    239             return a.compareTo(b);
    240         }
    241     }
    242 
    243     static class EquivClass {
    244         EquivClass(Comparator c) {
    245             comparator = c;
    246         }
    247         private HashMap itemToSet = new HashMap();
    248         private Comparator comparator;
    249 
    250         void add(Object a, Object b) {
    251             Set sa = (Set)itemToSet.get(a);
    252             Set sb = (Set)itemToSet.get(b);
    253             if (sa == null && sb == null) { // new set!
    254                 Set s = new TreeSet(comparator);
    255                 s.add(a);
    256                 s.add(b);
    257                 itemToSet.put(a, s);
    258                 itemToSet.put(b, s);
    259             } else if (sa == null) {
    260                 sb.add(a);
    261             } else if (sb == null) {
    262                 sa.add(b);
    263             } else { // merge sets, dumping sb
    264                 sa.addAll(sb);
    265                 Iterator it = sb.iterator();
    266                 while (it.hasNext()) {
    267                     itemToSet.put(it.next(), sa);
    268                 }
    269             }
    270         }
    271 
    272         private class MyIterator implements Iterator {
    273             private Iterator it;
    274             MyIterator (Comparator comp) {
    275                 TreeSet values = new TreeSet(comp);
    276                 values.addAll(itemToSet.values());
    277                 it = values.iterator();
    278             }
    279 
    280             public boolean hasNext() {
    281                 return it.hasNext();
    282             }
    283             public Object next() {
    284                 return it.next();
    285             }
    286             public void remove() {
    287                 throw new IllegalArgumentException("can't remove");
    288             }
    289         }
    290 
    291         public Iterator getSetIterator (Comparator comp) {
    292             return new MyIterator(comp);
    293         }
    294 
    295     }
    296 }