Home | History | Annotate | Download | only in tool
      1 package org.unicode.cldr.tool;
      2 
      3 import java.io.BufferedReader;
      4 import java.io.IOException;
      5 import java.io.PrintWriter;
      6 import java.lang.reflect.Field;
      7 import java.util.ArrayList;
      8 import java.util.Comparator;
      9 import java.util.List;
     10 import java.util.Locale;
     11 import java.util.Map;
     12 import java.util.Set;
     13 import java.util.TreeMap;
     14 import java.util.TreeSet;
     15 
     16 import org.unicode.cldr.draft.FileUtilities;
     17 import org.unicode.cldr.util.CldrUtility;
     18 import org.unicode.cldr.util.Pair;
     19 
     20 import com.ibm.icu.impl.Relation;
     21 import com.ibm.icu.lang.UCharacter;
     22 import com.ibm.icu.text.Collator;
     23 import com.ibm.icu.text.NumberFormat;
     24 import com.ibm.icu.text.Transliterator;
     25 import com.ibm.icu.text.UTF16;
     26 import com.ibm.icu.text.UnicodeSet;
     27 import com.ibm.icu.text.UnicodeSetIterator;
     28 import com.ibm.icu.util.ULocale;
     29 
     30 /**
     31  * Takes a list of mappings (tab delimited) from source to target and produces a
     32  * transliterator
     33  *
     34  * @author markdavis
     35  *         http://en.wikipedia.org/wiki/English_phonology
     36  */
     37 public class MakeTransliterator {
     38     // DEBUGGING
     39     static int forceSeparateIfShorter = 4; // 4
     40 
     41     private static final String CHECK_BASE = null; // "vessel";
     42     private static final String CHECK_BUILT = null; // "vessel";
     43 
     44     private static final String TEST_STRING = "territories";
     45     private static final boolean SHOW_OVERRIDES = true;
     46 
     47     private static final int MINIMUM_FREQUENCY = 9999;
     48 
     49     static boolean isIPA = true;
     50     static boolean onlyToTarget = true;
     51 
     52     // others
     53 
     54     static NumberFormat nf = NumberFormat.getInstance(ULocale.ENGLISH);
     55 
     56     static Collator col = Collator.getInstance(ULocale.ROOT);
     57 
     58     static String cldrDataDir = "C:\\cvsdata\\unicode\\cldr\\tools\\java\\org\\unicode\\cldr\\util\\data\\transforms\\";
     59 
     60     public static void main(String[] args) throws IOException {
     61         setTranslitDebug(true);
     62 
     63         Locale fil = new Locale("fil");
     64         System.out.println(fil);
     65         fil = new Locale("fil", "US");
     66         System.out.println(fil);
     67 
     68         String sourceFile = cldrDataDir + "internal_raw_IPA.txt";
     69         String targetFile = cldrDataDir + "en-IPA.txt";
     70         String targetCountFile = cldrDataDir + "en-IPA_count.txt";
     71         String skippedLinesFile = "C:\\DATA\\GEN\\SkippedIPA.txt";
     72 
     73         PrintWriter skippedOut = FileUtilities.openUTF8Writer("", skippedLinesFile);
     74 
     75         // String coreRules = getCoreTransliterator();
     76         String fixBadIpaRules = createFromFile(cldrDataDir + "internal_fixBadIpa.txt", null, null);
     77         fixBadIpa = Transliterator.createFromRules("foo", fixBadIpaRules, Transliterator.FORWARD);
     78 
     79         Map<String, String> overrides = getOverrides();
     80 
     81         String coreForeRules = createFromFile(cldrDataDir + "internal_baseEnglishToIpa.txt", null, null);
     82         coreBase = Transliterator.createFromRules("foo", coreForeRules, Transliterator.FORWARD);
     83         if (CHECK_BASE != null) {
     84             setTranslitDebug(true);
     85             System.out.println(coreBase.transliterate(CHECK_BASE));
     86             return;
     87         }
     88 
     89         if (CHECK_BUILT != null) {
     90             String foo = createFromFile(cldrDataDir + "en-IPA.txt", null, null);
     91             Transliterator fooTrans = Transliterator.createFromRules("foo", foo, Transliterator.FORWARD);
     92 
     93             setTranslitDebug(true);
     94             System.out.println(fooTrans.transliterate(CHECK_BUILT));
     95             return;
     96         }
     97 
     98         String coreBackRules = createFromFile(cldrDataDir + "internal_English-IPA-backwards.txt", null, null);
     99         checkCoreReversibility(skippedOut, coreForeRules, coreBackRules);
    100         String coreRules = coreForeRules + coreBackRules;
    101         System.out.println(coreRules);
    102 
    103         // C:\DATA\GEN\mergedIPA2.txt
    104         // we have to have items in order. Longest forms need to come first, on both
    105         // sides.
    106         Relation<String, Pair<String, Long>> store = Relation.of(new TreeMap<String, Set<Pair<String, Long>>>(MyComparator),
    107             TreeSet.class);
    108 
    109         targetCharacters = new UnicodeSet();
    110         sourceCharacters = new UnicodeSet();
    111         allowedSourceCharacters = new UnicodeSet(
    112             "[[:Letter:]\u2019]").freeze();
    113         allowedTargetCharacters = new UnicodeSet(
    114             "[\u00E6 \u0251 b d\u00F0 e \u0259 \u025B f-i \u026A j-n \u014B o p r s \u0283 t u \u028A v w z \u0292 \u03B8]")
    115                 .freeze();
    116         countSkipped = 0;
    117         totalFrequency = 0;
    118         skippedFrequency = 0;
    119         int targetField = isIPA ? 2 : 1;
    120 
    121         BufferedReader in = FileUtilities.openUTF8Reader("", sourceFile);
    122         while (true) {
    123             String line = in.readLine();
    124             if (line == null)
    125                 break;
    126             if (line.startsWith("\uFEFF")) {
    127                 line = line.substring(1);
    128             }
    129             String originalLine = line;
    130             int commentCharPosition = line.indexOf('#');
    131             if (commentCharPosition >= 0) {
    132                 line = line.substring(0, commentCharPosition);
    133             }
    134             line = line.trim();
    135             frequency = -1;
    136             String[] pieces = line.split(" *[\\t,] *");
    137             if (pieces.length <= targetField) {
    138                 // skippedOut.println(originalLine + "\tno phonetics");
    139                 // countSkipped++;
    140                 continue; // no phonetics
    141             }
    142             String source = pieces[0];
    143             if (TEST_STRING != null && source.equals(TEST_STRING)) {
    144                 System.out.println(line); // for debugging
    145             }
    146 
    147             // Fix Source
    148             source = source.replace("'", "");
    149             source = UCharacter.toLowerCase(ULocale.ENGLISH, source);
    150             if (source.endsWith(".")) {
    151                 source = source.substring(0, source.length() - 1);
    152             }
    153             if (source.contains(" ") || source.contains("-")) {
    154                 skippedOut.println(originalLine + "\tspace or hyphen");
    155                 countSkipped++;
    156                 skippedFrequency += frequency;
    157                 continue;
    158             }
    159 
    160             //String bestTarget = null;
    161 
    162             String override = overrides.get(source);
    163             String spelling = spellout.transliterate(source);
    164 
    165             for (int i = 1; i < pieces.length; ++i) {
    166                 String target = pieces[i];
    167                 if (target.startsWith("%")) {
    168                     frequency = Long.parseLong(target.substring(1));
    169                     continue;
    170                 }
    171 
    172                 if (override != null) {
    173                     if (SHOW_OVERRIDES)
    174                         System.out.println("Overriding\t" + source + "  ! " + target + "  " + override);
    175                     if (override.length() != 0) {
    176                         if (TEST_STRING != null && source.equals(TEST_STRING)) {
    177                             setTranslitDebug(true);
    178                         }
    179                         target = fixBadIpa.transliterate(override);
    180                         setTranslitDebug(false);
    181                         addSourceTarget(skippedOut, source, target, originalLine, store);
    182                     }
    183                     break;
    184                 }
    185 
    186                 if (frequency < MINIMUM_FREQUENCY) {
    187                     // skippedOut.println(originalLine + "\tno frequency");
    188                     countSkipped++;
    189                     continue;
    190                 }
    191 
    192                 target = UCharacter.toLowerCase(ULocale.ENGLISH, target);
    193                 target = target.replace(" ", ""); // remove extra spaces
    194 
    195                 if (target.startsWith("-") || target.endsWith("-")) {
    196                     continue;
    197                 }
    198 
    199                 String oldTarget = target;
    200                 target = fixBadIpa.transliterate(target);
    201 
    202                 if (target.equals(spelling)) {
    203                     skippedOut.println(originalLine
    204                         + "\tspellout");
    205                     countSkipped++;
    206                     continue;
    207                 }
    208 
    209                 if (!target.equals(oldTarget)) {
    210                     skippedOut.println("\t### fixed IPA:\t" + source + "\t" + target
    211                         + "\twas: " + oldTarget);
    212                 }
    213 
    214                 addSourceTarget(skippedOut, source, target, originalLine, store);
    215             }
    216         }
    217 
    218         // add the overrides that are not in.
    219 
    220         for (String word : overrides.keySet()) {
    221             if (!store.containsKey(word)) {
    222                 String target = overrides.get(word);
    223                 if (target.length() != 0) {
    224                     if (SHOW_OVERRIDES) System.out.println("New overrides:\t" + word + "  " + target);
    225                     addSourceTarget(skippedOut, word, target, "overrides", store);
    226                 }
    227             }
    228         }
    229         in.close();
    230         System.out.println("total count: " + nf.format(store.size()));
    231         System.out.println("skipped count: " + nf.format(countSkipped));
    232 
    233         System.out.println("total frequency-weighted: " + nf.format(totalFrequency));
    234         System.out.println("skipped frequency-weighted: " + nf.format(skippedFrequency));
    235 
    236         if (false) {
    237             System.out.println(CldrUtility.LINE_SEPARATOR + "Source Characters ");
    238             showSet(sourceCharacters);
    239             System.out.println(CldrUtility.LINE_SEPARATOR + "Target Characters ");
    240             showSet(targetCharacters);
    241         }
    242 
    243         // Set<String> seenSource = new HashSet<String>();
    244         // Set<String> seenTarget = new HashSet<String>();
    245 
    246         int countAdded = 0;
    247         int countTotal = 0;
    248         long frequencyAdded = 0;
    249         long frequencySkipped = 0;
    250 
    251         Transliterator base = Transliterator.createFromRules("foo", coreRules, Transliterator.FORWARD);
    252         // build up the transliterator one length at a time.
    253         List<String> newRules = new ArrayList<String>();
    254         StringBuilder buffer = new StringBuilder();
    255 
    256         int lastSourceLength = 1;
    257 
    258         Relation<Long, String> count_failures = Relation.of(new TreeMap<Long, Set<String>>(), TreeSet.class);
    259 
    260         sourceLoop: for (String source : store.keySet()) {
    261             if (TEST_STRING != null && source.equals(TEST_STRING)) {
    262                 System.out.println(source + "\t" + store.getAll(source));
    263             }
    264             countTotal++;
    265             // whenever the source changes in length, rebuild the transliterator
    266             if (source.length() != lastSourceLength && source.length() >= forceSeparateIfShorter) {
    267                 System.out.println("Building transliterator for length " + lastSourceLength + " : " + newRules.size());
    268                 System.out.flush();
    269                 skippedOut.flush();
    270                 String rules = buildRules(coreRules, newRules, buffer);
    271                 // System.out.println(rules);
    272                 base = Transliterator.createFromRules("foo", rules, Transliterator.FORWARD);
    273 
    274                 lastSourceLength = source.length();
    275             }
    276             Set<Pair<String, Long>> targetSet = store.getAll(source);
    277             // see if any of the mappings fall out
    278             String targetUsingCore = base.transliterate(source);
    279 
    280             String bestTarget = null;
    281             int bestDistance = 999;
    282             long frequency = 0;
    283             for (Pair<String, Long> targetPair : targetSet) {
    284                 String target = targetPair.getFirst();
    285                 if (target.length() == 0) {
    286                     throw new IllegalArgumentException(source + "  " + target);
    287                 }
    288                 frequency = targetPair.getSecond();
    289 
    290                 if (targetUsingCore.equals(target)) {
    291                     // we have a match! skip this source
    292                     skippedOut.println("# skipping " + source + "  " + target + " ;");
    293                     frequencySkipped += frequency;
    294                     continue sourceLoop;
    295                 }
    296                 if (mostlyEqual(source, target, targetUsingCore)) {
    297                     // we have a match! skip this source
    298                     skippedOut.println("# skipping " + source + "  " + target + " ; # close enough to "
    299                         + targetUsingCore);
    300                     frequencySkipped += frequency;
    301                     continue sourceLoop;
    302                 }
    303                 int distance = distance(source, target, targetUsingCore);
    304                 if (bestDistance > distance) {
    305                     bestTarget = target;
    306                     bestDistance = distance;
    307                 }
    308             }
    309             // if we get to here, we have a new rule.
    310             if (bestTarget != null) {
    311                 boolean forceSeparate = false;
    312                 if (source.length() < forceSeparateIfShorter || bestTarget.length() * 2 > source.length() * 3) {
    313                     forceSeparate = true;
    314                 } else {
    315                     String spelling = spellout.transliterate(source);
    316                     if (bestTarget.equals(spelling)) {
    317                         forceSeparate = true;
    318                     } else {
    319                         // if it is likely that the word can have an extra letter added that changes the pronunciation
    320                         // force it to be separate
    321                         if (source.endsWith("e")) {
    322                             forceSeparate = true;
    323                         }
    324                     }
    325                 }
    326                 String targetUsingBaseCore = coreBase.transliterate(source);
    327 
    328                 if (forceSeparate) {
    329                     source = "$x{" + source + "}$x";
    330                 } else {
    331                     source = "$x{" + source;
    332                 }
    333                 // strange hack
    334                 String hackSource = source.startsWith("use") ? "'" + source + "'" : source;
    335                 newRules.add(hackSource + "  " + bestTarget + " ; # " + targetUsingCore
    336                     + (targetUsingBaseCore.equals(targetUsingCore) ? "" : "\t\t" + targetUsingBaseCore)
    337                     + CldrUtility.LINE_SEPARATOR);
    338                 skippedOut.println("# couldn't replace  " + source + "  " + bestTarget + " ; # " + targetUsingCore);
    339                 count_failures.put(-frequency, source + "  " + bestTarget + " ; # " + targetUsingCore);
    340                 countAdded++;
    341                 frequencyAdded += frequency;
    342             }
    343         }
    344 
    345         String rules = buildRules(coreRules, newRules, buffer);
    346         base = Transliterator.createFromRules("foo", rules, Transliterator.FORWARD); // verify that it builds
    347 
    348         PrintWriter out = FileUtilities.openUTF8Writer("", targetFile);
    349         out.println(rules);
    350         out.close();
    351 
    352         out = FileUtilities.openUTF8Writer("", targetCountFile);
    353         for (long count : count_failures.keySet()) {
    354             for (String line : count_failures.getAll(count)) {
    355                 out.println(count + "\t" + line);
    356             }
    357         }
    358         out.close();
    359 
    360         // if (false) {
    361         //
    362         // // now write out the transliterator file
    363         // PrintWriter out = FileUtilities.openUTF8Writer("", targetFile);
    364         // for (String source : store.keySet()) {
    365         // Set<String> targetSet = store.getAll(source);
    366         // for (String target : targetSet) {
    367         // if (seenSource.contains(source)) {
    368         // if (onlyToTarget) {
    369         // // nothing
    370         // } else if (seenTarget.contains(target)) {
    371         // skippedOut.println("# " + source + "  " + target + " ;");
    372         // countSkipped++;
    373         // } else {
    374         // out.println(source + "  " + target + " ;");
    375         // countSourceFromTarget++;
    376         // }
    377         // } else if (onlyToTarget || seenTarget.contains(target)) {
    378         // out.println(source + "  " + target + " ;");
    379         // countSourceToTarget++;
    380         // } else {
    381         // out.println(source + "  " + target + " ;");
    382         // countSourceAndTarget++;
    383         // }
    384         // seenSource.add(source);
    385         // seenTarget.add(target);
    386         // }
    387         // }
    388         // out.close();
    389         // }
    390         skippedOut.close();
    391         System.out.println("countTotal: " + nf.format(countTotal));
    392         System.out.println("countAdded: " + nf.format(countAdded));
    393         System.out.println("countSkipped: " + nf.format(countTotal - countAdded));
    394         System.out.println("frequencyTotal: " + nf.format(frequencyAdded + frequencySkipped));
    395         System.out.println("frequencyAdded: " + nf.format(frequencyAdded));
    396         System.out.println("frequencySkipped: " + nf.format(frequencySkipped));
    397     }
    398 
    399     private static void setTranslitDebug(boolean newSetting) {
    400         // Transliterator.DEBUG = newSetting;
    401         try {
    402             Field debug = Transliterator.class.getField("DEBUG");
    403             debug.setBoolean(Transliterator.class, newSetting);
    404         } catch (Exception e) {
    405             e.printStackTrace();
    406         }
    407     }
    408 
    409     private static void addSourceTarget(PrintWriter skippedOut, String source, String target, String originalLine,
    410         Relation<String, Pair<String, Long>> store) {
    411         if (source.equals("teh")) {
    412             System.out.println("debug");
    413         }
    414         if (!allowedSourceCharacters.containsAll(source)) {
    415             skippedOut.println(originalLine
    416                 + "\t# Strange source values:\t"
    417                 + source
    418                 + "\t"
    419                 + new UnicodeSet().addAll(source)
    420                     .removeAll(allowedSourceCharacters).toPattern(false));
    421             countSkipped++;
    422             skippedFrequency += frequency;
    423             return;
    424         }
    425         if (!allowedTargetCharacters.containsAll(target)) {
    426             System.out.println(originalLine
    427                 + "\t# Strange target values:\t"
    428                 + target
    429                 + "\t"
    430                 + new UnicodeSet().addAll(target)
    431                     .removeAll(allowedTargetCharacters).toPattern(false));
    432             countSkipped++;
    433             skippedFrequency += frequency;
    434             return;
    435         }
    436 
    437         sourceCharacters.addAll(source);
    438         targetCharacters.addAll(target);
    439         store.put(source, new Pair<String, Long>(target, frequency));
    440         totalFrequency += frequency;
    441 
    442     }
    443 
    444     private static void checkCoreReversibility(PrintWriter skippedOut, String coreRules, String coreBackRules) {
    445         Transliterator base = Transliterator.createFromRules("foo", coreRules, Transliterator.FORWARD);
    446         Transliterator back = Transliterator.createFromRules("foo2", coreBackRules, Transliterator.REVERSE);
    447         String[] tests = "bat bait bet beet bit bite bot boat but bute bout boot book boy pat bat vat fat mat tat dat thew father nat sat zoo ash asia gate cat late rate hate yet rang chat jet"
    448             .split("\\s");
    449         for (String test : tests) {
    450             String test2 = base.transliterate(test);
    451             String test3 = back.transliterate(test2);
    452             skippedOut.println(test + "\t " + test2 + "\t " + test3);
    453         }
    454         skippedOut.flush();
    455     }
    456 
    457     private static String buildRules(String coreRules, List<String> newRules, StringBuilder buffer) {
    458         // Transliterator base;
    459         // build backwards!!
    460         buffer.setLength(0);
    461         buffer.append(
    462             "# Author: M Davis" + CldrUtility.LINE_SEPARATOR +
    463                 "# Email: mark.davis (at) icu-project.org" + CldrUtility.LINE_SEPARATOR +
    464                 "# Description: English to IPA" + CldrUtility.LINE_SEPARATOR +
    465                 // "$nletter {([A-Z]+)} $nletter > &en-IPA/spellout($1) ; " + Utility.LINE_SEPARATOR +
    466                 ":: lower(); " + CldrUtility.LINE_SEPARATOR +
    467                 "$x = [:^letter:] ;" + CldrUtility.LINE_SEPARATOR);
    468         for (int i = newRules.size() - 1; i >= 0; --i) {
    469             buffer.append(newRules.get(i));
    470         }
    471         buffer.append(coreRules);
    472         // System.out.println(buffer);
    473         String result = buffer.toString();
    474         // ensure it builds
    475         return result;
    476     }
    477 
    478     private static void showSet(UnicodeSet sourceCharacters) {
    479         for (UnicodeSetIterator it = new UnicodeSetIterator(sourceCharacters); it
    480             .next();) {
    481             System.out.println(com.ibm.icu.impl.Utility.hex(it.codepoint) + "\t("
    482                 + UTF16.valueOf(it.codepoint) + ")\t"
    483                 + UCharacter.getName(it.codepoint));
    484         }
    485     }
    486 
    487     public static UnicodeSet vowels = new UnicodeSet("[aeiou         ]").freeze();
    488     public static UnicodeSet short_vowels = new UnicodeSet("[     ]").freeze();
    489     /**
    490      * Return true if the strings are essentially the same.
    491      * Differences between schwas and short vowels are counted in certain cases
    492      *
    493      * @param targetDir
    494      * @param targetUsingCore
    495      * @param targetUsingCore2
    496      * @return
    497      */
    498     static UnicodeSet targetChars = new UnicodeSet();
    499     static UnicodeSet targetCoreChars = new UnicodeSet();
    500     static UnicodeSet tempDiff = new UnicodeSet();
    501     static Transliterator distinguishLongVowels = Transliterator.createFromRules("faa",
    502         " >  ;" +
    503             " >  ;" +
    504             "o >  ;",
    505         Transliterator.FORWARD);
    506 
    507     private static int distance(String source, String target, String targetUsingCore) {
    508         if (target.equals(targetUsingCore)) return 0;
    509         if (mostlyEqual(source, target, targetUsingCore)) return 1;
    510         // first compare the consonants. Count each difference as 3
    511         String zappedTarget = distinguishLongVowels.transliterate(target);
    512         String zappedCoreTarget = distinguishLongVowels.transliterate(targetUsingCore);
    513 
    514         targetChars.clear().addAll(zappedTarget); //
    515         targetCoreChars.clear().addAll(zappedCoreTarget);
    516         if (targetChars.equals(targetCoreChars)) {
    517             return 3;
    518         }
    519         targetChars.removeAll(short_vowels);
    520         targetCoreChars.removeAll(short_vowels);
    521         if (targetChars.equals(targetCoreChars)) {
    522             return 5;
    523         }
    524 
    525         targetChars.removeAll(vowels);
    526         targetCoreChars.removeAll(vowels);
    527         if (targetChars.equals(targetCoreChars)) {
    528             return 5;
    529         }
    530 
    531         tempDiff.clear().addAll(targetChars).removeAll(targetCoreChars);
    532         int result = 7 + tempDiff.size();
    533         tempDiff.clear().addAll(targetCoreChars).removeAll(targetChars);
    534         result += tempDiff.size();
    535         return result;
    536     }
    537 
    538     static final Transliterator skeletonize = Transliterator.createFromRules("faa",
    539         " >  ;" +
    540             " > i ;" +
    541             "o > oi ;" +
    542             "r > r ;" +
    543             "r > er ;" +
    544             "r > er ;" +
    545             "r > ir ;" +
    546             "r > ur ;",
    547         Transliterator.FORWARD);
    548 
    549     private static boolean mostlyEqual(String inSource, String inTarget, String inTargetUsingCore) {
    550 
    551         if (inTarget.length() != inTargetUsingCore.length()) return false;
    552 
    553         // transform these -- simplest that way
    554         String target = skeletonize.transliterate(inTarget);
    555         String targetUsingCore = skeletonize.transliterate(inTargetUsingCore);
    556 
    557         for (int i = 0; i < target.length(); ++i) {
    558             char ca = target.charAt(i);
    559             char cb = targetUsingCore.charAt(i);
    560             if (ca != cb) {
    561                 // disregard differences with short vowels
    562                 if (ca == '' && short_vowels.contains(cb) || short_vowels.contains(ca) && cb == '') {
    563                     continue;
    564                 }
    565                 // ")  && a.startsWith("")
    566                 if (ca == '' && cb == '' || ca == '' && cb == '') {
    567                     continue;
    568                 }
    569                 return false;
    570             }
    571         }
    572         return true; // return diffCount == 0 ? true : diffCount < vowelCount;
    573     }
    574 
    575     static Transliterator spellout = Transliterator.createFromRules("foo",
    576         "a > e ;"
    577             + "b > bi ;"
    578             + "c > si ;"
    579             + "d > di ;"
    580             + "e > i ;"
    581             + "f > f ;"
    582             + "g > di ;"
    583             + "h > et ;"
    584             + "i >  ;"
    585             + "j > de ;"
    586             + "k > ke ;"
    587             + "l > l ;"
    588             + "m > m ;"
    589             + "n > n ;"
    590             + "o > o ;"
    591             + "p > pi ;"
    592             + "q > kwu ;"
    593             + "r > r ;"
    594             + "s > s ;"
    595             + "t > ti ;"
    596             + "u > ju ;"
    597             + "v > vi ;"
    598             + "w > dbj ;"
    599             + "x > ks ;"
    600             + "y > w ;"
    601             + "z > zi ;",
    602         Transliterator.FORWARD);
    603 
    604     /**
    605      * Returns items sorted alphabetically, shortest first
    606      */
    607     static Comparator MyComparator = new Comparator() {
    608 
    609         public int compare(Object a, Object b) {
    610             String as = (String) a;
    611             String bs = (String) b;
    612             if (as.length() < bs.length())
    613                 return -1;
    614             if (as.length() > bs.length())
    615                 return 1;
    616             int result = col.compare(as, bs);
    617             if (result != 0) {
    618                 return result;
    619             }
    620             return as.compareTo(bs);
    621         }
    622 
    623     };
    624 
    625     // static String dataDir = "C:\\cvsdata\\unicode\\ucd\\unicodetools\\dictionary\\Data\\";
    626 // private static String getCoreTransliterator() throws IOException {
    627     //
    628     // String accentRules = createFromFile(dataDir + "accentRules.txt", null, null);
    629     //
    630     // Transliterator doAccentRules = Transliterator.createFromRules("foo", accentRules, Transliterator.FORWARD);
    631     //
    632     // String markedToIpa = createFromFile(dataDir + "IPARules.txt", doAccentRules, null);
    633     // System.out.println(markedToIpa);
    634     // Transliterator doMarkedToIpa = Transliterator.createFromRules("foo", markedToIpa, Transliterator.FORWARD);
    635     //
    636     // String trial = "a>";
    637     // String result = doMarkedToIpa.transliterate(trial);
    638     // System.out.println("****" + result);
    639     //
    640     // String englishToIpaBase = createFromFile(dataDir + "reduceRules.txt", doAccentRules, doMarkedToIpa);
    641     //
    642     // System.out.println(englishToIpaBase);
    643     //
    644     // //Transform file name into id
    645     //
    646     // return englishToIpaBase;
    647     // }
    648 
    649     public static String createFromFile(String fileName, Transliterator pretrans, Transliterator pretrans2)
    650         throws IOException {
    651         StringBuilder buffer = new StringBuilder();
    652         BufferedReader fli = FileUtilities.openUTF8Reader("", fileName);
    653         while (true) {
    654             String line = fli.readLine();
    655             if (line == null) break;
    656             if (line.startsWith("\uFEFF")) line = line.substring(1);
    657             if (pretrans != null) {
    658                 line = pretrans.transliterate(line);
    659             }
    660             if (pretrans2 != null) {
    661                 line = pretrans2.transliterate(line);
    662             }
    663 
    664             buffer.append(line);
    665             buffer.append(CldrUtility.LINE_SEPARATOR); // separate with whitespace
    666         }
    667         fli.close();
    668         return buffer.toString();
    669     }
    670 
    671     static int LIMIT = Integer.MAX_VALUE;
    672     private static Transliterator fixBadIpa;
    673     private static UnicodeSet targetCharacters;
    674     private static UnicodeSet sourceCharacters;
    675     private static UnicodeSet allowedSourceCharacters;
    676     private static UnicodeSet allowedTargetCharacters;
    677     private static int countSkipped;
    678     private static long skippedFrequency;
    679     private static long frequency;
    680     private static long totalFrequency;
    681     private static Transliterator coreBase;
    682 
    683     public static Map<String, String> getOverrides() throws IOException {
    684         Map<String, String> result = new TreeMap<String, String>();
    685         BufferedReader br = FileUtilities.openUTF8Reader(cldrDataDir, "internal_overrides.txt");
    686         try {
    687             int counter = 0;
    688             while (counter < LIMIT) {
    689                 String line = br.readLine();
    690                 if (line == null) break;
    691                 line = line.trim();
    692                 if (line.length() == 0) continue;
    693 
    694                 String[] iLine = line.split("\\s*\\s*");
    695                 String word = iLine[0].trim();
    696                 if (result.containsKey(word)) {
    697                     System.out.println("Overrides already contain: " + word);
    698                     continue;
    699                 }
    700                 if (iLine.length < 2) {
    701                     result.put(word, "");
    702                 } else {
    703                     String ipa = fixBadIpa.transliterate(iLine[1].trim());
    704                     result.put(word, ipa);
    705                 }
    706             }
    707         } finally {
    708             br.close();
    709         }
    710         return result;
    711     }
    712 
    713 }