1 package org.unicode.cldr.tool; 2 3 import java.io.BufferedReader; 4 import java.io.IOException; 5 import java.io.PrintWriter; 6 import java.lang.reflect.Field; 7 import java.util.ArrayList; 8 import java.util.Comparator; 9 import java.util.List; 10 import java.util.Locale; 11 import java.util.Map; 12 import java.util.Set; 13 import java.util.TreeMap; 14 import java.util.TreeSet; 15 16 import org.unicode.cldr.draft.FileUtilities; 17 import org.unicode.cldr.util.CldrUtility; 18 import org.unicode.cldr.util.Pair; 19 20 import com.ibm.icu.impl.Relation; 21 import com.ibm.icu.lang.UCharacter; 22 import com.ibm.icu.text.Collator; 23 import com.ibm.icu.text.NumberFormat; 24 import com.ibm.icu.text.Transliterator; 25 import com.ibm.icu.text.UTF16; 26 import com.ibm.icu.text.UnicodeSet; 27 import com.ibm.icu.text.UnicodeSetIterator; 28 import com.ibm.icu.util.ULocale; 29 30 /** 31 * Takes a list of mappings (tab delimited) from source to target and produces a 32 * transliterator 33 * 34 * @author markdavis 35 * http://en.wikipedia.org/wiki/English_phonology 36 */ 37 public class MakeTransliterator { 38 // DEBUGGING 39 static int forceSeparateIfShorter = 4; // 4 40 41 private static final String CHECK_BASE = null; // "vessel"; 42 private static final String CHECK_BUILT = null; // "vessel"; 43 44 private static final String TEST_STRING = "territories"; 45 private static final boolean SHOW_OVERRIDES = true; 46 47 private static final int MINIMUM_FREQUENCY = 9999; 48 49 static boolean isIPA = true; 50 static boolean onlyToTarget = true; 51 52 // others 53 54 static NumberFormat nf = NumberFormat.getInstance(ULocale.ENGLISH); 55 56 static Collator col = Collator.getInstance(ULocale.ROOT); 57 58 static String cldrDataDir = "C:\\cvsdata\\unicode\\cldr\\tools\\java\\org\\unicode\\cldr\\util\\data\\transforms\\"; 59 60 public static void main(String[] args) throws IOException { 61 setTranslitDebug(true); 62 63 Locale fil = new Locale("fil"); 64 System.out.println(fil); 65 fil = new Locale("fil", "US"); 66 System.out.println(fil); 67 68 String sourceFile = cldrDataDir + "internal_raw_IPA.txt"; 69 String targetFile = cldrDataDir + "en-IPA.txt"; 70 String targetCountFile = cldrDataDir + "en-IPA_count.txt"; 71 String skippedLinesFile = "C:\\DATA\\GEN\\SkippedIPA.txt"; 72 73 PrintWriter skippedOut = FileUtilities.openUTF8Writer("", skippedLinesFile); 74 75 // String coreRules = getCoreTransliterator(); 76 String fixBadIpaRules = createFromFile(cldrDataDir + "internal_fixBadIpa.txt", null, null); 77 fixBadIpa = Transliterator.createFromRules("foo", fixBadIpaRules, Transliterator.FORWARD); 78 79 Map<String, String> overrides = getOverrides(); 80 81 String coreForeRules = createFromFile(cldrDataDir + "internal_baseEnglishToIpa.txt", null, null); 82 coreBase = Transliterator.createFromRules("foo", coreForeRules, Transliterator.FORWARD); 83 if (CHECK_BASE != null) { 84 setTranslitDebug(true); 85 System.out.println(coreBase.transliterate(CHECK_BASE)); 86 return; 87 } 88 89 if (CHECK_BUILT != null) { 90 String foo = createFromFile(cldrDataDir + "en-IPA.txt", null, null); 91 Transliterator fooTrans = Transliterator.createFromRules("foo", foo, Transliterator.FORWARD); 92 93 setTranslitDebug(true); 94 System.out.println(fooTrans.transliterate(CHECK_BUILT)); 95 return; 96 } 97 98 String coreBackRules = createFromFile(cldrDataDir + "internal_English-IPA-backwards.txt", null, null); 99 checkCoreReversibility(skippedOut, coreForeRules, coreBackRules); 100 String coreRules = coreForeRules + coreBackRules; 101 System.out.println(coreRules); 102 103 // C:\DATA\GEN\mergedIPA2.txt 104 // we have to have items in order. Longest forms need to come first, on both 105 // sides. 106 Relation<String, Pair<String, Long>> store = Relation.of(new TreeMap<String, Set<Pair<String, Long>>>(MyComparator), 107 TreeSet.class); 108 109 targetCharacters = new UnicodeSet(); 110 sourceCharacters = new UnicodeSet(); 111 allowedSourceCharacters = new UnicodeSet( 112 "[[:Letter:]\u2019]").freeze(); 113 allowedTargetCharacters = new UnicodeSet( 114 "[\u00E6 \u0251 b d\u00F0 e \u0259 \u025B f-i \u026A j-n \u014B o p r s \u0283 t u \u028A v w z \u0292 \u03B8]") 115 .freeze(); 116 countSkipped = 0; 117 totalFrequency = 0; 118 skippedFrequency = 0; 119 int targetField = isIPA ? 2 : 1; 120 121 BufferedReader in = FileUtilities.openUTF8Reader("", sourceFile); 122 while (true) { 123 String line = in.readLine(); 124 if (line == null) 125 break; 126 if (line.startsWith("\uFEFF")) { 127 line = line.substring(1); 128 } 129 String originalLine = line; 130 int commentCharPosition = line.indexOf('#'); 131 if (commentCharPosition >= 0) { 132 line = line.substring(0, commentCharPosition); 133 } 134 line = line.trim(); 135 frequency = -1; 136 String[] pieces = line.split(" *[\\t,] *"); 137 if (pieces.length <= targetField) { 138 // skippedOut.println(originalLine + "\tno phonetics"); 139 // countSkipped++; 140 continue; // no phonetics 141 } 142 String source = pieces[0]; 143 if (TEST_STRING != null && source.equals(TEST_STRING)) { 144 System.out.println(line); // for debugging 145 } 146 147 // Fix Source 148 source = source.replace("'", ""); 149 source = UCharacter.toLowerCase(ULocale.ENGLISH, source); 150 if (source.endsWith(".")) { 151 source = source.substring(0, source.length() - 1); 152 } 153 if (source.contains(" ") || source.contains("-")) { 154 skippedOut.println(originalLine + "\tspace or hyphen"); 155 countSkipped++; 156 skippedFrequency += frequency; 157 continue; 158 } 159 160 //String bestTarget = null; 161 162 String override = overrides.get(source); 163 String spelling = spellout.transliterate(source); 164 165 for (int i = 1; i < pieces.length; ++i) { 166 String target = pieces[i]; 167 if (target.startsWith("%")) { 168 frequency = Long.parseLong(target.substring(1)); 169 continue; 170 } 171 172 if (override != null) { 173 if (SHOW_OVERRIDES) 174 System.out.println("Overriding\t" + source + " ! " + target + " " + override); 175 if (override.length() != 0) { 176 if (TEST_STRING != null && source.equals(TEST_STRING)) { 177 setTranslitDebug(true); 178 } 179 target = fixBadIpa.transliterate(override); 180 setTranslitDebug(false); 181 addSourceTarget(skippedOut, source, target, originalLine, store); 182 } 183 break; 184 } 185 186 if (frequency < MINIMUM_FREQUENCY) { 187 // skippedOut.println(originalLine + "\tno frequency"); 188 countSkipped++; 189 continue; 190 } 191 192 target = UCharacter.toLowerCase(ULocale.ENGLISH, target); 193 target = target.replace(" ", ""); // remove extra spaces 194 195 if (target.startsWith("-") || target.endsWith("-")) { 196 continue; 197 } 198 199 String oldTarget = target; 200 target = fixBadIpa.transliterate(target); 201 202 if (target.equals(spelling)) { 203 skippedOut.println(originalLine 204 + "\tspellout"); 205 countSkipped++; 206 continue; 207 } 208 209 if (!target.equals(oldTarget)) { 210 skippedOut.println("\t### fixed IPA:\t" + source + "\t" + target 211 + "\twas: " + oldTarget); 212 } 213 214 addSourceTarget(skippedOut, source, target, originalLine, store); 215 } 216 } 217 218 // add the overrides that are not in. 219 220 for (String word : overrides.keySet()) { 221 if (!store.containsKey(word)) { 222 String target = overrides.get(word); 223 if (target.length() != 0) { 224 if (SHOW_OVERRIDES) System.out.println("New overrides:\t" + word + " " + target); 225 addSourceTarget(skippedOut, word, target, "overrides", store); 226 } 227 } 228 } 229 in.close(); 230 System.out.println("total count: " + nf.format(store.size())); 231 System.out.println("skipped count: " + nf.format(countSkipped)); 232 233 System.out.println("total frequency-weighted: " + nf.format(totalFrequency)); 234 System.out.println("skipped frequency-weighted: " + nf.format(skippedFrequency)); 235 236 if (false) { 237 System.out.println(CldrUtility.LINE_SEPARATOR + "Source Characters "); 238 showSet(sourceCharacters); 239 System.out.println(CldrUtility.LINE_SEPARATOR + "Target Characters "); 240 showSet(targetCharacters); 241 } 242 243 // Set<String> seenSource = new HashSet<String>(); 244 // Set<String> seenTarget = new HashSet<String>(); 245 246 int countAdded = 0; 247 int countTotal = 0; 248 long frequencyAdded = 0; 249 long frequencySkipped = 0; 250 251 Transliterator base = Transliterator.createFromRules("foo", coreRules, Transliterator.FORWARD); 252 // build up the transliterator one length at a time. 253 List<String> newRules = new ArrayList<String>(); 254 StringBuilder buffer = new StringBuilder(); 255 256 int lastSourceLength = 1; 257 258 Relation<Long, String> count_failures = Relation.of(new TreeMap<Long, Set<String>>(), TreeSet.class); 259 260 sourceLoop: for (String source : store.keySet()) { 261 if (TEST_STRING != null && source.equals(TEST_STRING)) { 262 System.out.println(source + "\t" + store.getAll(source)); 263 } 264 countTotal++; 265 // whenever the source changes in length, rebuild the transliterator 266 if (source.length() != lastSourceLength && source.length() >= forceSeparateIfShorter) { 267 System.out.println("Building transliterator for length " + lastSourceLength + " : " + newRules.size()); 268 System.out.flush(); 269 skippedOut.flush(); 270 String rules = buildRules(coreRules, newRules, buffer); 271 // System.out.println(rules); 272 base = Transliterator.createFromRules("foo", rules, Transliterator.FORWARD); 273 274 lastSourceLength = source.length(); 275 } 276 Set<Pair<String, Long>> targetSet = store.getAll(source); 277 // see if any of the mappings fall out 278 String targetUsingCore = base.transliterate(source); 279 280 String bestTarget = null; 281 int bestDistance = 999; 282 long frequency = 0; 283 for (Pair<String, Long> targetPair : targetSet) { 284 String target = targetPair.getFirst(); 285 if (target.length() == 0) { 286 throw new IllegalArgumentException(source + " " + target); 287 } 288 frequency = targetPair.getSecond(); 289 290 if (targetUsingCore.equals(target)) { 291 // we have a match! skip this source 292 skippedOut.println("# skipping " + source + " " + target + " ;"); 293 frequencySkipped += frequency; 294 continue sourceLoop; 295 } 296 if (mostlyEqual(source, target, targetUsingCore)) { 297 // we have a match! skip this source 298 skippedOut.println("# skipping " + source + " " + target + " ; # close enough to " 299 + targetUsingCore); 300 frequencySkipped += frequency; 301 continue sourceLoop; 302 } 303 int distance = distance(source, target, targetUsingCore); 304 if (bestDistance > distance) { 305 bestTarget = target; 306 bestDistance = distance; 307 } 308 } 309 // if we get to here, we have a new rule. 310 if (bestTarget != null) { 311 boolean forceSeparate = false; 312 if (source.length() < forceSeparateIfShorter || bestTarget.length() * 2 > source.length() * 3) { 313 forceSeparate = true; 314 } else { 315 String spelling = spellout.transliterate(source); 316 if (bestTarget.equals(spelling)) { 317 forceSeparate = true; 318 } else { 319 // if it is likely that the word can have an extra letter added that changes the pronunciation 320 // force it to be separate 321 if (source.endsWith("e")) { 322 forceSeparate = true; 323 } 324 } 325 } 326 String targetUsingBaseCore = coreBase.transliterate(source); 327 328 if (forceSeparate) { 329 source = "$x{" + source + "}$x"; 330 } else { 331 source = "$x{" + source; 332 } 333 // strange hack 334 String hackSource = source.startsWith("use") ? "'" + source + "'" : source; 335 newRules.add(hackSource + " " + bestTarget + " ; # " + targetUsingCore 336 + (targetUsingBaseCore.equals(targetUsingCore) ? "" : "\t\t" + targetUsingBaseCore) 337 + CldrUtility.LINE_SEPARATOR); 338 skippedOut.println("# couldn't replace " + source + " " + bestTarget + " ; # " + targetUsingCore); 339 count_failures.put(-frequency, source + " " + bestTarget + " ; # " + targetUsingCore); 340 countAdded++; 341 frequencyAdded += frequency; 342 } 343 } 344 345 String rules = buildRules(coreRules, newRules, buffer); 346 base = Transliterator.createFromRules("foo", rules, Transliterator.FORWARD); // verify that it builds 347 348 PrintWriter out = FileUtilities.openUTF8Writer("", targetFile); 349 out.println(rules); 350 out.close(); 351 352 out = FileUtilities.openUTF8Writer("", targetCountFile); 353 for (long count : count_failures.keySet()) { 354 for (String line : count_failures.getAll(count)) { 355 out.println(count + "\t" + line); 356 } 357 } 358 out.close(); 359 360 // if (false) { 361 // 362 // // now write out the transliterator file 363 // PrintWriter out = FileUtilities.openUTF8Writer("", targetFile); 364 // for (String source : store.keySet()) { 365 // Set<String> targetSet = store.getAll(source); 366 // for (String target : targetSet) { 367 // if (seenSource.contains(source)) { 368 // if (onlyToTarget) { 369 // // nothing 370 // } else if (seenTarget.contains(target)) { 371 // skippedOut.println("# " + source + " " + target + " ;"); 372 // countSkipped++; 373 // } else { 374 // out.println(source + " " + target + " ;"); 375 // countSourceFromTarget++; 376 // } 377 // } else if (onlyToTarget || seenTarget.contains(target)) { 378 // out.println(source + " " + target + " ;"); 379 // countSourceToTarget++; 380 // } else { 381 // out.println(source + " " + target + " ;"); 382 // countSourceAndTarget++; 383 // } 384 // seenSource.add(source); 385 // seenTarget.add(target); 386 // } 387 // } 388 // out.close(); 389 // } 390 skippedOut.close(); 391 System.out.println("countTotal: " + nf.format(countTotal)); 392 System.out.println("countAdded: " + nf.format(countAdded)); 393 System.out.println("countSkipped: " + nf.format(countTotal - countAdded)); 394 System.out.println("frequencyTotal: " + nf.format(frequencyAdded + frequencySkipped)); 395 System.out.println("frequencyAdded: " + nf.format(frequencyAdded)); 396 System.out.println("frequencySkipped: " + nf.format(frequencySkipped)); 397 } 398 399 private static void setTranslitDebug(boolean newSetting) { 400 // Transliterator.DEBUG = newSetting; 401 try { 402 Field debug = Transliterator.class.getField("DEBUG"); 403 debug.setBoolean(Transliterator.class, newSetting); 404 } catch (Exception e) { 405 e.printStackTrace(); 406 } 407 } 408 409 private static void addSourceTarget(PrintWriter skippedOut, String source, String target, String originalLine, 410 Relation<String, Pair<String, Long>> store) { 411 if (source.equals("teh")) { 412 System.out.println("debug"); 413 } 414 if (!allowedSourceCharacters.containsAll(source)) { 415 skippedOut.println(originalLine 416 + "\t# Strange source values:\t" 417 + source 418 + "\t" 419 + new UnicodeSet().addAll(source) 420 .removeAll(allowedSourceCharacters).toPattern(false)); 421 countSkipped++; 422 skippedFrequency += frequency; 423 return; 424 } 425 if (!allowedTargetCharacters.containsAll(target)) { 426 System.out.println(originalLine 427 + "\t# Strange target values:\t" 428 + target 429 + "\t" 430 + new UnicodeSet().addAll(target) 431 .removeAll(allowedTargetCharacters).toPattern(false)); 432 countSkipped++; 433 skippedFrequency += frequency; 434 return; 435 } 436 437 sourceCharacters.addAll(source); 438 targetCharacters.addAll(target); 439 store.put(source, new Pair<String, Long>(target, frequency)); 440 totalFrequency += frequency; 441 442 } 443 444 private static void checkCoreReversibility(PrintWriter skippedOut, String coreRules, String coreBackRules) { 445 Transliterator base = Transliterator.createFromRules("foo", coreRules, Transliterator.FORWARD); 446 Transliterator back = Transliterator.createFromRules("foo2", coreBackRules, Transliterator.REVERSE); 447 String[] tests = "bat bait bet beet bit bite bot boat but bute bout boot book boy pat bat vat fat mat tat dat thew father nat sat zoo ash asia gate cat late rate hate yet rang chat jet" 448 .split("\\s"); 449 for (String test : tests) { 450 String test2 = base.transliterate(test); 451 String test3 = back.transliterate(test2); 452 skippedOut.println(test + "\t " + test2 + "\t " + test3); 453 } 454 skippedOut.flush(); 455 } 456 457 private static String buildRules(String coreRules, List<String> newRules, StringBuilder buffer) { 458 // Transliterator base; 459 // build backwards!! 460 buffer.setLength(0); 461 buffer.append( 462 "# Author: M Davis" + CldrUtility.LINE_SEPARATOR + 463 "# Email: mark.davis (at) icu-project.org" + CldrUtility.LINE_SEPARATOR + 464 "# Description: English to IPA" + CldrUtility.LINE_SEPARATOR + 465 // "$nletter {([A-Z]+)} $nletter > &en-IPA/spellout($1) ; " + Utility.LINE_SEPARATOR + 466 ":: lower(); " + CldrUtility.LINE_SEPARATOR + 467 "$x = [:^letter:] ;" + CldrUtility.LINE_SEPARATOR); 468 for (int i = newRules.size() - 1; i >= 0; --i) { 469 buffer.append(newRules.get(i)); 470 } 471 buffer.append(coreRules); 472 // System.out.println(buffer); 473 String result = buffer.toString(); 474 // ensure it builds 475 return result; 476 } 477 478 private static void showSet(UnicodeSet sourceCharacters) { 479 for (UnicodeSetIterator it = new UnicodeSetIterator(sourceCharacters); it 480 .next();) { 481 System.out.println(com.ibm.icu.impl.Utility.hex(it.codepoint) + "\t(" 482 + UTF16.valueOf(it.codepoint) + ")\t" 483 + UCharacter.getName(it.codepoint)); 484 } 485 } 486 487 public static UnicodeSet vowels = new UnicodeSet("[aeiou ]").freeze(); 488 public static UnicodeSet short_vowels = new UnicodeSet("[ ]").freeze(); 489 /** 490 * Return true if the strings are essentially the same. 491 * Differences between schwas and short vowels are counted in certain cases 492 * 493 * @param targetDir 494 * @param targetUsingCore 495 * @param targetUsingCore2 496 * @return 497 */ 498 static UnicodeSet targetChars = new UnicodeSet(); 499 static UnicodeSet targetCoreChars = new UnicodeSet(); 500 static UnicodeSet tempDiff = new UnicodeSet(); 501 static Transliterator distinguishLongVowels = Transliterator.createFromRules("faa", 502 " > ;" + 503 " > ;" + 504 "o > ;", 505 Transliterator.FORWARD); 506 507 private static int distance(String source, String target, String targetUsingCore) { 508 if (target.equals(targetUsingCore)) return 0; 509 if (mostlyEqual(source, target, targetUsingCore)) return 1; 510 // first compare the consonants. Count each difference as 3 511 String zappedTarget = distinguishLongVowels.transliterate(target); 512 String zappedCoreTarget = distinguishLongVowels.transliterate(targetUsingCore); 513 514 targetChars.clear().addAll(zappedTarget); // 515 targetCoreChars.clear().addAll(zappedCoreTarget); 516 if (targetChars.equals(targetCoreChars)) { 517 return 3; 518 } 519 targetChars.removeAll(short_vowels); 520 targetCoreChars.removeAll(short_vowels); 521 if (targetChars.equals(targetCoreChars)) { 522 return 5; 523 } 524 525 targetChars.removeAll(vowels); 526 targetCoreChars.removeAll(vowels); 527 if (targetChars.equals(targetCoreChars)) { 528 return 5; 529 } 530 531 tempDiff.clear().addAll(targetChars).removeAll(targetCoreChars); 532 int result = 7 + tempDiff.size(); 533 tempDiff.clear().addAll(targetCoreChars).removeAll(targetChars); 534 result += tempDiff.size(); 535 return result; 536 } 537 538 static final Transliterator skeletonize = Transliterator.createFromRules("faa", 539 " > ;" + 540 " > i ;" + 541 "o > oi ;" + 542 "r > r ;" + 543 "r > er ;" + 544 "r > er ;" + 545 "r > ir ;" + 546 "r > ur ;", 547 Transliterator.FORWARD); 548 549 private static boolean mostlyEqual(String inSource, String inTarget, String inTargetUsingCore) { 550 551 if (inTarget.length() != inTargetUsingCore.length()) return false; 552 553 // transform these -- simplest that way 554 String target = skeletonize.transliterate(inTarget); 555 String targetUsingCore = skeletonize.transliterate(inTargetUsingCore); 556 557 for (int i = 0; i < target.length(); ++i) { 558 char ca = target.charAt(i); 559 char cb = targetUsingCore.charAt(i); 560 if (ca != cb) { 561 // disregard differences with short vowels 562 if (ca == '' && short_vowels.contains(cb) || short_vowels.contains(ca) && cb == '') { 563 continue; 564 } 565 // ") && a.startsWith("") 566 if (ca == '' && cb == '' || ca == '' && cb == '') { 567 continue; 568 } 569 return false; 570 } 571 } 572 return true; // return diffCount == 0 ? true : diffCount < vowelCount; 573 } 574 575 static Transliterator spellout = Transliterator.createFromRules("foo", 576 "a > e ;" 577 + "b > bi ;" 578 + "c > si ;" 579 + "d > di ;" 580 + "e > i ;" 581 + "f > f ;" 582 + "g > di ;" 583 + "h > et ;" 584 + "i > ;" 585 + "j > de ;" 586 + "k > ke ;" 587 + "l > l ;" 588 + "m > m ;" 589 + "n > n ;" 590 + "o > o ;" 591 + "p > pi ;" 592 + "q > kwu ;" 593 + "r > r ;" 594 + "s > s ;" 595 + "t > ti ;" 596 + "u > ju ;" 597 + "v > vi ;" 598 + "w > dbj ;" 599 + "x > ks ;" 600 + "y > w ;" 601 + "z > zi ;", 602 Transliterator.FORWARD); 603 604 /** 605 * Returns items sorted alphabetically, shortest first 606 */ 607 static Comparator MyComparator = new Comparator() { 608 609 public int compare(Object a, Object b) { 610 String as = (String) a; 611 String bs = (String) b; 612 if (as.length() < bs.length()) 613 return -1; 614 if (as.length() > bs.length()) 615 return 1; 616 int result = col.compare(as, bs); 617 if (result != 0) { 618 return result; 619 } 620 return as.compareTo(bs); 621 } 622 623 }; 624 625 // static String dataDir = "C:\\cvsdata\\unicode\\ucd\\unicodetools\\dictionary\\Data\\"; 626 // private static String getCoreTransliterator() throws IOException { 627 // 628 // String accentRules = createFromFile(dataDir + "accentRules.txt", null, null); 629 // 630 // Transliterator doAccentRules = Transliterator.createFromRules("foo", accentRules, Transliterator.FORWARD); 631 // 632 // String markedToIpa = createFromFile(dataDir + "IPARules.txt", doAccentRules, null); 633 // System.out.println(markedToIpa); 634 // Transliterator doMarkedToIpa = Transliterator.createFromRules("foo", markedToIpa, Transliterator.FORWARD); 635 // 636 // String trial = "a>"; 637 // String result = doMarkedToIpa.transliterate(trial); 638 // System.out.println("****" + result); 639 // 640 // String englishToIpaBase = createFromFile(dataDir + "reduceRules.txt", doAccentRules, doMarkedToIpa); 641 // 642 // System.out.println(englishToIpaBase); 643 // 644 // //Transform file name into id 645 // 646 // return englishToIpaBase; 647 // } 648 649 public static String createFromFile(String fileName, Transliterator pretrans, Transliterator pretrans2) 650 throws IOException { 651 StringBuilder buffer = new StringBuilder(); 652 BufferedReader fli = FileUtilities.openUTF8Reader("", fileName); 653 while (true) { 654 String line = fli.readLine(); 655 if (line == null) break; 656 if (line.startsWith("\uFEFF")) line = line.substring(1); 657 if (pretrans != null) { 658 line = pretrans.transliterate(line); 659 } 660 if (pretrans2 != null) { 661 line = pretrans2.transliterate(line); 662 } 663 664 buffer.append(line); 665 buffer.append(CldrUtility.LINE_SEPARATOR); // separate with whitespace 666 } 667 fli.close(); 668 return buffer.toString(); 669 } 670 671 static int LIMIT = Integer.MAX_VALUE; 672 private static Transliterator fixBadIpa; 673 private static UnicodeSet targetCharacters; 674 private static UnicodeSet sourceCharacters; 675 private static UnicodeSet allowedSourceCharacters; 676 private static UnicodeSet allowedTargetCharacters; 677 private static int countSkipped; 678 private static long skippedFrequency; 679 private static long frequency; 680 private static long totalFrequency; 681 private static Transliterator coreBase; 682 683 public static Map<String, String> getOverrides() throws IOException { 684 Map<String, String> result = new TreeMap<String, String>(); 685 BufferedReader br = FileUtilities.openUTF8Reader(cldrDataDir, "internal_overrides.txt"); 686 try { 687 int counter = 0; 688 while (counter < LIMIT) { 689 String line = br.readLine(); 690 if (line == null) break; 691 line = line.trim(); 692 if (line.length() == 0) continue; 693 694 String[] iLine = line.split("\\s*\\s*"); 695 String word = iLine[0].trim(); 696 if (result.containsKey(word)) { 697 System.out.println("Overrides already contain: " + word); 698 continue; 699 } 700 if (iLine.length < 2) { 701 result.put(word, ""); 702 } else { 703 String ipa = fixBadIpa.transliterate(iLine[1].trim()); 704 result.put(word, ipa); 705 } 706 } 707 } finally { 708 br.close(); 709 } 710 return result; 711 } 712 713 }