1 /* 2 * Created on May 19, 2005 3 * Copyright (C) 2004-2005, Unicode, Inc., International Business Machines Corporation, and others. 4 * For terms of use, see http://www.unicode.org/terms_of_use.html 5 */ 6 package org.unicode.cldr.tool; 7 8 import java.io.BufferedReader; 9 import java.io.IOException; 10 import java.io.PrintWriter; 11 import java.util.Comparator; 12 import java.util.HashMap; 13 import java.util.HashSet; 14 import java.util.Iterator; 15 import java.util.Map; 16 import java.util.Set; 17 import java.util.TreeMap; 18 import java.util.TreeSet; 19 20 import org.unicode.cldr.draft.FileUtilities; 21 import org.unicode.cldr.util.ArrayComparator; 22 import org.unicode.cldr.util.CLDRFile; 23 import org.unicode.cldr.util.CldrUtility; 24 import org.unicode.cldr.util.Factory; 25 import org.unicode.cldr.util.LanguageTagParser; 26 import org.unicode.cldr.util.Log; 27 import org.unicode.cldr.util.StandardCodes; 28 import org.unicode.cldr.util.TransliteratorUtilities; 29 30 import com.ibm.icu.lang.UCharacter; 31 import com.ibm.icu.text.Collator; 32 import com.ibm.icu.text.Transliterator; 33 import com.ibm.icu.text.UnicodeSet; 34 import com.ibm.icu.util.ICUUncheckedIOException; 35 import com.ibm.icu.util.ULocale; 36 37 /** 38 * @throws IOException 39 * 40 */ 41 class GenerateStatistics { 42 static final boolean HACK = true; 43 static CLDRFile english; 44 static Factory factory; 45 static LanguageTagParser ltp = new LanguageTagParser(); 46 static Collator col = Collator.getInstance(ULocale.ENGLISH); 47 static boolean notitlecase = true; 48 49 public static void generateSize(String sourceDir, String logDir, String match, boolean transliterate) 50 throws IOException { 51 factory = Factory.make(sourceDir, match); 52 ToolUtilities.registerExtraTransliterators(); 53 54 PrintWriter logHtml = FileUtilities.openUTF8Writer(logDir, "test_generation_log.html"); 55 //String dir = logDir + "main" + File.separator; 56 // DraftChecker dc = new DraftChecker(dir); 57 english = factory.make("en", true); 58 Set<String> languages = new TreeSet<String>(col), countries = new TreeSet<String>(col), draftLanguages = new TreeSet<String>( 59 col), draftCountries = new TreeSet<String>(col); 60 Set<Object> nativeLanguages = new TreeSet<Object>(), nativeCountries = new TreeSet<Object>(), draftNativeLanguages = new TreeSet<Object>(), 61 draftNativeCountries = new TreeSet<Object>(); 62 int localeCount = 0; 63 int draftLocaleCount = 0; 64 65 Set<String> contents = removeSingleLanguagesWhereWeHaveScripts(factory.getAvailable()); 66 67 for (Iterator<String> it = contents.iterator(); it.hasNext();) { 68 String localeID = it.next(); 69 if (CLDRFile.isSupplementalName(localeID)) continue; 70 if (localeID.equals("root")) 71 continue; // skip root 72 System.out.println("Collecting info for:\t" + localeID.replace("_", "\t")); 73 boolean draft = false; // dc.isDraft(localeName); 74 if (draft) { 75 draftLocaleCount++; 76 addCounts(localeID, true, draftLanguages, 77 draftCountries, draftNativeLanguages, 78 draftNativeCountries); 79 } else { 80 localeCount++; 81 addCounts(localeID, false, languages, 82 countries, nativeLanguages, nativeCountries); 83 } 84 if (false) 85 Log.logln(draft + ", " + localeCount + ", " 86 + languages.size() + ", " + countries.size() + ", " 87 + draftLocaleCount + ", " + draftLanguages.size() 88 + ", " + draftCountries.size()); 89 } 90 draftLanguages.removeAll(languages); 91 for (Iterator<Object> it = nativeLanguages.iterator(); it.hasNext();) { 92 draftNativeLanguages.remove(it.next()); 93 } 94 logHtml.println("<html><head>"); 95 logHtml 96 .println("<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>"); 97 logHtml.println("</head><body>"); 98 logHtml.println("<p><b>Locales (" + localeCount + "):</b>"); 99 logHtml.println("<p><b>Languages (" + languages.size() + "):</b>"); 100 logHtml.println(showSet(nativeLanguages, transliterate, true)); 101 logHtml.println("<p><b>Territories (" + countries.size() + "):</b>"); 102 logHtml.println(showSet(nativeCountries, transliterate, false)); 103 logHtml.println("<p><b>Draft locales (" + draftLocaleCount + "):</b>"); 104 logHtml.println("<p><b>Draft languages (" + draftLanguages.size() 105 + "):</b>"); 106 logHtml.println(showSet(draftNativeLanguages, transliterate, true)); 107 logHtml.println("<p><b>Draft countries (" + draftCountries.size() 108 + "):</b>"); 109 logHtml.println(showSet(draftNativeCountries, transliterate, false)); 110 logHtml.println(CldrUtility.ANALYTICS); 111 logHtml.println("</body></html>"); 112 logHtml.close(); 113 } 114 115 /** 116 * 117 */ 118 private static Set<String> removeSingleLanguagesWhereWeHaveScripts(Set<String> contents) { 119 StandardCodes sc = StandardCodes.make(); 120 contents = new TreeSet<String>(contents); // make writable 121 if (false && HACK) { 122 contents.add("bs_Latn"); 123 contents.add("bs_Cyrl"); 124 contents.add("bs_Latn_BA"); 125 contents.add("bs_Cyrl_BA"); 126 } 127 // find the languages with scripts 128 Set<String> toRemove = new HashSet<String>(); 129 if (HACK) toRemove.add("sh"); 130 131 for (Iterator<String> it = contents.iterator(); it.hasNext();) { 132 String localeID = it.next(); 133 if (CLDRFile.isSupplementalName(localeID)) { 134 continue; 135 } 136 // if there is a lang_script, then remove everything starting with lang that doesn't have "a" script 137 String lang = ltp.set(localeID).getLanguage(); 138 String territory = ltp.set(localeID).getRegion(); 139 if (!sc.getGoodAvailableCodes("language").contains(lang)) { 140 System.out.println("Odd language, removing: " + localeID); 141 it.remove(); 142 continue; 143 } 144 if (territory.length() != 0 && !sc.getGoodAvailableCodes("territory").contains(territory)) { 145 System.out.println("Odd territory, removing: " + localeID); 146 it.remove(); 147 continue; 148 } 149 String langscript = ltp.set(localeID).getLanguageScript(); 150 if (!lang.equals(langscript)) toRemove.add(lang); 151 } 152 153 for (Iterator<String> it = contents.iterator(); it.hasNext();) { 154 String localeID = it.next(); 155 if (CLDRFile.isSupplementalName(localeID)) { 156 continue; 157 } 158 // if there is a lang_script, then remove everything starting with lang that doesn't have "a" script 159 String lang = ltp.set(localeID).getLanguage(); 160 if (!toRemove.contains(lang)) continue; 161 String langscript = ltp.set(localeID).getLanguageScript(); 162 if (lang.equals(langscript)) it.remove(); 163 } 164 return contents; 165 } 166 167 static final UnicodeSet NON_LATIN = new UnicodeSet("[^[:latin:][:common:][:inherited:]]"); 168 169 /** 170 * @param nativeCountries 171 * @param transliterate 172 * TODO 173 * @param isLanguage 174 * TODO 175 */ 176 @SuppressWarnings({ "unchecked", "rawtypes" }) 177 private static String showSet(Set nativeCountries, boolean transliterate, 178 boolean isLanguage) { 179 UnicodeSet BIDI_R = new UnicodeSet( 180 "[[:Bidi_Class=R:][:Bidi_Class=AL:]]"); 181 StringBuffer result = new StringBuffer(); 182 Map sb = new TreeMap(LanguageList.col); 183 // collect multiples by English name 184 for (Iterator it = nativeCountries.iterator(); it.hasNext();) { 185 LanguageList llist = (LanguageList) it.next(); 186 Set s = (Set) sb.get(llist.getEnglishName()); 187 if (s == null) 188 sb.put(llist.getEnglishName(), s = new TreeSet()); 189 s.add(llist); 190 } 191 192 Set<String> titleSet = new TreeSet<String>(col); 193 Set<String> qualifierSet = new TreeSet<String>(col); 194 195 for (Iterator<String> it = sb.keySet().iterator(); it.hasNext();) { 196 String englishName = it.next(); 197 Set s = (Set) sb.get(englishName); 198 if (result.length() != 0) { 199 result.append("; "); 200 } 201 String code = ""; 202 boolean needQualifier = s.size() != 1; 203 titleSet.clear(); 204 qualifierSet.clear(); 205 206 for (Iterator<LanguageList> it2 = s.iterator(); it2.hasNext();) { 207 LanguageList llist = it2.next(); 208 String localName = llist.getLocalName(); 209 String locale = llist.getLocale(); 210 211 // see if we need qualifier 212 String lang = locale, country = ""; 213 if (locale.length() > 3 214 && locale.charAt(locale.length() - 3) == '_') { 215 lang = locale.substring(0, locale.length() - 3); 216 country = locale.substring(locale.length() - 2); 217 } 218 219 // fix 220 if (BIDI_R.containsSome(localName)) 221 localName = '\u200E' + localName + '\u200E'; 222 223 // qualifiers += lang; 224 225 if (isLanguage) { 226 code = lang; 227 } else { 228 code = country; 229 } 230 231 if (!localName.equalsIgnoreCase(englishName)) { 232 needQualifier = true; 233 qualifierSet.add(localName); 234 235 if (transliterate && NON_LATIN.containsSome(localName) 236 && !lang.equals("ja")) { 237 String transName = localName; 238 try { 239 transName = fixedTitleCase("en", 240 toLatin.transliterate(localName)); 241 } catch (RuntimeException e) { 242 System.out.println("\t" + e.getMessage()); 243 } 244 if (NON_LATIN.containsSome(transName)) { 245 Log.logln("Can't transliterate " + localName 246 + ": " + transName); 247 } else { 248 titleSet.add(transName); 249 } 250 } 251 } 252 } 253 String title = code + (titleSet.isEmpty() ? "" : ": " + titleSet.toString()); 254 String before = "", after = ""; 255 if (title.length() != 0) { 256 before = "<span title=\'" 257 + TransliteratorUtilities.toHTML.transliterate(title) + "'>"; 258 after = "</span>"; 259 } 260 String qualifiers = qualifierSet.toString(); 261 if (!needQualifier || qualifierSet.isEmpty()) 262 qualifiers = ""; 263 else 264 qualifiers = " " + qualifiers; // qualifiers = " (" + qualifiers + ")"; 265 266 // fix 267 if (englishName.endsWith(", China")) { 268 englishName = englishName.substring(0, englishName.length() 269 - ", China".length()) 270 + " China"; 271 } 272 273 result.append(before) 274 .append( 275 TransliteratorUtilities.toHTML.transliterate(englishName 276 + qualifiers)) 277 .append(after); 278 } 279 return result.toString(); 280 } 281 282 /** 283 * @param localeID 284 * @param isDraft 285 * TODO 286 * @param draftLanguages 287 * @param draftCountries 288 * @param draftNativeLanguages 289 * @param draftNativeCountries 290 */ 291 private static void addCounts(String localeID, boolean isDraft, Set<String> draftLanguages, Set<String> draftCountries, 292 Set<Object> draftNativeLanguages, Set<Object> draftNativeCountries) { 293 // ULocale uloc = new ULocale(localeName); 294 ltp.set(localeID); 295 String lang = ltp.getLanguage(); 296 String langScript = ltp.getLanguageScript(); 297 String country = ltp.getRegion(); 298 299 // dump aliases 300 // if ((country.equals("TW") || country.equals("HK") || country.equals("MO")) && lang.equals("zh")) return; 301 // if (lang.equals("zh_Hans") || lang.equals("sr_Cyrl") || lang.equals("sh")) return; 302 303 String nativeName, englishName; 304 draftLanguages.add(lang); 305 nativeName = getFixedLanguageName(localeID, langScript); 306 englishName = english.getName(langScript); 307 if (!lang.equals("en") && nativeName.equals(englishName)) { 308 Log.logln((isDraft ? "D" : "") + "\tWarning: in " + localeID + ", display name for " + lang 309 + " equals English: " + nativeName); 310 } 311 312 draftNativeLanguages.add(new LanguageList(langScript, englishName, fixedTitleCase("en", nativeName))); 313 314 if (!country.equals("")) { 315 draftCountries.add(country); 316 nativeName = getFixedDisplayCountry(localeID, country); 317 englishName = getFixedDisplayCountry("en", country); 318 if (!lang.equals("en") && nativeName.equals(englishName)) { 319 Log.logln((isDraft ? "D" : "") + "\tWarning: in " + localeID + ", display name for " + country 320 + " equals English: " + nativeName); 321 } 322 draftNativeCountries.add(new LanguageList(localeID, englishName, fixedTitleCase("en", nativeName))); 323 } 324 } 325 326 private static class LanguageList implements Comparable<Object> { 327 Object[] contents; 328 static Collator col = Collator.getInstance(ULocale.ENGLISH); 329 static Comparator<Object[]> comp = new ArrayComparator(new Collator[] { col, col, null }); 330 331 LanguageList(String locale, String englishName, String localName) { 332 contents = new Object[] { englishName, locale, localName }; 333 } 334 335 public int compareTo(Object o) { 336 return comp.compare(contents, ((LanguageList) o).contents); 337 } 338 339 String getLocale() { 340 return (String) contents[1]; 341 } 342 343 String getEnglishName() { 344 return (String) contents[0]; 345 } 346 347 String getLocalName() { 348 return (String) contents[2]; 349 } 350 } 351 352 static String fixedTitleCase(String localeID, String in) { 353 if (notitlecase) return in; 354 String result = UCharacter.toTitleCase(new ULocale(localeID), in, null); 355 if (HACK) { 356 result = GenerateCldrTests.replace(result, "U.s.", "U.S."); 357 result = GenerateCldrTests.replace(result, "S.a.r.", "S.A.R."); 358 } 359 return result; 360 } 361 362 /* 363 * static void addMapSet(Map m, Object key, Object value, Comparator com) { 364 * Set valueSet = (Set) m.get(key); 365 * if (valueSet == null) { 366 * valueSet = new TreeSet(com); 367 * m.put(key, valueSet); 368 * } 369 * valueSet.add(value); 370 * } 371 */ 372 373 /** 374 * 375 */ 376 private static String getFixedLanguageName(String localeID, String lang) { 377 if (HACK) { 378 if (localeID.equals("bs") || localeID.startsWith("bs_")) { 379 if (lang.equals("bs") || lang.startsWith("bs_")) return "Bosanski"; 380 } 381 } 382 CLDRFile cldr = factory.make(localeID, true); 383 return cldr.getName(lang); 384 } 385 386 /** 387 * @param uloc 388 * @return 389 */ 390 private static String getFixedDisplayCountry(String localeID, String country) { 391 if (HACK) { 392 if (localeID.equals("bs") || localeID.startsWith("bs_")) { 393 if (country.equals("BA")) 394 return "\u0411\u043E\u0441\u043D\u0430 \u0438 \u0425\u0435\u0440\u0446\u0435\u0433\u043E\u0432\u0438\u043D\u0430"; 395 } 396 } 397 CLDRFile cldr = factory.make(localeID, true); 398 String name = cldr.getName("territory", country); 399 if (false && HACK) { 400 Object trial = fixCountryNames.get(name); 401 if (trial != null) { 402 return (String) trial; 403 } 404 } 405 return name; 406 } 407 408 static Map<String, String> fixCountryNames = new HashMap<String, String>(); 409 static { 410 fixCountryNames.put("\u0408\u0443\u0433\u043E\u0441\u043B\u0430\u0432\u0438\u0458\u0430", 411 "\u0421\u0440\u0431\u0438\u0458\u0430 \u0438 \u0426\u0440\u043D\u0430 \u0413\u043E\u0440\u0430"); 412 fixCountryNames.put("Jugoslavija", "Srbija i Crna Gora"); 413 fixCountryNames.put("Yugoslavia", "Serbia and Montenegro"); 414 } 415 public static final Transliterator toLatin = Transliterator.getInstance("any-latin"); 416 417 public static class DraftChecker { 418 String dir; 419 Map<String, Object> cache = new HashMap<String, Object>(); 420 Object TRUE = new Object(); 421 Object FALSE = new Object(); 422 423 public DraftChecker(String dir) { 424 this.dir = dir; 425 } 426 427 public boolean isDraft(String localeName) { 428 Object check = cache.get(localeName); 429 if (check != null) { 430 return check == TRUE; 431 } 432 BufferedReader pw = null; 433 //boolean result = true; 434 try { 435 pw = FileUtilities.openUTF8Reader(dir, localeName + ".xml"); 436 while (true) { 437 String line = pw.readLine(); 438 if (line == null) { 439 throw new IllegalArgumentException("Internal Error: should never get here."); 440 } 441 if (line.indexOf("<ldml") >= 0) { 442 if (line.indexOf("draft") >= 0) { 443 check = TRUE; 444 } else { 445 check = FALSE; 446 } 447 break; 448 } 449 } 450 pw.close(); 451 } catch (IOException e) { 452 throw new ICUUncheckedIOException("Failure on " + localeName + ": " + dir + localeName + ".xml", e); 453 } 454 cache.put(localeName, check); 455 return check == TRUE; 456 } 457 } 458 459 }