Home | History | Annotate | Download | only in tool
      1 /*
      2  * Created on May 19, 2005
      3  * Copyright (C) 2004-2005, Unicode, Inc., International Business Machines Corporation, and others.
      4  * For terms of use, see http://www.unicode.org/terms_of_use.html
      5  */
      6 package org.unicode.cldr.tool;
      7 
      8 import java.io.BufferedReader;
      9 import java.io.IOException;
     10 import java.io.PrintWriter;
     11 import java.util.Comparator;
     12 import java.util.HashMap;
     13 import java.util.HashSet;
     14 import java.util.Iterator;
     15 import java.util.Map;
     16 import java.util.Set;
     17 import java.util.TreeMap;
     18 import java.util.TreeSet;
     19 
     20 import org.unicode.cldr.draft.FileUtilities;
     21 import org.unicode.cldr.util.ArrayComparator;
     22 import org.unicode.cldr.util.CLDRFile;
     23 import org.unicode.cldr.util.CldrUtility;
     24 import org.unicode.cldr.util.Factory;
     25 import org.unicode.cldr.util.LanguageTagParser;
     26 import org.unicode.cldr.util.Log;
     27 import org.unicode.cldr.util.StandardCodes;
     28 import org.unicode.cldr.util.TransliteratorUtilities;
     29 
     30 import com.ibm.icu.lang.UCharacter;
     31 import com.ibm.icu.text.Collator;
     32 import com.ibm.icu.text.Transliterator;
     33 import com.ibm.icu.text.UnicodeSet;
     34 import com.ibm.icu.util.ICUUncheckedIOException;
     35 import com.ibm.icu.util.ULocale;
     36 
     37 /**
     38  * @throws IOException
     39  *
     40  */
     41 class GenerateStatistics {
     42     static final boolean HACK = true;
     43     static CLDRFile english;
     44     static Factory factory;
     45     static LanguageTagParser ltp = new LanguageTagParser();
     46     static Collator col = Collator.getInstance(ULocale.ENGLISH);
     47     static boolean notitlecase = true;
     48 
     49     public static void generateSize(String sourceDir, String logDir, String match, boolean transliterate)
     50         throws IOException {
     51         factory = Factory.make(sourceDir, match);
     52         ToolUtilities.registerExtraTransliterators();
     53 
     54         PrintWriter logHtml = FileUtilities.openUTF8Writer(logDir, "test_generation_log.html");
     55         //String dir = logDir + "main" + File.separator;
     56         // DraftChecker dc = new DraftChecker(dir);
     57         english = factory.make("en", true);
     58         Set<String> languages = new TreeSet<String>(col), countries = new TreeSet<String>(col), draftLanguages = new TreeSet<String>(
     59             col), draftCountries = new TreeSet<String>(col);
     60         Set<Object> nativeLanguages = new TreeSet<Object>(), nativeCountries = new TreeSet<Object>(), draftNativeLanguages = new TreeSet<Object>(),
     61             draftNativeCountries = new TreeSet<Object>();
     62         int localeCount = 0;
     63         int draftLocaleCount = 0;
     64 
     65         Set<String> contents = removeSingleLanguagesWhereWeHaveScripts(factory.getAvailable());
     66 
     67         for (Iterator<String> it = contents.iterator(); it.hasNext();) {
     68             String localeID = it.next();
     69             if (CLDRFile.isSupplementalName(localeID)) continue;
     70             if (localeID.equals("root"))
     71                 continue; // skip root
     72             System.out.println("Collecting info for:\t" + localeID.replace("_", "\t"));
     73             boolean draft = false; // dc.isDraft(localeName);
     74             if (draft) {
     75                 draftLocaleCount++;
     76                 addCounts(localeID, true, draftLanguages,
     77                     draftCountries, draftNativeLanguages,
     78                     draftNativeCountries);
     79             } else {
     80                 localeCount++;
     81                 addCounts(localeID, false, languages,
     82                     countries, nativeLanguages, nativeCountries);
     83             }
     84             if (false)
     85                 Log.logln(draft + ", " + localeCount + ", "
     86                     + languages.size() + ", " + countries.size() + ", "
     87                     + draftLocaleCount + ", " + draftLanguages.size()
     88                     + ", " + draftCountries.size());
     89         }
     90         draftLanguages.removeAll(languages);
     91         for (Iterator<Object> it = nativeLanguages.iterator(); it.hasNext();) {
     92             draftNativeLanguages.remove(it.next());
     93         }
     94         logHtml.println("<html><head>");
     95         logHtml
     96             .println("<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
     97         logHtml.println("</head><body>");
     98         logHtml.println("<p><b>Locales (" + localeCount + "):</b>");
     99         logHtml.println("<p><b>Languages (" + languages.size() + "):</b>");
    100         logHtml.println(showSet(nativeLanguages, transliterate, true));
    101         logHtml.println("<p><b>Territories (" + countries.size() + "):</b>");
    102         logHtml.println(showSet(nativeCountries, transliterate, false));
    103         logHtml.println("<p><b>Draft locales (" + draftLocaleCount + "):</b>");
    104         logHtml.println("<p><b>Draft languages (" + draftLanguages.size()
    105             + "):</b>");
    106         logHtml.println(showSet(draftNativeLanguages, transliterate, true));
    107         logHtml.println("<p><b>Draft countries (" + draftCountries.size()
    108             + "):</b>");
    109         logHtml.println(showSet(draftNativeCountries, transliterate, false));
    110         logHtml.println(CldrUtility.ANALYTICS);
    111         logHtml.println("</body></html>");
    112         logHtml.close();
    113     }
    114 
    115     /**
    116      *
    117      */
    118     private static Set<String> removeSingleLanguagesWhereWeHaveScripts(Set<String> contents) {
    119         StandardCodes sc = StandardCodes.make();
    120         contents = new TreeSet<String>(contents); // make writable
    121         if (false && HACK) {
    122             contents.add("bs_Latn");
    123             contents.add("bs_Cyrl");
    124             contents.add("bs_Latn_BA");
    125             contents.add("bs_Cyrl_BA");
    126         }
    127         // find the languages with scripts
    128         Set<String> toRemove = new HashSet<String>();
    129         if (HACK) toRemove.add("sh");
    130 
    131         for (Iterator<String> it = contents.iterator(); it.hasNext();) {
    132             String localeID = it.next();
    133             if (CLDRFile.isSupplementalName(localeID)) {
    134                 continue;
    135             }
    136             // if there is a lang_script, then remove everything starting with lang that doesn't have "a" script
    137             String lang = ltp.set(localeID).getLanguage();
    138             String territory = ltp.set(localeID).getRegion();
    139             if (!sc.getGoodAvailableCodes("language").contains(lang)) {
    140                 System.out.println("Odd language, removing: " + localeID);
    141                 it.remove();
    142                 continue;
    143             }
    144             if (territory.length() != 0 && !sc.getGoodAvailableCodes("territory").contains(territory)) {
    145                 System.out.println("Odd territory, removing: " + localeID);
    146                 it.remove();
    147                 continue;
    148             }
    149             String langscript = ltp.set(localeID).getLanguageScript();
    150             if (!lang.equals(langscript)) toRemove.add(lang);
    151         }
    152 
    153         for (Iterator<String> it = contents.iterator(); it.hasNext();) {
    154             String localeID = it.next();
    155             if (CLDRFile.isSupplementalName(localeID)) {
    156                 continue;
    157             }
    158             // if there is a lang_script, then remove everything starting with lang that doesn't have "a" script
    159             String lang = ltp.set(localeID).getLanguage();
    160             if (!toRemove.contains(lang)) continue;
    161             String langscript = ltp.set(localeID).getLanguageScript();
    162             if (lang.equals(langscript)) it.remove();
    163         }
    164         return contents;
    165     }
    166 
    167     static final UnicodeSet NON_LATIN = new UnicodeSet("[^[:latin:][:common:][:inherited:]]");
    168 
    169     /**
    170      * @param nativeCountries
    171      * @param transliterate
    172      *            TODO
    173      * @param isLanguage
    174      *            TODO
    175      */
    176     @SuppressWarnings({ "unchecked", "rawtypes" })
    177     private static String showSet(Set nativeCountries, boolean transliterate,
    178         boolean isLanguage) {
    179         UnicodeSet BIDI_R = new UnicodeSet(
    180             "[[:Bidi_Class=R:][:Bidi_Class=AL:]]");
    181         StringBuffer result = new StringBuffer();
    182         Map sb = new TreeMap(LanguageList.col);
    183         // collect multiples by English name
    184         for (Iterator it = nativeCountries.iterator(); it.hasNext();) {
    185             LanguageList llist = (LanguageList) it.next();
    186             Set s = (Set) sb.get(llist.getEnglishName());
    187             if (s == null)
    188                 sb.put(llist.getEnglishName(), s = new TreeSet());
    189             s.add(llist);
    190         }
    191 
    192         Set<String> titleSet = new TreeSet<String>(col);
    193         Set<String> qualifierSet = new TreeSet<String>(col);
    194 
    195         for (Iterator<String> it = sb.keySet().iterator(); it.hasNext();) {
    196             String englishName = it.next();
    197             Set s = (Set) sb.get(englishName);
    198             if (result.length() != 0) {
    199                 result.append("; ");
    200             }
    201             String code = "";
    202             boolean needQualifier = s.size() != 1;
    203             titleSet.clear();
    204             qualifierSet.clear();
    205 
    206             for (Iterator<LanguageList> it2 = s.iterator(); it2.hasNext();) {
    207                 LanguageList llist = it2.next();
    208                 String localName = llist.getLocalName();
    209                 String locale = llist.getLocale();
    210 
    211                 // see if we need qualifier
    212                 String lang = locale, country = "";
    213                 if (locale.length() > 3
    214                     && locale.charAt(locale.length() - 3) == '_') {
    215                     lang = locale.substring(0, locale.length() - 3);
    216                     country = locale.substring(locale.length() - 2);
    217                 }
    218 
    219                 // fix
    220                 if (BIDI_R.containsSome(localName))
    221                     localName = '\u200E' + localName + '\u200E';
    222 
    223                 // qualifiers += lang;
    224 
    225                 if (isLanguage) {
    226                     code = lang;
    227                 } else {
    228                     code = country;
    229                 }
    230 
    231                 if (!localName.equalsIgnoreCase(englishName)) {
    232                     needQualifier = true;
    233                     qualifierSet.add(localName);
    234 
    235                     if (transliterate && NON_LATIN.containsSome(localName)
    236                         && !lang.equals("ja")) {
    237                         String transName = localName;
    238                         try {
    239                             transName = fixedTitleCase("en",
    240                                 toLatin.transliterate(localName));
    241                         } catch (RuntimeException e) {
    242                             System.out.println("\t" + e.getMessage());
    243                         }
    244                         if (NON_LATIN.containsSome(transName)) {
    245                             Log.logln("Can't transliterate " + localName
    246                                 + ": " + transName);
    247                         } else {
    248                             titleSet.add(transName);
    249                         }
    250                     }
    251                 }
    252             }
    253             String title = code + (titleSet.isEmpty() ? "" : ": " + titleSet.toString());
    254             String before = "", after = "";
    255             if (title.length() != 0) {
    256                 before = "<span title=\'"
    257                     + TransliteratorUtilities.toHTML.transliterate(title) + "'>";
    258                 after = "</span>";
    259             }
    260             String qualifiers = qualifierSet.toString();
    261             if (!needQualifier || qualifierSet.isEmpty())
    262                 qualifiers = "";
    263             else
    264                 qualifiers = " " + qualifiers; // qualifiers = " (" + qualifiers + ")";
    265 
    266             // fix
    267             if (englishName.endsWith(", China")) {
    268                 englishName = englishName.substring(0, englishName.length()
    269                     - ", China".length())
    270                     + " China";
    271             }
    272 
    273             result.append(before)
    274                 .append(
    275                     TransliteratorUtilities.toHTML.transliterate(englishName
    276                         + qualifiers))
    277                 .append(after);
    278         }
    279         return result.toString();
    280     }
    281 
    282     /**
    283      * @param localeID
    284      * @param isDraft
    285      *            TODO
    286      * @param draftLanguages
    287      * @param draftCountries
    288      * @param draftNativeLanguages
    289      * @param draftNativeCountries
    290      */
    291     private static void addCounts(String localeID, boolean isDraft, Set<String> draftLanguages, Set<String> draftCountries,
    292         Set<Object> draftNativeLanguages, Set<Object> draftNativeCountries) {
    293         // ULocale uloc = new ULocale(localeName);
    294         ltp.set(localeID);
    295         String lang = ltp.getLanguage();
    296         String langScript = ltp.getLanguageScript();
    297         String country = ltp.getRegion();
    298 
    299         // dump aliases
    300         // if ((country.equals("TW") || country.equals("HK") || country.equals("MO")) && lang.equals("zh")) return;
    301         // if (lang.equals("zh_Hans") || lang.equals("sr_Cyrl") || lang.equals("sh")) return;
    302 
    303         String nativeName, englishName;
    304         draftLanguages.add(lang);
    305         nativeName = getFixedLanguageName(localeID, langScript);
    306         englishName = english.getName(langScript);
    307         if (!lang.equals("en") && nativeName.equals(englishName)) {
    308             Log.logln((isDraft ? "D" : "") + "\tWarning: in " + localeID + ", display name for " + lang
    309                 + " equals English: " + nativeName);
    310         }
    311 
    312         draftNativeLanguages.add(new LanguageList(langScript, englishName, fixedTitleCase("en", nativeName)));
    313 
    314         if (!country.equals("")) {
    315             draftCountries.add(country);
    316             nativeName = getFixedDisplayCountry(localeID, country);
    317             englishName = getFixedDisplayCountry("en", country);
    318             if (!lang.equals("en") && nativeName.equals(englishName)) {
    319                 Log.logln((isDraft ? "D" : "") + "\tWarning: in " + localeID + ", display name for " + country
    320                     + " equals English: " + nativeName);
    321             }
    322             draftNativeCountries.add(new LanguageList(localeID, englishName, fixedTitleCase("en", nativeName)));
    323         }
    324     }
    325 
    326     private static class LanguageList implements Comparable<Object> {
    327         Object[] contents;
    328         static Collator col = Collator.getInstance(ULocale.ENGLISH);
    329         static Comparator<Object[]> comp = new ArrayComparator(new Collator[] { col, col, null });
    330 
    331         LanguageList(String locale, String englishName, String localName) {
    332             contents = new Object[] { englishName, locale, localName };
    333         }
    334 
    335         public int compareTo(Object o) {
    336             return comp.compare(contents, ((LanguageList) o).contents);
    337         }
    338 
    339         String getLocale() {
    340             return (String) contents[1];
    341         }
    342 
    343         String getEnglishName() {
    344             return (String) contents[0];
    345         }
    346 
    347         String getLocalName() {
    348             return (String) contents[2];
    349         }
    350     }
    351 
    352     static String fixedTitleCase(String localeID, String in) {
    353         if (notitlecase) return in;
    354         String result = UCharacter.toTitleCase(new ULocale(localeID), in, null);
    355         if (HACK) {
    356             result = GenerateCldrTests.replace(result, "U.s.", "U.S.");
    357             result = GenerateCldrTests.replace(result, "S.a.r.", "S.A.R.");
    358         }
    359         return result;
    360     }
    361 
    362     /*
    363      * static void addMapSet(Map m, Object key, Object value, Comparator com) {
    364      * Set valueSet = (Set) m.get(key);
    365      * if (valueSet == null) {
    366      * valueSet = new TreeSet(com);
    367      * m.put(key, valueSet);
    368      * }
    369      * valueSet.add(value);
    370      * }
    371      */
    372 
    373     /**
    374      *
    375      */
    376     private static String getFixedLanguageName(String localeID, String lang) {
    377         if (HACK) {
    378             if (localeID.equals("bs") || localeID.startsWith("bs_")) {
    379                 if (lang.equals("bs") || lang.startsWith("bs_")) return "Bosanski";
    380             }
    381         }
    382         CLDRFile cldr = factory.make(localeID, true);
    383         return cldr.getName(lang);
    384     }
    385 
    386     /**
    387      * @param uloc
    388      * @return
    389      */
    390     private static String getFixedDisplayCountry(String localeID, String country) {
    391         if (HACK) {
    392             if (localeID.equals("bs") || localeID.startsWith("bs_")) {
    393                 if (country.equals("BA"))
    394                     return "\u0411\u043E\u0441\u043D\u0430 \u0438 \u0425\u0435\u0440\u0446\u0435\u0433\u043E\u0432\u0438\u043D\u0430";
    395             }
    396         }
    397         CLDRFile cldr = factory.make(localeID, true);
    398         String name = cldr.getName("territory", country);
    399         if (false && HACK) {
    400             Object trial = fixCountryNames.get(name);
    401             if (trial != null) {
    402                 return (String) trial;
    403             }
    404         }
    405         return name;
    406     }
    407 
    408     static Map<String, String> fixCountryNames = new HashMap<String, String>();
    409     static {
    410         fixCountryNames.put("\u0408\u0443\u0433\u043E\u0441\u043B\u0430\u0432\u0438\u0458\u0430",
    411             "\u0421\u0440\u0431\u0438\u0458\u0430 \u0438 \u0426\u0440\u043D\u0430 \u0413\u043E\u0440\u0430");
    412         fixCountryNames.put("Jugoslavija", "Srbija i Crna Gora");
    413         fixCountryNames.put("Yugoslavia", "Serbia and Montenegro");
    414     }
    415     public static final Transliterator toLatin = Transliterator.getInstance("any-latin");
    416 
    417     public static class DraftChecker {
    418         String dir;
    419         Map<String, Object> cache = new HashMap<String, Object>();
    420         Object TRUE = new Object();
    421         Object FALSE = new Object();
    422 
    423         public DraftChecker(String dir) {
    424             this.dir = dir;
    425         }
    426 
    427         public boolean isDraft(String localeName) {
    428             Object check = cache.get(localeName);
    429             if (check != null) {
    430                 return check == TRUE;
    431             }
    432             BufferedReader pw = null;
    433             //boolean result = true;
    434             try {
    435                 pw = FileUtilities.openUTF8Reader(dir, localeName + ".xml");
    436                 while (true) {
    437                     String line = pw.readLine();
    438                     if (line == null) {
    439                         throw new IllegalArgumentException("Internal Error: should never get here.");
    440                     }
    441                     if (line.indexOf("<ldml") >= 0) {
    442                         if (line.indexOf("draft") >= 0) {
    443                             check = TRUE;
    444                         } else {
    445                             check = FALSE;
    446                         }
    447                         break;
    448                     }
    449                 }
    450                 pw.close();
    451             } catch (IOException e) {
    452                 throw new ICUUncheckedIOException("Failure on " + localeName + ": " + dir + localeName + ".xml", e);
    453             }
    454             cache.put(localeName, check);
    455             return check == TRUE;
    456         }
    457     }
    458 
    459 }