Home | History | Annotate | Download | only in unittest
      1 package org.unicode.cldr.unittest;
      2 
      3 import java.util.Arrays;
      4 import java.util.HashSet;
      5 import java.util.Map;
      6 import java.util.Map.Entry;
      7 import java.util.Set;
      8 import java.util.TreeMap;
      9 import java.util.TreeSet;
     10 
     11 import org.unicode.cldr.draft.ScriptMetadata;
     12 import org.unicode.cldr.draft.ScriptMetadata.Info;
     13 import org.unicode.cldr.tool.LikelySubtags;
     14 import org.unicode.cldr.util.CLDRConfig;
     15 import org.unicode.cldr.util.CLDRFile;
     16 import org.unicode.cldr.util.ChainedMap;
     17 import org.unicode.cldr.util.ChainedMap.M3;
     18 import org.unicode.cldr.util.Containment;
     19 import org.unicode.cldr.util.LanguageTagParser;
     20 import org.unicode.cldr.util.StandardCodes;
     21 import org.unicode.cldr.util.SupplementalDataInfo;
     22 
     23 import com.ibm.icu.dev.test.TestFmwk;
     24 import com.ibm.icu.lang.UCharacter;
     25 import com.ibm.icu.lang.UProperty;
     26 import com.ibm.icu.lang.UScript;
     27 import com.ibm.icu.text.UnicodeSet;
     28 import com.ibm.icu.util.VersionInfo;
     29 
     30 public class LikelySubtagsTest extends TestFmwk {
     31 
     32     private boolean DEBUG = false;
     33     private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO = CLDRConfig
     34         .getInstance().getSupplementalDataInfo();
     35     static final Map<String, String> likely = SUPPLEMENTAL_DATA_INFO
     36         .getLikelySubtags();
     37     static final LikelySubtags LIKELY = new LikelySubtags(
     38         SUPPLEMENTAL_DATA_INFO, likely);
     39 
     40     public static void main(String[] args) {
     41         new LikelySubtagsTest().run(args);
     42     }
     43 
     44     static class Tags {
     45         final Set<String> languages = new TreeSet<String>();
     46         final Set<String> scripts = new TreeSet<String>();
     47         final Set<String> regions = new TreeSet<String>();
     48         final Set<String> scriptRegion = new TreeSet<String>();
     49         final Set<String> languageScript = new TreeSet<String>();
     50         final Set<String> languageRegion = new TreeSet<String>();
     51         final Set<String> all = new TreeSet<String>();
     52         final ChainedMap.M4<String, String, String, Boolean> languageToScriptToRegions = ChainedMap
     53             .of(new TreeMap<String, Object>(),
     54                 new TreeMap<String, Object>(),
     55                 new TreeMap<String, Object>(), Boolean.class);
     56         final ChainedMap.M3<String, String, Boolean> languageToRegions = ChainedMap
     57             .of(new TreeMap<String, Object>(),
     58                 new TreeMap<String, Object>(), Boolean.class);
     59 
     60         public Tags() {
     61             final LanguageTagParser ltp = new LanguageTagParser();
     62             for (Entry<String, String> entry : likely.entrySet()) {
     63                 add(ltp.set(entry.getKey()), true);
     64                 add(ltp.set(entry.getValue()), false);
     65             }
     66             // add unfamiliar script, unfamiliar region
     67             for (String lang : languageToScriptToRegions.keySet()) {
     68                 if (lang.equals("und")) {
     69                     continue;
     70                 }
     71                 M3<String, String, Boolean> scriptToRegion = languageToScriptToRegions
     72                     .get(lang);
     73                 final Set<String> scriptsFor = scriptToRegion.keySet();
     74                 final Set<String> regionsFor = languageToRegions.get(lang)
     75                     .keySet();
     76 
     77                 String firstScriptNotIn = getNonEmptyNotIn(scripts, scriptsFor);
     78                 String firstRegionNotIn = getNonEmptyNotIn(regions, regionsFor);
     79 
     80                 languageToScriptToRegions.put(lang, firstScriptNotIn,
     81                     firstRegionNotIn, Boolean.TRUE);
     82                 // clone for safety before iterating
     83                 for (String script : new HashSet<String>(scriptsFor)) {
     84                     languageToScriptToRegions.put(lang, script,
     85                         firstRegionNotIn, Boolean.TRUE);
     86                 }
     87                 for (String region : new HashSet<String>(regionsFor)) {
     88                     languageToScriptToRegions.put(lang, firstScriptNotIn,
     89                         region, Boolean.TRUE);
     90                 }
     91             }
     92 
     93             // System.out.println("all: " + all);
     94             // System.out.println("scriptRegion: " + scriptRegion);
     95             // System.out.println("languageScript: " + languageScript);
     96             // System.out.println("languageRegion: " + languageRegion);
     97         }
     98 
     99         private static <T> T getNonEmptyNotIn(Iterable<T> a, Set<T> b) {
    100             for (T x : a) {
    101                 if (!b.contains(x) && !x.toString().isEmpty()) {
    102                     return x;
    103                 }
    104             }
    105             throw new IllegalArgumentException();
    106         }
    107 
    108         void add(LanguageTagParser ltp, boolean source) {
    109             String sourceLanguage = ltp.getLanguage();
    110             String sourceScript = ltp.getScript();
    111             String sourceRegion = ltp.getRegion();
    112             languageToScriptToRegions.put(sourceLanguage, sourceScript,
    113                 sourceRegion, Boolean.TRUE);
    114             languageToScriptToRegions.put(sourceLanguage, sourceScript, "",
    115                 Boolean.TRUE);
    116             languageToScriptToRegions.put(sourceLanguage, "", "", Boolean.TRUE);
    117             languageToRegions.put(sourceLanguage, "", Boolean.TRUE);
    118             if (StandardCodes.isCountry(sourceRegion)) {
    119                 languageToScriptToRegions.put(sourceLanguage, "", sourceRegion,
    120                     Boolean.TRUE);
    121                 languageToRegions.put(sourceLanguage, sourceRegion,
    122                     Boolean.TRUE);
    123             }
    124 
    125             // capture all cases of 2 items
    126             if (source) {
    127                 if (!sourceScript.isEmpty() && !sourceRegion.isEmpty()) {
    128                     if (!sourceLanguage.equals("und")) {
    129                         all.add(ltp.toString());
    130                     } else {
    131                         scriptRegion.add(ltp.toString());
    132                     }
    133                 } else if (!sourceLanguage.equals("und")) {
    134                     if (!sourceScript.isEmpty()) {
    135                         languageScript.add(ltp.toString());
    136                     } else if (!sourceRegion.isEmpty()) {
    137                         languageRegion.add(ltp.toString());
    138                     }
    139                 }
    140             }
    141             languages.add(sourceLanguage);
    142             scripts.add(sourceScript);
    143             if (StandardCodes.isCountry(sourceRegion) || sourceRegion.isEmpty()) {
    144                 regions.add(sourceRegion);
    145             }
    146         }
    147     }
    148 
    149     static final Tags TAGS = new Tags();
    150 
    151     final LanguageTagParser maxLtp = new LanguageTagParser();
    152     final LanguageTagParser sourceLtp = new LanguageTagParser();
    153 
    154     /**
    155      * Return false if we should skip the language
    156      *
    157      * @param source
    158      * @return
    159      */
    160     public boolean checkAdding(String source) {
    161         // if X maps to Y, then adding a field from Y to X will still map to Y
    162         // Example:
    163         // und_AF => fa_Arab_AF
    164         // therefore, the following should also be true:
    165         // und_Arab_AF => fa_Arab_AF
    166         // fa_AF => fa_Arab_AF
    167         // fa_Arab_AF => fa_Arab_AF
    168 
    169         String max = LIKELY.maximize(source);
    170         if (!assertNotEquals("Maximize " + source, null, max)) {
    171             return source.contains("_");
    172         }
    173         sourceLtp.set(source);
    174         if (!sourceLtp.getRegion().isEmpty()
    175             && !StandardCodes.isCountry(sourceLtp.getRegion())) {
    176             return true;
    177         }
    178         maxLtp.set(max);
    179         for (int i = 1; i < 8; ++i) {
    180             if ((i & 1) != 0) {
    181                 if (!sourceLtp.getLanguage().equals("und"))
    182                     continue;
    183                 sourceLtp.setLanguage(maxLtp.getLanguage());
    184             }
    185             if ((i & 2) != 0) {
    186                 if (!sourceLtp.getScript().isEmpty())
    187                     continue;
    188                 sourceLtp.setScript(maxLtp.getScript());
    189             }
    190             if ((i & 4) != 0) {
    191                 if (!sourceLtp.getRegion().isEmpty())
    192                     continue;
    193                 sourceLtp.setRegion(maxLtp.getRegion());
    194             }
    195             String test = sourceLtp.toString();
    196             final String maximize = LIKELY.maximize(test);
    197             if (!max.equals(maximize)) {
    198                 if (!assertEquals(source + " -> " + max + ", so testing "
    199                     + test, max, maximize)) {
    200                     LIKELY.maximize(test); // do again for debugging
    201                 }
    202             }
    203             sourceLtp.set(source); // restore
    204         }
    205         return true;
    206     }
    207 
    208     public void TestCompleteness() {
    209         // if (logKnownIssue("Cldrbug:7121",
    210         // "Problems with likely subtags test")) {
    211         // return;
    212         // }
    213         // checkAdding("und_Bopo");
    214         // checkAdding("und_Brai");
    215         // checkAdding("und_Limb");
    216         // checkAdding("und_Cakm");
    217         // checkAdding("und_Shaw");
    218 
    219         final LanguageTagParser ltp = new LanguageTagParser();
    220         if (DEBUG) {
    221             System.out.println(TAGS.languages.size() + "\t" + TAGS.languages);
    222             System.out.println(TAGS.scripts.size() + "\t" + TAGS.scripts);
    223             System.out.println(TAGS.regions.size() + "\t" + TAGS.regions);
    224         }
    225         main: for (Entry<String, Map<String, Map<String, Boolean>>> languageScriptRegion : TAGS.languageToScriptToRegions) {
    226             String language = languageScriptRegion.getKey();
    227             ltp.set(language); // clears script, region
    228             for (Entry<String, Map<String, Boolean>> scriptRegion : languageScriptRegion
    229                 .getValue().entrySet()) {
    230                 String script = scriptRegion.getKey();
    231                 ltp.setScript(script);
    232                 for (String region : scriptRegion.getValue().keySet()) {
    233                     ltp.setRegion(region);
    234                     String testTag = ltp.toString();
    235                     // System.out.println(testTag);
    236                     if (!checkAdding(testTag)) {
    237                         continue main;
    238                     }
    239                 }
    240             }
    241         }
    242     }
    243 
    244     static Set<String> exceptions = new HashSet<String>(Arrays.asList("Zyyy",
    245         "Zinh", "Zzzz", "Brai"));
    246 
    247     public void TestStability() {
    248         // when maximized must never change
    249         // first get all the subtags
    250         // then test all the combinations
    251         LanguageTagParser ltp = new LanguageTagParser();
    252         for (Entry<String, String> entry : likely.entrySet()) {
    253             ltp.set(entry.getKey());
    254             String sourceLanguage = ltp.getLanguage();
    255             if (sourceLanguage.equals("und")) {
    256                 sourceLanguage = "";
    257             }
    258             String sourceScript = ltp.getScript();
    259             String sourceRegion = ltp.getRegion();
    260             ltp.set(entry.getValue());
    261             String targetLanguage = ltp.getLanguage();
    262             String targetScript = ltp.getScript();
    263             String targetRegion = ltp.getRegion();
    264             if (!sourceLanguage.isEmpty()) {
    265                 assertEquals("language", sourceLanguage, targetLanguage);
    266             }
    267             if (!sourceScript.isEmpty()) {
    268                 assertEquals("script", sourceScript, targetScript);
    269             }
    270             if (!sourceRegion.isEmpty()) {
    271                 if (Containment.isLeaf(sourceRegion)) {
    272                     assertEquals("region", sourceRegion, targetRegion);
    273                 }
    274             }
    275         }
    276 
    277     }
    278 
    279     public void TestForMissingScriptMetadata() {
    280         TreeSet<String> metadataScripts = new TreeSet<String>(
    281             ScriptMetadata.getScripts());
    282         UnicodeSet current = new UnicodeSet(0, 0x10FFFF);
    283         UnicodeSet toRemove = new UnicodeSet();
    284 
    285         while (!current.isEmpty()) {
    286             int ch = current.charAt(0);
    287             int script = UScript.getScript(ch);
    288             String shortName = UScript.getShortName(script);
    289             Info i = ScriptMetadata.getInfo(shortName);
    290             if (i == null) {
    291                 errln("Script Metadata is missing: " + shortName);
    292                 continue;
    293             }
    294             if (i.likelyLanguage.equals("und")
    295                 && !exceptions.contains(shortName)) {
    296                 errln("Script has no likely language: " + shortName);
    297             }
    298             toRemove.applyIntPropertyValue(UProperty.SCRIPT, script);
    299             current.removeAll(toRemove);
    300             metadataScripts.remove(shortName);
    301         }
    302         metadataScripts
    303             .removeAll(Arrays.asList("Hans", "Hant", "Hanb", "Jamo", "Jpan", "Kore")); // remove
    304         // "combo"
    305         // scripts
    306         if (!metadataScripts.isEmpty()) {
    307             // Warning, not error, so that we can add scripts to the script metadata
    308             // and later update to the Unicode version that has characters for those scripts.
    309             warnln("Script Metadata for characters not in Unicode: "
    310                 + metadataScripts);
    311         }
    312     }
    313 
    314     public void TestMissingInfoForLanguage() {
    315         CLDRFile english = CLDRConfig.getInstance().getEnglish();
    316 
    317         for (String language : CLDRConfig.getInstance().getCldrFactory()
    318             .getAvailableLanguages()) {
    319             if (language.contains("_") || language.equals("root")) {
    320                 continue;
    321             }
    322             String likelyExpansion = likely.get(language);
    323             if (likelyExpansion == null) {
    324                 errln("Missing likely subtags for: " + language);
    325             } else {
    326                 logln("Likely subtags for " + language + ":\t " + likely);
    327             }
    328             String path = CLDRFile.getKey(CLDRFile.LANGUAGE_NAME, language);
    329             String englishName = english.getStringValue(path);
    330             if (englishName == null) {
    331                 errln("Missing English translation for: " + language);
    332             }
    333         }
    334     }
    335 
    336     public void TestMissingInfoForRegion() {
    337         CLDRFile english = CLDRConfig.getInstance().getEnglish();
    338 
    339         for (String region : StandardCodes.make().getGoodAvailableCodes(
    340             "territory")) {
    341             String likelyExpansion = likely.get("und_" + region);
    342             if (likelyExpansion == null) {
    343                 if (region.equals("ZZ") || region.equals("001") || region.equals("UN")
    344                     || SUPPLEMENTAL_DATA_INFO.getContained(region) == null) { // not
    345                     // container
    346                     String likelyTag = LikelySubtags.maximize("und_" + region,
    347                         likely);
    348                     if (likelyTag == null || !likelyTag.startsWith("en_Latn_")) {
    349                         errln("Missing likely subtags for region: " + region
    350                             + "\t" + english.getName("territory", region));
    351                     }
    352                 } else { // container
    353                     errln("Missing likely subtags for macroregion (fix to exclude regions having 'en'): "
    354                         + region
    355                         + "\t"
    356                         + english.getName("territory", region));
    357                 }
    358             } else {
    359                 logln("Likely subtags for region: " + region + ":\t " + likely);
    360             }
    361             String path = CLDRFile.getKey(CLDRFile.TERRITORY_NAME, region);
    362             String englishName = english.getStringValue(path);
    363             if (englishName == null) {
    364                 errln("Missing English translation for: " + region);
    365             }
    366         }
    367     }
    368 
    369     public void TestMissingInfoForScript() {
    370         VersionInfo icuUnicodeVersion = UCharacter.getUnicodeVersion();
    371         TreeSet<String> sorted = new TreeSet<String>(
    372             ScriptMetadata.getScripts());
    373         Set<String> exceptions2 = new HashSet<String>(
    374             Arrays.asList("zh_Hans_CN"));
    375         for (String script : sorted) {
    376             if (exceptions.contains(script) || script.equals("Latn")
    377                 || script.equals("Dsrt")) {
    378                 // we minimize away und_X, when the code puts in en...US
    379                 continue;
    380             }
    381             Info i = ScriptMetadata.getInfo(script);
    382             // System.out.println(i);
    383             String likelyLanguage = i.likelyLanguage;
    384             String originCountry = i.originCountry;
    385             String undScript = "und_" + script;
    386             String langScript = likelyLanguage + "_" + script + "_";
    387             String likelyExpansion = likely.get(undScript);
    388             if (likelyExpansion == null) {
    389                 String msg = "Missing likely language for script (und_" + script
    390                     + ")  should be something like:\t "
    391                     + showOverride(script, originCountry, langScript);
    392                 if (i.age.compareTo(icuUnicodeVersion) <= 0) {
    393                     // Error: Missing data for a script in ICU's Unicode version.
    394                     errln(msg);
    395                 } else {
    396                     // Warning: Missing data for a script in a future Unicode version.
    397                     warnln(msg);
    398                 }
    399             } else if (!exceptions2.contains(likelyExpansion)
    400                 && !likelyExpansion.startsWith(langScript)) {
    401                 // if
    402                 // (logKnownIssue("Cldrbug:7181","Missing script metadata for "
    403                 // + script)
    404                 // && (script.equals("Tfng") || script.equals("Brah"))) {
    405                 // logln("Wrong likely language for script (und_" + script +
    406                 // "). Should not be " + likelyExpansion
    407                 // + ", but something like:\t " + showOverride(script,
    408                 // originCountry, langScript));
    409                 // } else {
    410                 errln("Wrong likely language for script (und_" + script
    411                     + "). Should not be " + likelyExpansion
    412                     + ", but something like:\t "
    413                     + showOverride(script, originCountry, langScript));
    414                 // }
    415             } else {
    416                 logln("OK: " + undScript + " => " + likelyExpansion);
    417             }
    418         }
    419         /**
    420          * und_Bopo => zh_Bopo_TW und_Copt => cop_Copt_EG // fix 002 und_Dsrt =>
    421          * en_Dsrt_US // fix US
    422          */
    423     }
    424 
    425     public String showOverride(String script, String originCountry,
    426         String langScript) {
    427         return "{\"und_" + script + "\", \"" + langScript + originCountry
    428             + "\"},";
    429     }
    430 }
    431