Home | History | Annotate | Download | only in test
      1 package org.unicode.cldr.test;
      2 
      3 import java.util.Collections;
      4 import java.util.EnumMap;
      5 import java.util.List;
      6 import java.util.Map;
      7 import java.util.Set;
      8 import java.util.TreeSet;
      9 import java.util.regex.Matcher;
     10 
     11 import org.unicode.cldr.draft.ScriptMetadata;
     12 import org.unicode.cldr.draft.ScriptMetadata.Info;
     13 import org.unicode.cldr.draft.ScriptMetadata.Trinary;
     14 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype;
     15 import org.unicode.cldr.tool.LikelySubtags;
     16 import org.unicode.cldr.util.CLDRFile;
     17 import org.unicode.cldr.util.CldrUtility;
     18 import org.unicode.cldr.util.Counter;
     19 import org.unicode.cldr.util.Factory;
     20 import org.unicode.cldr.util.PathStarrer;
     21 import org.unicode.cldr.util.PatternCache;
     22 import org.unicode.cldr.util.RegexLookup;
     23 import org.unicode.cldr.util.XPathParts;
     24 
     25 import com.ibm.icu.dev.util.CollectionUtilities;
     26 import com.ibm.icu.lang.UCharacter;
     27 import com.ibm.icu.text.BreakIterator;
     28 import com.ibm.icu.util.ULocale;
     29 
     30 public class CheckConsistentCasing extends FactoryCheckCLDR {
     31 
     32     private static final boolean DEBUG = CldrUtility.getProperty("DEBUG", false);
     33 
     34     private static final double MIN_FACTOR = 2.5;
     35     // remember to add this class to the list in CheckCLDR.getCheckAll
     36     // to run just this test, on just locales starting with 'nl', use CheckCLDR with -fnl.* -t.*Currencies.*
     37 
     38     XPathParts parts = new XPathParts(); // used to parse out a path
     39     ULocale uLocale = null;
     40     BreakIterator breaker = null;
     41     private String locale;
     42     CasingInfo casingInfo;
     43     private boolean hasCasingInfo;
     44 
     45     public CheckConsistentCasing(Factory factory) {
     46         super(factory);
     47         casingInfo = new CasingInfo(factory);
     48     }
     49 
     50     @Override
     51     public CheckCLDR setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options,
     52         List<CheckStatus> possibleErrors) {
     53         if (cldrFileToCheck == null) return this;
     54         super.setCldrFileToCheck(cldrFileToCheck, options, possibleErrors);
     55         locale = cldrFileToCheck.getLocaleID();
     56         // get info about casing; note that this is done in two steps since
     57         // ScriptMetadata.getInfo() returns null, in some instances.
     58         // OLD: Info localeInfo = ScriptMetadata.getInfo(locale);
     59         String script = new LikelySubtags().getLikelyScript(locale);
     60         Info localeInfo = ScriptMetadata.getInfo(script);
     61 
     62         if (localeInfo != null && localeInfo.hasCase == Trinary.YES) {
     63             // this script has casing info, so we can request it here
     64             try {
     65                 types = casingInfo.getLocaleCasing(locale);
     66             } catch (Exception e) {
     67                 types = Collections.emptyMap();
     68             }
     69         } else {
     70             // no casing info - since the types Map is global, and null checks aren't done,
     71             // we are better off  with an empty map here
     72             types = Collections.emptyMap();
     73         }
     74         if (types == null || types.isEmpty()) {
     75             possibleErrors.add(new CheckStatus().setCause(this)
     76                 .setMainType(CheckStatus.warningType)
     77                 .setSubtype(Subtype.incorrectCasing)
     78                 .setMessage("Could not load casing info for {0}", locale));
     79         }
     80         // types may be null, avoid NPE
     81         hasCasingInfo = (types == null) ? false : types.size() > 0;
     82         return this;
     83     }
     84 
     85     // If you don't need any file initialization or postprocessing, you only need this one routine
     86     public CheckCLDR handleCheck(String path, String fullPath, String value, Options options,
     87         List<CheckStatus> result) {
     88         // it helps performance to have a quick reject of most paths
     89         if (fullPath == null) return this; // skip paths that we don't have
     90         if (!hasCasingInfo) return this;
     91 
     92         String locale2 = getCldrFileToCheck().getSourceLocaleID(path, null);
     93         if (locale2.equals(locale) && value != null && value.length() > 0) {
     94             Category category = getCategory(path);
     95             if (category != null) {
     96                 checkConsistentCasing(category, path, fullPath, value, options, result);
     97             }
     98         }
     99         return this;
    100     }
    101 
    102     static final Matcher placeholder = PatternCache.get("\\{\\d+\\}").matcher("");
    103 
    104     /**
    105      * The casing type of a given string.
    106      */
    107     public enum CasingType {
    108         titlecase, lowercase, other;
    109         public static CasingType from(String s) {
    110             if (s == null || s.length() == 0) {
    111                 return other;
    112             }
    113             int cp;
    114             // Look for the first meaningful character in the string to determine case.
    115             for (int i = 0; i < s.length(); i += Character.charCount(cp)) {
    116                 cp = s.codePointAt(i);
    117                 // used to skip the placeholders, but works better to have them be 'other'
    118                 // if (cp == '{') {
    119                 // if (placeholder.reset(s).region(i,s.length()).lookingAt()) {
    120                 // i = placeholder.end() - 1; // skip
    121                 // continue;
    122                 // }
    123                 // }
    124                 int type = UCharacter.getType(cp);
    125                 switch (type) {
    126 
    127                 case UCharacter.LOWERCASE_LETTER:
    128                     return lowercase;
    129 
    130                 case UCharacter.UPPERCASE_LETTER:
    131                 case UCharacter.TITLECASE_LETTER:
    132                     return titlecase;
    133 
    134                 // for other letters / numbers / symbols, return other
    135                 case UCharacter.OTHER_LETTER:
    136                 case UCharacter.DECIMAL_DIGIT_NUMBER:
    137                 case UCharacter.LETTER_NUMBER:
    138                 case UCharacter.OTHER_NUMBER:
    139                 case UCharacter.MATH_SYMBOL:
    140                 case UCharacter.CURRENCY_SYMBOL:
    141                 case UCharacter.MODIFIER_SYMBOL:
    142                 case UCharacter.OTHER_SYMBOL:
    143                     return other;
    144                 // ignore everything else (whitespace, punctuation, etc) and keep going
    145                 }
    146             }
    147             return other;
    148         }
    149 
    150         /**
    151          * Return true if either is other, or they are identical.
    152          */
    153         public boolean worksWith(CasingType otherType) {
    154             return otherType == null || this == otherType || this == CasingType.other || otherType == CasingType.other;
    155         }
    156     }
    157 
    158     public enum CasingTypeAndErrFlag {
    159         titlecase_mismatchWarn(CasingType.titlecase, false), titlecase_mismatchErr(CasingType.titlecase, true), lowercase_mismatchWarn(CasingType.lowercase,
    160             false), lowercase_mismatchErr(CasingType.lowercase, true), other_mismatchWarn(CasingType.other, false), other_mismatchErr(CasingType.other, true);
    161 
    162         private final CasingType type;
    163         private final boolean flag; // force error instead of warning for mismatch
    164 
    165         private CasingTypeAndErrFlag(CasingType type, boolean flag) {
    166             this.type = type;
    167             this.flag = flag;
    168         }
    169 
    170         public CasingType type() {
    171             return type;
    172         }
    173 
    174         public boolean flag() {
    175             return flag;
    176         }
    177     }
    178 
    179     static final RegexLookup<Category> pathToBucket = new RegexLookup<Category>()
    180         .add("//ldml/localeDisplayNames/languages/language", Category.language)
    181         .add("//ldml/localeDisplayNames/scripts/script", Category.script)
    182         .add("//ldml/localeDisplayNames/territories/territory", Category.territory)
    183         .add("//ldml/localeDisplayNames/variants/variant", Category.variant)
    184         .add("//ldml/localeDisplayNames/keys/key", Category.key)
    185         .add("//ldml/localeDisplayNames/types/type", Category.keyValue)
    186         .add("//ldml/dates/calendars/calendar.*/months.*narrow", Category.month_narrow)
    187         .add("//ldml/dates/calendars/calendar.*/months.*format", Category.month_format_except_narrow)
    188         .add("//ldml/dates/calendars/calendar.*/months", Category.month_standalone_except_narrow)
    189         .add("//ldml/dates/calendars/calendar.*/days.*narrow", Category.day_narrow)
    190         .add("//ldml/dates/calendars/calendar.*/days.*format", Category.day_format_except_narrow)
    191         .add("//ldml/dates/calendars/calendar.*/days", Category.day_standalone_except_narrow)
    192         .add("//ldml/dates/calendars/calendar.*/eras/eraNarrow", Category.era_narrow)
    193         .add("//ldml/dates/calendars/calendar.*/eras/eraAbbr", Category.era_abbr)
    194         .add("//ldml/dates/calendars/calendar.*/eras/", Category.era_name)
    195         .add("//ldml/dates/calendars/calendar.*/quarters.*narrow", Category.quarter_narrow)
    196         .add("//ldml/dates/calendars/calendar.*/quarters.*abbreviated", Category.quarter_abbreviated)
    197         .add("//ldml/dates/calendars/calendar.*/quarters.*format", Category.quarter_format_wide)
    198         .add("//ldml/dates/calendars/calendar.*/quarters", Category.quarter_standalone_wide)
    199         .add("//ldml/.*/relative", Category.relative)
    200         .add("//ldml/dates/fields", Category.calendar_field)
    201         .add("//ldml/dates/timeZoneNames/zone.*/exemplarCity", Category.zone_exemplarCity)
    202         .add("//ldml/dates/timeZoneNames/zone.*/short", Category.zone_short)
    203         .add("//ldml/dates/timeZoneNames/zone", Category.zone_long)
    204         .add("//ldml/dates/timeZoneNames/metazone.*/commonlyUsed", Category.NOT_USED) // just to remove them from the other cases
    205         .add("//ldml/dates/timeZoneNames/metazone.*/short", Category.metazone_long)
    206         .add("//ldml/dates/timeZoneNames/metazone", Category.metazone_long)
    207         .add("//ldml/numbers/currencies/currency.*/symbol", Category.symbol)
    208         .add("//ldml/numbers/currencies/currency.*/displayName.*@count", Category.currencyName_count)
    209         .add("//ldml/numbers/currencies/currency.*/displayName", Category.currencyName)
    210         .add("//ldml/units/unit.*/unitPattern.*(past|future)", Category.relative)
    211         .add("//ldml/units/unit.*/unitPattern", Category.unit_pattern)
    212     // ldml/localeDisplayNames/keys/key[@type=".*"]
    213     // ldml/localeDisplayNames/measurementSystemNames/measurementSystemName[@type=".*"]
    214     // ldml/localeDisplayNames/transformNames/transformName[@type=".*"]
    215     ;
    216 
    217     Map<Category, CasingTypeAndErrFlag> types = new EnumMap<Category, CasingTypeAndErrFlag>(Category.class);
    218 
    219     public enum Category {
    220         language, script, territory, variant, keyValue, month_narrow, month_format_except_narrow, month_standalone_except_narrow, day_narrow, day_format_except_narrow, day_standalone_except_narrow, era_narrow, era_abbr, era_name, quarter_narrow, quarter_abbreviated, quarter_format_wide, quarter_standalone_wide, calendar_field, zone_exemplarCity, zone_short, zone_long, NOT_USED, metazone_short, metazone_long, symbol, currencyName_count, currencyName, relative, unit_pattern, key;
    221     }
    222 
    223     // //ldml/numbers/currencies/currency[@type="ADP"]/displayName
    224     // //ldml/numbers/currencies/currency[@type="RON"]/displayName[@count="other"]
    225     // //ldml/numbers/currencies/currency[@type="BYB"]/symbol
    226 
    227     static Category getCategory(String path) {
    228         return pathToBucket.get(path);
    229     }
    230 
    231     /**
    232      * Calculates casing information using data from the specified CLDRFile.
    233      *
    234      * @param resolved
    235      *            the resolved CLDRFile to calculate casing information from
    236      * @return
    237      */
    238     public static Map<Category, CasingType> getSamples(CLDRFile resolved) {
    239         // Use EnumMap instead of an array for type safety.
    240         Map<Category, Counter<CasingType>> counters = new EnumMap<Category, Counter<CasingType>>(Category.class);
    241 
    242         for (Category category : Category.values()) {
    243             counters.put(category, new Counter<CasingType>());
    244         }
    245         PathStarrer starrer = new PathStarrer();
    246         boolean isRoot = "root".equals(resolved.getLocaleID());
    247         Set<String> missing = !DEBUG ? null : new TreeSet<String>();
    248 
    249         for (String path : resolved) {
    250             if (!isRoot) {
    251                 String locale2 = resolved.getSourceLocaleID(path, null);
    252                 if (locale2.equals("root") || locale2.equals("code-fallback")) {
    253                     continue;
    254                 }
    255             }
    256             String winningPath = resolved.getWinningPath(path);
    257             if (!winningPath.equals(path)) {
    258                 continue;
    259             }
    260             Category category = getCategory(path);
    261             if (category != null) {
    262                 String value = resolved.getStringValue(path);
    263                 if (value == null || value.length() == 0) continue;
    264                 CasingType ft = CasingType.from(value);
    265                 counters.get(category).add(ft, 1);
    266             } else if (DEBUG) {
    267                 String starred = starrer.set(path);
    268                 missing.add(starred);
    269             }
    270         }
    271 
    272         Map<Category, CasingType> info = new EnumMap<Category, CasingType>(Category.class);
    273         for (Category category : Category.values()) {
    274             if (category == Category.NOT_USED) continue;
    275             Counter<CasingType> counter = counters.get(category);
    276             long countLower = counter.getCount(CasingType.lowercase);
    277             long countUpper = counter.getCount(CasingType.titlecase);
    278             long countOther = counter.getCount(CasingType.other);
    279             CasingType type;
    280             if (countLower + countUpper == 0) {
    281                 type = CasingType.other;
    282             } else if (countLower >= countUpper * MIN_FACTOR && countLower >= countOther) {
    283                 type = CasingType.lowercase;
    284             } else if (countUpper >= countLower * MIN_FACTOR && countUpper >= countOther) {
    285                 type = CasingType.titlecase;
    286             } else {
    287                 type = CasingType.other;
    288             }
    289             info.put(category, type);
    290         }
    291         if (DEBUG && missing.size() != 0) {
    292             System.out.println("Paths skipped:\n" + CollectionUtilities.join(missing, "\n"));
    293         }
    294         return info;
    295     }
    296 
    297     private static final String CASE_WARNING = "The first letter of {0} is {1}, which differs from what is expected " +
    298         "for the {2} category: that almost all values be {3}.\n\n" +
    299         "For guidance, see http://cldr.org/translation/capitalization. " +
    300         "If this warning is wrong, please file a ticket at http://unicode.org/cldr/trac/.";
    301 
    302     private void checkConsistentCasing(Category category, String path, String fullPath, String value,
    303         Options options, List<CheckStatus> result) {
    304         // Avoid NPE
    305         if (types != null) {
    306             CasingType ft = CasingType.from(value);
    307             CasingTypeAndErrFlag typeAndFlagFromCat = types.get(category);
    308             if (typeAndFlagFromCat == null) {
    309                 typeAndFlagFromCat = CasingTypeAndErrFlag.other_mismatchWarn;
    310             }
    311             if (!ft.worksWith(typeAndFlagFromCat.type())) {
    312                 result.add(new CheckStatus().setCause(this)
    313                     .setMainType(typeAndFlagFromCat.flag() ? CheckStatus.errorType : CheckStatus.warningType)
    314                     .setSubtype(Subtype.incorrectCasing) // typically warningType or errorType
    315                     .setMessage(CASE_WARNING, value, ft, category, typeAndFlagFromCat.type())); // the message; can be MessageFormat with arguments
    316             }
    317         }
    318     }
    319 }