Home | History | Annotate | Download | only in test
      1 package org.unicode.cldr.test;
      2 
      3 import java.util.BitSet;
      4 import java.util.List;
      5 
      6 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype;
      7 import org.unicode.cldr.util.CLDRConfig;
      8 import org.unicode.cldr.util.CLDRFile;
      9 import org.unicode.cldr.util.Factory;
     10 import org.unicode.cldr.util.SupplementalDataInfo;
     11 import org.unicode.cldr.util.UnicodeSetPrettyPrinter;
     12 import org.unicode.cldr.util.XPathParts;
     13 
     14 import com.ibm.icu.lang.UCharacter;
     15 import com.ibm.icu.lang.UCharacterDirection;
     16 import com.ibm.icu.lang.UProperty;
     17 import com.ibm.icu.lang.UScript;
     18 import com.ibm.icu.text.Collator;
     19 import com.ibm.icu.text.UnicodeSet;
     20 import com.ibm.icu.text.UnicodeSetIterator;
     21 import com.ibm.icu.util.ULocale;
     22 
     23 public class CheckExemplars extends FactoryCheckCLDR {
     24 
     25     public static final boolean USE_PUNCTUATION = false;
     26     private static final boolean SUPPRESS_AUX_EMPTY_CHECK = true;
     27     private static final String[] QUOTE_ELEMENTS = {
     28         "quotationStart", "quotationEnd",
     29         "alternateQuotationStart", "alternateQuotationEnd" };
     30     static final SupplementalDataInfo SUP = CLDRConfig.getInstance().getSupplementalDataInfo();
     31 
     32     Collator col;
     33     Collator spaceCol;
     34     boolean isRoot;
     35     UnicodeSetPrettyPrinter prettyPrinter;
     36 
     37     static final UnicodeSet HangulSyllables = new UnicodeSet(
     38         "[[:Hangul_Syllable_Type=LVT:][:Hangul_Syllable_Type=LV:]]").freeze();
     39 
     40     public static final UnicodeSet AlwaysOK;
     41     static {
     42         if (USE_PUNCTUATION) {
     43             AlwaysOK = new UnicodeSet("[\\u0020\\u00A0]");
     44         } else {
     45             AlwaysOK = new UnicodeSet(
     46                 "[[[:Nd:][:script=common:][:script=inherited:]-[:Default_Ignorable_Code_Point:]-[:C:] - [_]] [\u05BE \u05F3 \u066A-\u066C]" +
     47                     "[[][ ][]{}]" + // TODO Fix this Hack
     48                     "]"); // [\\u200c-\\u200f] [:script=common:][:script=inherited:]
     49         }
     50         AlwaysOK.freeze();
     51     }
     52     // TODO Fix some of these characters
     53     private static final UnicodeSet SPECIAL_ALLOW = new UnicodeSet(
     54         "[\u061C\\u200E\\u200F\\u200c\\u200d"
     55             +
     56             "[\u064B\u064E-\u0651\u0670][:Nd:][\u0951\u0952][\u064B-\u0652\u0654-\u0657\u0670][\u0A66-\u0A6F][\u0ED0-\u0ED9][\u064B-\u0652][\\u02BB\\u02BC][\u0CE6-\u0CEF][\u0966-\u096F]"
     57             +
     58             "[:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] ]" // restore
     59     // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:]
     60     ).freeze(); // add RLM, LRM [\u200C\u200D]
     61 
     62     public static final UnicodeSet UAllowedInExemplars = new UnicodeSet("[[:assigned:]-[:Z:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:]
     63         .removeAll(AlwaysOK) // this will remove some
     64         // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we restore them
     65         // in SPECIAL_ALLOW
     66         .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]
     67         .freeze();
     68 
     69     public static final UnicodeSet UAllowedInNumbers = new UnicodeSet("[\u00A0\u202F[:N:][:P:][:Sm:][:Letter_Number:][:Numeric_Type=Numeric:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:]
     70         .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]
     71         .freeze();
     72 
     73     public static final UnicodeSet AllowedInExemplars = new UnicodeSet(UAllowedInExemplars)
     74         .removeAll(new UnicodeSet("[[:Uppercase:]-[\u0130]]"))
     75         .freeze();
     76 
     77     public static final UnicodeSet ALLOWED_IN_PUNCTUATION = new UnicodeSet("[[:P:][:S:]-[:Sc:]]")
     78         .freeze();
     79 
     80     public static final UnicodeSet ALLOWED_IN_AUX = new UnicodeSet(AllowedInExemplars)
     81         .addAll(ALLOWED_IN_PUNCTUATION)
     82         .removeAll(AlwaysOK) // this will remove some
     83         // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we restore them
     84         // in SPECIAL_ALLOW
     85         .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]
     86         .freeze();
     87 
     88     public enum ExemplarType {
     89         main(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)", true), punctuation(ALLOWED_IN_PUNCTUATION, "punctuation",
     90             false), auxiliary(ALLOWED_IN_AUX, "(specific-script - uppercase - invisibles + \u0130)",
     91                 true), index(UAllowedInExemplars, "(specific-script - invisibles)", false), numbers(UAllowedInNumbers, "(specific-script - invisibles)", false),
     92         // currencySymbol(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)", false)
     93         ;
     94 
     95         public final UnicodeSet allowed;
     96         public final UnicodeSet toRemove;
     97         public final String message;
     98         public final boolean convertUppercase;
     99 
    100         ExemplarType(UnicodeSet allowed, String message, boolean convertUppercase) {
    101             if (!allowed.isFrozen()) {
    102                 throw new IllegalArgumentException("Internal Error");
    103             }
    104             this.allowed = allowed;
    105             this.message = message;
    106             this.toRemove = new UnicodeSet(allowed).complement().freeze();
    107             this.convertUppercase = convertUppercase;
    108         }
    109     }
    110 
    111     public CheckExemplars(Factory factory) {
    112         super(factory);
    113     }
    114 
    115     // Allowed[:script=common:][:script=inherited:][:alphabetic=false:]
    116 
    117     @Override
    118     public CheckCLDR setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options,
    119         List<CheckStatus> possibleErrors) {
    120         if (cldrFileToCheck == null) return this;
    121         super.setCldrFileToCheck(cldrFileToCheck, options, possibleErrors);
    122         String locale = cldrFileToCheck.getLocaleID();
    123         col = Collator.getInstance(new ULocale(locale));
    124         spaceCol = Collator.getInstance(new ULocale(locale));
    125         spaceCol.setStrength(Collator.PRIMARY);
    126         isRoot = cldrFileToCheck.getLocaleID().equals("root");
    127         prettyPrinter = new UnicodeSetPrettyPrinter()
    128             .setOrdering(col != null ? col : Collator.getInstance(ULocale.ROOT))
    129             .setSpaceComparator(col != null ? col : Collator.getInstance(ULocale.ROOT)
    130                 .setStrength2(Collator.PRIMARY))
    131             .setCompressRanges(true);
    132 
    133         // check for auxiliary anyway
    134         if (!SUPPRESS_AUX_EMPTY_CHECK) {
    135             UnicodeSet auxiliarySet = getResolvedCldrFileToCheck().getExemplarSet("auxiliary",
    136                 CLDRFile.WinningChoice.WINNING);
    137 
    138             if (auxiliarySet == null) {
    139                 possibleErrors.add(
    140                     new CheckStatus().setCause(this)
    141                         .setMainType(CheckStatus.warningType)
    142                         .setSubtype(Subtype.missingAuxiliaryExemplars)
    143                         .setMessage("Most languages allow <i>some<i> auxiliary characters, so review this."));
    144             }
    145         }
    146         return this;
    147     }
    148 
    149     public CheckCLDR handleCheck(String path, String fullPath, String value, Options options,
    150         List<CheckStatus> result) {
    151         if (fullPath == null) return this; // skip paths that we don't have
    152         if (path.indexOf("/exemplarCharacters") < 0) {
    153             if (path.contains("parseLenient")) {
    154                 checkParse(path, fullPath, value, options, result);
    155             }
    156             return this;
    157         }
    158         XPathParts oparts = XPathParts.getFrozenInstance(path);
    159         final String exemplarString = oparts.findAttributeValue("exemplarCharacters", "type");
    160         ExemplarType type = exemplarString == null ? ExemplarType.main : ExemplarType.valueOf(exemplarString);
    161         checkExemplar(value, result, type);
    162 
    163         // check relation to auxiliary set
    164         try {
    165             UnicodeSet mainSet = getResolvedCldrFileToCheck().getExemplarSet("", CLDRFile.WinningChoice.WINNING);
    166             if (type == ExemplarType.auxiliary) {
    167                 UnicodeSet auxiliarySet = new UnicodeSet(value);
    168 
    169                 UnicodeSet combined = new UnicodeSet(mainSet).addAll(auxiliarySet);
    170                 checkMixedScripts("main+auxiliary", combined, result);
    171 
    172                 if (auxiliarySet.containsSome(mainSet)) {
    173                     UnicodeSet overlap = new UnicodeSet(mainSet).retainAll(auxiliarySet).removeAll(HangulSyllables);
    174                     if (overlap.size() != 0) {
    175                         String fixedExemplar1 = new UnicodeSetPrettyPrinter()
    176                             .setOrdering(col != null ? col : Collator.getInstance(ULocale.ROOT))
    177                             .setSpaceComparator(col != null ? col : Collator.getInstance(ULocale.ROOT)
    178                                 .setStrength2(Collator.PRIMARY))
    179                             .setCompressRanges(true)
    180                             .format(overlap);
    181                         result
    182                             .add(new CheckStatus()
    183                                 .setCause(this)
    184                                 .setMainType(CheckStatus.errorType)
    185                                 .setSubtype(Subtype.auxiliaryExemplarsOverlap)
    186                                 .setMessage("Auxiliary characters also exist in main: \u200E{0}\u200E",
    187                                     new Object[] { fixedExemplar1 }));
    188                     }
    189                 }
    190             } else if (type == ExemplarType.punctuation) {
    191                 // Check that the punctuation exemplar characters include quotation marks.
    192                 UnicodeSet punctuationSet = new UnicodeSet(value);
    193                 UnicodeSet quoteSet = new UnicodeSet();
    194                 for (String element : QUOTE_ELEMENTS) {
    195                     quoteSet.add(getResolvedCldrFileToCheck().getWinningValue("//ldml/delimiters/" + element));
    196                 }
    197                 if (!punctuationSet.containsAll(quoteSet)) {
    198                     quoteSet.removeAll(punctuationSet);
    199                     // go ahead and list the characters separately, with space between, for clarity.
    200                     StringBuilder characters = new StringBuilder();
    201                     for (String item : quoteSet) {
    202                         if (characters.length() != 0) {
    203                             characters.append(" ");
    204                         }
    205                         characters.append(item);
    206                     }
    207                     // String characters = quoteSet.toPattern(false);
    208                     CheckStatus message = new CheckStatus().setCause(this)
    209                         .setMainType(CheckStatus.warningType)
    210                         .setSubtype(Subtype.missingPunctuationCharacters)
    211                         .setMessage("Punctuation exemplar characters are missing quotation marks for this locale: {0}",
    212                             characters);
    213                     result.add(message);
    214                 }
    215             } else if (type == ExemplarType.index) {
    216                 // Check that the index exemplar characters are in case-completed union of main and auxiliary exemplars
    217                 UnicodeSet auxiliarySet = getResolvedCldrFileToCheck().getExemplarSet("auxiliary", CLDRFile.WinningChoice.WINNING);
    218                 if (auxiliarySet == null) {
    219                     auxiliarySet = new UnicodeSet();
    220                 }
    221                 UnicodeSet mainAndAuxAllCase = new UnicodeSet(mainSet).addAll(auxiliarySet).closeOver(UnicodeSet.ADD_CASE_MAPPINGS);
    222                 UnicodeSet indexBadChars = new UnicodeSet(value).removeAll(mainAndAuxAllCase);
    223 
    224                 if (!indexBadChars.isEmpty()) {
    225                     CheckStatus message = new CheckStatus().setCause(this)
    226                         .setMainType(CheckStatus.warningType)
    227                         .setSubtype(Subtype.charactersNotInMainOrAuxiliaryExemplars)
    228                         .setMessage("Index exemplars include characters not in main or auxiliary exemplars: {0}",
    229                             indexBadChars.toPattern(false));
    230                     result.add(message);
    231                 }
    232             }
    233 
    234             // check for consistency with RTL
    235 
    236             Boolean localeIsRTL = false;
    237             String charOrientation = getResolvedCldrFileToCheck().getStringValue(
    238                 "//ldml/layout/orientation/characterOrder");
    239             if (charOrientation.equals("right-to-left")) {
    240                 localeIsRTL = true;
    241             }
    242 
    243             UnicodeSetIterator mi = new UnicodeSetIterator(mainSet);
    244             while (mi.next()) {
    245                 if (mi.codepoint != UnicodeSetIterator.IS_STRING &&
    246                     (UCharacter.getDirection(mi.codepoint) == UCharacterDirection.RIGHT_TO_LEFT ||
    247                         UCharacter.getDirection(mi.codepoint) == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)
    248                     &&
    249                     !localeIsRTL) {
    250                     result.add(new CheckStatus()
    251                         .setCause(this)
    252                         .setMainType(CheckStatus.errorType)
    253                         .setSubtype(Subtype.orientationDisagreesWithExemplars)
    254                         .setMessage(
    255                             "Main exemplar set contains RTL characters, but orientation of this locale is not RTL."));
    256                     break;
    257                 }
    258             }
    259 
    260         } catch (Exception e) {
    261         } // if these didn't parse, checkExemplar will be called anyway at some point
    262         return this;
    263     }
    264 
    265     private void checkParse(String path, String fullPath, String value, Options options, List<CheckStatus> result) {
    266         try {
    267             XPathParts oparts = XPathParts.getFrozenInstance(path);
    268             // only thing we do is make sure that the sample is in the value
    269             UnicodeSet us = new UnicodeSet(value);
    270             String sampleValue = oparts.getAttributeValue(-1, "sample");
    271             if (!us.contains(sampleValue)) {
    272                 CheckStatus message = new CheckStatus().setCause(this)
    273                     .setMainType(CheckStatus.errorType)
    274                     .setSubtype(Subtype.badParseLenient)
    275                     .setMessage("ParseLenient sample not in value: {0}  {1}", us, sampleValue);
    276                 result.add(message);
    277             }
    278         } catch (Exception e) {
    279             CheckStatus message = new CheckStatus().setCause(this)
    280                 .setMainType(CheckStatus.errorType)
    281                 .setSubtype(Subtype.badParseLenient)
    282                 .setMessage(e.getMessage());
    283             result.add(message);
    284         }
    285     }
    286 
    287     static final BitSet Japn = new BitSet();
    288     static final BitSet Kore = new BitSet();
    289     static {
    290         Japn.set(UScript.HAN);
    291         Japn.set(UScript.HIRAGANA);
    292         Japn.set(UScript.KATAKANA);
    293         Kore.set(UScript.HAN);
    294         Kore.set(UScript.HANGUL);
    295     }
    296 
    297     private void checkMixedScripts(String title, UnicodeSet set, List<CheckStatus> result) {
    298         BitSet s = new BitSet();
    299         for (String item : set) {
    300             int script = UScript.getScript(item.codePointAt(0));
    301             if (script != UScript.COMMON && script != UScript.INHERITED) {
    302                 s.set(script);
    303             }
    304         }
    305         final int cardinality = s.cardinality();
    306         if (cardinality < 2) {
    307             return;
    308         }
    309         if (cardinality == 2 && title.equals("currencySymbol") && s.get(UScript.LATIN)) {
    310             return; // allow 2 scripts in exemplars for currencies.
    311         }
    312         // allowable combinations
    313         if (s.equals(Japn) || s.equals(Kore)) {
    314             return;
    315         }
    316         StringBuilder scripts = new StringBuilder();
    317         for (int i = s.nextSetBit(0); i >= 0; i = s.nextSetBit(i + 1)) {
    318             if (scripts.length() != 0) {
    319                 scripts.append(", ");
    320             }
    321             scripts.append(UScript.getName(i));
    322             UnicodeSet inSet = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, i).retainAll(set);
    323             int count = 0;
    324             scripts.append(" (");
    325             for (String cp : inSet) {
    326                 if (count != 0) {
    327                     scripts.append(",");
    328                 }
    329                 scripts.append(cp);
    330                 count++;
    331                 if (count > 3) {
    332                     scripts.append('\u2026');
    333                     break;
    334                 }
    335             }
    336             scripts.append(")");
    337         }
    338         result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType)
    339             .setSubtype(Subtype.illegalExemplarSet)
    340             .setMessage("{0} exemplars contain multiple scripts: {1}", new Object[] { title, scripts }));
    341         return;
    342     }
    343 
    344     private void checkExemplar(String v, List<CheckStatus> result, ExemplarType exemplarType) {
    345         if (v == null) return;
    346         final UnicodeSet exemplar1;
    347         try {
    348             exemplar1 = new UnicodeSet(v).freeze();
    349         } catch (Exception e) {
    350             result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType)
    351                 .setSubtype(Subtype.illegalExemplarSet)
    352                 .setMessage("This field must be a set of the form [a b c-d ...]: ", new Object[] { e.getMessage() }));
    353             return;
    354         }
    355 
    356         // check for mixed scripts
    357 
    358         checkMixedScripts(exemplarType.toString(), exemplar1, result);
    359 
    360         // check that the formatting is correct
    361 
    362         String fixedExemplar1 = prettyPrinter.format(exemplar1);
    363         UnicodeSet doubleCheck = new UnicodeSet(fixedExemplar1);
    364         if (!doubleCheck.equals(exemplar1)) {
    365             result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType)
    366                 .setSubtype(Subtype.internalUnicodeSetFormattingError)
    367                 .setMessage("Internal Error: formatting not working for {0}", new Object[] { exemplar1 }));
    368         }
    369         // else if (!v.equals(fixedExemplar1)) {
    370         // result.add(new CheckStatus().setCause(this).setType(CheckStatus.warningType)
    371         // .setMessage("Better formatting would be \u200E{0}\u200E", new Object[]{fixedExemplar1}));
    372         // }
    373 
    374         // now check that only allowed characters are in the set
    375 
    376         if (!exemplarType.allowed.containsAll(exemplar1)) {
    377             UnicodeSet remainder0 = new UnicodeSet(exemplar1).removeAll(exemplarType.allowed);
    378 
    379             // we do allow for punctuation & combining marks in strings
    380             UnicodeSet remainder = new UnicodeSet();
    381             for (String s : remainder0) {
    382                 if (Character.codePointCount(s, 0, s.length()) == 1) {
    383                     remainder.add(s);
    384                 } else {
    385                     // just check normalization
    386                 }
    387             }
    388 
    389             // after a first check, we check again in case we flattened
    390 
    391             if (remainder.size() != 0) {
    392                 fixedExemplar1 = prettyPrinter.format(exemplar1);
    393                 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType)
    394                     .setSubtype(Subtype.illegalCharactersInExemplars)
    395                     .setMessage("Should be limited to " + exemplarType.message + "; thus not contain: \u200E{0}\u200E",
    396                         new Object[] { remainder }));
    397             }
    398         }
    399 
    400         // now check for empty
    401 
    402         if (!isRoot && exemplar1.size() == 0) {
    403             switch (exemplarType) {
    404 //            case currencySymbol: // ok if empty
    405 //                break;
    406             case auxiliary:
    407                 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.warningType)
    408                     .setSubtype(Subtype.missingAuxiliaryExemplars)
    409                     .setMessage("Most languages allow <i>some<i> auxiliary characters, so review this."));
    410                 break;
    411             case index:
    412             case punctuation:
    413             case main:
    414                 result.add(new CheckStatus()
    415                     .setCause(this)
    416                     .setMainType(CheckStatus.errorType)
    417                     .setSubtype(Subtype.missingMainExemplars)
    418                     .setMessage(
    419                         "Exemplar set (" + exemplarType
    420                             + ") must not be empty -- that would imply that this language uses no " +
    421                             (exemplarType == ExemplarType.punctuation ? "punctuation" : "letters") + "!"));
    422                 break;
    423             }
    424         }
    425     }
    426 }
    427