Home | History | Annotate | Download | only in test
      1 /* Copyright (C) 2007-2013 Google and others.  All Rights Reserved. */
      2 /* Copyright (C) 2007-2013 IBM Corp. and others. All Rights Reserved. */
      3 
      4 package org.unicode.cldr.test;
      5 
      6 import java.util.Arrays;
      7 import java.util.HashMap;
      8 import java.util.HashSet;
      9 import java.util.List;
     10 import java.util.Map;
     11 import java.util.Set;
     12 import java.util.TreeSet;
     13 import java.util.regex.Matcher;
     14 import java.util.regex.Pattern;
     15 
     16 import org.unicode.cldr.test.CheckExemplars.ExemplarType;
     17 import org.unicode.cldr.util.Builder;
     18 import org.unicode.cldr.util.CLDRFile;
     19 import org.unicode.cldr.util.CLDRLocale;
     20 import org.unicode.cldr.util.CldrUtility;
     21 import org.unicode.cldr.util.DateTimeCanonicalizer;
     22 import org.unicode.cldr.util.DateTimeCanonicalizer.DateTimePatternType;
     23 import org.unicode.cldr.util.Emoji;
     24 import org.unicode.cldr.util.ICUServiceBuilder;
     25 import org.unicode.cldr.util.PatternCache;
     26 import org.unicode.cldr.util.UnicodeSetPrettyPrinter;
     27 import org.unicode.cldr.util.With;
     28 import org.unicode.cldr.util.XPathParts;
     29 
     30 import com.google.common.base.Joiner;
     31 import com.google.common.base.Splitter;
     32 import com.google.myanmartools.ZawgyiDetector;
     33 import com.ibm.icu.lang.UCharacter;
     34 import com.ibm.icu.text.Collator;
     35 import com.ibm.icu.text.DateIntervalInfo;
     36 import com.ibm.icu.text.DateTimePatternGenerator;
     37 import com.ibm.icu.text.DecimalFormat;
     38 import com.ibm.icu.text.Normalizer;
     39 import com.ibm.icu.text.RuleBasedCollator;
     40 import com.ibm.icu.text.Transform;
     41 import com.ibm.icu.text.Transliterator;
     42 import com.ibm.icu.text.UnicodeSet;
     43 import com.ibm.icu.text.UnicodeSetIterator;
     44 import com.ibm.icu.util.ULocale;
     45 
     46 /**
     47  * Class for processing the input and output of CLDR data for use in the
     48  * Survey Tool and other tools.
     49  */
     50 public class DisplayAndInputProcessor {
     51 
     52     private static final boolean FIX_YEARS = true;
     53 
     54     public static final boolean DEBUG_DAIP = CldrUtility.getProperty("DEBUG_DAIP", false);
     55 
     56     public static final UnicodeSet RTL = new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]")
     57         .freeze();
     58 
     59     public static final UnicodeSet TO_QUOTE = (UnicodeSet) new UnicodeSet(
     60         "[[:Cn:]" +
     61             "[:Default_Ignorable_Code_Point:]" +
     62             "[:patternwhitespace:]" +
     63             "[:Me:][:Mn:]]" // add non-spacing marks
     64     ).freeze();
     65 
     66     public static final Pattern NUMBER_FORMAT_XPATH = Pattern
     67         .compile("//ldml/numbers/.*Format\\[@type=\"standard\"]/pattern.*");
     68 
     69     public static final Pattern NUMBER_SEPARATOR_PATTERN = Pattern
     70         .compile("//ldml/numbers/symbols.*/(decimal|group)");
     71 
     72     private static final Pattern APOSTROPHE_SKIP_PATHS = PatternCache.get("//ldml/("
     73         + "localeDisplayNames/languages/language\\[@type=\"mic\"].*|"
     74         + "characters/.*|"
     75         + "delimiters/.*|"
     76         + "dates/.+/(pattern|intervalFormatItem|dateFormatItem).*|"
     77         + "units/.+/unitPattern.*|"
     78         + "units/.+/durationUnitPattern.*|"
     79         + "numbers/symbols.*|"
     80         + "numbers/miscPatterns.*|"
     81         + "numbers/(decimal|currency|percent|scientific)Formats.+/(decimal|currency|percent|scientific)Format.*)");
     82     private static final Pattern INTERVAL_FORMAT_PATHS = PatternCache.get("//ldml/dates/.+/intervalFormatItem.*");
     83     private static final Pattern NON_DECIMAL_PERIOD = PatternCache.get("(?<![0#'])\\.(?![0#'])");
     84     private static final Pattern WHITESPACE_NO_NBSP_TO_NORMALIZE = PatternCache.get("\\s+"); // string of whitespace not
     85     // including NBSP, i.e. [
     86     // \t\n\r]+
     87     private static final Pattern WHITESPACE_AND_NBSP_TO_NORMALIZE = PatternCache.get("[\\s\\u00A0]+"); // string of
     88     // whitespace
     89     // including NBSP,
     90     // i.e. [
     91     // \u00A0\t\n\r]+
     92     private static final UnicodeSet UNICODE_WHITESPACE = new UnicodeSet("[:whitespace:]").freeze();
     93 
     94     private static final CLDRLocale MALAYALAM = CLDRLocale.getInstance("ml");
     95     private static final CLDRLocale ROMANIAN = CLDRLocale.getInstance("ro");
     96     private static final CLDRLocale CATALAN = CLDRLocale.getInstance("ca");
     97     private static final CLDRLocale NGOMBA = CLDRLocale.getInstance("jgo");
     98     private static final CLDRLocale KWASIO = CLDRLocale.getInstance("nmg");
     99     private static final CLDRLocale HEBREW = CLDRLocale.getInstance("he");
    100     private static final CLDRLocale MYANMAR = CLDRLocale.getInstance("my");
    101     private static final CLDRLocale KYRGYZ = CLDRLocale.getInstance("ky");
    102     private static final CLDRLocale URDU = CLDRLocale.getInstance("ur");
    103     private static final CLDRLocale PASHTO = CLDRLocale.getInstance("ps");
    104     private static final CLDRLocale FARSI = CLDRLocale.getInstance("fa");
    105     private static final CLDRLocale GERMAN_SWITZERLAND = CLDRLocale.getInstance("de_CH");
    106     private static final CLDRLocale SWISS_GERMAN = CLDRLocale.getInstance("gsw");
    107     public static final Set<String> LANGUAGES_USING_MODIFIER_APOSTROPHE = new HashSet<String>(
    108         Arrays.asList("br", "bss", "cch", "gn", "ha", "ha_Latn", "lkt", "mgo", "moh", "nnh", "qu", "quc", "uk", "uz", "uz_Latn"));
    109 
    110     //      =>     
    111     private static final char[][] ROMANIAN_CONVERSIONS = {
    112         { '\u015E', '\u0218' }, { '\u015F', '\u0219' }, { '\u0162', '\u021A' },
    113         { '\u0163', '\u021B' } };
    114 
    115     private static final char[][] CATALAN_CONVERSIONS = {
    116         { '\u013F', '\u004C', '\u00B7' }, //  -> L
    117         { '\u0140', '\u006C', '\u00B7' } }; //  -> l
    118 
    119     private static final char[][] NGOMBA_CONVERSIONS = {
    120         { '\u0251', '\u0061' }, { '\u0261', '\u0067' }, //   -> a ,  -> g , See ticket #5691
    121         { '\u2019', '\uA78C' }, { '\u02BC', '\uA78C' } }; //  Saltillo, see ticket #6805
    122 
    123     private static final char[][] KWASIO_CONVERSIONS = {
    124         { '\u0306', '\u030C' }, // See ticket #6571, use caron instead of breve
    125         { '\u0103', '\u01CE' }, { '\u0102', '\u01CD' }, // a-breve -> a-caron
    126         { '\u0115', '\u011B' }, { '\u011A', '\u01CD' }, // e-breve -> e-caron
    127         { '\u012D', '\u01D0' }, { '\u012C', '\u01CF' }, // i-breve -> i-caron
    128         { '\u014F', '\u01D2' }, { '\u014E', '\u01D1' }, // o-breve -> o-caron
    129         { '\u016D', '\u01D4' }, { '\u016C', '\u01D3' } // u-breve -> u-caron
    130     };
    131 
    132     private static final char[][] HEBREW_CONVERSIONS = {
    133         { '\'', '\u05F3' }, { '"', '\u05F4' } }; //  ' -> geresh  " -> gershayim
    134 
    135     private static final char[][] KYRGYZ_CONVERSIONS = {
    136         { '', '' }, { '', '' } }; //  right modifier
    137 
    138     private static final char[][] URDU_PLUS_CONVERSIONS = {
    139         { '\u0643', '\u06A9' }}; //  wrong char
    140 
    141     private static final ZawgyiDetector detector = new ZawgyiDetector();
    142     private static final Transliterator zawgyiUnicodeTransliterator =
    143         Transliterator.getInstance("Zawgyi-my");
    144 
    145     private Collator col;
    146 
    147     private Collator spaceCol;
    148 
    149     private UnicodeSetPrettyPrinter pp = null;
    150 
    151     final private CLDRLocale locale;
    152     private boolean isPosix;
    153 
    154     /**
    155      * Constructor, taking cldrFile.
    156      *
    157      * @param cldrFileToCheck
    158      */
    159     public DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator) {
    160         init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), needsCollator);
    161     }
    162 
    163     public DisplayAndInputProcessor(CLDRFile cldrFileToCheck) {
    164         init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), true);
    165     }
    166 
    167     void init(CLDRLocale locale, boolean needsCollator) {
    168         isPosix = locale.toString().indexOf("POSIX") >= 0;
    169         if (needsCollator) {
    170             ICUServiceBuilder isb = null;
    171             try {
    172                 isb = ICUServiceBuilder.forLocale(locale);
    173             } catch (Exception e) {
    174             }
    175 
    176             if (isb != null) {
    177                 try {
    178                     col = isb.getRuleBasedCollator();
    179                 } catch (Exception e) {
    180                     col = Collator.getInstance(ULocale.ROOT);
    181                 }
    182             } else {
    183                 col = Collator.getInstance(ULocale.ROOT);
    184             }
    185 
    186             spaceCol = Collator.getInstance(locale.toULocale());
    187             if (spaceCol instanceof RuleBasedCollator) {
    188                 ((RuleBasedCollator) spaceCol).setAlternateHandlingShifted(false);
    189             }
    190             pp = new UnicodeSetPrettyPrinter().setOrdering(Collator.getInstance(ULocale.ROOT))
    191                 .setSpaceComparator(Collator.getInstance(ULocale.ROOT).setStrength2(Collator.PRIMARY))
    192                 .setCompressRanges(true)
    193                 .setToQuote(new UnicodeSet(TO_QUOTE))
    194                 .setOrdering(col)
    195                 .setSpaceComparator(spaceCol);
    196         }
    197     }
    198 
    199     public UnicodeSetPrettyPrinter getPrettyPrinter() {
    200         return pp;
    201     }
    202 
    203     /**
    204      * Constructor, taking locale.
    205      *
    206      * @param locale
    207      */
    208     public DisplayAndInputProcessor(ULocale locale, boolean needsCollator) {
    209         init(this.locale = CLDRLocale.getInstance(locale), needsCollator);
    210     }
    211 
    212     /**
    213      * Constructor, taking locale.
    214      *
    215      * @param locale
    216      */
    217     public DisplayAndInputProcessor(ULocale locale) {
    218         init(this.locale = CLDRLocale.getInstance(locale), true);
    219     }
    220 
    221     /**
    222      * Constructor, taking locale.
    223      *
    224      * @param locale
    225      */
    226     public DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator) {
    227         init(this.locale = locale, needsCollator);
    228     }
    229 
    230     /**
    231      * Constructor, taking locale.
    232      *
    233      * @param locale
    234      */
    235     public DisplayAndInputProcessor(CLDRLocale locale) {
    236         init(this.locale = locale, true);
    237     }
    238 
    239     /**
    240      * Process the value for display. The result is a string for display in the
    241      * Survey tool or similar program.
    242      *
    243      * @param path
    244      * @param value
    245      * @param fullPath
    246      * @return
    247      */
    248     public synchronized String processForDisplay(String path, String value) {
    249         value = Normalizer.compose(value, false); // Always normalize all text to NFC.
    250         if (hasUnicodeSetValue(path)) {
    251             value = displayUnicodeSet(value);
    252         } else if (path.contains("stopword")) {
    253             return value.trim().isEmpty() ? "NONE" : value;
    254         } else {
    255             NumericType numericType = NumericType.getNumericType(path);
    256             if (numericType != NumericType.NOT_NUMERIC) {
    257                 // Canonicalize existing values that aren't canonicalized yet.
    258                 // New values will be canonicalized on input using processInput().
    259                 try {
    260                     value = getCanonicalPattern(value, numericType, isPosix);
    261                 } catch (IllegalArgumentException e) {
    262                     if (DEBUG_DAIP) System.err.println("Illegal pattern: " + value);
    263                 }
    264                 if (numericType != NumericType.CURRENCY && numericType != NumericType.CURRENCY_ABBREVIATED) {
    265                     value = value.replace("'", "");
    266                 }
    267             }
    268         }
    269         // Fix up any apostrophes in number symbols
    270         if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) {
    271             value = value.replace('\'', '\u2019');
    272         }
    273         // Fix up any apostrophes as appropriate (Don't do so for things like date patterns...
    274         if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
    275             value = normalizeApostrophes(value);
    276         }
    277         // Fix up hyphens, replacing with N-dash as appropriate
    278         if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
    279             value = normalizeIntervalHyphens(value);
    280         } else {
    281             value = normalizeHyphens(value);
    282         }
    283         return value;
    284     }
    285 
    286     private boolean hasUnicodeSetValue(String path) {
    287         return path.startsWith("//ldml/characters/exemplarCharacters") || path.startsWith("//ldml/characters/parseLenients");
    288     }
    289 
    290     static final UnicodeSet WHITESPACE = new UnicodeSet("[:whitespace:]").freeze();
    291     static final DateTimeCanonicalizer dtc = new DateTimeCanonicalizer(FIX_YEARS);
    292 
    293     public static final Splitter SPLIT_BAR = Splitter.on('|').trimResults().omitEmptyStrings();
    294     static final Splitter SPLIT_SPACE = Splitter.on(' ').trimResults().omitEmptyStrings();
    295     static final Joiner JOIN_BAR = Joiner.on(" | ");
    296 
    297     /**
    298      * Process the value for input. The result is a cleaned-up value. For example,
    299      * an exemplar set is modified to be in the normal format, and any missing [ ]
    300      * are added (a common omission on entry). If there are any failures then the
    301      * original value is returned, so that the proper error message can be given.
    302      *
    303      * @param path
    304      * @param value
    305      * @param internalException
    306      *            TODO
    307      * @param fullPath
    308      * @return
    309      */
    310     public synchronized String processInput(String path, String value, Exception[] internalException) {
    311         String original = value;
    312         value = Normalizer.compose(value, false); // Always normalize all input to NFC.
    313         if (internalException != null) {
    314             internalException[0] = null;
    315         }
    316         try {
    317             // Normalise Malayalam characters.
    318             boolean isUnicodeSet = hasUnicodeSetValue(path);
    319             if (locale.childOf(MALAYALAM)) {
    320                 String newvalue = normalizeMalayalam(value);
    321                 if (DEBUG_DAIP) System.out.println("DAIP: Normalized Malayalam '" + value + "' to '" + newvalue + "'");
    322                 value = newvalue;
    323             } else if (locale.childOf(ROMANIAN) && !isUnicodeSet) {
    324                 value = standardizeRomanian(value);
    325             } else if (locale.childOf(CATALAN) && !isUnicodeSet) {
    326                 value = standardizeCatalan(value);
    327             } else if (locale.childOf(NGOMBA) && !isUnicodeSet) {
    328                 value = standardizeNgomba(value);
    329             } else if (locale.childOf(KWASIO) && !isUnicodeSet) {
    330                 value = standardizeKwasio(value);
    331             } else if (locale.childOf(HEBREW) && !APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
    332                 value = replaceChars(path, value, HEBREW_CONVERSIONS, false);
    333             } else if ((locale.childOf(SWISS_GERMAN) || locale.childOf(GERMAN_SWITZERLAND)) && !isUnicodeSet) {
    334                 value = standardizeSwissGerman(value);
    335             } else if (locale.childOf(MYANMAR) && !isUnicodeSet) {
    336                 value = standardizeMyanmar(value);
    337             } else if (locale.childOf(KYRGYZ)) {
    338                 value = replaceChars(path, value, KYRGYZ_CONVERSIONS, false);
    339             } else if (locale.childOf(URDU) || locale.childOf(PASHTO) || locale.childOf(FARSI)) {
    340                 value = replaceChars(path, value, URDU_PLUS_CONVERSIONS, true);
    341             }
    342 
    343             if (UNICODE_WHITESPACE.containsSome(value)) {
    344                 value = normalizeWhitespace(path, value);
    345             }
    346 
    347             // all of our values should not have leading or trailing spaces, except insertBetween
    348             if (!path.contains("/insertBetween") && !isUnicodeSet) {
    349                 value = value.trim();
    350             }
    351 
    352             // fix grouping separator if space
    353             if (path.startsWith("//ldml/numbers/symbols") && !path.contains("/alias")) {
    354                 if (value.isEmpty()) {
    355                     value = "\u00A0";
    356                 }
    357                 value = value.replace(' ', '\u00A0');
    358             }
    359 
    360             // fix date patterns
    361             DateTimePatternType datetimePatternType = DateTimePatternType.fromPath(path);
    362             if (DateTimePatternType.STOCK_AVAILABLE_INTERVAL_PATTERNS.contains(datetimePatternType)) {
    363                 try {
    364                     value = dtc.getCanonicalDatePattern(path, value, datetimePatternType);
    365                 } catch (IllegalArgumentException ex) {
    366                     return value;
    367                 }
    368             }
    369 
    370             if (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("displayName")) {
    371                 value = normalizeCurrencyDisplayName(value);
    372             }
    373             NumericType numericType = NumericType.getNumericType(path);
    374             if (numericType != NumericType.NOT_NUMERIC) {
    375                 if (numericType == NumericType.CURRENCY) {
    376                     value = value.replaceAll(" ", "\u00A0");
    377                     if (numericType == NumericType.CURRENCY_ABBREVIATED) {
    378                         value = value.replaceAll("0\\.0+", "0");
    379                     }
    380                 } else {
    381                     value = value.replaceAll("([%\u00A4]) ", "$1\u00A0")
    382                         .replaceAll(" ([%\u00A4])", "\u00A0$1");
    383                     value = replace(NON_DECIMAL_PERIOD, value, "'.'");
    384                     if (numericType == NumericType.DECIMAL_ABBREVIATED) {
    385                         value = value.replaceAll("0\\.0+", "0");
    386                     }
    387                 }
    388                 value = getCanonicalPattern(value, numericType, isPosix);
    389             }
    390 
    391             // fix [,]
    392             if (path.startsWith("//ldml/localeDisplayNames/languages/language")
    393                 || path.startsWith("//ldml/localeDisplayNames/scripts/script")
    394                 || path.startsWith("//ldml/localeDisplayNames/territories/territory")
    395                 || path.startsWith("//ldml/localeDisplayNames/variants/variant")
    396                 || path.startsWith("//ldml/localeDisplayNames/keys/key")
    397                 || path.startsWith("//ldml/localeDisplayNames/types/type")) {
    398                 value = value.replace('[', '(').replace(']', ')').replace('', '').replace('', '');
    399             }
    400 
    401             // Normalize two single quotes for the inches symbol.
    402             if (path.contains("/units")) {
    403                 value = value.replace("''", "");
    404             }
    405 
    406             // check specific cases
    407             if (isUnicodeSet) {
    408                 value = inputUnicodeSet(path, value);
    409             } else if (path.contains("stopword")) {
    410                 if (value.equals("NONE")) {
    411                     value = "";
    412                 }
    413             }
    414 
    415             // Normalize ellipsis data.
    416             if (path.startsWith("//ldml/characters/ellipsis")) {
    417                 value = value.replace("...", "");
    418             }
    419 
    420             // Replace Arabic presentation forms with their nominal counterparts
    421             value = replaceArabicPresentationForms(value);
    422 
    423             // Fix up any apostrophes as appropriate (Don't do so for things like date patterns...
    424             if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
    425                 value = normalizeApostrophes(value);
    426             }
    427             // Fix up any apostrophes in number symbols
    428             if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) {
    429                 value = value.replace('\'', '\u2019');
    430             }
    431             // Fix up hyphens, replacing with N-dash as appropriate
    432             if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
    433                 value = normalizeIntervalHyphens(value);
    434             } else if (!isUnicodeSet) {
    435                 value = normalizeHyphens(value);
    436             }
    437 
    438             if (path.startsWith("//ldml/annotations/annotation")) {
    439                 if (path.contains(Emoji.TYPE_TTS)) {
    440                     // The row has something like " -name" in the first column. Cf. namePath, getNamePaths.
    441                     // Normally the value is like "zebra" or "unicorn face", without "|".
    442                     // If the user enters a value with "|",  discard anything after "|"; e.g., change "a | b | c" to "a".
    443                     value = SPLIT_BAR.split(value).iterator().next();
    444                 } else {
    445                     // The row has something like " keywords" in the first column. Cf. keywordPath, getKeywordPaths.
    446                     // Normally the value is like "stripe | zebra", with "|".
    447                     value = annotationsForDisplay(value);
    448                 }
    449             }
    450 
    451             return value;
    452         } catch (RuntimeException e) {
    453             if (internalException != null) {
    454                 internalException[0] = e;
    455             }
    456             return original;
    457         }
    458     }
    459 
    460     private static final boolean REMOVE_COVERED_KEYWORDS = true;
    461 
    462     /**
    463      * Produce a modification of the given annotation by sorting its components and filtering covered keywords.
    464      *
    465      * Examples: Given "b | a", return "a | b". Given "bear | panda | panda bear", return "bear | panda".
    466      *
    467      * @param value the string
    468      * @return the possibly modified string
    469      */
    470     private static String annotationsForDisplay(String value) {
    471         TreeSet<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ROOT));
    472         sorted.addAll(SPLIT_BAR.splitToList(value));
    473         if (REMOVE_COVERED_KEYWORDS) {
    474             filterCoveredKeywords(sorted);
    475         }
    476         value = JOIN_BAR.join(sorted);
    477         return value;
    478     }
    479 
    480     /**
    481      * Filter from the given set some keywords that include spaces, if they duplicate,
    482      * or are "covered by", other keywords in the set.
    483      *
    484      * For example, if the set is {"bear", "panda", "panda bear"} (annotation was "bear | panda | panda bear"),
    485      * then remove "panda bear", treating it as "covered" since the set already includes "panda" and "bear".
    486      *
    487      * @param sorted the set from which items may be removed
    488      */
    489     public static void filterCoveredKeywords(TreeSet<String> sorted) {
    490         // for now, just do single items
    491         HashSet<String> toRemove = new HashSet<>();
    492 
    493         for (String item : sorted) {
    494             List<String> list = SPLIT_SPACE.splitToList(item);
    495             if (list.size() < 2) {
    496                 continue;
    497             }
    498             if (sorted.containsAll(list)) {
    499                 toRemove.add(item);
    500             }
    501         }
    502         sorted.removeAll(toRemove);
    503     }
    504 
    505     private String displayUnicodeSet(String value) {
    506         if (value.startsWith("[") && value.endsWith("]")) {
    507             value = value.substring(1, value.length() - 1);
    508         }
    509 
    510         value = replace(NEEDS_QUOTE1, value, "$1\\\\$2$3");
    511         value = replace(NEEDS_QUOTE2, value, "$1\\\\$2$3");
    512 
    513         // if (RTL.containsSome(value) && value.startsWith("[") && value.endsWith("]")) {
    514         // return "\u200E[\u200E" + value.substring(1,value.length()-2) + "\u200E]\u200E";
    515         // }
    516         return value;
    517     }
    518 
    519     private String inputUnicodeSet(String path, String value) {
    520         // clean up the user's input.
    521         // first, fix up the '['
    522         value = value.trim();
    523 
    524         // remove brackets and trim again before regex
    525         if (value.startsWith("[")) {
    526             value = value.substring(1);
    527         }
    528         if (value.endsWith("]") && (!value.endsWith("\\]") || value.endsWith("\\\\]"))) {
    529             value = value.substring(0, value.length() - 1);
    530         }
    531         value = value.trim();
    532 
    533         value = replace(NEEDS_QUOTE1, value, "$1\\\\$2$3");
    534         value = replace(NEEDS_QUOTE2, value, "$1\\\\$2$3");
    535 
    536         // re-add brackets.
    537         value = "[" + value + "]";
    538 
    539         UnicodeSet exemplar = new UnicodeSet(value);
    540         XPathParts parts = XPathParts.getFrozenInstance(path); // new XPathParts().set(path);
    541         if (parts.getElement(2).equals("parseLenients")) {
    542             return exemplar.toPattern(false);
    543         }
    544         final String type = parts.getAttributeValue(-1, "type");
    545         ExemplarType exemplarType = type == null ? ExemplarType.main : ExemplarType.valueOf(type);
    546         value = getCleanedUnicodeSet(exemplar, pp, exemplarType);
    547         return value;
    548     }
    549 
    550     private String normalizeWhitespace(String path, String value) {
    551         // turn all whitespace sequences (including tab and newline, and NBSP for certain paths)
    552         // into a single space or a single NBSP depending on path.
    553         if ((path.contains("/dateFormatLength") && path.contains("/pattern")) ||
    554             path.contains("/availableFormats/dateFormatItem") ||
    555             (path.startsWith("//ldml/dates/timeZoneNames/metazone") && path.contains("/long")) ||
    556             path.startsWith("//ldml/dates/timeZoneNames/regionFormat") ||
    557             path.startsWith("//ldml/localeDisplayNames/codePatterns/codePattern") ||
    558             path.startsWith("//ldml/localeDisplayNames/languages/language") ||
    559             path.startsWith("//ldml/localeDisplayNames/territories/territory") ||
    560             path.startsWith("//ldml/localeDisplayNames/types/type") ||
    561             (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("/displayName")) ||
    562             (path.contains("/decimalFormatLength[@type=\"long\"]") && path.contains("/pattern")) ||
    563             path.startsWith("//ldml/posix/messages") ||
    564             (path.startsWith("//ldml/units/uni") && path.contains("/unitPattern "))) {
    565             value = WHITESPACE_AND_NBSP_TO_NORMALIZE.matcher(value).replaceAll(" "); // replace with regular space
    566         } else if ((path.contains("/currencies/currency") && (path.contains("/group") || path.contains("/pattern")))
    567             ||
    568             (path.contains("/currencyFormatLength") && path.contains("/pattern")) ||
    569             (path.contains("/currencySpacing") && path.contains("/insertBetween")) ||
    570             (path.contains("/decimalFormatLength") && path.contains("/pattern")) || // i.e. the non-long ones
    571             (path.contains("/percentFormatLength") && path.contains("/pattern")) ||
    572             (path.startsWith("//ldml/numbers/symbols") && (path.contains("/group") || path.contains("/nan")))) {
    573             value = WHITESPACE_AND_NBSP_TO_NORMALIZE.matcher(value).replaceAll("\u00A0"); // replace with NBSP
    574         } else {
    575             // in this case don't normalize away NBSP
    576             value = WHITESPACE_NO_NBSP_TO_NORMALIZE.matcher(value).replaceAll(" "); // replace with regular space
    577         }
    578         return value;
    579     }
    580 
    581     private String normalizeCurrencyDisplayName(String value) {
    582         StringBuilder result = new StringBuilder();
    583         boolean inParentheses = false;
    584         for (int i = 0; i < value.length(); i++) {
    585             char c = value.charAt(i);
    586             if (c == '(') {
    587                 inParentheses = true;
    588             } else if (c == ')') {
    589                 inParentheses = false;
    590             }
    591             if (inParentheses && c == '-' && Character.isDigit(value.charAt(i - 1))) {
    592                 c = 0x2013; /* Replace hyphen-minus with dash for date ranges */
    593             }
    594             result.append(c);
    595         }
    596         return result.toString();
    597     }
    598 
    599     private String normalizeApostrophes(String value) {
    600         // If our DAIP always had a CLDRFile to work with, then we could just check the exemplar set in it to see.
    601         // But since we don't, we just maintain the list internally and use it.
    602         if (LANGUAGES_USING_MODIFIER_APOSTROPHE.contains(locale.getLanguage())) {
    603             return value.replace('\'', '\u02bc');
    604         } else {
    605             char prev = 0;
    606             StringBuilder builder = new StringBuilder();
    607             for (char c : value.toCharArray()) {
    608                 if (c == '\'') {
    609                     if (Character.isLetter(prev)) {
    610                         builder.append('\u2019');
    611                     } else {
    612                         builder.append('\u2018');
    613                     }
    614                 } else {
    615                     builder.append(c);
    616                 }
    617                 prev = c;
    618             }
    619             return builder.toString();
    620         }
    621     }
    622 
    623     private String normalizeIntervalHyphens(String value) {
    624         DateTimePatternGenerator.FormatParser fp = new DateTimePatternGenerator.FormatParser();
    625         fp.set(DateIntervalInfo.genPatternInfo(value, false).getFirstPart());
    626         List<Object> items = fp.getItems();
    627         Object last = items.get(items.size() - 1);
    628         if (last instanceof String) {
    629             String separator = last.toString();
    630             if (separator.contains("-")) {
    631                 StringBuilder sb = new StringBuilder();
    632                 sb.append(DateIntervalInfo.genPatternInfo(value, false).getFirstPart());
    633                 if (sb.lastIndexOf(separator) >= 0) {
    634                     sb.delete(sb.lastIndexOf(separator), sb.length());
    635                     sb.append(separator.replace("-", "\u2013"));
    636                     sb.append(DateIntervalInfo.genPatternInfo(value, false).getSecondPart());
    637                     return sb.toString();
    638                 }
    639             }
    640         }
    641         return value;
    642     }
    643 
    644     private String normalizeHyphens(String value) {
    645         int hyphenLocation = value.indexOf("-");
    646         if (hyphenLocation > 0 &&
    647             Character.isDigit(value.charAt(hyphenLocation - 1)) &&
    648             hyphenLocation < value.length() - 1 &&
    649             Character.isDigit(value.charAt(hyphenLocation + 1))) {
    650             StringBuilder sb = new StringBuilder();
    651             sb.append(value.substring(0, hyphenLocation));
    652             sb.append("\u2013");
    653             sb.append(value.substring(hyphenLocation + 1));
    654             return sb.toString();
    655         }
    656         return value;
    657     }
    658 
    659     private String standardizeRomanian(String value) {
    660         StringBuilder builder = new StringBuilder();
    661         for (char c : value.toCharArray()) {
    662             for (char[] pair : ROMANIAN_CONVERSIONS) {
    663                 if (c == pair[0]) {
    664                     c = pair[1];
    665                     break;
    666                 }
    667             }
    668             builder.append(c);
    669         }
    670         return builder.toString();
    671     }
    672 
    673     private String standardizeKwasio(String value) {
    674         StringBuilder builder = new StringBuilder();
    675         for (char c : value.toCharArray()) {
    676             for (char[] pair : KWASIO_CONVERSIONS) {
    677                 if (c == pair[0]) {
    678                     c = pair[1];
    679                     break;
    680                 }
    681             }
    682             builder.append(c);
    683         }
    684         return builder.toString();
    685     }
    686 
    687     // Use the myanmar-tools detector.
    688     private String standardizeMyanmar(String value) {
    689         if (detector.getZawgyiProbability(value) > 0.90) {
    690             return zawgyiUnicodeTransliterator.transform(value);
    691         }
    692         return value;
    693     }
    694 
    695     private String standardizeNgomba(String value) {
    696         StringBuilder builder = new StringBuilder();
    697         char[] charArray = value.toCharArray();
    698         for (int i = 0; i < charArray.length; i++) {
    699             char c = charArray[i];
    700             boolean convertedSaltillo = false;
    701             for (char[] pair : NGOMBA_CONVERSIONS) {
    702                 if (c == pair[0]) {
    703                     c = pair[1];
    704                     if (c == '\uA78C') {
    705                         convertedSaltillo = true;
    706                     }
    707                     break;
    708                 }
    709             }
    710             if (convertedSaltillo &&
    711                 ((i > 0 && i < charArray.length - 1 && Character.isUpperCase(charArray[i - 1]) && Character.isUpperCase(charArray[i + 1])) ||
    712                     (i > 1 && Character.isUpperCase(charArray[i - 1]) && Character.isUpperCase(charArray[i - 2])))) {
    713                 c = '\uA78B'; // UPPER CASE SALTILLO
    714             }
    715             builder.append(c);
    716         }
    717         return builder.toString();
    718     }
    719 
    720     private String replaceChars(String path, String value, char[][] charsToReplace, boolean skipAuxExemplars) {
    721         if (skipAuxExemplars && path.contains("/exemplarCharacters[@type=\"auxiliary\"]")) {
    722             return value;
    723         }
    724         StringBuilder builder = new StringBuilder();
    725         for (char c : value.toCharArray()) {
    726             for (char[] pair : charsToReplace) {
    727                 if (c == pair[0]) {
    728                     c = pair[1];
    729                     break;
    730                 }
    731             }
    732             builder.append(c);
    733         }
    734         return builder.toString();
    735     }
    736 
    737     private String standardizeSwissGerman(String value) {
    738         return value.replaceAll("\u00DF", "ss");
    739     }
    740 
    741     private String standardizeCatalan(String value) {
    742         StringBuilder builder = new StringBuilder();
    743         for (char c : value.toCharArray()) {
    744             boolean didSubstitute = false;
    745             for (char[] triple : CATALAN_CONVERSIONS) {
    746                 if (c == triple[0]) {
    747                     builder.append(triple[1]);
    748                     builder.append(triple[2]);
    749                     didSubstitute = true;
    750                     break;
    751                 }
    752             }
    753             if (!didSubstitute) {
    754                 builder.append(c);
    755             }
    756         }
    757         return builder.toString();
    758     }
    759 
    760     private String replace(Pattern pattern, String value, String replacement) {
    761         String value2 = pattern.matcher(value).replaceAll(replacement);
    762         if (DEBUG_DAIP && !value.equals(value2)) {
    763             System.out.println("\n" + value + " => " + value2);
    764         }
    765         return value2;
    766     }
    767 
    768     private static Pattern UNNORMALIZED_MALAYALAM = PatternCache.get(
    769         "(\u0D23|\u0D28|\u0D30|\u0D32|\u0D33|\u0D15)\u0D4D\u200D");
    770 
    771     private static Map<Character, Character> NORMALIZING_MAP = Builder.with(new HashMap<Character, Character>())
    772         .put('\u0D23', '\u0D7A').put('\u0D28', '\u0D7B')
    773         .put('\u0D30', '\u0D7C').put('\u0D32', '\u0D7D')
    774         .put('\u0D33', '\u0D7E').put('\u0D15', '\u0D7F').get();
    775 
    776     /**
    777      * Normalizes the Malayalam characters in the specified input.
    778      *
    779      * @param value
    780      *            the input to be normalized
    781      * @return
    782      */
    783     private String normalizeMalayalam(String value) {
    784         // Normalize Malayalam characters.
    785         Matcher matcher = UNNORMALIZED_MALAYALAM.matcher(value);
    786         if (matcher.find()) {
    787             StringBuffer buffer = new StringBuffer();
    788             int start = 0;
    789             do {
    790                 buffer.append(value.substring(start, matcher.start(0)));
    791                 char codePoint = matcher.group(1).charAt(0);
    792                 buffer.append(NORMALIZING_MAP.get(codePoint));
    793                 start = matcher.end(0);
    794             } while (matcher.find());
    795             buffer.append(value.substring(start));
    796             value = buffer.toString();
    797         }
    798         return value;
    799     }
    800 
    801     static final Transform<String, String> fixArabicPresentation = Transliterator.getInstance(
    802         "[[:block=Arabic_Presentation_Forms_A:][:block=Arabic_Presentation_Forms_B:]] nfkc");
    803 
    804     /**
    805      * Normalizes the Arabic presentation forms characters in the specified input.
    806      *
    807      * @param value
    808      *            the input to be normalized
    809      * @return
    810      */
    811     private String replaceArabicPresentationForms(String value) {
    812         value = fixArabicPresentation.transform(value);
    813         return value;
    814     }
    815 
    816     static Pattern REMOVE_QUOTE1 = PatternCache.get("(\\s)(\\\\[-\\}\\]\\&])()");
    817     static Pattern REMOVE_QUOTE2 = PatternCache.get("(\\\\[\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s)
    818 
    819     static Pattern NEEDS_QUOTE1 = PatternCache.get("(\\s|$)([-\\}\\]\\&])()");
    820     static Pattern NEEDS_QUOTE2 = PatternCache.get("([^\\\\])([\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s)
    821 
    822     public static String getCleanedUnicodeSet(UnicodeSet exemplar, UnicodeSetPrettyPrinter prettyPrinter,
    823         ExemplarType exemplarType) {
    824         if (prettyPrinter == null) {
    825             return exemplar.toPattern(false);
    826         }
    827         String value;
    828         prettyPrinter.setCompressRanges(exemplar.size() > 300);
    829         value = exemplar.toPattern(false);
    830         UnicodeSet toAdd = new UnicodeSet();
    831 
    832         for (UnicodeSetIterator usi = new UnicodeSetIterator(exemplar); usi.next();) {
    833             String string = usi.getString();
    834             if (string.equals("") || string.equals("")) {
    835                 toAdd.add(string);
    836                 continue;
    837             }
    838             if (exemplarType.convertUppercase) {
    839                 string = UCharacter.toLowerCase(ULocale.ENGLISH, string);
    840             }
    841             toAdd.add(string);
    842             String composed = Normalizer.compose(string, false);
    843             if (!string.equals(composed)) {
    844                 toAdd.add(composed);
    845             }
    846         }
    847 
    848         toAdd.removeAll(exemplarType.toRemove);
    849 
    850         if (DEBUG_DAIP && !toAdd.equals(exemplar)) {
    851             UnicodeSet oldOnly = new UnicodeSet(exemplar).removeAll(toAdd);
    852             UnicodeSet newOnly = new UnicodeSet(toAdd).removeAll(exemplar);
    853             System.out.println("Exemplar:\t" + exemplarType + ",\tremoved\t" + oldOnly + ",\tadded\t" + newOnly);
    854         }
    855 
    856         String fixedExemplar = prettyPrinter.format(toAdd);
    857         UnicodeSet doubleCheck = new UnicodeSet(fixedExemplar);
    858         if (!toAdd.equals(doubleCheck)) {
    859             // something went wrong, leave as is
    860         } else if (!value.equals(fixedExemplar)) { // put in this condition just for debugging
    861             if (DEBUG_DAIP) {
    862                 System.out.println(TestMetadata.showDifference(
    863                     With.codePoints(value),
    864                     With.codePoints(fixedExemplar),
    865                     "\n"));
    866             }
    867             value = fixedExemplar;
    868         }
    869         return value;
    870     }
    871 
    872     /**
    873      * @return a canonical numeric pattern, based on the type, and the isPOSIX flag. The latter is set for en_US_POSIX.
    874      */
    875     static final Splitter SEMI_SPLITTER = Splitter.on(';').trimResults();
    876 
    877     public static String getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX) {
    878         // TODO fix later to properly handle quoted ;
    879 
    880         DecimalFormat df = new DecimalFormat(inpattern);
    881         if (type == NumericType.DECIMAL_ABBREVIATED || type == NumericType.CURRENCY_ABBREVIATED
    882             || CldrUtility.INHERITANCE_MARKER.equals(inpattern)) {
    883             return inpattern; // TODO fix when ICU bug is fixed
    884             // df.setMaximumFractionDigits(df.getMinimumFractionDigits());
    885             // df.setMaximumIntegerDigits(Math.max(1, df.getMinimumIntegerDigits()));
    886         } else {
    887             // int decimals = type == CURRENCY_TYPE ? 2 : 1;
    888             int[] digits = isPOSIX ? type.posixDigitCount : type.digitCount;
    889             df.setMinimumIntegerDigits(digits[0]);
    890             df.setMinimumFractionDigits(digits[1]);
    891             df.setMaximumFractionDigits(digits[2]);
    892         }
    893         String pattern = df.toPattern();
    894         List<String> parts = SEMI_SPLITTER.splitToList(pattern);
    895         String pattern2 = parts.get(0);
    896         if (parts.size() > 1) {
    897             pattern2 += ";" + parts.get(1);
    898         }
    899         if (!pattern2.equals(pattern)) {
    900             pattern = pattern2;
    901         }
    902         // int pos = pattern.indexOf(';');
    903         // if (pos < 0) return pattern + ";-" + pattern;
    904         return pattern;
    905     }
    906 
    907     /*
    908      * This tests what type a numeric pattern is.
    909      */
    910     public enum NumericType {
    911         CURRENCY(new int[] { 1, 2, 2 }, new int[] { 1, 2, 2 }), CURRENCY_ABBREVIATED(), DECIMAL(new int[] { 1, 0, 3 },
    912             new int[] { 1, 0, 6 }), DECIMAL_ABBREVIATED(), PERCENT(new int[] { 1, 0, 0 },
    913                 new int[] { 1, 0, 0 }), SCIENTIFIC(new int[] { 0, 0, 0 }, new int[] { 1, 6, 6 }), NOT_NUMERIC;
    914 
    915         private static final Pattern NUMBER_PATH = Pattern
    916             .compile("//ldml/numbers/((currency|decimal|percent|scientific)Formats|currencies/currency).*");
    917         private int[] digitCount;
    918         private int[] posixDigitCount;
    919 
    920         private NumericType() {
    921         };
    922 
    923         private NumericType(int[] digitCount, int[] posixDigitCount) {
    924             this.digitCount = digitCount;
    925             this.posixDigitCount = posixDigitCount;
    926         }
    927 
    928         /**
    929          * @return the numeric type of the xpath
    930          */
    931         public static NumericType getNumericType(String xpath) {
    932             Matcher matcher = NUMBER_PATH.matcher(xpath);
    933             if (xpath.indexOf("/pattern") < 0) {
    934                 return NOT_NUMERIC;
    935             } else if (matcher.matches()) {
    936                 if (matcher.group(1).equals("currencies/currency")) {
    937                     return CURRENCY;
    938                 } else {
    939                     NumericType type = NumericType.valueOf(matcher.group(2).toUpperCase());
    940                     if (xpath.contains("=\"1000")) {
    941                         if (type == DECIMAL) {
    942                             type = DECIMAL_ABBREVIATED;
    943                         } else if (type == CURRENCY) {
    944                             type = CURRENCY_ABBREVIATED;
    945                         } else {
    946                             throw new IllegalArgumentException("Internal Error");
    947                         }
    948                     }
    949                     return type;
    950                 }
    951             } else {
    952                 return NOT_NUMERIC;
    953             }
    954         }
    955 
    956         public int[] getDigitCount() {
    957             return digitCount;
    958         }
    959 
    960         public int[] getPosixDigitCount() {
    961             return posixDigitCount;
    962         }
    963     };
    964 }
    965