Home | History | Annotate | Download | only in util
      1 /*
      2  **********************************************************************
      3  * Copyright (c) 2002-2011, International Business Machines
      4  * Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  * Author: Mark Davis
      7  **********************************************************************
      8  */
      9 package org.unicode.cldr.util;
     10 
     11 import java.util.Collection;
     12 import java.util.Collections;
     13 import java.util.EnumSet;
     14 import java.util.Iterator;
     15 import java.util.List;
     16 import java.util.Locale;
     17 import java.util.Map;
     18 import java.util.Map.Entry;
     19 import java.util.NoSuchElementException;
     20 import java.util.Set;
     21 import java.util.StringTokenizer;
     22 import java.util.TreeMap;
     23 import java.util.TreeSet;
     24 import java.util.regex.Pattern;
     25 
     26 import org.unicode.cldr.tool.LikelySubtags;
     27 
     28 import com.google.common.base.CharMatcher;
     29 import com.google.common.base.Joiner;
     30 import com.google.common.base.Splitter;
     31 import com.google.common.collect.ImmutableList;
     32 import com.google.common.collect.ImmutableMap;
     33 import com.ibm.icu.impl.Relation;
     34 import com.ibm.icu.impl.Row.R2;
     35 import com.ibm.icu.text.UnicodeSet;
     36 
     37 public class LanguageTagParser {
     38     /**
     39      * @return Returns the language, or "" if none.
     40      */
     41     public String getLanguage() {
     42         return language;
     43     }
     44 
     45     /**
     46      * @return Returns the script, or "" if none.
     47      */
     48     public String getScript() {
     49         return script;
     50     }
     51 
     52     /**
     53      * @return Returns the region, or "" if none.
     54      */
     55     public String getRegion() {
     56         return region;
     57     }
     58 
     59     /**
     60      * @return Returns the variants.
     61      */
     62     public List<String> getVariants() {
     63         return ImmutableList.copyOf(variants);
     64     }
     65 
     66     /**
     67      * @return Returns the grandfathered flag
     68      */
     69     public boolean isGrandfathered() {
     70         return grandfathered;
     71     }
     72 
     73     /**
     74      * @return Returns the extensions.
     75      */
     76     @Deprecated
     77     public Map<String, String> getExtensions() {
     78         return OutputOption.ICU.convert(extensions);
     79     }
     80 
     81     /**
     82      * @return Returns the localeExtensions.
     83      */
     84     @Deprecated
     85     public Map<String, String> getLocaleExtensions() {
     86         return OutputOption.ICU.convert(localeExtensions);
     87     }
     88 
     89     /**
     90      * @return Returns the extensions.
     91      */
     92     public Map<String, List<String>> getExtensionsDetailed() {
     93         return ImmutableMap.copyOf(extensions);
     94     }
     95 
     96     /**
     97      * @return Returns the localeExtensions.
     98      */
     99     public Map<String, List<String>> getLocaleExtensionsDetailed() {
    100         return ImmutableMap.copyOf(localeExtensions);
    101     }
    102 
    103     /**
    104      * @return Returns the original, preparsed language tag
    105      */
    106     public String getOriginal() {
    107         return original;
    108     }
    109 
    110     /**
    111      * @return Returns the language-script (or language) part of a tag.
    112      */
    113     public String getLanguageScript() {
    114         if (script.length() != 0) return language + "_" + script;
    115         return language;
    116     }
    117 
    118     /**
    119      * @param in
    120      *            Collection of language tag strings
    121      * @return Returns each of the language-script tags in the collection.
    122      */
    123     public static Set<String> getLanguageScript(Collection<String> in) {
    124         return getLanguageAndScript(in, null);
    125     }
    126 
    127     /**
    128      * @param in
    129      *            Collection of language tag strings
    130      * @return Returns each of the language-script tags in the collection.
    131      */
    132     public static Set<String> getLanguageAndScript(Collection<String> in, Set<String> output) {
    133         if (output == null) output = new TreeSet<String>();
    134         LanguageTagParser lparser = new LanguageTagParser();
    135         for (Iterator<String> it = in.iterator(); it.hasNext();) {
    136             output.add(lparser.set(it.next()).getLanguageScript());
    137         }
    138         return output;
    139     }
    140 
    141     // private fields
    142 
    143     private String original;
    144     private boolean grandfathered = false;
    145     private String language;
    146     private String script;
    147     private String region;
    148     private Set<String> variants = new TreeSet<String>();
    149     private Map<String, List<String>> extensions = new TreeMap<String, List<String>>(); // use tree map
    150     private Map<String, List<String>> localeExtensions = new TreeMap<String, List<String>>();
    151 
    152     private static final UnicodeSet ALPHA = new UnicodeSet("[a-zA-Z]").freeze();
    153     private static final UnicodeSet DIGIT = new UnicodeSet("[0-9]").freeze();
    154     private static final UnicodeSet ALPHANUM = new UnicodeSet("[0-9a-zA-Z]").freeze();
    155     private static final UnicodeSet EXTENSION_VALUE = new UnicodeSet("[0-9a-zA-Z/_]").freeze();
    156     private static final UnicodeSet X = new UnicodeSet("[xX]").freeze();
    157     private static final UnicodeSet ALPHA_MINUS_X = new UnicodeSet(ALPHA).removeAll(X).freeze();
    158     private static StandardCodes standardCodes = StandardCodes.make();
    159     private static final Set<String> grandfatheredCodes = standardCodes.getAvailableCodes("grandfathered");
    160     private static final String separator = "-_"; // '-' alone for 3066bis language tags
    161     private static final UnicodeSet SEPARATORS = new UnicodeSet().addAll(separator).freeze();
    162     private static final Splitter SPLIT_BAR = Splitter.on(CharMatcher.anyOf(separator));
    163     private static final Splitter SPLIT_COLON = Splitter.on(';');
    164     private static final Splitter SPLIT_EQUAL = Splitter.on('=');
    165     private static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance();
    166     private static final Relation<R2<String, String>, String> BCP47_ALIASES = SDI.getBcp47Aliases();
    167 
    168     /**
    169      * Parses out a language tag, setting a number of fields that can subsequently be retrieved.
    170      * If a private-use field is found, it is returned as the last extension.<br>
    171      * This only checks for well-formedness (syntax), not for validity (subtags in registry). For the latter, see
    172      * isValid.
    173      *
    174      * @param languageTag
    175      * @return
    176      */
    177     public LanguageTagParser set(String languageTag) {
    178         if (languageTag.length() == 0) {
    179             throw new IllegalArgumentException("Language tag cannot be empty");
    180         }
    181         languageTag = languageTag.toLowerCase(Locale.ROOT);
    182 
    183         // clear everything out
    184         language = region = script = "";
    185         grandfathered = false;
    186         variants.clear();
    187         extensions.clear();
    188         localeExtensions.clear();
    189         original = languageTag;
    190         int localeExtensionsPosition = languageTag.indexOf('@');
    191         if (localeExtensionsPosition >= 0) {
    192             final String localeExtensionsString = languageTag.substring(localeExtensionsPosition + 1);
    193             for (String keyValue : SPLIT_COLON.split(localeExtensionsString)) {
    194                 final Iterator<String> keyValuePair = SPLIT_EQUAL.split(keyValue).iterator();
    195                 final String key = keyValuePair.next();
    196                 final String value = keyValuePair.next();
    197                 if (keyValuePair.hasNext() || !ALPHANUM.containsAll(key) || !EXTENSION_VALUE.containsAll(value)) {
    198                     throwError(keyValue, "Invalid key/value pair");
    199                 }
    200                 localeExtensions.put(key, SPLIT_BAR.splitToList(value));
    201             }
    202             languageTag = languageTag.substring(0, localeExtensionsPosition);
    203         }
    204 
    205         // first test for grandfathered
    206         if (grandfatheredCodes.contains(languageTag)) {
    207             language = languageTag;
    208             grandfathered = true;
    209             return this;
    210         }
    211 
    212         // each time we fetch a token, we check for length from 1..8, and all alphanum
    213         StringTokenizer st = new StringTokenizer(languageTag, separator);
    214         String subtag;
    215         try {
    216             subtag = getSubtag(st);
    217         } catch (Exception e1) {
    218             throw new IllegalArgumentException("Illegal language tag: " + languageTag, e1);
    219         }
    220 
    221         // check for private use (x-...) and return if so
    222         if (subtag.equalsIgnoreCase("x")) {
    223             getExtension(subtag, st, 1);
    224             return this;
    225         }
    226 
    227         // check that language subtag is valid
    228         if (!ALPHA.containsAll(subtag) || subtag.length() < 2) {
    229             throwError(subtag, "Invalid language subtag");
    230         }
    231         try { // The try block is to catch the out-of-tokens case. Easier than checking each time.
    232             language = subtag;
    233             subtag = getSubtag(st); // prepare for next
    234 
    235             // check for script, 4 letters
    236             if (subtag.length() == 4 && ALPHA.containsAll(subtag)) {
    237                 script = subtag;
    238                 script = script.substring(0, 1).toUpperCase(Locale.ROOT)
    239                     + script.substring(1);
    240                 subtag = getSubtag(st); // prepare for next
    241             }
    242 
    243             // check for region, 2 letters or 3 digits
    244             if (subtag.length() == 2 && ALPHA.containsAll(subtag)
    245                 || subtag.length() == 3 && DIGIT.containsAll(subtag)) {
    246                 region = subtag.toUpperCase(Locale.ENGLISH);
    247                 subtag = getSubtag(st); // prepare for next
    248             }
    249 
    250             // get variants: length > 4 or len=4 & starts with digit
    251             while (isValidVariant(subtag)) {
    252                 variants.add(subtag);
    253                 subtag = getSubtag(st); // prepare for next
    254             }
    255 
    256             // get extensions: singleton '-' subtag (2-8 long)
    257             while (subtag.length() == 1 && ALPHA_MINUS_X.contains(subtag)) {
    258                 subtag = getExtension(subtag, st, 2);
    259                 if (subtag == null) return this; // done
    260             }
    261 
    262             if (subtag.equalsIgnoreCase("x")) {
    263                 getExtension(subtag, st, 1);
    264                 return this;
    265             }
    266 
    267             // if we make it to this point, then we have an error
    268             throwError(subtag, "Illegal subtag");
    269 
    270         } catch (NoSuchElementException e) {
    271             // this exception just means we ran out of tokens. That's ok, so we just return.
    272         }
    273         return this;
    274     }
    275 
    276     private boolean isValidVariant(String subtag) {
    277         return subtag != null && ALPHANUM.containsAll(subtag)
    278             && (subtag.length() > 4 || subtag.length() == 4 && DIGIT.contains(subtag.charAt(0)));
    279     }
    280 
    281     /**
    282      *
    283      * @return true iff the language tag validates
    284      */
    285     public boolean isValid() {
    286         if (grandfathered) return true; // don't need further checking, since we already did so when parsing
    287         if (!validates(language, "language")) return false;
    288         if (!validates(script, "script")) return false;
    289         if (!validates(region, "territory")) return false;
    290         for (Iterator<String> it = variants.iterator(); it.hasNext();) {
    291             if (!validates(it.next(), "variant")) return false;
    292         }
    293         return true; // passed the gauntlet
    294     }
    295 
    296     public enum Status {
    297         WELL_FORMED, VALID, CANONICAL, MINIMAL
    298     }
    299 
    300     public Status getStatus(Set<String> errors) {
    301         errors.clear();
    302         if (!isValid()) {
    303             return Status.WELL_FORMED;
    304             // TODO, check the bcp47 extension codes also
    305         }
    306         Map<String, Map<String, R2<List<String>, String>>> aliasInfo = SDI.getLocaleAliasInfo();
    307         Map<String, Map<String, String>> languageInfo = StandardCodes.getLStreg().get("language");
    308 
    309         if (aliasInfo.get("language").containsKey(language)) {
    310             errors.add("Non-canonical language: " + language);
    311         }
    312         Map<String, String> lstrInfo = languageInfo.get(language);
    313         if (lstrInfo != null) {
    314             String scope = lstrInfo.get("Scope");
    315             if ("collection".equals(scope)) {
    316                 errors.add("Collection language: " + language);
    317             }
    318         }
    319         if (aliasInfo.get("script").containsKey(script)) {
    320             errors.add("Non-canonical script: " + script);
    321         }
    322         if (aliasInfo.get("territory").containsKey(region)) {
    323             errors.add("Non-canonical region: " + region);
    324         }
    325         if (!errors.isEmpty()) {
    326             return Status.VALID;
    327         }
    328         String tag = language + (script.isEmpty() ? "" : "_" + script) + (region.isEmpty() ? "" : "_" + region);
    329         String minimized = LikelySubtags.minimize(tag, SDI.getLikelySubtags(), false);
    330         if (minimized == null) {
    331             errors.add("No minimal data for:" + tag);
    332             if (script.isEmpty() && region.isEmpty()) {
    333                 return Status.MINIMAL;
    334             } else {
    335                 return Status.CANONICAL;
    336             }
    337         }
    338         if (!tag.equals(minimized)) {
    339             errors.add("Not minimal:" + tag + "-->" + minimized);
    340             return Status.CANONICAL;
    341         }
    342         return Status.MINIMAL;
    343     }
    344 
    345     /**
    346      * @param subtag
    347      * @param type
    348      * @return true if the subtag is empty, or if it is in the registry
    349      */
    350     private boolean validates(String subtag, String type) {
    351         return subtag.length() == 0 || standardCodes.getAvailableCodes(type).contains(subtag);
    352     }
    353 
    354     /**
    355      * Internal method
    356      *
    357      * @param minLength
    358      *            TODO
    359      */
    360     private String getExtension(String subtag, StringTokenizer st, int minLength) {
    361         final String key = subtag;
    362         if (extensions.containsKey(key)) {
    363             throwError(subtag, "Can't have two extensions with the same key");
    364         }
    365         if (!st.hasMoreElements()) {
    366             throwError(subtag, "Private Use / Extension requires subsequent subtag");
    367         }
    368         ImmutableList.Builder<String> result = ImmutableList.builder();
    369         try {
    370             while (st.hasMoreElements()) {
    371                 subtag = getSubtag(st);
    372                 if (subtag.length() < minLength) {
    373                     return subtag;
    374                 }
    375                 result.add(subtag);
    376             }
    377             return null;
    378         } finally {
    379             extensions.put(key, result.build());
    380         }
    381     }
    382 
    383     /**
    384      * Internal method
    385      */
    386     private String getSubtag(StringTokenizer st) {
    387         String result = st.nextToken();
    388         if (result.length() < 1 || result.length() > 8) {
    389             throwError(result, "Illegal length (must be 1..8)");
    390         }
    391         if (!ALPHANUM.containsAll(result)) {
    392             throwError(result, "Illegal characters (" + new UnicodeSet().addAll(result).removeAll(ALPHANUM) + ")");
    393         }
    394         return result;
    395     }
    396 
    397     /**
    398      * Internal method
    399      */
    400     private void throwError(String subtag, String errorText) {
    401         throw new IllegalArgumentException(errorText + ": " + subtag + " in " + original);
    402     }
    403 
    404     public LanguageTagParser setRegion(String region) {
    405         this.region = region;
    406         return this;
    407     }
    408 
    409     public LanguageTagParser setScript(String script) {
    410         this.script = script;
    411         return this;
    412     }
    413 
    414     public enum OutputOption {
    415         ICU('_'), BCP47('-');
    416         final char separator;
    417         final Joiner joiner;
    418 
    419         private OutputOption(char separator) {
    420             this.separator = separator;
    421             joiner = Joiner.on(separator);
    422         }
    423 
    424         public Map<String, String> convert(Map<String, List<String>> mapToList) {
    425             if (mapToList.isEmpty()) {
    426                 return Collections.emptyMap();
    427             }
    428             ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();
    429             for (Entry<String, List<String>> entry : mapToList.entrySet()) {
    430                 builder.put(entry.getKey(), joiner.join(entry.getValue()));
    431             }
    432             return builder.build();
    433         }
    434     }
    435 
    436     public String toString() {
    437         return toString(OutputOption.ICU);
    438     }
    439 
    440     public String toString(OutputOption oo) {
    441         StringBuilder result = new StringBuilder(language); // optimize for the simple cases
    442         if (this.script.length() != 0) result.append(oo.separator).append(script);
    443         if (this.region.length() != 0) result.append(oo.separator).append(region);
    444         if (this.variants.size() != 0) {
    445             for (String variant : variants) {
    446                 result.append(oo.separator).append(oo != OutputOption.ICU ? variant : variant.toUpperCase(Locale.ROOT));
    447             }
    448         }
    449         if (this.extensions.size() != 0) {
    450             for (Entry<String, List<String>> extension : extensions.entrySet()) {
    451                 String key = extension.getKey();
    452                 String value = oo.joiner.join(extension.getValue());
    453                 result.append(oo.separator).append(key)
    454                     .append(oo.separator).append(value);
    455             }
    456         }
    457         if (this.localeExtensions.size() != 0) {
    458             if (oo == OutputOption.BCP47) {
    459                 throw new IllegalArgumentException("Cannot represent as BCP47 without canonicalizing first");
    460             }
    461             result.append('@');
    462             for (Entry<String, List<String>> extension : localeExtensions.entrySet()) {
    463                 String key = extension.getKey();
    464                 String value = oo.joiner.join(extension.getValue());
    465                 result.append(oo != OutputOption.ICU ? key : key.toUpperCase(Locale.ROOT))
    466                     .append('=').append(oo != OutputOption.ICU ? value : value.toUpperCase(Locale.ROOT));
    467             }
    468         }
    469         return result.toString();
    470     }
    471 
    472     /**
    473      * Return just the language, script, and region (no variants or extensions)
    474      * @return
    475      */
    476     public String toLSR() {
    477         String result = language; // optimize for the simple cases
    478         if (this.script.length() != 0) result += "_" + script;
    479         if (this.region.length() != 0) result += "_" + region;
    480         return result;
    481     }
    482 
    483     public enum Fields {
    484         LANGUAGE, SCRIPT, REGION, VARIANTS
    485     };
    486 
    487     public static Set<Fields> LANGUAGE_SCRIPT = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.SCRIPT));
    488     public static Set<Fields> LANGUAGE_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.REGION));
    489     public static Set<Fields> LANGUAGE_SCRIPT_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE,
    490         Fields.SCRIPT, Fields.REGION));
    491 
    492     public String toString(Set<Fields> selection) {
    493         String result = language;
    494         if (selection.contains(Fields.SCRIPT) && script.length() != 0) result += "_" + script;
    495         if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" + region;
    496         if (selection.contains(Fields.VARIANTS) && variants.size() != 0) {
    497             for (String variant : (Collection<String>) variants) {
    498                 result += "_" + variant;
    499             }
    500         }
    501         return result;
    502     }
    503 
    504     public LanguageTagParser setLanguage(String language) {
    505         if (SEPARATORS.containsSome(language)) {
    506             String oldScript = script;
    507             String oldRegion = region;
    508             Set<String> oldVariants = variants;
    509             set(language);
    510             if (script.length() == 0) {
    511                 script = oldScript;
    512             }
    513             if (region.length() == 0) {
    514                 region = oldRegion;
    515             }
    516             if (oldVariants.size() != 0) {
    517                 variants = oldVariants;
    518             }
    519         } else {
    520             this.language = language;
    521         }
    522         return this;
    523     }
    524 
    525     public LanguageTagParser setLocaleExtensions(Map<String, String> localeExtensions) {
    526         this.localeExtensions = expandMap(localeExtensions, 1, Integer.MAX_VALUE);
    527         return this;
    528     }
    529 
    530     public LanguageTagParser setVariants(Collection<String> newVariants) {
    531         for (String variant : newVariants) {
    532             if (!isValidVariant(variant)) {
    533                 throw new IllegalArgumentException("Illegal variant: " + variant);
    534             }
    535         }
    536         variants.clear();
    537         variants.addAll(newVariants);
    538         return this;
    539     }
    540 
    541     static final Pattern EXTENSION_PATTERN = PatternCache.get("([0-9a-zA-Z]{2,8}(-[0-9a-zA-Z]{2,8})*)?");
    542 
    543     public LanguageTagParser setExtensions(Map<String, String> newExtensions) {
    544         this.extensions = expandMap(newExtensions, 2, 8);
    545         return this;
    546     }
    547 
    548     public static String getSimpleParent(String s) {
    549         int lastBar = s.lastIndexOf('_');
    550         return lastBar >= 0 ? s.substring(0, lastBar) : "";
    551     }
    552 
    553     private Map<String, List<String>> expandMap(Map<String, String> newLocaleExtensions, int minLength, int maxLength) {
    554         if (newLocaleExtensions.isEmpty()) {
    555             return Collections.emptyMap();
    556         }
    557         ImmutableMap.Builder<String, List<String>> result = ImmutableMap.builder();
    558         for (Entry<String, String> entry : newLocaleExtensions.entrySet()) {
    559             result.put(entry.getKey(), split(entry.getValue(), minLength, maxLength));
    560         }
    561         return result.build();
    562     }
    563 
    564     private List<String> split(String value, int minLength, int maxLength) {
    565         List<String> values = SPLIT_BAR.splitToList(value);
    566         for (String s : values) {
    567             if (s.length() < minLength || s.length() > maxLength) {
    568                 throw new IllegalArgumentException("Illegal subtag length for: " + s);
    569             }
    570             if (!ALPHANUM.contains(s)) {
    571                 throw new IllegalArgumentException("Illegal locale character in: " + s);
    572             }
    573         }
    574         return values;
    575     }
    576 }