Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4 ******************************************************************************
      5 * Copyright (C) 2003-2011, International Business Machines Corporation and   *
      6 * others. All Rights Reserved.                                               *
      7 ******************************************************************************
      8 */
      9 
     10 package com.ibm.icu.impl;
     11 
     12 import java.util.Collections;
     13 import java.util.Comparator;
     14 import java.util.Iterator;
     15 import java.util.Map;
     16 import java.util.TreeMap;
     17 
     18 import com.ibm.icu.impl.locale.AsciiUtil;
     19 
     20 /**
     21  * Utility class to parse and normalize locale ids (including POSIX style)
     22  */
     23 public final class LocaleIDParser {
     24 
     25     /**
     26      * Char array representing the locale ID.
     27      */
     28     private char[] id;
     29 
     30     /**
     31      * Current position in {@link #id} (while parsing).
     32      */
     33     private int index;
     34 
     35     /**
     36      * Temporary buffer for parsed sections of data.
     37      */
     38     private StringBuilder buffer;
     39 
     40     // um, don't handle POSIX ids unless we request it.  why not?  well... because.
     41     private boolean canonicalize;
     42     private boolean hadCountry;
     43 
     44     // used when canonicalizing
     45     Map<String, String> keywords;
     46     String baseName;
     47 
     48     /**
     49      * Parsing constants.
     50      */
     51     private static final char KEYWORD_SEPARATOR     = '@';
     52     private static final char HYPHEN                = '-';
     53     private static final char KEYWORD_ASSIGN        = '=';
     54     private static final char COMMA                 = ',';
     55     private static final char ITEM_SEPARATOR        = ';';
     56     private static final char DOT                   = '.';
     57     private static final char UNDERSCORE            = '_';
     58 
     59     public LocaleIDParser(String localeID) {
     60         this(localeID, false);
     61     }
     62 
     63     public LocaleIDParser(String localeID, boolean canonicalize) {
     64         id = localeID.toCharArray();
     65         index = 0;
     66         buffer = new StringBuilder(id.length + 5);
     67         this.canonicalize = canonicalize;
     68     }
     69 
     70     private void reset() {
     71         index = 0;
     72         buffer = new StringBuilder(id.length + 5);
     73     }
     74 
     75     // utilities for working on text in the buffer
     76 
     77     /**
     78      * Append c to the buffer.
     79      */
     80     private void append(char c) {
     81         buffer.append(c);
     82     }
     83 
     84     private void addSeparator() {
     85         append(UNDERSCORE);
     86     }
     87 
     88     /**
     89      * Returns the text in the buffer from start to blen as a String.
     90      */
     91     private String getString(int start) {
     92         return buffer.substring(start);
     93     }
     94 
     95     /**
     96      * Set the length of the buffer to pos, then append the string.
     97      */
     98     private void set(int pos, String s) {
     99         buffer.delete(pos, buffer.length());
    100         buffer.insert(pos, s);
    101     }
    102 
    103     /**
    104      * Append the string to the buffer.
    105      */
    106     private void append(String s) {
    107         buffer.append(s);
    108     }
    109 
    110     // utilities for parsing text out of the id
    111 
    112     /**
    113      * Character to indicate no more text is available in the id.
    114      */
    115     private static final char DONE = '\uffff';
    116 
    117     /**
    118      * Returns the character at index in the id, and advance index.  The returned character
    119      * is DONE if index was at the limit of the buffer.  The index is advanced regardless
    120      * so that decrementing the index will always 'unget' the last character returned.
    121      */
    122     private char next() {
    123         if (index == id.length) {
    124             index++;
    125             return DONE;
    126         }
    127 
    128         return id[index++];
    129     }
    130 
    131     /**
    132      * Advance index until the next terminator or id separator, and leave it there.
    133      */
    134     private void skipUntilTerminatorOrIDSeparator() {
    135         while (!isTerminatorOrIDSeparator(next()));
    136         --index;
    137     }
    138 
    139     /**
    140      * Returns true if the character at index in the id is a terminator.
    141      */
    142     private boolean atTerminator() {
    143         return index >= id.length || isTerminator(id[index]);
    144     }
    145 
    146     /**
    147      * Returns true if the character is a terminator (keyword separator, dot, or DONE).
    148      * Dot is a terminator because of the POSIX form, where dot precedes the codepage.
    149      */
    150     private boolean isTerminator(char c) {
    151         // always terminate at DOT, even if not handling POSIX.  It's an error...
    152         return c == KEYWORD_SEPARATOR || c == DONE || c == DOT;
    153     }
    154 
    155     /**
    156      * Returns true if the character is a terminator or id separator.
    157      */
    158     private boolean isTerminatorOrIDSeparator(char c) {
    159         return c == UNDERSCORE || c == HYPHEN || isTerminator(c);
    160     }
    161 
    162     /**
    163      * Returns true if the start of the buffer has an experimental or private language
    164      * prefix, the pattern '[ixIX][-_].' shows the syntax checked.
    165      */
    166     private boolean haveExperimentalLanguagePrefix() {
    167         if (id.length > 2) {
    168             char c = id[1];
    169             if (c == HYPHEN || c == UNDERSCORE) {
    170                 c = id[0];
    171                 return c == 'x' || c == 'X' || c == 'i' || c == 'I';
    172             }
    173         }
    174         return false;
    175     }
    176 
    177     /**
    178      * Returns true if a value separator occurs at or after index.
    179      */
    180     private boolean haveKeywordAssign() {
    181         // assume it is safe to start from index
    182         for (int i = index; i < id.length; ++i) {
    183             if (id[i] == KEYWORD_ASSIGN) {
    184                 return true;
    185             }
    186         }
    187         return false;
    188     }
    189 
    190     /**
    191      * Advance index past language, and accumulate normalized language code in buffer.
    192      * Index must be at 0 when this is called.  Index is left at a terminator or id
    193      * separator.  Returns the start of the language code in the buffer.
    194      */
    195     private int parseLanguage() {
    196         int startLength = buffer.length();
    197 
    198         if (haveExperimentalLanguagePrefix()) {
    199             append(AsciiUtil.toLower(id[0]));
    200             append(HYPHEN);
    201             index = 2;
    202         }
    203 
    204         char c;
    205         while(!isTerminatorOrIDSeparator(c = next())) {
    206             append(AsciiUtil.toLower(c));
    207         }
    208         --index; // unget
    209 
    210         if (buffer.length() - startLength == 3) {
    211             String lang = LocaleIDs.threeToTwoLetterLanguage(getString(0));
    212             if (lang != null) {
    213                 set(0, lang);
    214             }
    215         }
    216 
    217         return 0;
    218     }
    219 
    220     /**
    221      * Advance index past language.  Index must be at 0 when this is called.  Index
    222      * is left at a terminator or id separator.
    223      */
    224     private void skipLanguage() {
    225         if (haveExperimentalLanguagePrefix()) {
    226             index = 2;
    227         }
    228         skipUntilTerminatorOrIDSeparator();
    229     }
    230 
    231     /**
    232      * Advance index past script, and accumulate normalized script in buffer.
    233      * Index must be immediately after the language.
    234      * If the item at this position is not a script (is not four characters
    235      * long) leave index and buffer unchanged.  Otherwise index is left at
    236      * a terminator or id separator.  Returns the start of the script code
    237      * in the buffer (this may be equal to the buffer length, if there is no
    238      * script).
    239      */
    240     private int parseScript() {
    241         if (!atTerminator()) {
    242             int oldIndex = index; // save original index
    243             ++index;
    244 
    245             int oldBlen = buffer.length(); // get before append hyphen, if we truncate everything is undone
    246             char c;
    247             boolean firstPass = true;
    248             while(!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c)) {
    249                 if (firstPass) {
    250                     addSeparator();
    251                     append(AsciiUtil.toUpper(c));
    252                     firstPass = false;
    253                 } else {
    254                     append(AsciiUtil.toLower(c));
    255                 }
    256             }
    257             --index; // unget
    258 
    259             /* If it's not exactly 4 characters long, then it's not a script. */
    260             if (index - oldIndex != 5) { // +1 to account for separator
    261                 index = oldIndex;
    262                 buffer.delete(oldBlen, buffer.length());
    263             } else {
    264                 oldBlen++; // index past hyphen, for clients who want to extract just the script
    265             }
    266 
    267             return oldBlen;
    268         }
    269         return buffer.length();
    270     }
    271 
    272     /**
    273      * Advance index past script.
    274      * Index must be immediately after the language and IDSeparator.
    275      * If the item at this position is not a script (is not four characters
    276      * long) leave index.  Otherwise index is left at a terminator or
    277      * id separator.
    278      */
    279     private void skipScript() {
    280         if (!atTerminator()) {
    281             int oldIndex = index;
    282             ++index;
    283 
    284             char c;
    285             while (!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c));
    286             --index;
    287 
    288             if (index - oldIndex != 5) { // +1 to account for separator
    289                 index = oldIndex;
    290             }
    291         }
    292     }
    293 
    294     /**
    295      * Advance index past country, and accumulate normalized country in buffer.
    296      * Index must be immediately after the script (if there is one, else language)
    297      * and IDSeparator.  Return the start of the country code in the buffer.
    298      */
    299     private int parseCountry() {
    300         if (!atTerminator()) {
    301             int oldIndex = index;
    302             ++index;
    303 
    304             int oldBlen = buffer.length();
    305             char c;
    306             boolean firstPass = true;
    307             while (!isTerminatorOrIDSeparator(c = next())) {
    308                 if (firstPass) { // first, add hyphen
    309                     hadCountry = true; // we have a country, let variant parsing know
    310                     addSeparator();
    311                     ++oldBlen; // increment past hyphen
    312                     firstPass = false;
    313                 }
    314                 append(AsciiUtil.toUpper(c));
    315             }
    316             --index; // unget
    317 
    318             int charsAppended = buffer.length() - oldBlen;
    319 
    320             if (charsAppended == 0) {
    321                 // Do nothing.
    322             }
    323             else if (charsAppended < 2 || charsAppended > 3) {
    324                 // It's not a country, so return index and blen to
    325                 // their previous values.
    326                 index = oldIndex;
    327                 --oldBlen;
    328                 buffer.delete(oldBlen, buffer.length());
    329                 hadCountry = false;
    330             }
    331             else if (charsAppended == 3) {
    332                 String region = LocaleIDs.threeToTwoLetterRegion(getString(oldBlen));
    333                 if (region != null) {
    334                     set(oldBlen, region);
    335                 }
    336             }
    337 
    338             return oldBlen;
    339         }
    340 
    341         return buffer.length();
    342     }
    343 
    344     /**
    345      * Advance index past country.
    346      * Index must be immediately after the script (if there is one, else language)
    347      * and IDSeparator.
    348      */
    349     private void skipCountry() {
    350         if (!atTerminator()) {
    351             if (id[index] == UNDERSCORE || id[index] == HYPHEN) {
    352                 ++index;
    353             }
    354             /*
    355              * Save the index point after the separator, since the format
    356              * requires two separators if the country is not present.
    357              */
    358             int oldIndex = index;
    359 
    360             skipUntilTerminatorOrIDSeparator();
    361             int charsSkipped = index - oldIndex;
    362             if (charsSkipped < 2 || charsSkipped > 3) {
    363                 index = oldIndex;
    364             }
    365         }
    366     }
    367 
    368     /**
    369      * Advance index past variant, and accumulate normalized variant in buffer.  This ignores
    370      * the codepage information from POSIX ids.  Index must be immediately after the country
    371      * or script.  Index is left at the keyword separator or at the end of the text.  Return
    372      * the start of the variant code in the buffer.
    373      *
    374      * In standard form, we can have the following forms:
    375      * ll__VVVV
    376      * ll_CC_VVVV
    377      * ll_Ssss_VVVV
    378      * ll_Ssss_CC_VVVV
    379      *
    380      * This also handles POSIX ids, which can have the following forms (pppp is code page id):
    381      * ll_CC.pppp          --> ll_CC
    382      * ll_CC.pppp@VVVV     --> ll_CC_VVVV
    383      * ll_CC@VVVV          --> ll_CC_VVVV
    384      *
    385      * We identify this use of '@' in POSIX ids by looking for an '=' following
    386      * the '@'.  If there is one, we consider '@' to start a keyword list, instead of
    387      * being part of a POSIX id.
    388      *
    389      * Note:  since it was decided that we want an option to not handle POSIX ids, this
    390      * becomes a bit more complex.
    391      */
    392     private int parseVariant() {
    393         int oldBlen = buffer.length();
    394 
    395         boolean start = true;
    396         boolean needSeparator = true;
    397         boolean skipping = false;
    398         char c;
    399         boolean firstPass = true;
    400 
    401         while ((c = next()) != DONE) {
    402             if (c == DOT) {
    403                 start = false;
    404                 skipping = true;
    405             } else if (c == KEYWORD_SEPARATOR) {
    406                 if (haveKeywordAssign()) {
    407                     break;
    408                 }
    409                 skipping = false;
    410                 start = false;
    411                 needSeparator = true; // add another underscore if we have more text
    412             } else if (start) {
    413                 start = false;
    414                 if (c != UNDERSCORE && c != HYPHEN) {
    415                     index--;
    416                 }
    417             } else if (!skipping) {
    418                 if (needSeparator) {
    419                     needSeparator = false;
    420                     if (firstPass && !hadCountry) { // no country, we'll need two
    421                         addSeparator();
    422                         ++oldBlen; // for sure
    423                     }
    424                     addSeparator();
    425                     if (firstPass) { // only for the first separator
    426                         ++oldBlen;
    427                         firstPass = false;
    428                     }
    429                 }
    430                 c = AsciiUtil.toUpper(c);
    431                 if (c == HYPHEN || c == COMMA) {
    432                     c = UNDERSCORE;
    433                 }
    434                 append(c);
    435             }
    436         }
    437         --index; // unget
    438 
    439         return oldBlen;
    440     }
    441 
    442     // no need for skipvariant, to get the keywords we'll just scan directly for
    443     // the keyword separator
    444 
    445     /**
    446      * Returns the normalized language id, or the empty string.
    447      */
    448     public String getLanguage() {
    449         reset();
    450         return getString(parseLanguage());
    451     }
    452 
    453     /**
    454      * Returns the normalized script id, or the empty string.
    455      */
    456     public String getScript() {
    457         reset();
    458         skipLanguage();
    459         return getString(parseScript());
    460     }
    461 
    462     /**
    463      * return the normalized country id, or the empty string.
    464      */
    465     public String getCountry() {
    466         reset();
    467         skipLanguage();
    468         skipScript();
    469         return getString(parseCountry());
    470     }
    471 
    472     /**
    473      * Returns the normalized variant id, or the empty string.
    474      */
    475     public String getVariant() {
    476         reset();
    477         skipLanguage();
    478         skipScript();
    479         skipCountry();
    480         return getString(parseVariant());
    481     }
    482 
    483     /**
    484      * Returns the language, script, country, and variant as separate strings.
    485      */
    486     public String[] getLanguageScriptCountryVariant() {
    487         reset();
    488         return new String[] {
    489                 getString(parseLanguage()),
    490                 getString(parseScript()),
    491                 getString(parseCountry()),
    492                 getString(parseVariant())
    493         };
    494     }
    495 
    496     public void setBaseName(String baseName) {
    497         this.baseName = baseName;
    498     }
    499 
    500     public void parseBaseName() {
    501         if (baseName != null) {
    502             set(0, baseName);
    503         } else {
    504             reset();
    505             parseLanguage();
    506             parseScript();
    507             parseCountry();
    508             parseVariant();
    509 
    510             // catch unwanted trailing underscore after country if there was no variant
    511             int len = buffer.length();
    512             if (len > 0 && buffer.charAt(len - 1) == UNDERSCORE) {
    513                 buffer.deleteCharAt(len - 1);
    514             }
    515         }
    516     }
    517 
    518     /**
    519      * Returns the normalized base form of the locale id.  The base
    520      * form does not include keywords.
    521      */
    522     public String getBaseName() {
    523         if (baseName != null) {
    524             return baseName;
    525         }
    526         parseBaseName();
    527         return getString(0);
    528     }
    529 
    530     /**
    531      * Returns the normalized full form of the locale id.  The full
    532      * form includes keywords if they are present.
    533      */
    534     public String getName() {
    535         parseBaseName();
    536         parseKeywords();
    537         return getString(0);
    538     }
    539 
    540     // keyword utilities
    541 
    542     /**
    543      * If we have keywords, advance index to the start of the keywords and return true,
    544      * otherwise return false.
    545      */
    546     private boolean setToKeywordStart() {
    547         for (int i = index; i < id.length; ++i) {
    548             if (id[i] == KEYWORD_SEPARATOR) {
    549                 if (canonicalize) {
    550                     for (int j = ++i; j < id.length; ++j) { // increment i past separator for return
    551                         if (id[j] == KEYWORD_ASSIGN) {
    552                             index = i;
    553                             return true;
    554                         }
    555                     }
    556                 } else {
    557                     if (++i < id.length) {
    558                         index = i;
    559                         return true;
    560                     }
    561                 }
    562                 break;
    563             }
    564         }
    565         return false;
    566     }
    567 
    568     private static boolean isDoneOrKeywordAssign(char c) {
    569         return c == DONE || c == KEYWORD_ASSIGN;
    570     }
    571 
    572     private static boolean isDoneOrItemSeparator(char c) {
    573         return c == DONE || c == ITEM_SEPARATOR;
    574     }
    575 
    576     private String getKeyword() {
    577         int start = index;
    578         while (!isDoneOrKeywordAssign(next())) {
    579         }
    580         --index;
    581         return AsciiUtil.toLowerString(new String(id, start, index-start).trim());
    582     }
    583 
    584     private String getValue() {
    585         int start = index;
    586         while (!isDoneOrItemSeparator(next())) {
    587         }
    588         --index;
    589         return new String(id, start, index-start).trim(); // leave case alone
    590     }
    591 
    592     private Comparator<String> getKeyComparator() {
    593         final Comparator<String> comp = new Comparator<String>() {
    594             @Override
    595             public int compare(String lhs, String rhs) {
    596                 return lhs.compareTo(rhs);
    597             }
    598         };
    599         return comp;
    600     }
    601 
    602     /**
    603      * Returns a map of the keywords and values, or null if there are none.
    604      */
    605     public Map<String, String> getKeywordMap() {
    606         if (keywords == null) {
    607             TreeMap<String, String> m = null;
    608             if (setToKeywordStart()) {
    609                 // trim spaces and convert to lower case, both keywords and values.
    610                 do {
    611                     String key = getKeyword();
    612                     if (key.length() == 0) {
    613                         break;
    614                     }
    615                     char c = next();
    616                     if (c != KEYWORD_ASSIGN) {
    617                         // throw new IllegalArgumentException("key '" + key + "' missing a value.");
    618                         if (c == DONE) {
    619                             break;
    620                         } else {
    621                             continue;
    622                         }
    623                     }
    624                     String value = getValue();
    625                     if (value.length() == 0) {
    626                         // throw new IllegalArgumentException("key '" + key + "' missing a value.");
    627                         continue;
    628                     }
    629                     if (m == null) {
    630                         m = new TreeMap<String, String>(getKeyComparator());
    631                     } else if (m.containsKey(key)) {
    632                         // throw new IllegalArgumentException("key '" + key + "' already has a value.");
    633                         continue;
    634                     }
    635                     m.put(key, value);
    636                 } while (next() == ITEM_SEPARATOR);
    637             }
    638             keywords = m != null ? m : Collections.<String, String>emptyMap();
    639         }
    640 
    641         return keywords;
    642     }
    643 
    644 
    645     /**
    646      * Parse the keywords and return start of the string in the buffer.
    647      */
    648     private int parseKeywords() {
    649         int oldBlen = buffer.length();
    650         Map<String, String> m = getKeywordMap();
    651         if (!m.isEmpty()) {
    652             boolean first = true;
    653             for (Map.Entry<String, String> e : m.entrySet()) {
    654                 append(first ? KEYWORD_SEPARATOR : ITEM_SEPARATOR);
    655                 first = false;
    656                 append(e.getKey());
    657                 append(KEYWORD_ASSIGN);
    658                 append(e.getValue());
    659             }
    660             if (first == false) {
    661                 ++oldBlen;
    662             }
    663         }
    664         return oldBlen;
    665     }
    666 
    667     /**
    668      * Returns an iterator over the keywords, or null if we have an empty map.
    669      */
    670     public Iterator<String> getKeywords() {
    671         Map<String, String> m = getKeywordMap();
    672         return m.isEmpty() ? null : m.keySet().iterator();
    673     }
    674 
    675     /**
    676      * Returns the value for the named keyword, or null if the keyword is not
    677      * present.
    678      */
    679     public String getKeywordValue(String keywordName) {
    680         Map<String, String> m = getKeywordMap();
    681         return m.isEmpty() ? null : m.get(AsciiUtil.toLowerString(keywordName.trim()));
    682     }
    683 
    684     /**
    685      * Set the keyword value only if it is not already set to something else.
    686      */
    687     public void defaultKeywordValue(String keywordName, String value) {
    688         setKeywordValue(keywordName, value, false);
    689     }
    690 
    691     /**
    692      * Set the value for the named keyword, or unset it if value is null.  If
    693      * keywordName itself is null, unset all keywords.  If keywordName is not null,
    694      * value must not be null.
    695      */
    696     public void setKeywordValue(String keywordName, String value) {
    697         setKeywordValue(keywordName, value, true);
    698     }
    699 
    700     /**
    701      * Set the value for the named keyword, or unset it if value is null.  If
    702      * keywordName itself is null, unset all keywords.  If keywordName is not null,
    703      * value must not be null.  If reset is true, ignore any previous value for
    704      * the keyword, otherwise do not change the keyword (including removal of
    705      * one or all keywords).
    706      */
    707     private void setKeywordValue(String keywordName, String value, boolean reset) {
    708         if (keywordName == null) {
    709             if (reset) {
    710                 // force new map, ignore value
    711                 keywords = Collections.<String, String>emptyMap();
    712             }
    713         } else {
    714             keywordName = AsciiUtil.toLowerString(keywordName.trim());
    715             if (keywordName.length() == 0) {
    716                 throw new IllegalArgumentException("keyword must not be empty");
    717             }
    718             if (value != null) {
    719                 value = value.trim();
    720                 if (value.length() == 0) {
    721                     throw new IllegalArgumentException("value must not be empty");
    722                 }
    723             }
    724             Map<String, String> m = getKeywordMap();
    725             if (m.isEmpty()) { // it is EMPTY_MAP
    726                 if (value != null) {
    727                     // force new map
    728                     keywords = new TreeMap<String, String>(getKeyComparator());
    729                     keywords.put(keywordName, value.trim());
    730                 }
    731             } else {
    732                 if (reset || !m.containsKey(keywordName)) {
    733                     if (value != null) {
    734                         m.put(keywordName, value);
    735                     } else {
    736                         m.remove(keywordName);
    737                         if (m.isEmpty()) {
    738                             // force new map
    739                             keywords = Collections.<String, String>emptyMap();
    740                         }
    741                     }
    742                 }
    743             }
    744         }
    745     }
    746 }
    747