Home | History | Annotate | Download | only in impl
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5 ******************************************************************************
      6 * Copyright (C) 2003-2011, International Business Machines Corporation and   *
      7 * others. All Rights Reserved.                                               *
      8 ******************************************************************************
      9 */
     10 
     11 package android.icu.impl;
     12 
     13 import java.util.Collections;
     14 import java.util.Comparator;
     15 import java.util.Iterator;
     16 import java.util.Map;
     17 import java.util.TreeMap;
     18 
     19 import android.icu.impl.locale.AsciiUtil;
     20 
     21 /**
     22  * Utility class to parse and normalize locale ids (including POSIX style)
     23  * @hide Only a subset of ICU is exposed in Android
     24  */
     25 public final class LocaleIDParser {
     26 
     27     /**
     28      * Char array representing the locale ID.
     29      */
     30     private char[] id;
     31 
     32     /**
     33      * Current position in {@link #id} (while parsing).
     34      */
     35     private int index;
     36 
     37     /**
     38      * Temporary buffer for parsed sections of data.
     39      */
     40     private StringBuilder buffer;
     41 
     42     // um, don't handle POSIX ids unless we request it.  why not?  well... because.
     43     private boolean canonicalize;
     44     private boolean hadCountry;
     45 
     46     // used when canonicalizing
     47     Map<String, String> keywords;
     48     String baseName;
     49 
     50     /**
     51      * Parsing constants.
     52      */
     53     private static final char KEYWORD_SEPARATOR     = '@';
     54     private static final char HYPHEN                = '-';
     55     private static final char KEYWORD_ASSIGN        = '=';
     56     private static final char COMMA                 = ',';
     57     private static final char ITEM_SEPARATOR        = ';';
     58     private static final char DOT                   = '.';
     59     private static final char UNDERSCORE            = '_';
     60 
     61     public LocaleIDParser(String localeID) {
     62         this(localeID, false);
     63     }
     64 
     65     public LocaleIDParser(String localeID, boolean canonicalize) {
     66         id = localeID.toCharArray();
     67         index = 0;
     68         buffer = new StringBuilder(id.length + 5);
     69         this.canonicalize = canonicalize;
     70     }
     71 
     72     private void reset() {
     73         index = 0;
     74         buffer = new StringBuilder(id.length + 5);
     75     }
     76 
     77     // utilities for working on text in the buffer
     78 
     79     /**
     80      * Append c to the buffer.
     81      */
     82     private void append(char c) {
     83         buffer.append(c);
     84     }
     85 
     86     private void addSeparator() {
     87         append(UNDERSCORE);
     88     }
     89 
     90     /**
     91      * Returns the text in the buffer from start to blen as a String.
     92      */
     93     private String getString(int start) {
     94         return buffer.substring(start);
     95     }
     96 
     97     /**
     98      * Set the length of the buffer to pos, then append the string.
     99      */
    100     private void set(int pos, String s) {
    101         buffer.delete(pos, buffer.length());
    102         buffer.insert(pos, s);
    103     }
    104 
    105     /**
    106      * Append the string to the buffer.
    107      */
    108     private void append(String s) {
    109         buffer.append(s);
    110     }
    111 
    112     // utilities for parsing text out of the id
    113 
    114     /**
    115      * Character to indicate no more text is available in the id.
    116      */
    117     private static final char DONE = '\uffff';
    118 
    119     /**
    120      * Returns the character at index in the id, and advance index.  The returned character
    121      * is DONE if index was at the limit of the buffer.  The index is advanced regardless
    122      * so that decrementing the index will always 'unget' the last character returned.
    123      */
    124     private char next() {
    125         if (index == id.length) {
    126             index++;
    127             return DONE;
    128         }
    129 
    130         return id[index++];
    131     }
    132 
    133     /**
    134      * Advance index until the next terminator or id separator, and leave it there.
    135      */
    136     private void skipUntilTerminatorOrIDSeparator() {
    137         while (!isTerminatorOrIDSeparator(next()));
    138         --index;
    139     }
    140 
    141     /**
    142      * Returns true if the character at index in the id is a terminator.
    143      */
    144     private boolean atTerminator() {
    145         return index >= id.length || isTerminator(id[index]);
    146     }
    147 
    148     /**
    149      * Returns true if the character is a terminator (keyword separator, dot, or DONE).
    150      * Dot is a terminator because of the POSIX form, where dot precedes the codepage.
    151      */
    152     private boolean isTerminator(char c) {
    153         // always terminate at DOT, even if not handling POSIX.  It's an error...
    154         return c == KEYWORD_SEPARATOR || c == DONE || c == DOT;
    155     }
    156 
    157     /**
    158      * Returns true if the character is a terminator or id separator.
    159      */
    160     private boolean isTerminatorOrIDSeparator(char c) {
    161         return c == UNDERSCORE || c == HYPHEN || isTerminator(c);
    162     }
    163 
    164     /**
    165      * Returns true if the start of the buffer has an experimental or private language
    166      * prefix, the pattern '[ixIX][-_].' shows the syntax checked.
    167      */
    168     private boolean haveExperimentalLanguagePrefix() {
    169         if (id.length > 2) {
    170             char c = id[1];
    171             if (c == HYPHEN || c == UNDERSCORE) {
    172                 c = id[0];
    173                 return c == 'x' || c == 'X' || c == 'i' || c == 'I';
    174             }
    175         }
    176         return false;
    177     }
    178 
    179     /**
    180      * Returns true if a value separator occurs at or after index.
    181      */
    182     private boolean haveKeywordAssign() {
    183         // assume it is safe to start from index
    184         for (int i = index; i < id.length; ++i) {
    185             if (id[i] == KEYWORD_ASSIGN) {
    186                 return true;
    187             }
    188         }
    189         return false;
    190     }
    191 
    192     /**
    193      * Advance index past language, and accumulate normalized language code in buffer.
    194      * Index must be at 0 when this is called.  Index is left at a terminator or id
    195      * separator.  Returns the start of the language code in the buffer.
    196      */
    197     private int parseLanguage() {
    198         int startLength = buffer.length();
    199 
    200         if (haveExperimentalLanguagePrefix()) {
    201             append(AsciiUtil.toLower(id[0]));
    202             append(HYPHEN);
    203             index = 2;
    204         }
    205 
    206         char c;
    207         while(!isTerminatorOrIDSeparator(c = next())) {
    208             append(AsciiUtil.toLower(c));
    209         }
    210         --index; // unget
    211 
    212         if (buffer.length() - startLength == 3) {
    213             String lang = LocaleIDs.threeToTwoLetterLanguage(getString(0));
    214             if (lang != null) {
    215                 set(0, lang);
    216             }
    217         }
    218 
    219         return 0;
    220     }
    221 
    222     /**
    223      * Advance index past language.  Index must be at 0 when this is called.  Index
    224      * is left at a terminator or id separator.
    225      */
    226     private void skipLanguage() {
    227         if (haveExperimentalLanguagePrefix()) {
    228             index = 2;
    229         }
    230         skipUntilTerminatorOrIDSeparator();
    231     }
    232 
    233     /**
    234      * Advance index past script, and accumulate normalized script in buffer.
    235      * Index must be immediately after the language.
    236      * If the item at this position is not a script (is not four characters
    237      * long) leave index and buffer unchanged.  Otherwise index is left at
    238      * a terminator or id separator.  Returns the start of the script code
    239      * in the buffer (this may be equal to the buffer length, if there is no
    240      * script).
    241      */
    242     private int parseScript() {
    243         if (!atTerminator()) {
    244             int oldIndex = index; // save original index
    245             ++index;
    246 
    247             int oldBlen = buffer.length(); // get before append hyphen, if we truncate everything is undone
    248             char c;
    249             boolean firstPass = true;
    250             while(!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c)) {
    251                 if (firstPass) {
    252                     addSeparator();
    253                     append(AsciiUtil.toUpper(c));
    254                     firstPass = false;
    255                 } else {
    256                     append(AsciiUtil.toLower(c));
    257                 }
    258             }
    259             --index; // unget
    260 
    261             /* If it's not exactly 4 characters long, then it's not a script. */
    262             if (index - oldIndex != 5) { // +1 to account for separator
    263                 index = oldIndex;
    264                 buffer.delete(oldBlen, buffer.length());
    265             } else {
    266                 oldBlen++; // index past hyphen, for clients who want to extract just the script
    267             }
    268 
    269             return oldBlen;
    270         }
    271         return buffer.length();
    272     }
    273 
    274     /**
    275      * Advance index past script.
    276      * Index must be immediately after the language and IDSeparator.
    277      * If the item at this position is not a script (is not four characters
    278      * long) leave index.  Otherwise index is left at a terminator or
    279      * id separator.
    280      */
    281     private void skipScript() {
    282         if (!atTerminator()) {
    283             int oldIndex = index;
    284             ++index;
    285 
    286             char c;
    287             while (!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c));
    288             --index;
    289 
    290             if (index - oldIndex != 5) { // +1 to account for separator
    291                 index = oldIndex;
    292             }
    293         }
    294     }
    295 
    296     /**
    297      * Advance index past country, and accumulate normalized country in buffer.
    298      * Index must be immediately after the script (if there is one, else language)
    299      * and IDSeparator.  Return the start of the country code in the buffer.
    300      */
    301     private int parseCountry() {
    302         if (!atTerminator()) {
    303             int oldIndex = index;
    304             ++index;
    305 
    306             int oldBlen = buffer.length();
    307             char c;
    308             boolean firstPass = true;
    309             while (!isTerminatorOrIDSeparator(c = next())) {
    310                 if (firstPass) { // first, add hyphen
    311                     hadCountry = true; // we have a country, let variant parsing know
    312                     addSeparator();
    313                     ++oldBlen; // increment past hyphen
    314                     firstPass = false;
    315                 }
    316                 append(AsciiUtil.toUpper(c));
    317             }
    318             --index; // unget
    319 
    320             int charsAppended = buffer.length() - oldBlen;
    321 
    322             if (charsAppended == 0) {
    323                 // Do nothing.
    324             }
    325             else if (charsAppended < 2 || charsAppended > 3) {
    326                 // It's not a country, so return index and blen to
    327                 // their previous values.
    328                 index = oldIndex;
    329                 --oldBlen;
    330                 buffer.delete(oldBlen, buffer.length());
    331                 hadCountry = false;
    332             }
    333             else if (charsAppended == 3) {
    334                 String region = LocaleIDs.threeToTwoLetterRegion(getString(oldBlen));
    335                 if (region != null) {
    336                     set(oldBlen, region);
    337                 }
    338             }
    339 
    340             return oldBlen;
    341         }
    342 
    343         return buffer.length();
    344     }
    345 
    346     /**
    347      * Advance index past country.
    348      * Index must be immediately after the script (if there is one, else language)
    349      * and IDSeparator.
    350      */
    351     private void skipCountry() {
    352         if (!atTerminator()) {
    353             if (id[index] == UNDERSCORE || id[index] == HYPHEN) {
    354                 ++index;
    355             }
    356             /*
    357              * Save the index point after the separator, since the format
    358              * requires two separators if the country is not present.
    359              */
    360             int oldIndex = index;
    361 
    362             skipUntilTerminatorOrIDSeparator();
    363             int charsSkipped = index - oldIndex;
    364             if (charsSkipped < 2 || charsSkipped > 3) {
    365                 index = oldIndex;
    366             }
    367         }
    368     }
    369 
    370     /**
    371      * Advance index past variant, and accumulate normalized variant in buffer.  This ignores
    372      * the codepage information from POSIX ids.  Index must be immediately after the country
    373      * or script.  Index is left at the keyword separator or at the end of the text.  Return
    374      * the start of the variant code in the buffer.
    375      *
    376      * In standard form, we can have the following forms:
    377      * ll__VVVV
    378      * ll_CC_VVVV
    379      * ll_Ssss_VVVV
    380      * ll_Ssss_CC_VVVV
    381      *
    382      * This also handles POSIX ids, which can have the following forms (pppp is code page id):
    383      * ll_CC.pppp          --> ll_CC
    384      * ll_CC.pppp@VVVV     --> ll_CC_VVVV
    385      * ll_CC@VVVV          --> ll_CC_VVVV
    386      *
    387      * We identify this use of '@' in POSIX ids by looking for an '=' following
    388      * the '@'.  If there is one, we consider '@' to start a keyword list, instead of
    389      * being part of a POSIX id.
    390      *
    391      * Note:  since it was decided that we want an option to not handle POSIX ids, this
    392      * becomes a bit more complex.
    393      */
    394     private int parseVariant() {
    395         int oldBlen = buffer.length();
    396 
    397         boolean start = true;
    398         boolean needSeparator = true;
    399         boolean skipping = false;
    400         char c;
    401         boolean firstPass = true;
    402 
    403         while ((c = next()) != DONE) {
    404             if (c == DOT) {
    405                 start = false;
    406                 skipping = true;
    407             } else if (c == KEYWORD_SEPARATOR) {
    408                 if (haveKeywordAssign()) {
    409                     break;
    410                 }
    411                 skipping = false;
    412                 start = false;
    413                 needSeparator = true; // add another underscore if we have more text
    414             } else if (start) {
    415                 start = false;
    416                 if (c != UNDERSCORE && c != HYPHEN) {
    417                     index--;
    418                 }
    419             } else if (!skipping) {
    420                 if (needSeparator) {
    421                     needSeparator = false;
    422                     if (firstPass && !hadCountry) { // no country, we'll need two
    423                         addSeparator();
    424                         ++oldBlen; // for sure
    425                     }
    426                     addSeparator();
    427                     if (firstPass) { // only for the first separator
    428                         ++oldBlen;
    429                         firstPass = false;
    430                     }
    431                 }
    432                 c = AsciiUtil.toUpper(c);
    433                 if (c == HYPHEN || c == COMMA) {
    434                     c = UNDERSCORE;
    435                 }
    436                 append(c);
    437             }
    438         }
    439         --index; // unget
    440 
    441         return oldBlen;
    442     }
    443 
    444     // no need for skipvariant, to get the keywords we'll just scan directly for
    445     // the keyword separator
    446 
    447     /**
    448      * Returns the normalized language id, or the empty string.
    449      */
    450     public String getLanguage() {
    451         reset();
    452         return getString(parseLanguage());
    453     }
    454 
    455     /**
    456      * Returns the normalized script id, or the empty string.
    457      */
    458     public String getScript() {
    459         reset();
    460         skipLanguage();
    461         return getString(parseScript());
    462     }
    463 
    464     /**
    465      * return the normalized country id, or the empty string.
    466      */
    467     public String getCountry() {
    468         reset();
    469         skipLanguage();
    470         skipScript();
    471         return getString(parseCountry());
    472     }
    473 
    474     /**
    475      * Returns the normalized variant id, or the empty string.
    476      */
    477     public String getVariant() {
    478         reset();
    479         skipLanguage();
    480         skipScript();
    481         skipCountry();
    482         return getString(parseVariant());
    483     }
    484 
    485     /**
    486      * Returns the language, script, country, and variant as separate strings.
    487      */
    488     public String[] getLanguageScriptCountryVariant() {
    489         reset();
    490         return new String[] {
    491                 getString(parseLanguage()),
    492                 getString(parseScript()),
    493                 getString(parseCountry()),
    494                 getString(parseVariant())
    495         };
    496     }
    497 
    498     public void setBaseName(String baseName) {
    499         this.baseName = baseName;
    500     }
    501 
    502     public void parseBaseName() {
    503         if (baseName != null) {
    504             set(0, baseName);
    505         } else {
    506             reset();
    507             parseLanguage();
    508             parseScript();
    509             parseCountry();
    510             parseVariant();
    511 
    512             // catch unwanted trailing underscore after country if there was no variant
    513             int len = buffer.length();
    514             if (len > 0 && buffer.charAt(len - 1) == UNDERSCORE) {
    515                 buffer.deleteCharAt(len - 1);
    516             }
    517         }
    518     }
    519 
    520     /**
    521      * Returns the normalized base form of the locale id.  The base
    522      * form does not include keywords.
    523      */
    524     public String getBaseName() {
    525         if (baseName != null) {
    526             return baseName;
    527         }
    528         parseBaseName();
    529         return getString(0);
    530     }
    531 
    532     /**
    533      * Returns the normalized full form of the locale id.  The full
    534      * form includes keywords if they are present.
    535      */
    536     public String getName() {
    537         parseBaseName();
    538         parseKeywords();
    539         return getString(0);
    540     }
    541 
    542     // keyword utilities
    543 
    544     /**
    545      * If we have keywords, advance index to the start of the keywords and return true,
    546      * otherwise return false.
    547      */
    548     private boolean setToKeywordStart() {
    549         for (int i = index; i < id.length; ++i) {
    550             if (id[i] == KEYWORD_SEPARATOR) {
    551                 if (canonicalize) {
    552                     for (int j = ++i; j < id.length; ++j) { // increment i past separator for return
    553                         if (id[j] == KEYWORD_ASSIGN) {
    554                             index = i;
    555                             return true;
    556                         }
    557                     }
    558                 } else {
    559                     if (++i < id.length) {
    560                         index = i;
    561                         return true;
    562                     }
    563                 }
    564                 break;
    565             }
    566         }
    567         return false;
    568     }
    569 
    570     private static boolean isDoneOrKeywordAssign(char c) {
    571         return c == DONE || c == KEYWORD_ASSIGN;
    572     }
    573 
    574     private static boolean isDoneOrItemSeparator(char c) {
    575         return c == DONE || c == ITEM_SEPARATOR;
    576     }
    577 
    578     private String getKeyword() {
    579         int start = index;
    580         while (!isDoneOrKeywordAssign(next())) {
    581         }
    582         --index;
    583         return AsciiUtil.toLowerString(new String(id, start, index-start).trim());
    584     }
    585 
    586     private String getValue() {
    587         int start = index;
    588         while (!isDoneOrItemSeparator(next())) {
    589         }
    590         --index;
    591         return new String(id, start, index-start).trim(); // leave case alone
    592     }
    593 
    594     private Comparator<String> getKeyComparator() {
    595         final Comparator<String> comp = new Comparator<String>() {
    596             @Override
    597             public int compare(String lhs, String rhs) {
    598                 return lhs.compareTo(rhs);
    599             }
    600         };
    601         return comp;
    602     }
    603 
    604     /**
    605      * Returns a map of the keywords and values, or null if there are none.
    606      */
    607     public Map<String, String> getKeywordMap() {
    608         if (keywords == null) {
    609             TreeMap<String, String> m = null;
    610             if (setToKeywordStart()) {
    611                 // trim spaces and convert to lower case, both keywords and values.
    612                 do {
    613                     String key = getKeyword();
    614                     if (key.length() == 0) {
    615                         break;
    616                     }
    617                     char c = next();
    618                     if (c != KEYWORD_ASSIGN) {
    619                         // throw new IllegalArgumentException("key '" + key + "' missing a value.");
    620                         if (c == DONE) {
    621                             break;
    622                         } else {
    623                             continue;
    624                         }
    625                     }
    626                     String value = getValue();
    627                     if (value.length() == 0) {
    628                         // throw new IllegalArgumentException("key '" + key + "' missing a value.");
    629                         continue;
    630                     }
    631                     if (m == null) {
    632                         m = new TreeMap<String, String>(getKeyComparator());
    633                     } else if (m.containsKey(key)) {
    634                         // throw new IllegalArgumentException("key '" + key + "' already has a value.");
    635                         continue;
    636                     }
    637                     m.put(key, value);
    638                 } while (next() == ITEM_SEPARATOR);
    639             }
    640             keywords = m != null ? m : Collections.<String, String>emptyMap();
    641         }
    642 
    643         return keywords;
    644     }
    645 
    646 
    647     /**
    648      * Parse the keywords and return start of the string in the buffer.
    649      */
    650     private int parseKeywords() {
    651         int oldBlen = buffer.length();
    652         Map<String, String> m = getKeywordMap();
    653         if (!m.isEmpty()) {
    654             boolean first = true;
    655             for (Map.Entry<String, String> e : m.entrySet()) {
    656                 append(first ? KEYWORD_SEPARATOR : ITEM_SEPARATOR);
    657                 first = false;
    658                 append(e.getKey());
    659                 append(KEYWORD_ASSIGN);
    660                 append(e.getValue());
    661             }
    662             if (first == false) {
    663                 ++oldBlen;
    664             }
    665         }
    666         return oldBlen;
    667     }
    668 
    669     /**
    670      * Returns an iterator over the keywords, or null if we have an empty map.
    671      */
    672     public Iterator<String> getKeywords() {
    673         Map<String, String> m = getKeywordMap();
    674         return m.isEmpty() ? null : m.keySet().iterator();
    675     }
    676 
    677     /**
    678      * Returns the value for the named keyword, or null if the keyword is not
    679      * present.
    680      */
    681     public String getKeywordValue(String keywordName) {
    682         Map<String, String> m = getKeywordMap();
    683         return m.isEmpty() ? null : m.get(AsciiUtil.toLowerString(keywordName.trim()));
    684     }
    685 
    686     /**
    687      * Set the keyword value only if it is not already set to something else.
    688      */
    689     public void defaultKeywordValue(String keywordName, String value) {
    690         setKeywordValue(keywordName, value, false);
    691     }
    692 
    693     /**
    694      * Set the value for the named keyword, or unset it if value is null.  If
    695      * keywordName itself is null, unset all keywords.  If keywordName is not null,
    696      * value must not be null.
    697      */
    698     public void setKeywordValue(String keywordName, String value) {
    699         setKeywordValue(keywordName, value, true);
    700     }
    701 
    702     /**
    703      * Set the value for the named keyword, or unset it if value is null.  If
    704      * keywordName itself is null, unset all keywords.  If keywordName is not null,
    705      * value must not be null.  If reset is true, ignore any previous value for
    706      * the keyword, otherwise do not change the keyword (including removal of
    707      * one or all keywords).
    708      */
    709     private void setKeywordValue(String keywordName, String value, boolean reset) {
    710         if (keywordName == null) {
    711             if (reset) {
    712                 // force new map, ignore value
    713                 keywords = Collections.<String, String>emptyMap();
    714             }
    715         } else {
    716             keywordName = AsciiUtil.toLowerString(keywordName.trim());
    717             if (keywordName.length() == 0) {
    718                 throw new IllegalArgumentException("keyword must not be empty");
    719             }
    720             if (value != null) {
    721                 value = value.trim();
    722                 if (value.length() == 0) {
    723                     throw new IllegalArgumentException("value must not be empty");
    724                 }
    725             }
    726             Map<String, String> m = getKeywordMap();
    727             if (m.isEmpty()) { // it is EMPTY_MAP
    728                 if (value != null) {
    729                     // force new map
    730                     keywords = new TreeMap<String, String>(getKeyComparator());
    731                     keywords.put(keywordName, value.trim());
    732                 }
    733             } else {
    734                 if (reset || !m.containsKey(keywordName)) {
    735                     if (value != null) {
    736                         m.put(keywordName, value);
    737                     } else {
    738                         m.remove(keywordName);
    739                         if (m.isEmpty()) {
    740                             // force new map
    741                             keywords = Collections.<String, String>emptyMap();
    742                         }
    743                     }
    744                 }
    745             }
    746         }
    747     }
    748 }
    749