Home | History | Annotate | Download | only in coll
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5 *******************************************************************************
      6 * Copyright (C) 2013-2015, International Business Machines
      7 * Corporation and others.  All Rights Reserved.
      8 *******************************************************************************
      9 * CollationRuleParser.java, ported from collationruleparser.h/.cpp
     10 *
     11 * C++ version created on: 2013apr10
     12 * created by: Markus W. Scherer
     13 */
     14 
     15 package android.icu.impl.coll;
     16 
     17 import java.text.ParseException;
     18 import java.util.ArrayList;
     19 
     20 import android.icu.impl.IllegalIcuArgumentException;
     21 import android.icu.impl.PatternProps;
     22 import android.icu.lang.UCharacter;
     23 import android.icu.lang.UProperty;
     24 import android.icu.text.Collator;
     25 import android.icu.text.Normalizer2;
     26 import android.icu.text.UTF16;
     27 import android.icu.text.UnicodeSet;
     28 import android.icu.util.ULocale;
     29 
     30 /**
     31  * @hide Only a subset of ICU is exposed in Android
     32  */
     33 public final class CollationRuleParser {
     34     /** Special reset positions. */
     35     enum Position {
     36         FIRST_TERTIARY_IGNORABLE,
     37         LAST_TERTIARY_IGNORABLE,
     38         FIRST_SECONDARY_IGNORABLE,
     39         LAST_SECONDARY_IGNORABLE,
     40         FIRST_PRIMARY_IGNORABLE,
     41         LAST_PRIMARY_IGNORABLE,
     42         FIRST_VARIABLE,
     43         LAST_VARIABLE,
     44         FIRST_REGULAR,
     45         LAST_REGULAR,
     46         FIRST_IMPLICIT,
     47         LAST_IMPLICIT,
     48         FIRST_TRAILING,
     49         LAST_TRAILING
     50     }
     51     static final Position[] POSITION_VALUES = Position.values();
     52 
     53     /**
     54      * First character of contractions that encode special reset positions.
     55      * U+FFFE cannot be tailored via rule syntax.
     56      *
     57      * The second contraction character is POS_BASE + Position.
     58      */
     59     static final char POS_LEAD = 0xfffe;
     60     /**
     61      * Base for the second character of contractions that encode special reset positions.
     62      * Braille characters U+28xx are printable and normalization-inert.
     63      * @see POS_LEAD
     64      */
     65     static final char POS_BASE = 0x2800;
     66 
     67     static abstract class Sink {
     68         /**
     69          * Adds a reset.
     70          * strength=UCOL_IDENTICAL for &str.
     71          * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
     72          */
     73         abstract void addReset(int strength, CharSequence str);
     74         /**
     75          * Adds a relation with strength and prefix | str / extension.
     76          */
     77         abstract void addRelation(int strength, CharSequence prefix,
     78                 CharSequence str, CharSequence extension);
     79 
     80         void suppressContractions(UnicodeSet set) {}
     81 
     82         void optimize(UnicodeSet set) {}
     83     }
     84 
     85     interface Importer {
     86         String getRules(String localeID, String collationType);
     87     }
     88 
     89     /**
     90      * Constructor.
     91      * The Sink must be set before parsing.
     92      * The Importer can be set, otherwise [import locale] syntax is not supported.
     93      */
     94     CollationRuleParser(CollationData base) {
     95         baseData = base;
     96     }
     97 
     98     /**
     99      * Sets the pointer to a Sink object.
    100      * The pointer is aliased: Pointer copy without cloning or taking ownership.
    101      */
    102     void setSink(Sink sinkAlias) {
    103         sink = sinkAlias;
    104     }
    105 
    106     /**
    107      * Sets the pointer to an Importer object.
    108      * The pointer is aliased: Pointer copy without cloning or taking ownership.
    109      */
    110     void setImporter(Importer importerAlias) {
    111         importer = importerAlias;
    112     }
    113 
    114     void parse(String ruleString, CollationSettings outSettings) throws ParseException {
    115         settings = outSettings;
    116         parse(ruleString);
    117     }
    118 
    119     private static final int UCOL_DEFAULT = -1;
    120     private static final int UCOL_OFF = 0;
    121     private static final int UCOL_ON = 1;
    122 
    123     /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
    124     private static final int STRENGTH_MASK = 0xf;
    125     private static final int STARRED_FLAG = 0x10;
    126     private static final int OFFSET_SHIFT = 8;
    127 
    128     private static final String BEFORE = "[before";
    129 
    130     // In C++, we parse into temporary UnicodeString objects named "raw" or "str".
    131     // In Java, we reuse this StringBuilder.
    132     private final StringBuilder rawBuilder = new StringBuilder();
    133 
    134     private void parse(String ruleString) throws ParseException {
    135         rules = ruleString;
    136         ruleIndex = 0;
    137 
    138         while(ruleIndex < rules.length()) {
    139             char c = rules.charAt(ruleIndex);
    140             if(PatternProps.isWhiteSpace(c)) {
    141                 ++ruleIndex;
    142                 continue;
    143             }
    144             switch(c) {
    145             case 0x26:  // '&'
    146                 parseRuleChain();
    147                 break;
    148             case 0x5b:  // '['
    149                 parseSetting();
    150                 break;
    151             case 0x23:  // '#' starts a comment, until the end of the line
    152                 ruleIndex = skipComment(ruleIndex + 1);
    153                 break;
    154             case 0x40:  // '@' is equivalent to [backwards 2]
    155                 settings.setFlag(CollationSettings.BACKWARD_SECONDARY, true);
    156                 ++ruleIndex;
    157                 break;
    158             case 0x21:  // '!' used to turn on Thai/Lao character reversal
    159                 // Accept but ignore. The root collator has contractions
    160                 // that are equivalent to the character reversal, where appropriate.
    161                 ++ruleIndex;
    162                 break;
    163             default:
    164                 setParseError("expected a reset or setting or comment");
    165                 break;
    166             }
    167         }
    168     }
    169 
    170     private void parseRuleChain() throws ParseException {
    171         int resetStrength = parseResetAndPosition();
    172         boolean isFirstRelation = true;
    173         for(;;) {
    174             int result = parseRelationOperator();
    175             if(result < 0) {
    176                 if(ruleIndex < rules.length() && rules.charAt(ruleIndex) == 0x23) {
    177                     // '#' starts a comment, until the end of the line
    178                     ruleIndex = skipComment(ruleIndex + 1);
    179                     continue;
    180                 }
    181                 if(isFirstRelation) {
    182                     setParseError("reset not followed by a relation");
    183                 }
    184                 return;
    185             }
    186             int strength = result & STRENGTH_MASK;
    187             if(resetStrength < Collator.IDENTICAL) {
    188                 // reset-before rule chain
    189                 if(isFirstRelation) {
    190                     if(strength != resetStrength) {
    191                         setParseError("reset-before strength differs from its first relation");
    192                         return;
    193                     }
    194                 } else {
    195                     if(strength < resetStrength) {
    196                         setParseError("reset-before strength followed by a stronger relation");
    197                         return;
    198                     }
    199                 }
    200             }
    201             int i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
    202             if((result & STARRED_FLAG) == 0) {
    203                 parseRelationStrings(strength, i);
    204             } else {
    205                 parseStarredCharacters(strength, i);
    206             }
    207             isFirstRelation = false;
    208         }
    209     }
    210 
    211     private int parseResetAndPosition() throws ParseException {
    212         int i = skipWhiteSpace(ruleIndex + 1);
    213         int j;
    214         char c;
    215         int resetStrength;
    216         if(rules.regionMatches(i, BEFORE, 0, BEFORE.length()) &&
    217                 (j = i + BEFORE.length()) < rules.length() &&
    218                 PatternProps.isWhiteSpace(rules.charAt(j)) &&
    219                 ((j = skipWhiteSpace(j + 1)) + 1) < rules.length() &&
    220                 0x31 <= (c = rules.charAt(j)) && c <= 0x33 &&
    221                 rules.charAt(j + 1) == 0x5d) {
    222             // &[before n] with n=1 or 2 or 3
    223             resetStrength = Collator.PRIMARY + (c - 0x31);
    224             i = skipWhiteSpace(j + 2);
    225         } else {
    226             resetStrength = Collator.IDENTICAL;
    227         }
    228         if(i >= rules.length()) {
    229             setParseError("reset without position");
    230             return UCOL_DEFAULT;
    231         }
    232         if(rules.charAt(i) == 0x5b) {  // '['
    233             i = parseSpecialPosition(i, rawBuilder);
    234         } else {
    235             i = parseTailoringString(i, rawBuilder);
    236         }
    237         try {
    238             sink.addReset(resetStrength, rawBuilder);
    239         } catch(Exception e) {
    240             setParseError("adding reset failed", e);
    241             return UCOL_DEFAULT;
    242         }
    243         ruleIndex = i;
    244         return resetStrength;
    245     }
    246 
    247     private int parseRelationOperator() {
    248         ruleIndex = skipWhiteSpace(ruleIndex);
    249         if(ruleIndex >= rules.length()) { return UCOL_DEFAULT; }
    250         int strength;
    251         int i = ruleIndex;
    252         char c = rules.charAt(i++);
    253         switch(c) {
    254         case 0x3c:  // '<'
    255             if(i < rules.length() && rules.charAt(i) == 0x3c) {  // <<
    256                 ++i;
    257                 if(i < rules.length() && rules.charAt(i) == 0x3c) {  // <<<
    258                     ++i;
    259                     if(i < rules.length() && rules.charAt(i) == 0x3c) {  // <<<<
    260                         ++i;
    261                         strength = Collator.QUATERNARY;
    262                     } else {
    263                         strength = Collator.TERTIARY;
    264                     }
    265                 } else {
    266                     strength = Collator.SECONDARY;
    267                 }
    268             } else {
    269                 strength = Collator.PRIMARY;
    270             }
    271             if(i < rules.length() && rules.charAt(i) == 0x2a) {  // '*'
    272                 ++i;
    273                 strength |= STARRED_FLAG;
    274             }
    275             break;
    276         case 0x3b:  // ';' same as <<
    277             strength = Collator.SECONDARY;
    278             break;
    279         case 0x2c:  // ',' same as <<<
    280             strength = Collator.TERTIARY;
    281             break;
    282         case 0x3d:  // '='
    283             strength = Collator.IDENTICAL;
    284             if(i < rules.length() && rules.charAt(i) == 0x2a) {  // '*'
    285                 ++i;
    286                 strength |= STARRED_FLAG;
    287             }
    288             break;
    289         default:
    290             return UCOL_DEFAULT;
    291         }
    292         return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
    293     }
    294 
    295     private void parseRelationStrings(int strength, int i) throws ParseException {
    296         // Parse
    297         //     prefix | str / extension
    298         // where prefix and extension are optional.
    299         String prefix = "";
    300         CharSequence extension = "";
    301         i = parseTailoringString(i, rawBuilder);
    302         char next = (i < rules.length()) ? rules.charAt(i) : 0;
    303         if(next == 0x7c) {  // '|' separates the context prefix from the string.
    304             prefix = rawBuilder.toString();
    305             i = parseTailoringString(i + 1, rawBuilder);
    306             next = (i < rules.length()) ? rules.charAt(i) : 0;
    307         }
    308         // str = rawBuilder (do not modify rawBuilder any more in this function)
    309         if(next == 0x2f) {  // '/' separates the string from the extension.
    310             StringBuilder extBuilder = new StringBuilder();
    311             i = parseTailoringString(i + 1, extBuilder);
    312             extension = extBuilder;
    313         }
    314         if(prefix.length() != 0) {
    315             int prefix0 = prefix.codePointAt(0);
    316             int c = rawBuilder.codePointAt(0);
    317             if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
    318                 setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary");
    319                 return;
    320             }
    321         }
    322         try {
    323             sink.addRelation(strength, prefix, rawBuilder, extension);
    324         } catch(Exception e) {
    325             setParseError("adding relation failed", e);
    326             return;
    327         }
    328         ruleIndex = i;
    329     }
    330 
    331     private void parseStarredCharacters(int strength, int i) throws ParseException {
    332         String empty = "";
    333         i = parseString(skipWhiteSpace(i), rawBuilder);
    334         if(rawBuilder.length() == 0) {
    335             setParseError("missing starred-relation string");
    336             return;
    337         }
    338         int prev = -1;
    339         int j = 0;
    340         for(;;) {
    341             while(j < rawBuilder.length()) {
    342                 int c = rawBuilder.codePointAt(j);
    343                 if(!nfd.isInert(c)) {
    344                     setParseError("starred-relation string is not all NFD-inert");
    345                     return;
    346                 }
    347                 try {
    348                     sink.addRelation(strength, empty, UTF16.valueOf(c), empty);
    349                 } catch(Exception e) {
    350                     setParseError("adding relation failed", e);
    351                     return;
    352                 }
    353                 j += Character.charCount(c);
    354                 prev = c;
    355             }
    356             if(i >= rules.length() || rules.charAt(i) != 0x2d) {  // '-'
    357                 break;
    358             }
    359             if(prev < 0) {
    360                 setParseError("range without start in starred-relation string");
    361                 return;
    362             }
    363             i = parseString(i + 1, rawBuilder);
    364             if(rawBuilder.length() == 0) {
    365                 setParseError("range without end in starred-relation string");
    366                 return;
    367             }
    368             int c = rawBuilder.codePointAt(0);
    369             if(c < prev) {
    370                 setParseError("range start greater than end in starred-relation string");
    371                 return;
    372             }
    373             // range prev-c
    374             while(++prev <= c) {
    375                 if(!nfd.isInert(prev)) {
    376                     setParseError("starred-relation string range is not all NFD-inert");
    377                     return;
    378                 }
    379                 if(isSurrogate(prev)) {
    380                     setParseError("starred-relation string range contains a surrogate");
    381                     return;
    382                 }
    383                 if(0xfffd <= prev && prev <= 0xffff) {
    384                     setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF");
    385                     return;
    386                 }
    387                 try {
    388                     sink.addRelation(strength, empty, UTF16.valueOf(prev), empty);
    389                 } catch(Exception e) {
    390                     setParseError("adding relation failed", e);
    391                     return;
    392                 }
    393             }
    394             prev = -1;
    395             j = Character.charCount(c);
    396         }
    397         ruleIndex = skipWhiteSpace(i);
    398     }
    399 
    400     private int parseTailoringString(int i, StringBuilder raw) throws ParseException {
    401         i = parseString(skipWhiteSpace(i), raw);
    402         if(raw.length() == 0) {
    403             setParseError("missing relation string");
    404         }
    405         return skipWhiteSpace(i);
    406     }
    407 
    408     private int parseString(int i, StringBuilder raw) throws ParseException {
    409         raw.setLength(0);
    410         while(i < rules.length()) {
    411             char c = rules.charAt(i++);
    412             if(isSyntaxChar(c)) {
    413                 if(c == 0x27) {  // apostrophe
    414                     if(i < rules.length() && rules.charAt(i) == 0x27) {
    415                         // Double apostrophe, encodes a single one.
    416                         raw.append((char)0x27);
    417                         ++i;
    418                         continue;
    419                     }
    420                     // Quote literal text until the next single apostrophe.
    421                     for(;;) {
    422                         if(i == rules.length()) {
    423                             setParseError("quoted literal text missing terminating apostrophe");
    424                             return i;
    425                         }
    426                         c = rules.charAt(i++);
    427                         if(c == 0x27) {
    428                             if(i < rules.length() && rules.charAt(i) == 0x27) {
    429                                 // Double apostrophe inside quoted literal text,
    430                                 // still encodes a single apostrophe.
    431                                 ++i;
    432                             } else {
    433                                 break;
    434                             }
    435                         }
    436                         raw.append(c);
    437                     }
    438                 } else if(c == 0x5c) {  // backslash
    439                     if(i == rules.length()) {
    440                         setParseError("backslash escape at the end of the rule string");
    441                         return i;
    442                     }
    443                     int cp = rules.codePointAt(i);
    444                     raw.appendCodePoint(cp);
    445                     i += Character.charCount(cp);
    446                 } else {
    447                     // Any other syntax character terminates a string.
    448                     --i;
    449                     break;
    450                 }
    451             } else if(PatternProps.isWhiteSpace(c)) {
    452                 // Unquoted white space terminates a string.
    453                 --i;
    454                 break;
    455             } else {
    456                 raw.append(c);
    457             }
    458         }
    459         for(int j = 0; j < raw.length();) {
    460             int c = raw.codePointAt(j);
    461             if(isSurrogate(c)) {
    462                 setParseError("string contains an unpaired surrogate");
    463                 return i;
    464             }
    465             if(0xfffd <= c && c <= 0xffff) {
    466                 setParseError("string contains U+FFFD, U+FFFE or U+FFFF");
    467                 return i;
    468             }
    469             j += Character.charCount(c);
    470         }
    471         return i;
    472     }
    473 
    474     // TODO: Widen UTF16.isSurrogate(char16) to take an int.
    475     private static final boolean isSurrogate(int c) {
    476         return (c & 0xfffff800) == 0xd800;
    477     }
    478 
    479     private static final String[] positions = {
    480         "first tertiary ignorable",
    481         "last tertiary ignorable",
    482         "first secondary ignorable",
    483         "last secondary ignorable",
    484         "first primary ignorable",
    485         "last primary ignorable",
    486         "first variable",
    487         "last variable",
    488         "first regular",
    489         "last regular",
    490         "first implicit",
    491         "last implicit",
    492         "first trailing",
    493         "last trailing"
    494     };
    495 
    496     /**
    497      * Sets str to a contraction of U+FFFE and (U+2800 + Position).
    498      * @return rule index after the special reset position
    499      * @throws ParseException
    500      */
    501     private int parseSpecialPosition(int i, StringBuilder str) throws ParseException {
    502         int j = readWords(i + 1, rawBuilder);
    503         if(j > i && rules.charAt(j) == 0x5d && rawBuilder.length() != 0) {  // words end with ]
    504             ++j;
    505             String raw = rawBuilder.toString();
    506             str.setLength(0);
    507             for(int pos = 0; pos < positions.length; ++pos) {
    508                 if(raw.equals(positions[pos])) {
    509                     str.append(POS_LEAD).append((char)(POS_BASE + pos));
    510                     return j;
    511                 }
    512             }
    513             if(raw.equals("top")) {
    514                 str.append(POS_LEAD).append((char)(POS_BASE + Position.LAST_REGULAR.ordinal()));
    515                 return j;
    516             }
    517             if(raw.equals("variable top")) {
    518                 str.append(POS_LEAD).append((char)(POS_BASE + Position.LAST_VARIABLE.ordinal()));
    519                 return j;
    520             }
    521         }
    522         setParseError("not a valid special reset position");
    523         return i;
    524     }
    525 
    526     private void parseSetting() throws ParseException {
    527         int i = ruleIndex + 1;
    528         int j = readWords(i, rawBuilder);
    529         if(j <= i || rawBuilder.length() == 0) {
    530             setParseError("expected a setting/option at '['");
    531         }
    532         // startsWith() etc. are available for String but not CharSequence/StringBuilder.
    533         String raw = rawBuilder.toString();
    534         if(rules.charAt(j) == 0x5d) {  // words end with ]
    535             ++j;
    536             if(raw.startsWith("reorder") &&
    537                     (raw.length() == 7 || raw.charAt(7) == 0x20)) {
    538                 parseReordering(raw);
    539                 ruleIndex = j;
    540                 return;
    541             }
    542             if(raw.equals("backwards 2")) {
    543                 settings.setFlag(CollationSettings.BACKWARD_SECONDARY, true);
    544                 ruleIndex = j;
    545                 return;
    546             }
    547             String v;
    548             int valueIndex = raw.lastIndexOf(0x20);
    549             if(valueIndex >= 0) {
    550                 v = raw.substring(valueIndex + 1);
    551                 raw = raw.substring(0, valueIndex);
    552             } else {
    553                 v = "";
    554             }
    555             if(raw.equals("strength") && v.length() == 1) {
    556                 int value = UCOL_DEFAULT;
    557                 char c = v.charAt(0);
    558                 if(0x31 <= c && c <= 0x34) {  // 1..4
    559                     value = Collator.PRIMARY + (c - 0x31);
    560                 } else if(c == 0x49) {  // 'I'
    561                     value = Collator.IDENTICAL;
    562                 }
    563                 if(value != UCOL_DEFAULT) {
    564                     settings.setStrength(value);
    565                     ruleIndex = j;
    566                     return;
    567                 }
    568             } else if(raw.equals("alternate")) {
    569                 int value = UCOL_DEFAULT;
    570                 if(v.equals("non-ignorable")) {
    571                     value = 0;  // UCOL_NON_IGNORABLE
    572                 } else if(v.equals("shifted")) {
    573                     value = 1;  // UCOL_SHIFTED
    574                 }
    575                 if(value != UCOL_DEFAULT) {
    576                     settings.setAlternateHandlingShifted(value > 0);
    577                     ruleIndex = j;
    578                     return;
    579                 }
    580             } else if(raw.equals("maxVariable")) {
    581                 int value = UCOL_DEFAULT;
    582                 if(v.equals("space")) {
    583                     value = CollationSettings.MAX_VAR_SPACE;
    584                 } else if(v.equals("punct")) {
    585                     value = CollationSettings.MAX_VAR_PUNCT;
    586                 } else if(v.equals("symbol")) {
    587                     value = CollationSettings.MAX_VAR_SYMBOL;
    588                 } else if(v.equals("currency")) {
    589                     value = CollationSettings.MAX_VAR_CURRENCY;
    590                 }
    591                 if(value != UCOL_DEFAULT) {
    592                     settings.setMaxVariable(value, 0);
    593                     settings.variableTop = baseData.getLastPrimaryForGroup(
    594                         Collator.ReorderCodes.FIRST + value);
    595                     assert(settings.variableTop != 0);
    596                     ruleIndex = j;
    597                     return;
    598                 }
    599             } else if(raw.equals("caseFirst")) {
    600                 int value = UCOL_DEFAULT;
    601                 if(v.equals("off")) {
    602                     value = UCOL_OFF;
    603                 } else if(v.equals("lower")) {
    604                     value = CollationSettings.CASE_FIRST;  // UCOL_LOWER_FIRST
    605                 } else if(v.equals("upper")) {
    606                     value = CollationSettings.CASE_FIRST_AND_UPPER_MASK;  // UCOL_UPPER_FIRST
    607                 }
    608                 if(value != UCOL_DEFAULT) {
    609                     settings.setCaseFirst(value);
    610                     ruleIndex = j;
    611                     return;
    612                 }
    613             } else if(raw.equals("caseLevel")) {
    614                 int value = getOnOffValue(v);
    615                 if(value != UCOL_DEFAULT) {
    616                     settings.setFlag(CollationSettings.CASE_LEVEL, value > 0);
    617                     ruleIndex = j;
    618                     return;
    619                 }
    620             } else if(raw.equals("normalization")) {
    621                 int value = getOnOffValue(v);
    622                 if(value != UCOL_DEFAULT) {
    623                     settings.setFlag(CollationSettings.CHECK_FCD, value > 0);
    624                     ruleIndex = j;
    625                     return;
    626                 }
    627             } else if(raw.equals("numericOrdering")) {
    628                 int value = getOnOffValue(v);
    629                 if(value != UCOL_DEFAULT) {
    630                     settings.setFlag(CollationSettings.NUMERIC, value > 0);
    631                     ruleIndex = j;
    632                     return;
    633                 }
    634             } else if(raw.equals("hiraganaQ")) {
    635                 int value = getOnOffValue(v);
    636                 if(value != UCOL_DEFAULT) {
    637                     if(value == UCOL_ON) {
    638                         setParseError("[hiraganaQ on] is not supported");
    639                     }
    640                     ruleIndex = j;
    641                     return;
    642                 }
    643             } else if(raw.equals("import")) {
    644                 // BCP 47 language tag -> ICU locale ID
    645                 ULocale localeID;
    646                 try {
    647                     localeID = new ULocale.Builder().setLanguageTag(v).build();
    648                 } catch(Exception e) {
    649                     setParseError("expected language tag in [import langTag]", e);
    650                     return;
    651                 }
    652                 // localeID minus all keywords
    653                 String baseID = localeID.getBaseName();
    654                 // @collation=type, or length=0 if not specified
    655                 String collationType = localeID.getKeywordValue("collation");
    656                 if(importer == null) {
    657                     setParseError("[import langTag] is not supported");
    658                 } else {
    659                     String importedRules;
    660                     try {
    661                         importedRules =
    662                             importer.getRules(baseID,
    663                                     collationType != null ? collationType : "standard");
    664                     } catch(Exception e) {
    665                         setParseError("[import langTag] failed", e);
    666                         return;
    667                     }
    668                     String outerRules = rules;
    669                     int outerRuleIndex = ruleIndex;
    670                     try {
    671                         parse(importedRules);
    672                     } catch(Exception e) {
    673                         ruleIndex = outerRuleIndex;  // Restore the original index for error reporting.
    674                         setParseError("parsing imported rules failed", e);
    675                     }
    676                     rules = outerRules;
    677                     ruleIndex = j;
    678                 }
    679                 return;
    680             }
    681         } else if(rules.charAt(j) == 0x5b) {  // words end with [
    682             UnicodeSet set = new UnicodeSet();
    683             j = parseUnicodeSet(j, set);
    684             if(raw.equals("optimize")) {
    685                 try {
    686                     sink.optimize(set);
    687                 } catch(Exception e) {
    688                     setParseError("[optimize set] failed", e);
    689                 }
    690                 ruleIndex = j;
    691                 return;
    692             } else if(raw.equals("suppressContractions")) {
    693                 try {
    694                     sink.suppressContractions(set);
    695                 } catch(Exception e) {
    696                     setParseError("[suppressContractions set] failed", e);
    697                 }
    698                 ruleIndex = j;
    699                 return;
    700             }
    701         }
    702         setParseError("not a valid setting/option");
    703     }
    704 
    705     private void parseReordering(CharSequence raw) throws ParseException {
    706         int i = 7;  // after "reorder"
    707         if(i == raw.length()) {
    708             // empty [reorder] with no codes
    709             settings.resetReordering();
    710             return;
    711         }
    712         // Parse the codes in [reorder aa bb cc].
    713         ArrayList<Integer> reorderCodes = new ArrayList<Integer>();
    714         while(i < raw.length()) {
    715             ++i;  // skip the word-separating space
    716             int limit = i;
    717             while(limit < raw.length() && raw.charAt(limit) != ' ') { ++limit; }
    718             String word = raw.subSequence(i, limit).toString();
    719             int code = getReorderCode(word);
    720             if(code < 0) {
    721                 setParseError("unknown script or reorder code");
    722                 return;
    723             }
    724             reorderCodes.add(code);
    725             i = limit;
    726         }
    727         if(reorderCodes.isEmpty()) {
    728             settings.resetReordering();
    729         } else {
    730             int[] codes = new int[reorderCodes.size()];
    731             int j = 0;
    732             for(Integer code : reorderCodes) { codes[j++] = code; }
    733             settings.setReordering(baseData, codes);
    734         }
    735     }
    736 
    737     private static final String[] gSpecialReorderCodes = {
    738         "space", "punct", "symbol", "currency", "digit"
    739     };
    740 
    741     /**
    742      * Gets a script or reorder code from its string representation.
    743      * @return the script/reorder code, or
    744      * -1 if not recognized
    745      */
    746     public static int getReorderCode(String word) {
    747         for(int i = 0; i < gSpecialReorderCodes.length; ++i) {
    748             if(word.equalsIgnoreCase(gSpecialReorderCodes[i])) {
    749                 return Collator.ReorderCodes.FIRST + i;
    750             }
    751         }
    752         try {
    753             int script = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, word);
    754             if(script >= 0) {
    755                 return script;
    756             }
    757         } catch (IllegalIcuArgumentException e) {
    758             // fall through
    759         }
    760         if(word.equalsIgnoreCase("others")) {
    761             return Collator.ReorderCodes.OTHERS;  // same as Zzzz = USCRIPT_UNKNOWN
    762         }
    763         return -1;
    764     }
    765 
    766     private static int getOnOffValue(String s) {
    767         if(s.equals("on")) {
    768             return UCOL_ON;
    769         } else if(s.equals("off")) {
    770             return UCOL_OFF;
    771         } else {
    772             return UCOL_DEFAULT;
    773         }
    774     }
    775 
    776     private int parseUnicodeSet(int i, UnicodeSet set) throws ParseException {
    777         // Collect a UnicodeSet pattern between a balanced pair of [brackets].
    778         int level = 0;
    779         int j = i;
    780         for(;;) {
    781             if(j == rules.length()) {
    782                 setParseError("unbalanced UnicodeSet pattern brackets");
    783                 return j;
    784             }
    785             char c = rules.charAt(j++);
    786             if(c == 0x5b) {  // '['
    787                 ++level;
    788             } else if(c == 0x5d) {  // ']'
    789                 if(--level == 0) { break; }
    790             }
    791         }
    792         try {
    793             set.applyPattern(rules.substring(i, j));
    794         } catch(Exception e) {
    795             setParseError("not a valid UnicodeSet pattern: " + e.getMessage());
    796         }
    797         j = skipWhiteSpace(j);
    798         if(j == rules.length() || rules.charAt(j) != 0x5d) {
    799             setParseError("missing option-terminating ']' after UnicodeSet pattern");
    800             return j;
    801         }
    802         return ++j;
    803     }
    804 
    805     private int readWords(int i, StringBuilder raw) {
    806         raw.setLength(0);
    807         i = skipWhiteSpace(i);
    808         for(;;) {
    809             if(i >= rules.length()) { return 0; }
    810             char c = rules.charAt(i);
    811             if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
    812                 if(raw.length() == 0) { return i; }
    813                 int lastIndex = raw.length() - 1;
    814                 if(raw.charAt(lastIndex) == ' ') {  // remove trailing space
    815                     raw.setLength(lastIndex);
    816                 }
    817                 return i;
    818             }
    819             if(PatternProps.isWhiteSpace(c)) {
    820                 raw.append(' ');
    821                 i = skipWhiteSpace(i + 1);
    822             } else {
    823                 raw.append(c);
    824                 ++i;
    825             }
    826         }
    827     }
    828 
    829     private int skipComment(int i) {
    830         // skip to past the newline
    831         while(i < rules.length()) {
    832             char c = rules.charAt(i++);
    833             // LF or FF or CR or NEL or LS or PS
    834             if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
    835                 // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
    836                 // NLF (new line function) = CR or LF or CR+LF or NEL.
    837                 // No need to collect all of CR+LF because a following LF will be ignored anyway.
    838                 break;
    839             }
    840         }
    841         return i;
    842     }
    843 
    844     private void setParseError(String reason) throws ParseException {
    845         throw makeParseException(reason);
    846     }
    847 
    848     private void setParseError(String reason, Exception e) throws ParseException {
    849         ParseException newExc = makeParseException(reason + ": " + e.getMessage());
    850         newExc.initCause(e);
    851         throw newExc;
    852     }
    853 
    854     private ParseException makeParseException(String reason) {
    855         return new ParseException(appendErrorContext(reason), ruleIndex);
    856     }
    857 
    858     private static final int U_PARSE_CONTEXT_LEN = 16;
    859 
    860     // C++ setErrorContext()
    861     private String appendErrorContext(String reason) {
    862         // Note: This relies on the calling code maintaining the ruleIndex
    863         // at a position that is useful for debugging.
    864         // For example, at the beginning of a reset or relation etc.
    865         StringBuilder msg = new StringBuilder(reason);
    866         msg.append(" at index ").append(ruleIndex);
    867         // We are not counting line numbers.
    868 
    869         msg.append(" near \"");
    870         // before ruleIndex
    871         int start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
    872         if(start < 0) {
    873             start = 0;
    874         } else if(start > 0 && Character.isLowSurrogate(rules.charAt(start))) {
    875             ++start;
    876         }
    877         msg.append(rules, start, ruleIndex);
    878 
    879         msg.append('!');
    880         // starting from ruleIndex
    881         int length = rules.length() - ruleIndex;
    882         if(length >= U_PARSE_CONTEXT_LEN) {
    883             length = U_PARSE_CONTEXT_LEN - 1;
    884             if(Character.isHighSurrogate(rules.charAt(ruleIndex + length - 1))) {
    885                 --length;
    886             }
    887         }
    888         msg.append(rules, ruleIndex, ruleIndex + length);
    889         return msg.append('\"').toString();
    890     }
    891 
    892     /**
    893      * ASCII [:P:] and [:S:]:
    894      * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
    895      */
    896     private static boolean isSyntaxChar(int c) {
    897         return 0x21 <= c && c <= 0x7e &&
    898                 (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
    899                 (0x5b <= c && c <= 0x60) || (0x7b <= c));
    900     }
    901 
    902     private int skipWhiteSpace(int i) {
    903         while(i < rules.length() && PatternProps.isWhiteSpace(rules.charAt(i))) {
    904             ++i;
    905         }
    906         return i;
    907     }
    908 
    909     private Normalizer2 nfd = Normalizer2.getNFDInstance();
    910     private Normalizer2 nfc = Normalizer2.getNFCInstance();
    911 
    912     private String rules;
    913     private final CollationData baseData;
    914     private CollationSettings settings;
    915 
    916     private Sink sink;
    917     private Importer importer;
    918 
    919     private int ruleIndex;
    920 }
    921