Home | History | Annotate | Download | only in coll
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * CollationRuleParser.java, ported from collationruleparser.h/.cpp
      9 *
     10 * C++ version created on: 2013apr10
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 package com.ibm.icu.impl.coll;
     15 
     16 import java.text.ParseException;
     17 import java.util.ArrayList;
     18 
     19 import com.ibm.icu.impl.IllegalIcuArgumentException;
     20 import com.ibm.icu.impl.PatternProps;
     21 import com.ibm.icu.lang.UCharacter;
     22 import com.ibm.icu.lang.UProperty;
     23 import com.ibm.icu.text.Collator;
     24 import com.ibm.icu.text.Normalizer2;
     25 import com.ibm.icu.text.UTF16;
     26 import com.ibm.icu.text.UnicodeSet;
     27 import com.ibm.icu.util.ULocale;
     28 
     29 public final class CollationRuleParser {
     30     /** Special reset positions. */
     31     enum Position {
     32         FIRST_TERTIARY_IGNORABLE,
     33         LAST_TERTIARY_IGNORABLE,
     34         FIRST_SECONDARY_IGNORABLE,
     35         LAST_SECONDARY_IGNORABLE,
     36         FIRST_PRIMARY_IGNORABLE,
     37         LAST_PRIMARY_IGNORABLE,
     38         FIRST_VARIABLE,
     39         LAST_VARIABLE,
     40         FIRST_REGULAR,
     41         LAST_REGULAR,
     42         FIRST_IMPLICIT,
     43         LAST_IMPLICIT,
     44         FIRST_TRAILING,
     45         LAST_TRAILING
     46     }
     47     static final Position[] POSITION_VALUES = Position.values();
     48 
     49     /**
     50      * First character of contractions that encode special reset positions.
     51      * U+FFFE cannot be tailored via rule syntax.
     52      *
     53      * The second contraction character is POS_BASE + Position.
     54      */
     55     static final char POS_LEAD = 0xfffe;
     56     /**
     57      * Base for the second character of contractions that encode special reset positions.
     58      * Braille characters U+28xx are printable and normalization-inert.
     59      * @see POS_LEAD
     60      */
     61     static final char POS_BASE = 0x2800;
     62 
     63     static abstract class Sink {
     64         /**
     65          * Adds a reset.
     66          * strength=UCOL_IDENTICAL for &str.
     67          * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
     68          */
     69         abstract void addReset(int strength, CharSequence str);
     70         /**
     71          * Adds a relation with strength and prefix | str / extension.
     72          */
     73         abstract void addRelation(int strength, CharSequence prefix,
     74                 CharSequence str, CharSequence extension);
     75 
     76         void suppressContractions(UnicodeSet set) {}
     77 
     78         void optimize(UnicodeSet set) {}
     79     }
     80 
     81     interface Importer {
     82         String getRules(String localeID, String collationType);
     83     }
     84 
     85     /**
     86      * Constructor.
     87      * The Sink must be set before parsing.
     88      * The Importer can be set, otherwise [import locale] syntax is not supported.
     89      */
     90     CollationRuleParser(CollationData base) {
     91         baseData = base;
     92     }
     93 
     94     /**
     95      * Sets the pointer to a Sink object.
     96      * The pointer is aliased: Pointer copy without cloning or taking ownership.
     97      */
     98     void setSink(Sink sinkAlias) {
     99         sink = sinkAlias;
    100     }
    101 
    102     /**
    103      * Sets the pointer to an Importer object.
    104      * The pointer is aliased: Pointer copy without cloning or taking ownership.
    105      */
    106     void setImporter(Importer importerAlias) {
    107         importer = importerAlias;
    108     }
    109 
    110     void parse(String ruleString, CollationSettings outSettings) throws ParseException {
    111         settings = outSettings;
    112         parse(ruleString);
    113     }
    114 
    115     private static final int UCOL_DEFAULT = -1;
    116     private static final int UCOL_OFF = 0;
    117     private static final int UCOL_ON = 1;
    118 
    119     /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
    120     private static final int STRENGTH_MASK = 0xf;
    121     private static final int STARRED_FLAG = 0x10;
    122     private static final int OFFSET_SHIFT = 8;
    123 
    124     private static final String BEFORE = "[before";
    125 
    126     // In C++, we parse into temporary UnicodeString objects named "raw" or "str".
    127     // In Java, we reuse this StringBuilder.
    128     private final StringBuilder rawBuilder = new StringBuilder();
    129 
    130     private void parse(String ruleString) throws ParseException {
    131         rules = ruleString;
    132         ruleIndex = 0;
    133 
    134         while(ruleIndex < rules.length()) {
    135             char c = rules.charAt(ruleIndex);
    136             if(PatternProps.isWhiteSpace(c)) {
    137                 ++ruleIndex;
    138                 continue;
    139             }
    140             switch(c) {
    141             case 0x26:  // '&'
    142                 parseRuleChain();
    143                 break;
    144             case 0x5b:  // '['
    145                 parseSetting();
    146                 break;
    147             case 0x23:  // '#' starts a comment, until the end of the line
    148                 ruleIndex = skipComment(ruleIndex + 1);
    149                 break;
    150             case 0x40:  // '@' is equivalent to [backwards 2]
    151                 settings.setFlag(CollationSettings.BACKWARD_SECONDARY, true);
    152                 ++ruleIndex;
    153                 break;
    154             case 0x21:  // '!' used to turn on Thai/Lao character reversal
    155                 // Accept but ignore. The root collator has contractions
    156                 // that are equivalent to the character reversal, where appropriate.
    157                 ++ruleIndex;
    158                 break;
    159             default:
    160                 setParseError("expected a reset or setting or comment");
    161                 break;
    162             }
    163         }
    164     }
    165 
    166     private void parseRuleChain() throws ParseException {
    167         int resetStrength = parseResetAndPosition();
    168         boolean isFirstRelation = true;
    169         for(;;) {
    170             int result = parseRelationOperator();
    171             if(result < 0) {
    172                 if(ruleIndex < rules.length() && rules.charAt(ruleIndex) == 0x23) {
    173                     // '#' starts a comment, until the end of the line
    174                     ruleIndex = skipComment(ruleIndex + 1);
    175                     continue;
    176                 }
    177                 if(isFirstRelation) {
    178                     setParseError("reset not followed by a relation");
    179                 }
    180                 return;
    181             }
    182             int strength = result & STRENGTH_MASK;
    183             if(resetStrength < Collator.IDENTICAL) {
    184                 // reset-before rule chain
    185                 if(isFirstRelation) {
    186                     if(strength != resetStrength) {
    187                         setParseError("reset-before strength differs from its first relation");
    188                         return;
    189                     }
    190                 } else {
    191                     if(strength < resetStrength) {
    192                         setParseError("reset-before strength followed by a stronger relation");
    193                         return;
    194                     }
    195                 }
    196             }
    197             int i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
    198             if((result & STARRED_FLAG) == 0) {
    199                 parseRelationStrings(strength, i);
    200             } else {
    201                 parseStarredCharacters(strength, i);
    202             }
    203             isFirstRelation = false;
    204         }
    205     }
    206 
    207     private int parseResetAndPosition() throws ParseException {
    208         int i = skipWhiteSpace(ruleIndex + 1);
    209         int j;
    210         char c;
    211         int resetStrength;
    212         if(rules.regionMatches(i, BEFORE, 0, BEFORE.length()) &&
    213                 (j = i + BEFORE.length()) < rules.length() &&
    214                 PatternProps.isWhiteSpace(rules.charAt(j)) &&
    215                 ((j = skipWhiteSpace(j + 1)) + 1) < rules.length() &&
    216                 0x31 <= (c = rules.charAt(j)) && c <= 0x33 &&
    217                 rules.charAt(j + 1) == 0x5d) {
    218             // &[before n] with n=1 or 2 or 3
    219             resetStrength = Collator.PRIMARY + (c - 0x31);
    220             i = skipWhiteSpace(j + 2);
    221         } else {
    222             resetStrength = Collator.IDENTICAL;
    223         }
    224         if(i >= rules.length()) {
    225             setParseError("reset without position");
    226             return UCOL_DEFAULT;
    227         }
    228         if(rules.charAt(i) == 0x5b) {  // '['
    229             i = parseSpecialPosition(i, rawBuilder);
    230         } else {
    231             i = parseTailoringString(i, rawBuilder);
    232         }
    233         try {
    234             sink.addReset(resetStrength, rawBuilder);
    235         } catch(Exception e) {
    236             setParseError("adding reset failed", e);
    237             return UCOL_DEFAULT;
    238         }
    239         ruleIndex = i;
    240         return resetStrength;
    241     }
    242 
    243     private int parseRelationOperator() {
    244         ruleIndex = skipWhiteSpace(ruleIndex);
    245         if(ruleIndex >= rules.length()) { return UCOL_DEFAULT; }
    246         int strength;
    247         int i = ruleIndex;
    248         char c = rules.charAt(i++);
    249         switch(c) {
    250         case 0x3c:  // '<'
    251             if(i < rules.length() && rules.charAt(i) == 0x3c) {  // <<
    252                 ++i;
    253                 if(i < rules.length() && rules.charAt(i) == 0x3c) {  // <<<
    254                     ++i;
    255                     if(i < rules.length() && rules.charAt(i) == 0x3c) {  // <<<<
    256                         ++i;
    257                         strength = Collator.QUATERNARY;
    258                     } else {
    259                         strength = Collator.TERTIARY;
    260                     }
    261                 } else {
    262                     strength = Collator.SECONDARY;
    263                 }
    264             } else {
    265                 strength = Collator.PRIMARY;
    266             }
    267             if(i < rules.length() && rules.charAt(i) == 0x2a) {  // '*'
    268                 ++i;
    269                 strength |= STARRED_FLAG;
    270             }
    271             break;
    272         case 0x3b:  // ';' same as <<
    273             strength = Collator.SECONDARY;
    274             break;
    275         case 0x2c:  // ',' same as <<<
    276             strength = Collator.TERTIARY;
    277             break;
    278         case 0x3d:  // '='
    279             strength = Collator.IDENTICAL;
    280             if(i < rules.length() && rules.charAt(i) == 0x2a) {  // '*'
    281                 ++i;
    282                 strength |= STARRED_FLAG;
    283             }
    284             break;
    285         default:
    286             return UCOL_DEFAULT;
    287         }
    288         return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
    289     }
    290 
    291     private void parseRelationStrings(int strength, int i) throws ParseException {
    292         // Parse
    293         //     prefix | str / extension
    294         // where prefix and extension are optional.
    295         String prefix = "";
    296         CharSequence extension = "";
    297         i = parseTailoringString(i, rawBuilder);
    298         char next = (i < rules.length()) ? rules.charAt(i) : 0;
    299         if(next == 0x7c) {  // '|' separates the context prefix from the string.
    300             prefix = rawBuilder.toString();
    301             i = parseTailoringString(i + 1, rawBuilder);
    302             next = (i < rules.length()) ? rules.charAt(i) : 0;
    303         }
    304         // str = rawBuilder (do not modify rawBuilder any more in this function)
    305         if(next == 0x2f) {  // '/' separates the string from the extension.
    306             StringBuilder extBuilder = new StringBuilder();
    307             i = parseTailoringString(i + 1, extBuilder);
    308             extension = extBuilder;
    309         }
    310         if(prefix.length() != 0) {
    311             int prefix0 = prefix.codePointAt(0);
    312             int c = rawBuilder.codePointAt(0);
    313             if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
    314                 setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary");
    315                 return;
    316             }
    317         }
    318         try {
    319             sink.addRelation(strength, prefix, rawBuilder, extension);
    320         } catch(Exception e) {
    321             setParseError("adding relation failed", e);
    322             return;
    323         }
    324         ruleIndex = i;
    325     }
    326 
    327     private void parseStarredCharacters(int strength, int i) throws ParseException {
    328         String empty = "";
    329         i = parseString(skipWhiteSpace(i), rawBuilder);
    330         if(rawBuilder.length() == 0) {
    331             setParseError("missing starred-relation string");
    332             return;
    333         }
    334         int prev = -1;
    335         int j = 0;
    336         for(;;) {
    337             while(j < rawBuilder.length()) {
    338                 int c = rawBuilder.codePointAt(j);
    339                 if(!nfd.isInert(c)) {
    340                     setParseError("starred-relation string is not all NFD-inert");
    341                     return;
    342                 }
    343                 try {
    344                     sink.addRelation(strength, empty, UTF16.valueOf(c), empty);
    345                 } catch(Exception e) {
    346                     setParseError("adding relation failed", e);
    347                     return;
    348                 }
    349                 j += Character.charCount(c);
    350                 prev = c;
    351             }
    352             if(i >= rules.length() || rules.charAt(i) != 0x2d) {  // '-'
    353                 break;
    354             }
    355             if(prev < 0) {
    356                 setParseError("range without start in starred-relation string");
    357                 return;
    358             }
    359             i = parseString(i + 1, rawBuilder);
    360             if(rawBuilder.length() == 0) {
    361                 setParseError("range without end in starred-relation string");
    362                 return;
    363             }
    364             int c = rawBuilder.codePointAt(0);
    365             if(c < prev) {
    366                 setParseError("range start greater than end in starred-relation string");
    367                 return;
    368             }
    369             // range prev-c
    370             while(++prev <= c) {
    371                 if(!nfd.isInert(prev)) {
    372                     setParseError("starred-relation string range is not all NFD-inert");
    373                     return;
    374                 }
    375                 if(isSurrogate(prev)) {
    376                     setParseError("starred-relation string range contains a surrogate");
    377                     return;
    378                 }
    379                 if(0xfffd <= prev && prev <= 0xffff) {
    380                     setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF");
    381                     return;
    382                 }
    383                 try {
    384                     sink.addRelation(strength, empty, UTF16.valueOf(prev), empty);
    385                 } catch(Exception e) {
    386                     setParseError("adding relation failed", e);
    387                     return;
    388                 }
    389             }
    390             prev = -1;
    391             j = Character.charCount(c);
    392         }
    393         ruleIndex = skipWhiteSpace(i);
    394     }
    395 
    396     private int parseTailoringString(int i, StringBuilder raw) throws ParseException {
    397         i = parseString(skipWhiteSpace(i), raw);
    398         if(raw.length() == 0) {
    399             setParseError("missing relation string");
    400         }
    401         return skipWhiteSpace(i);
    402     }
    403 
    404     private int parseString(int i, StringBuilder raw) throws ParseException {
    405         raw.setLength(0);
    406         while(i < rules.length()) {
    407             char c = rules.charAt(i++);
    408             if(isSyntaxChar(c)) {
    409                 if(c == 0x27) {  // apostrophe
    410                     if(i < rules.length() && rules.charAt(i) == 0x27) {
    411                         // Double apostrophe, encodes a single one.
    412                         raw.append((char)0x27);
    413                         ++i;
    414                         continue;
    415                     }
    416                     // Quote literal text until the next single apostrophe.
    417                     for(;;) {
    418                         if(i == rules.length()) {
    419                             setParseError("quoted literal text missing terminating apostrophe");
    420                             return i;
    421                         }
    422                         c = rules.charAt(i++);
    423                         if(c == 0x27) {
    424                             if(i < rules.length() && rules.charAt(i) == 0x27) {
    425                                 // Double apostrophe inside quoted literal text,
    426                                 // still encodes a single apostrophe.
    427                                 ++i;
    428                             } else {
    429                                 break;
    430                             }
    431                         }
    432                         raw.append(c);
    433                     }
    434                 } else if(c == 0x5c) {  // backslash
    435                     if(i == rules.length()) {
    436                         setParseError("backslash escape at the end of the rule string");
    437                         return i;
    438                     }
    439                     int cp = rules.codePointAt(i);
    440                     raw.appendCodePoint(cp);
    441                     i += Character.charCount(cp);
    442                 } else {
    443                     // Any other syntax character terminates a string.
    444                     --i;
    445                     break;
    446                 }
    447             } else if(PatternProps.isWhiteSpace(c)) {
    448                 // Unquoted white space terminates a string.
    449                 --i;
    450                 break;
    451             } else {
    452                 raw.append(c);
    453             }
    454         }
    455         for(int j = 0; j < raw.length();) {
    456             int c = raw.codePointAt(j);
    457             if(isSurrogate(c)) {
    458                 setParseError("string contains an unpaired surrogate");
    459                 return i;
    460             }
    461             if(0xfffd <= c && c <= 0xffff) {
    462                 setParseError("string contains U+FFFD, U+FFFE or U+FFFF");
    463                 return i;
    464             }
    465             j += Character.charCount(c);
    466         }
    467         return i;
    468     }
    469 
    470     // TODO: Widen UTF16.isSurrogate(char16) to take an int.
    471     private static final boolean isSurrogate(int c) {
    472         return (c & 0xfffff800) == 0xd800;
    473     }
    474 
    475     private static final String[] positions = {
    476         "first tertiary ignorable",
    477         "last tertiary ignorable",
    478         "first secondary ignorable",
    479         "last secondary ignorable",
    480         "first primary ignorable",
    481         "last primary ignorable",
    482         "first variable",
    483         "last variable",
    484         "first regular",
    485         "last regular",
    486         "first implicit",
    487         "last implicit",
    488         "first trailing",
    489         "last trailing"
    490     };
    491 
    492     /**
    493      * Sets str to a contraction of U+FFFE and (U+2800 + Position).
    494      * @return rule index after the special reset position
    495      * @throws ParseException
    496      */
    497     private int parseSpecialPosition(int i, StringBuilder str) throws ParseException {
    498         int j = readWords(i + 1, rawBuilder);
    499         if(j > i && rules.charAt(j) == 0x5d && rawBuilder.length() != 0) {  // words end with ]
    500             ++j;
    501             String raw = rawBuilder.toString();
    502             str.setLength(0);
    503             for(int pos = 0; pos < positions.length; ++pos) {
    504                 if(raw.equals(positions[pos])) {
    505                     str.append(POS_LEAD).append((char)(POS_BASE + pos));
    506                     return j;
    507                 }
    508             }
    509             if(raw.equals("top")) {
    510                 str.append(POS_LEAD).append((char)(POS_BASE + Position.LAST_REGULAR.ordinal()));
    511                 return j;
    512             }
    513             if(raw.equals("variable top")) {
    514                 str.append(POS_LEAD).append((char)(POS_BASE + Position.LAST_VARIABLE.ordinal()));
    515                 return j;
    516             }
    517         }
    518         setParseError("not a valid special reset position");
    519         return i;
    520     }
    521 
    522     private void parseSetting() throws ParseException {
    523         int i = ruleIndex + 1;
    524         int j = readWords(i, rawBuilder);
    525         if(j <= i || rawBuilder.length() == 0) {
    526             setParseError("expected a setting/option at '['");
    527         }
    528         // startsWith() etc. are available for String but not CharSequence/StringBuilder.
    529         String raw = rawBuilder.toString();
    530         if(rules.charAt(j) == 0x5d) {  // words end with ]
    531             ++j;
    532             if(raw.startsWith("reorder") &&
    533                     (raw.length() == 7 || raw.charAt(7) == 0x20)) {
    534                 parseReordering(raw);
    535                 ruleIndex = j;
    536                 return;
    537             }
    538             if(raw.equals("backwards 2")) {
    539                 settings.setFlag(CollationSettings.BACKWARD_SECONDARY, true);
    540                 ruleIndex = j;
    541                 return;
    542             }
    543             String v;
    544             int valueIndex = raw.lastIndexOf(0x20);
    545             if(valueIndex >= 0) {
    546                 v = raw.substring(valueIndex + 1);
    547                 raw = raw.substring(0, valueIndex);
    548             } else {
    549                 v = "";
    550             }
    551             if(raw.equals("strength") && v.length() == 1) {
    552                 int value = UCOL_DEFAULT;
    553                 char c = v.charAt(0);
    554                 if(0x31 <= c && c <= 0x34) {  // 1..4
    555                     value = Collator.PRIMARY + (c - 0x31);
    556                 } else if(c == 0x49) {  // 'I'
    557                     value = Collator.IDENTICAL;
    558                 }
    559                 if(value != UCOL_DEFAULT) {
    560                     settings.setStrength(value);
    561                     ruleIndex = j;
    562                     return;
    563                 }
    564             } else if(raw.equals("alternate")) {
    565                 int value = UCOL_DEFAULT;
    566                 if(v.equals("non-ignorable")) {
    567                     value = 0;  // UCOL_NON_IGNORABLE
    568                 } else if(v.equals("shifted")) {
    569                     value = 1;  // UCOL_SHIFTED
    570                 }
    571                 if(value != UCOL_DEFAULT) {
    572                     settings.setAlternateHandlingShifted(value > 0);
    573                     ruleIndex = j;
    574                     return;
    575                 }
    576             } else if(raw.equals("maxVariable")) {
    577                 int value = UCOL_DEFAULT;
    578                 if(v.equals("space")) {
    579                     value = CollationSettings.MAX_VAR_SPACE;
    580                 } else if(v.equals("punct")) {
    581                     value = CollationSettings.MAX_VAR_PUNCT;
    582                 } else if(v.equals("symbol")) {
    583                     value = CollationSettings.MAX_VAR_SYMBOL;
    584                 } else if(v.equals("currency")) {
    585                     value = CollationSettings.MAX_VAR_CURRENCY;
    586                 }
    587                 if(value != UCOL_DEFAULT) {
    588                     settings.setMaxVariable(value, 0);
    589                     settings.variableTop = baseData.getLastPrimaryForGroup(
    590                         Collator.ReorderCodes.FIRST + value);
    591                     assert(settings.variableTop != 0);
    592                     ruleIndex = j;
    593                     return;
    594                 }
    595             } else if(raw.equals("caseFirst")) {
    596                 int value = UCOL_DEFAULT;
    597                 if(v.equals("off")) {
    598                     value = UCOL_OFF;
    599                 } else if(v.equals("lower")) {
    600                     value = CollationSettings.CASE_FIRST;  // UCOL_LOWER_FIRST
    601                 } else if(v.equals("upper")) {
    602                     value = CollationSettings.CASE_FIRST_AND_UPPER_MASK;  // UCOL_UPPER_FIRST
    603                 }
    604                 if(value != UCOL_DEFAULT) {
    605                     settings.setCaseFirst(value);
    606                     ruleIndex = j;
    607                     return;
    608                 }
    609             } else if(raw.equals("caseLevel")) {
    610                 int value = getOnOffValue(v);
    611                 if(value != UCOL_DEFAULT) {
    612                     settings.setFlag(CollationSettings.CASE_LEVEL, value > 0);
    613                     ruleIndex = j;
    614                     return;
    615                 }
    616             } else if(raw.equals("normalization")) {
    617                 int value = getOnOffValue(v);
    618                 if(value != UCOL_DEFAULT) {
    619                     settings.setFlag(CollationSettings.CHECK_FCD, value > 0);
    620                     ruleIndex = j;
    621                     return;
    622                 }
    623             } else if(raw.equals("numericOrdering")) {
    624                 int value = getOnOffValue(v);
    625                 if(value != UCOL_DEFAULT) {
    626                     settings.setFlag(CollationSettings.NUMERIC, value > 0);
    627                     ruleIndex = j;
    628                     return;
    629                 }
    630             } else if(raw.equals("hiraganaQ")) {
    631                 int value = getOnOffValue(v);
    632                 if(value != UCOL_DEFAULT) {
    633                     if(value == UCOL_ON) {
    634                         setParseError("[hiraganaQ on] is not supported");
    635                     }
    636                     ruleIndex = j;
    637                     return;
    638                 }
    639             } else if(raw.equals("import")) {
    640                 // BCP 47 language tag -> ICU locale ID
    641                 ULocale localeID;
    642                 try {
    643                     localeID = new ULocale.Builder().setLanguageTag(v).build();
    644                 } catch(Exception e) {
    645                     setParseError("expected language tag in [import langTag]", e);
    646                     return;
    647                 }
    648                 // localeID minus all keywords
    649                 String baseID = localeID.getBaseName();
    650                 // @collation=type, or length=0 if not specified
    651                 String collationType = localeID.getKeywordValue("collation");
    652                 if(importer == null) {
    653                     setParseError("[import langTag] is not supported");
    654                 } else {
    655                     String importedRules;
    656                     try {
    657                         importedRules =
    658                             importer.getRules(baseID,
    659                                     collationType != null ? collationType : "standard");
    660                     } catch(Exception e) {
    661                         setParseError("[import langTag] failed", e);
    662                         return;
    663                     }
    664                     String outerRules = rules;
    665                     int outerRuleIndex = ruleIndex;
    666                     try {
    667                         parse(importedRules);
    668                     } catch(Exception e) {
    669                         ruleIndex = outerRuleIndex;  // Restore the original index for error reporting.
    670                         setParseError("parsing imported rules failed", e);
    671                     }
    672                     rules = outerRules;
    673                     ruleIndex = j;
    674                 }
    675                 return;
    676             }
    677         } else if(rules.charAt(j) == 0x5b) {  // words end with [
    678             UnicodeSet set = new UnicodeSet();
    679             j = parseUnicodeSet(j, set);
    680             if(raw.equals("optimize")) {
    681                 try {
    682                     sink.optimize(set);
    683                 } catch(Exception e) {
    684                     setParseError("[optimize set] failed", e);
    685                 }
    686                 ruleIndex = j;
    687                 return;
    688             } else if(raw.equals("suppressContractions")) {
    689                 try {
    690                     sink.suppressContractions(set);
    691                 } catch(Exception e) {
    692                     setParseError("[suppressContractions set] failed", e);
    693                 }
    694                 ruleIndex = j;
    695                 return;
    696             }
    697         }
    698         setParseError("not a valid setting/option");
    699     }
    700 
    701     private void parseReordering(CharSequence raw) throws ParseException {
    702         int i = 7;  // after "reorder"
    703         if(i == raw.length()) {
    704             // empty [reorder] with no codes
    705             settings.resetReordering();
    706             return;
    707         }
    708         // Parse the codes in [reorder aa bb cc].
    709         ArrayList<Integer> reorderCodes = new ArrayList<Integer>();
    710         while(i < raw.length()) {
    711             ++i;  // skip the word-separating space
    712             int limit = i;
    713             while(limit < raw.length() && raw.charAt(limit) != ' ') { ++limit; }
    714             String word = raw.subSequence(i, limit).toString();
    715             int code = getReorderCode(word);
    716             if(code < 0) {
    717                 setParseError("unknown script or reorder code");
    718                 return;
    719             }
    720             reorderCodes.add(code);
    721             i = limit;
    722         }
    723         if(reorderCodes.isEmpty()) {
    724             settings.resetReordering();
    725         } else {
    726             int[] codes = new int[reorderCodes.size()];
    727             int j = 0;
    728             for(Integer code : reorderCodes) { codes[j++] = code; }
    729             settings.setReordering(baseData, codes);
    730         }
    731     }
    732 
    733     private static final String[] gSpecialReorderCodes = {
    734         "space", "punct", "symbol", "currency", "digit"
    735     };
    736 
    737     /**
    738      * Gets a script or reorder code from its string representation.
    739      * @return the script/reorder code, or
    740      * -1 if not recognized
    741      */
    742     public static int getReorderCode(String word) {
    743         for(int i = 0; i < gSpecialReorderCodes.length; ++i) {
    744             if(word.equalsIgnoreCase(gSpecialReorderCodes[i])) {
    745                 return Collator.ReorderCodes.FIRST + i;
    746             }
    747         }
    748         try {
    749             int script = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, word);
    750             if(script >= 0) {
    751                 return script;
    752             }
    753         } catch (IllegalIcuArgumentException e) {
    754             // fall through
    755         }
    756         if(word.equalsIgnoreCase("others")) {
    757             return Collator.ReorderCodes.OTHERS;  // same as Zzzz = USCRIPT_UNKNOWN
    758         }
    759         return -1;
    760     }
    761 
    762     private static int getOnOffValue(String s) {
    763         if(s.equals("on")) {
    764             return UCOL_ON;
    765         } else if(s.equals("off")) {
    766             return UCOL_OFF;
    767         } else {
    768             return UCOL_DEFAULT;
    769         }
    770     }
    771 
    772     private int parseUnicodeSet(int i, UnicodeSet set) throws ParseException {
    773         // Collect a UnicodeSet pattern between a balanced pair of [brackets].
    774         int level = 0;
    775         int j = i;
    776         for(;;) {
    777             if(j == rules.length()) {
    778                 setParseError("unbalanced UnicodeSet pattern brackets");
    779                 return j;
    780             }
    781             char c = rules.charAt(j++);
    782             if(c == 0x5b) {  // '['
    783                 ++level;
    784             } else if(c == 0x5d) {  // ']'
    785                 if(--level == 0) { break; }
    786             }
    787         }
    788         try {
    789             set.applyPattern(rules.substring(i, j));
    790         } catch(Exception e) {
    791             setParseError("not a valid UnicodeSet pattern: " + e.getMessage());
    792         }
    793         j = skipWhiteSpace(j);
    794         if(j == rules.length() || rules.charAt(j) != 0x5d) {
    795             setParseError("missing option-terminating ']' after UnicodeSet pattern");
    796             return j;
    797         }
    798         return ++j;
    799     }
    800 
    801     private int readWords(int i, StringBuilder raw) {
    802         raw.setLength(0);
    803         i = skipWhiteSpace(i);
    804         for(;;) {
    805             if(i >= rules.length()) { return 0; }
    806             char c = rules.charAt(i);
    807             if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
    808                 if(raw.length() == 0) { return i; }
    809                 int lastIndex = raw.length() - 1;
    810                 if(raw.charAt(lastIndex) == ' ') {  // remove trailing space
    811                     raw.setLength(lastIndex);
    812                 }
    813                 return i;
    814             }
    815             if(PatternProps.isWhiteSpace(c)) {
    816                 raw.append(' ');
    817                 i = skipWhiteSpace(i + 1);
    818             } else {
    819                 raw.append(c);
    820                 ++i;
    821             }
    822         }
    823     }
    824 
    825     private int skipComment(int i) {
    826         // skip to past the newline
    827         while(i < rules.length()) {
    828             char c = rules.charAt(i++);
    829             // LF or FF or CR or NEL or LS or PS
    830             if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
    831                 // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
    832                 // NLF (new line function) = CR or LF or CR+LF or NEL.
    833                 // No need to collect all of CR+LF because a following LF will be ignored anyway.
    834                 break;
    835             }
    836         }
    837         return i;
    838     }
    839 
    840     private void setParseError(String reason) throws ParseException {
    841         throw makeParseException(reason);
    842     }
    843 
    844     private void setParseError(String reason, Exception e) throws ParseException {
    845         ParseException newExc = makeParseException(reason + ": " + e.getMessage());
    846         newExc.initCause(e);
    847         throw newExc;
    848     }
    849 
    850     private ParseException makeParseException(String reason) {
    851         return new ParseException(appendErrorContext(reason), ruleIndex);
    852     }
    853 
    854     private static final int U_PARSE_CONTEXT_LEN = 16;
    855 
    856     // C++ setErrorContext()
    857     private String appendErrorContext(String reason) {
    858         // Note: This relies on the calling code maintaining the ruleIndex
    859         // at a position that is useful for debugging.
    860         // For example, at the beginning of a reset or relation etc.
    861         StringBuilder msg = new StringBuilder(reason);
    862         msg.append(" at index ").append(ruleIndex);
    863         // We are not counting line numbers.
    864 
    865         msg.append(" near \"");
    866         // before ruleIndex
    867         int start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
    868         if(start < 0) {
    869             start = 0;
    870         } else if(start > 0 && Character.isLowSurrogate(rules.charAt(start))) {
    871             ++start;
    872         }
    873         msg.append(rules, start, ruleIndex);
    874 
    875         msg.append('!');
    876         // starting from ruleIndex
    877         int length = rules.length() - ruleIndex;
    878         if(length >= U_PARSE_CONTEXT_LEN) {
    879             length = U_PARSE_CONTEXT_LEN - 1;
    880             if(Character.isHighSurrogate(rules.charAt(ruleIndex + length - 1))) {
    881                 --length;
    882             }
    883         }
    884         msg.append(rules, ruleIndex, ruleIndex + length);
    885         return msg.append('\"').toString();
    886     }
    887 
    888     /**
    889      * ASCII [:P:] and [:S:]:
    890      * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
    891      */
    892     private static boolean isSyntaxChar(int c) {
    893         return 0x21 <= c && c <= 0x7e &&
    894                 (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
    895                 (0x5b <= c && c <= 0x60) || (0x7b <= c));
    896     }
    897 
    898     private int skipWhiteSpace(int i) {
    899         while(i < rules.length() && PatternProps.isWhiteSpace(rules.charAt(i))) {
    900             ++i;
    901         }
    902         return i;
    903     }
    904 
    905     private Normalizer2 nfd = Normalizer2.getNFDInstance();
    906     private Normalizer2 nfc = Normalizer2.getNFCInstance();
    907 
    908     private String rules;
    909     private final CollationData baseData;
    910     private CollationSettings settings;
    911 
    912     private Sink sink;
    913     private Importer importer;
    914 
    915     private int ruleIndex;
    916 }
    917