Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4 **********************************************************************
      5 *   Copyright (c) 2001-2011, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 */
      9 package com.ibm.icu.text;
     10 
     11 import java.text.ParsePosition;
     12 import java.util.ArrayList;
     13 import java.util.HashMap;
     14 import java.util.List;
     15 import java.util.Map;
     16 
     17 import com.ibm.icu.impl.IllegalIcuArgumentException;
     18 import com.ibm.icu.impl.PatternProps;
     19 import com.ibm.icu.impl.Utility;
     20 import com.ibm.icu.lang.UCharacter;
     21 import com.ibm.icu.text.RuleBasedTransliterator.Data;
     22 
     23 class TransliteratorParser {
     24 
     25     //----------------------------------------------------------------------
     26     // Data members
     27     //----------------------------------------------------------------------
     28 
     29     /**
     30      * PUBLIC data member.
     31      * A Vector of RuleBasedTransliterator.Data objects, one for each discrete group
     32      * of rules in the rule set
     33      */
     34     public List<Data> dataVector;
     35 
     36     /**
     37      * PUBLIC data member.
     38      * A Vector of Strings containing all of the ID blocks in the rule set
     39      */
     40     public List<String> idBlockVector;
     41 
     42     /**
     43      * The current data object for which we are parsing rules
     44      */
     45     private Data curData;
     46 
     47     /**
     48      * PUBLIC data member containing the parsed compound filter, if any.
     49      */
     50     public UnicodeSet compoundFilter;
     51 
     52 
     53     private int direction;
     54 
     55     /**
     56      * Temporary symbol table used during parsing.
     57      */
     58     private ParseData parseData;
     59 
     60     /**
     61      * Temporary vector of set variables.  When parsing is complete, this
     62      * is copied into the array data.variables.  As with data.variables,
     63      * element 0 corresponds to character data.variablesBase.
     64      */
     65     private List<Object> variablesVector;
     66 
     67     /**
     68      * Temporary table of variable names.  When parsing is complete, this is
     69      * copied into data.variableNames.
     70      */
     71     private Map<String, char[]> variableNames;
     72 
     73     /**
     74      * String of standins for segments.  Used during the parsing of a single
     75      * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
     76      * to StringMatcher object segmentObjects.elementAt(0), etc.
     77      */
     78     private StringBuffer segmentStandins;
     79 
     80     /**
     81      * Vector of StringMatcher objects for segments.  Used during the
     82      * parsing of a single rule.
     83      * segmentStandins.charAt(0) is the standin for "$1" and corresponds
     84      * to StringMatcher object segmentObjects.elementAt(0), etc.
     85      */
     86     private List<StringMatcher> segmentObjects;
     87 
     88     /**
     89      * The next available stand-in for variables.  This starts at some point in
     90      * the private use area (discovered dynamically) and increments up toward
     91      * <code>variableLimit</code>.  At any point during parsing, available
     92      * variables are <code>variableNext..variableLimit-1</code>.
     93      */
     94     private char variableNext;
     95 
     96     /**
     97      * The last available stand-in for variables.  This is discovered
     98      * dynamically.  At any point during parsing, available variables are
     99      * <code>variableNext..variableLimit-1</code>.  During variable definition
    100      * we use the special value variableLimit-1 as a placeholder.
    101      */
    102     private char variableLimit;
    103 
    104     /**
    105      * When we encounter an undefined variable, we do not immediately signal
    106      * an error, in case we are defining this variable, e.g., "$a = [a-z];".
    107      * Instead, we save the name of the undefined variable, and substitute
    108      * in the placeholder char variableLimit - 1, and decrement
    109      * variableLimit.
    110      */
    111     private String undefinedVariableName;
    112 
    113     /**
    114      * The stand-in character for the 'dot' set, represented by '.' in
    115      * patterns.  This is allocated the first time it is needed, and
    116      * reused thereafter.
    117      */
    118     private int dotStandIn = -1;
    119 
    120     //----------------------------------------------------------------------
    121     // Constants
    122     //----------------------------------------------------------------------
    123 
    124     // Indicator for ID blocks
    125     private static final String ID_TOKEN = "::";
    126     private static final int ID_TOKEN_LEN = 2;
    127 
    128 /*
    129 (reserved for future expansion)
    130     // markers for beginning and end of rule groups
    131     private static final String BEGIN_TOKEN = "BEGIN";
    132     private static final String END_TOKEN = "END";
    133 */
    134 
    135     // Operators
    136     private static final char VARIABLE_DEF_OP   = '=';
    137     private static final char FORWARD_RULE_OP   = '>';
    138     private static final char REVERSE_RULE_OP   = '<';
    139     private static final char FWDREV_RULE_OP    = '~'; // internal rep of <> op
    140 
    141     private static final String OPERATORS = "=><\u2190\u2192\u2194";
    142     private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;";
    143 
    144     // Other special characters
    145     private static final char QUOTE               = '\'';
    146     private static final char ESCAPE              = '\\';
    147     private static final char END_OF_RULE         = ';';
    148     private static final char RULE_COMMENT_CHAR   = '#';
    149 
    150     private static final char CONTEXT_ANTE        = '{'; // ante{key
    151     private static final char CONTEXT_POST        = '}'; // key}post
    152     private static final char CURSOR_POS          = '|';
    153     private static final char CURSOR_OFFSET       = '@';
    154     private static final char ANCHOR_START        = '^';
    155 
    156     private static final char KLEENE_STAR         = '*';
    157     private static final char ONE_OR_MORE         = '+';
    158     private static final char ZERO_OR_ONE         = '?';
    159 
    160     private static final char DOT                 = '.';
    161     private static final String DOT_SET           = "[^[:Zp:][:Zl:]\\r\\n$]";
    162 
    163     // By definition, the ANCHOR_END special character is a
    164     // trailing SymbolTable.SYMBOL_REF character.
    165     // private static final char ANCHOR_END       = '$';
    166 
    167     // Segments of the input string are delimited by "(" and ")".  In the
    168     // output string these segments are referenced as "$1", "$2", etc.
    169     private static final char SEGMENT_OPEN        = '(';
    170     private static final char SEGMENT_CLOSE       = ')';
    171 
    172     // A function is denoted &Source-Target/Variant(text)
    173     private static final char FUNCTION            = '&';
    174 
    175     // Aliases for some of the syntax characters. These are provided so
    176     // transliteration rules can be expressed in XML without clashing with
    177     // XML syntax characters '<', '>', and '&'.
    178     private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow
    179     private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow
    180     private static final char ALT_FWDREV_RULE_OP  = '\u2194'; // Left Right Arrow
    181     private static final char ALT_FUNCTION        = '\u2206'; // Increment (~Greek Capital Delta)
    182 
    183     // Special characters disallowed at the top level
    184     private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]");
    185 
    186     // Special characters disallowed within a segment
    187     private static UnicodeSet ILLEGAL_SEG = new UnicodeSet("[\\{\\}\\|\\@]");
    188 
    189     // Special characters disallowed within a function argument
    190     private static UnicodeSet ILLEGAL_FUNC = new UnicodeSet("[\\^\\(\\.\\*\\+\\?\\{\\}\\|\\@]");
    191 
    192     //----------------------------------------------------------------------
    193     // class ParseData
    194     //----------------------------------------------------------------------
    195 
    196     /**
    197      * This class implements the SymbolTable interface.  It is used
    198      * during parsing to give UnicodeSet access to variables that
    199      * have been defined so far.  Note that it uses variablesVector,
    200      * _not_ data.variables.
    201      */
    202     private class ParseData implements SymbolTable {
    203 
    204         /**
    205          * Implement SymbolTable API.
    206          */
    207         @Override
    208         public char[] lookup(String name) {
    209             return variableNames.get(name);
    210         }
    211 
    212         /**
    213          * Implement SymbolTable API.
    214          */
    215         @Override
    216         public UnicodeMatcher lookupMatcher(int ch) {
    217             // Note that we cannot use data.lookup() because the
    218             // set array has not been constructed yet.
    219             int i = ch - curData.variablesBase;
    220             if (i >= 0 && i < variablesVector.size()) {
    221                 return (UnicodeMatcher) variablesVector.get(i);
    222             }
    223             return null;
    224         }
    225 
    226         /**
    227          * Implement SymbolTable API.  Parse out a symbol reference
    228          * name.
    229          */
    230         @Override
    231         public String parseReference(String text, ParsePosition pos, int limit) {
    232             int start = pos.getIndex();
    233             int i = start;
    234             while (i < limit) {
    235                 char c = text.charAt(i);
    236                 if ((i==start && !UCharacter.isUnicodeIdentifierStart(c)) ||
    237                     !UCharacter.isUnicodeIdentifierPart(c)) {
    238                     break;
    239                 }
    240                 ++i;
    241             }
    242             if (i == start) { // No valid name chars
    243                 return null;
    244             }
    245             pos.setIndex(i);
    246             return text.substring(start, i);
    247         }
    248 
    249         /**
    250          * Return true if the given character is a matcher standin or a plain
    251          * character (non standin).
    252          */
    253         public boolean isMatcher(int ch) {
    254             // Note that we cannot use data.lookup() because the
    255             // set array has not been constructed yet.
    256             int i = ch - curData.variablesBase;
    257             if (i >= 0 && i < variablesVector.size()) {
    258                 return variablesVector.get(i) instanceof UnicodeMatcher;
    259             }
    260             return true;
    261         }
    262 
    263         /**
    264          * Return true if the given character is a replacer standin or a plain
    265          * character (non standin).
    266          */
    267         public boolean isReplacer(int ch) {
    268             // Note that we cannot use data.lookup() because the
    269             // set array has not been constructed yet.
    270             int i = ch - curData.variablesBase;
    271             if (i >= 0 && i < variablesVector.size()) {
    272                 return variablesVector.get(i) instanceof UnicodeReplacer;
    273             }
    274             return true;
    275         }
    276     }
    277 
    278     //----------------------------------------------------------------------
    279     // classes RuleBody, RuleArray, and RuleReader
    280     //----------------------------------------------------------------------
    281 
    282     /**
    283      * A private abstract class representing the interface to rule
    284      * source code that is broken up into lines.  Handles the
    285      * folding of lines terminated by a backslash.  This folding
    286      * is limited; it does not account for comments, quotes, or
    287      * escapes, so its use to be limited.
    288      */
    289     private static abstract class RuleBody {
    290 
    291         /**
    292          * Retrieve the next line of the source, or return null if
    293          * none.  Folds lines terminated by a backslash into the
    294          * next line, without regard for comments, quotes, or
    295          * escapes.
    296          */
    297         String nextLine() {
    298             String s = handleNextLine();
    299             if (s != null &&
    300                 s.length() > 0 &&
    301                 s.charAt(s.length() - 1) == '\\') {
    302                 StringBuilder b = new StringBuilder(s);
    303                 do {
    304                     b.deleteCharAt(b.length()-1);
    305                     s = handleNextLine();
    306                     if (s == null) {
    307                         break;
    308                     }
    309                     b.append(s);
    310                 } while (s.length() > 0 &&
    311                          s.charAt(s.length() - 1) == '\\');
    312                 s = b.toString();
    313             }
    314             return s;
    315         }
    316 
    317         /**
    318          * Reset to the first line of the source.
    319          */
    320         abstract void reset();
    321 
    322         /**
    323          * Subclass method to return the next line of the source.
    324          */
    325         abstract String handleNextLine();
    326     }
    327 
    328     /**
    329      * RuleBody subclass for a String[] array.
    330      */
    331     private static class RuleArray extends RuleBody {
    332         String[] array;
    333         int i;
    334         public RuleArray(String[] array) { this.array = array; i = 0; }
    335         @Override
    336         public String handleNextLine() {
    337             return (i < array.length) ? array[i++] : null;
    338         }
    339         @Override
    340         public void reset() {
    341             i = 0;
    342         }
    343     }
    344 
    345     /*
    346      * RuleBody subclass for a ResourceReader.
    347      */
    348 /*    private static class RuleReader extends RuleBody {
    349         ResourceReader reader;
    350         public RuleReader(ResourceReader reader) { this.reader = reader; }
    351         public String handleNextLine() {
    352             try {
    353                 return reader.readLine();
    354             } catch (java.io.IOException e) {}
    355             return null;
    356         }
    357         public void reset() {
    358             reader.reset();
    359         }
    360     }*/
    361 
    362     //----------------------------------------------------------------------
    363     // class RuleHalf
    364     //----------------------------------------------------------------------
    365 
    366     /**
    367      * A class representing one side of a rule.  This class knows how to
    368      * parse half of a rule.  It is tightly coupled to the method
    369      * TransliteratorParser.parseRule().
    370      */
    371     private static class RuleHalf {
    372 
    373         public String text;
    374 
    375         public int cursor = -1; // position of cursor in text
    376         public int ante = -1;   // position of ante context marker '{' in text
    377         public int post = -1;   // position of post context marker '}' in text
    378 
    379         // Record the offset to the cursor either to the left or to the
    380         // right of the key.  This is indicated by characters on the output
    381         // side that allow the cursor to be positioned arbitrarily within
    382         // the matching text.  For example, abc{def} > | @@@ xyz; changes
    383         // def to xyz and moves the cursor to before abc.  Offset characters
    384         // must be at the start or end, and they cannot move the cursor past
    385         // the ante- or postcontext text.  Placeholders are only valid in
    386         // output text.  The length of the ante and post context is
    387         // determined at runtime, because of supplementals and quantifiers.
    388         public int cursorOffset = 0; // only nonzero on output side
    389 
    390         // Position of first CURSOR_OFFSET on _right_.  This will be -1
    391         // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
    392         private int cursorOffsetPos = 0;
    393 
    394         public boolean anchorStart = false;
    395         public boolean anchorEnd   = false;
    396 
    397         /**
    398          * The segment number from 1..n of the next '(' we see
    399          * during parsing; 1-based.
    400          */
    401         private int nextSegmentNumber = 1;
    402 
    403         /**
    404          * Parse one side of a rule, stopping at either the limit,
    405          * the END_OF_RULE character, or an operator.
    406          * @return the index after the terminating character, or
    407          * if limit was reached, limit
    408          */
    409         public int parse(String rule, int pos, int limit,
    410                          TransliteratorParser parser) {
    411             int start = pos;
    412             StringBuffer buf = new StringBuffer();
    413             pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_TOP, false);
    414             text = buf.toString();
    415 
    416             if (cursorOffset > 0 && cursor != cursorOffsetPos) {
    417                 syntaxError("Misplaced " + CURSOR_POS, rule, start);
    418             }
    419 
    420             return pos;
    421         }
    422 
    423         /**
    424          * Parse a section of one side of a rule, stopping at either
    425          * the limit, the END_OF_RULE character, an operator, or a
    426          * segment close character.  This method parses both a
    427          * top-level rule half and a segment within such a rule half.
    428          * It calls itself recursively to parse segments and nested
    429          * segments.
    430          * @param buf buffer into which to accumulate the rule pattern
    431          * characters, either literal characters from the rule or
    432          * standins for UnicodeMatcher objects including segments.
    433          * @param illegal the set of special characters that is illegal during
    434          * this parse.
    435          * @param isSegment if true, then we've already seen a '(' and
    436          * pos on entry points right after it.  Accumulate everything
    437          * up to the closing ')', put it in a segment matcher object,
    438          * generate a standin for it, and add the standin to buf.  As
    439          * a side effect, update the segments vector with a reference
    440          * to the segment matcher.  This works recursively for nested
    441          * segments.  If isSegment is false, just accumulate
    442          * characters into buf.
    443          * @return the index after the terminating character, or
    444          * if limit was reached, limit
    445          */
    446         private int parseSection(String rule, int pos, int limit,
    447                                  TransliteratorParser parser,
    448                                  StringBuffer buf,
    449                                  UnicodeSet illegal,
    450                                  boolean isSegment) {
    451             int start = pos;
    452             ParsePosition pp = null;
    453             int quoteStart = -1; // Most recent 'single quoted string'
    454             int quoteLimit = -1;
    455             int varStart = -1; // Most recent $variableReference
    456             int varLimit = -1;
    457             int[] iref = new int[1];
    458             int bufStart = buf.length();
    459 
    460         main:
    461             while (pos < limit) {
    462                 // Since all syntax characters are in the BMP, fetching
    463                 // 16-bit code units suffices here.
    464                 char c = rule.charAt(pos++);
    465                 if (PatternProps.isWhiteSpace(c)) {
    466                     continue;
    467                 }
    468                 // HALF_ENDERS is all chars that end a rule half: "<>=;"
    469                 if (HALF_ENDERS.indexOf(c) >= 0) {
    470                     ///CLOVER:OFF
    471                     // isSegment is always false
    472                     if (isSegment) {
    473                         syntaxError("Unclosed segment", rule, start);
    474                     }
    475                     ///CLOVER:ON
    476                     break main;
    477                 }
    478                 if (anchorEnd) {
    479                     // Text after a presumed end anchor is a syntax err
    480                     syntaxError("Malformed variable reference", rule, start);
    481                 }
    482                 if (UnicodeSet.resemblesPattern(rule, pos-1)) {
    483                     if (pp == null) {
    484                         pp = new ParsePosition(0);
    485                     }
    486                     pp.setIndex(pos-1); // Backup to opening '['
    487                     buf.append(parser.parseSet(rule, pp));
    488                     pos = pp.getIndex();
    489                     continue;
    490                 }
    491                 // Handle escapes
    492                 if (c == ESCAPE) {
    493                     if (pos == limit) {
    494                         syntaxError("Trailing backslash", rule, start);
    495                     }
    496                     iref[0] = pos;
    497                     int escaped = Utility.unescapeAt(rule, iref);
    498                     pos = iref[0];
    499                     if (escaped == -1) {
    500                         syntaxError("Malformed escape", rule, start);
    501                     }
    502                     parser.checkVariableRange(escaped, rule, start);
    503                     UTF16.append(buf, escaped);
    504                     continue;
    505                 }
    506                 // Handle quoted matter
    507                 if (c == QUOTE) {
    508                     int iq = rule.indexOf(QUOTE, pos);
    509                     if (iq == pos) {
    510                         buf.append(c); // Parse [''] outside quotes as [']
    511                         ++pos;
    512                     } else {
    513                         /* This loop picks up a run of quoted text of the
    514                          * form 'aaaa' each time through.  If this run
    515                          * hasn't really ended ('aaaa''bbbb') then it keeps
    516                          * looping, each time adding on a new run.  When it
    517                          * reaches the final quote it breaks.
    518                          */
    519                         quoteStart = buf.length();
    520                         for (;;) {
    521                             if (iq < 0) {
    522                                 syntaxError("Unterminated quote", rule, start);
    523                             }
    524                             buf.append(rule.substring(pos, iq));
    525                             pos = iq+1;
    526                             if (pos < limit && rule.charAt(pos) == QUOTE) {
    527                             // Parse [''] inside quotes as [']
    528                                 iq = rule.indexOf(QUOTE, pos+1);
    529                             // Continue looping
    530                             } else {
    531                                 break;
    532                             }
    533                         }
    534                         quoteLimit = buf.length();
    535 
    536                         for (iq=quoteStart; iq<quoteLimit; ++iq) {
    537                             parser.checkVariableRange(buf.charAt(iq), rule, start);
    538                         }
    539                     }
    540                     continue;
    541                 }
    542 
    543                 parser.checkVariableRange(c, rule, start);
    544 
    545                 if (illegal.contains(c)) {
    546                     syntaxError("Illegal character '" + c + '\'', rule, start);
    547                 }
    548 
    549                 switch (c) {
    550 
    551                 //------------------------------------------------------
    552                 // Elements allowed within and out of segments
    553                 //------------------------------------------------------
    554                 case ANCHOR_START:
    555                     if (buf.length() == 0 && !anchorStart) {
    556                         anchorStart = true;
    557                     } else {
    558                         syntaxError("Misplaced anchor start",
    559                                     rule, start);
    560                     }
    561                     break;
    562                 case SEGMENT_OPEN:
    563                     {
    564                         // bufSegStart is the offset in buf to the first
    565                         // character of the segment we are parsing.
    566                         int bufSegStart = buf.length();
    567 
    568                         // Record segment number now, since nextSegmentNumber
    569                         // will be incremented during the call to parseSection
    570                         // if there are nested segments.
    571                         int segmentNumber = nextSegmentNumber++; // 1-based
    572 
    573                         // Parse the segment
    574                         pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_SEG, true);
    575 
    576                         // After parsing a segment, the relevant characters are
    577                         // in buf, starting at offset bufSegStart.  Extract them
    578                         // into a string matcher, and replace them with a
    579                         // standin for that matcher.
    580                         StringMatcher m =
    581                             new StringMatcher(buf.substring(bufSegStart),
    582                                               segmentNumber, parser.curData);
    583 
    584                         // Record and associate object and segment number
    585                         parser.setSegmentObject(segmentNumber, m);
    586                         buf.setLength(bufSegStart);
    587                         buf.append(parser.getSegmentStandin(segmentNumber));
    588                     }
    589                     break;
    590                 case FUNCTION:
    591                 case ALT_FUNCTION:
    592                     {
    593                         iref[0] = pos;
    594                         TransliteratorIDParser.SingleID single = TransliteratorIDParser.parseFilterID(rule, iref);
    595                         // The next character MUST be a segment open
    596                         if (single == null ||
    597                             !Utility.parseChar(rule, iref, SEGMENT_OPEN)) {
    598                             syntaxError("Invalid function", rule, start);
    599                         }
    600 
    601                         Transliterator t = single.getInstance();
    602                         if (t == null) {
    603                             syntaxError("Invalid function ID", rule, start);
    604                         }
    605 
    606                         // bufSegStart is the offset in buf to the first
    607                         // character of the segment we are parsing.
    608                         int bufSegStart = buf.length();
    609 
    610                         // Parse the segment
    611                         pos = parseSection(rule, iref[0], limit, parser, buf, ILLEGAL_FUNC, true);
    612 
    613                         // After parsing a segment, the relevant characters are
    614                         // in buf, starting at offset bufSegStart.
    615                         FunctionReplacer r =
    616                             new FunctionReplacer(t,
    617                                 new StringReplacer(buf.substring(bufSegStart), parser.curData));
    618 
    619                         // Replace the buffer contents with a stand-in
    620                         buf.setLength(bufSegStart);
    621                         buf.append(parser.generateStandInFor(r));
    622                     }
    623                     break;
    624                 case SymbolTable.SYMBOL_REF:
    625                     // Handle variable references and segment references "$1" .. "$9"
    626                     {
    627                         // A variable reference must be followed immediately
    628                         // by a Unicode identifier start and zero or more
    629                         // Unicode identifier part characters, or by a digit
    630                         // 1..9 if it is a segment reference.
    631                         if (pos == limit) {
    632                             // A variable ref character at the end acts as
    633                             // an anchor to the context limit, as in perl.
    634                             anchorEnd = true;
    635                             break;
    636                         }
    637                         // Parse "$1" "$2" .. "$9" .. (no upper limit)
    638                         c = rule.charAt(pos);
    639                         int r = UCharacter.digit(c, 10);
    640                         if (r >= 1 && r <= 9) {
    641                             iref[0] = pos;
    642                             r = Utility.parseNumber(rule, iref, 10);
    643                             if (r < 0) {
    644                                 syntaxError("Undefined segment reference",
    645                                             rule, start);
    646                             }
    647                             pos = iref[0];
    648                             buf.append(parser.getSegmentStandin(r));
    649                         } else {
    650                             if (pp == null) { // Lazy create
    651                                 pp = new ParsePosition(0);
    652                             }
    653                             pp.setIndex(pos);
    654                             String name = parser.parseData.
    655                                 parseReference(rule, pp, limit);
    656                             if (name == null) {
    657                                 // This means the '$' was not followed by a
    658                                 // valid name.  Try to interpret it as an
    659                                 // end anchor then.  If this also doesn't work
    660                                 // (if we see a following character) then signal
    661                                 // an error.
    662                                 anchorEnd = true;
    663                                 break;
    664                             }
    665                             pos = pp.getIndex();
    666                             // If this is a variable definition statement,
    667                             // then the LHS variable will be undefined.  In
    668                             // that case appendVariableDef() will append the
    669                             // special placeholder char variableLimit-1.
    670                             varStart = buf.length();
    671                             parser.appendVariableDef(name, buf);
    672                             varLimit = buf.length();
    673                         }
    674                     }
    675                     break;
    676                 case DOT:
    677                     buf.append(parser.getDotStandIn());
    678                     break;
    679                 case KLEENE_STAR:
    680                 case ONE_OR_MORE:
    681                 case ZERO_OR_ONE:
    682                     // Quantifiers.  We handle single characters, quoted strings,
    683                     // variable references, and segments.
    684                     //  a+      matches  aaa
    685                     //  'foo'+  matches  foofoofoo
    686                     //  $v+     matches  xyxyxy if $v == xy
    687                     //  (seg)+  matches  segsegseg
    688                     {
    689                         ///CLOVER:OFF
    690                         // isSegment is always false
    691                         if (isSegment && buf.length() == bufStart) {
    692                             // The */+ immediately follows '('
    693                             syntaxError("Misplaced quantifier", rule, start);
    694                             break;
    695                         }
    696                         ///CLOVER:ON
    697 
    698                         int qstart, qlimit;
    699                         // The */+ follows an isolated character or quote
    700                         // or variable reference
    701                         if (buf.length() == quoteLimit) {
    702                             // The */+ follows a 'quoted string'
    703                             qstart = quoteStart;
    704                             qlimit = quoteLimit;
    705                         } else if (buf.length() == varLimit) {
    706                             // The */+ follows a $variableReference
    707                             qstart = varStart;
    708                             qlimit = varLimit;
    709                         } else {
    710                             // The */+ follows a single character, possibly
    711                             // a segment standin
    712                             qstart = buf.length() - 1;
    713                             qlimit = qstart + 1;
    714                         }
    715 
    716                         UnicodeMatcher m;
    717                         try {
    718                             m = new StringMatcher(buf.toString(), qstart, qlimit,
    719                                               0, parser.curData);
    720                         } catch (RuntimeException e) {
    721                             final String precontext = pos < 50 ? rule.substring(0, pos) : "..." + rule.substring(pos - 50, pos);
    722                             final String postContext = limit-pos <= 50 ? rule.substring(pos, limit) : rule.substring(pos, pos+50) + "...";
    723                             throw new IllegalIcuArgumentException("Failure in rule: " + precontext + "$$$"
    724                                     + postContext).initCause(e);
    725                         }
    726                         int min = 0;
    727                         int max = Quantifier.MAX;
    728                         switch (c) {
    729                         case ONE_OR_MORE:
    730                             min = 1;
    731                             break;
    732                         case ZERO_OR_ONE:
    733                             min = 0;
    734                             max = 1;
    735                             break;
    736                             // case KLEENE_STAR:
    737                             //    do nothing -- min, max already set
    738                         }
    739                         m = new Quantifier(m, min, max);
    740                         buf.setLength(qstart);
    741                         buf.append(parser.generateStandInFor(m));
    742                     }
    743                     break;
    744 
    745                 //------------------------------------------------------
    746                 // Elements allowed ONLY WITHIN segments
    747                 //------------------------------------------------------
    748                 case SEGMENT_CLOSE:
    749                     // assert(isSegment);
    750                     // We're done parsing a segment.
    751                     break main;
    752 
    753                 //------------------------------------------------------
    754                 // Elements allowed ONLY OUTSIDE segments
    755                 //------------------------------------------------------
    756                 case CONTEXT_ANTE:
    757                     if (ante >= 0) {
    758                         syntaxError("Multiple ante contexts", rule, start);
    759                     }
    760                     ante = buf.length();
    761                     break;
    762                 case CONTEXT_POST:
    763                     if (post >= 0) {
    764                         syntaxError("Multiple post contexts", rule, start);
    765                     }
    766                     post = buf.length();
    767                     break;
    768                 case CURSOR_POS:
    769                     if (cursor >= 0) {
    770                         syntaxError("Multiple cursors", rule, start);
    771                     }
    772                     cursor = buf.length();
    773                     break;
    774                 case CURSOR_OFFSET:
    775                     if (cursorOffset < 0) {
    776                         if (buf.length() > 0) {
    777                             syntaxError("Misplaced " + c, rule, start);
    778                         }
    779                         --cursorOffset;
    780                     } else if (cursorOffset > 0) {
    781                         if (buf.length() != cursorOffsetPos || cursor >= 0) {
    782                             syntaxError("Misplaced " + c, rule, start);
    783                         }
    784                         ++cursorOffset;
    785                     } else {
    786                         if (cursor == 0 && buf.length() == 0) {
    787                             cursorOffset = -1;
    788                         } else if (cursor < 0) {
    789                             cursorOffsetPos = buf.length();
    790                             cursorOffset = 1;
    791                         } else {
    792                             syntaxError("Misplaced " + c, rule, start);
    793                         }
    794                     }
    795                     break;
    796 
    797                 //------------------------------------------------------
    798                 // Non-special characters
    799                 //------------------------------------------------------
    800                 default:
    801                     // Disallow unquoted characters other than [0-9A-Za-z]
    802                     // in the printable ASCII range.  These characters are
    803                     // reserved for possible future use.
    804                     if (c >= 0x0021 && c <= 0x007E &&
    805                         !((c >= '0' && c <= '9') ||
    806                           (c >= 'A' && c <= 'Z') ||
    807                           (c >= 'a' && c <= 'z'))) {
    808                         syntaxError("Unquoted " + c, rule, start);
    809                     }
    810                     buf.append(c);
    811                     break;
    812                 }
    813             }
    814             return pos;
    815         }
    816 
    817         /**
    818          * Remove context.
    819          */
    820         void removeContext() {
    821             text = text.substring(ante < 0 ? 0 : ante,
    822                                   post < 0 ? text.length() : post);
    823             ante = post = -1;
    824             anchorStart = anchorEnd = false;
    825         }
    826 
    827         /**
    828          * Return true if this half looks like valid output, that is, does not
    829          * contain quantifiers or other special input-only elements.
    830          */
    831         public boolean isValidOutput(TransliteratorParser parser) {
    832             for (int i=0; i<text.length(); ) {
    833                 int c = UTF16.charAt(text, i);
    834                 i += UTF16.getCharCount(c);
    835                 if (!parser.parseData.isReplacer(c)) {
    836                     return false;
    837                 }
    838             }
    839             return true;
    840         }
    841 
    842         /**
    843          * Return true if this half looks like valid input, that is, does not
    844          * contain functions or other special output-only elements.
    845          */
    846         public boolean isValidInput(TransliteratorParser parser) {
    847             for (int i=0; i<text.length(); ) {
    848                 int c = UTF16.charAt(text, i);
    849                 i += UTF16.getCharCount(c);
    850                 if (!parser.parseData.isMatcher(c)) {
    851                     return false;
    852                 }
    853             }
    854             return true;
    855         }
    856     }
    857 
    858     //----------------------------------------------------------------------
    859     // PUBLIC methods
    860     //----------------------------------------------------------------------
    861 
    862     /**
    863      * Constructor.
    864      */
    865     public TransliteratorParser() {
    866     }
    867 
    868     /**
    869      * Parse a set of rules.  After the parse completes, examine the public
    870      * data members for results.
    871      */
    872     public void parse(String rules, int dir) {
    873         parseRules(new RuleArray(new String[] { rules }), dir);
    874     }
    875 
    876     /*
    877      * Parse a set of rules.  After the parse completes, examine the public
    878      * data members for results.
    879      */
    880 /*    public void parse(ResourceReader rules, int direction) {
    881         parseRules(new RuleReader(rules), direction);
    882     }*/
    883 
    884     //----------------------------------------------------------------------
    885     // PRIVATE methods
    886     //----------------------------------------------------------------------
    887 
    888     /**
    889      * Parse an array of zero or more rules.  The strings in the array are
    890      * treated as if they were concatenated together, with rule terminators
    891      * inserted between array elements if not present already.
    892      *
    893      * Any previous rules are discarded.  Typically this method is called exactly
    894      * once, during construction.
    895      *
    896      * The member this.data will be set to null if there are no rules.
    897      *
    898      * @exception IllegalIcuArgumentException if there is a syntax error in the
    899      * rules
    900      */
    901     void parseRules(RuleBody ruleArray, int dir) {
    902         boolean parsingIDs = true;
    903         int ruleCount = 0;
    904 
    905         dataVector = new ArrayList<Data>();
    906         idBlockVector = new ArrayList<String>();
    907         curData = null;
    908         direction = dir;
    909         compoundFilter = null;
    910         variablesVector = new ArrayList<Object>();
    911         variableNames = new HashMap<String, char[]>();
    912         parseData = new ParseData();
    913 
    914         List<RuntimeException> errors = new ArrayList<RuntimeException>();
    915         int errorCount = 0;
    916 
    917         ruleArray.reset();
    918 
    919         StringBuilder idBlockResult = new StringBuilder();
    920 
    921         // The compound filter offset is an index into idBlockResult.
    922         // If it is 0, then the compound filter occurred at the start,
    923         // and it is the offset to the _start_ of the compound filter
    924         // pattern.  Otherwise it is the offset to the _limit_ of the
    925         // compound filter pattern within idBlockResult.
    926         this.compoundFilter = null;
    927         int compoundFilterOffset = -1;
    928 
    929     main:
    930         for (;;) {
    931             String rule = ruleArray.nextLine();
    932             if (rule == null) {
    933                 break;
    934             }
    935             int pos = 0;
    936             int limit = rule.length();
    937             while (pos < limit) {
    938                 char c = rule.charAt(pos++);
    939                 if (PatternProps.isWhiteSpace(c)) {
    940                     continue;
    941                 }
    942                 // Skip lines starting with the comment character
    943                 if (c == RULE_COMMENT_CHAR) {
    944                     pos = rule.indexOf("\n", pos) + 1;
    945                     if (pos == 0) {
    946                         break; // No "\n" found; rest of rule is a commnet
    947                     }
    948                     continue; // Either fall out or restart with next line
    949                 }
    950 
    951                 // skip empty rules
    952                 if (c == END_OF_RULE)
    953                     continue;
    954 
    955                 // Often a rule file contains multiple errors.  It's
    956                 // convenient to the rule author if these are all reported
    957                 // at once.  We keep parsing rules even after a failure, up
    958                 // to a specified limit, and report all errors at once.
    959                 try {
    960                     ++ruleCount;
    961 
    962                     // We've found the start of a rule or ID.  c is its first
    963                     // character, and pos points past c.
    964                     --pos;
    965                     // Look for an ID token.  Must have at least ID_TOKEN_LEN + 1
    966                     // chars left.
    967                     if ((pos + ID_TOKEN_LEN + 1) <= limit &&
    968                             rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) {
    969                         pos += ID_TOKEN_LEN;
    970                         c = rule.charAt(pos);
    971                         while (PatternProps.isWhiteSpace(c) && pos < limit) {
    972                             ++pos;
    973                             c = rule.charAt(pos);
    974                         }
    975                         int[] p = new int[] { pos };
    976 
    977                         if (!parsingIDs) {
    978                             if (curData != null) {
    979                                 if (direction == Transliterator.FORWARD)
    980                                     dataVector.add(curData);
    981                                 else
    982                                     dataVector.add(0, curData);
    983                                 curData = null;
    984                             }
    985                             parsingIDs = true;
    986                         }
    987 
    988                         TransliteratorIDParser.SingleID id =
    989                             TransliteratorIDParser.parseSingleID(
    990                                           rule, p, direction);
    991                         if (p[0] != pos && Utility.parseChar(rule, p, END_OF_RULE)) {
    992                             // Successful ::ID parse.
    993 
    994                             if (direction == Transliterator.FORWARD) {
    995                                 idBlockResult.append(id.canonID).append(END_OF_RULE);
    996                             } else {
    997                                 idBlockResult.insert(0, id.canonID + END_OF_RULE);
    998                             }
    999 
   1000                         } else {
   1001                             // Couldn't parse an ID.  Try to parse a global filter
   1002                             int[] withParens = new int[] { -1 };
   1003                             UnicodeSet f = TransliteratorIDParser.parseGlobalFilter(rule, p, direction, withParens, null);
   1004                             if (f != null && Utility.parseChar(rule, p, END_OF_RULE)) {
   1005                                 if ((direction == Transliterator.FORWARD) ==
   1006                                     (withParens[0] == 0)) {
   1007                                     if (compoundFilter != null) {
   1008                                         // Multiple compound filters
   1009                                         syntaxError("Multiple global filters", rule, pos);
   1010                                     }
   1011                                     compoundFilter = f;
   1012                                     compoundFilterOffset = ruleCount;
   1013                                }
   1014                             } else {
   1015                                 // Invalid ::id
   1016                                 // Can be parsed as neither an ID nor a global filter
   1017                                 syntaxError("Invalid ::ID", rule, pos);
   1018                             }
   1019                         }
   1020 
   1021                         pos = p[0];
   1022                     } else {
   1023                         if (parsingIDs) {
   1024                             if (direction == Transliterator.FORWARD)
   1025                                 idBlockVector.add(idBlockResult.toString());
   1026                             else
   1027                                 idBlockVector.add(0, idBlockResult.toString());
   1028                             idBlockResult.delete(0, idBlockResult.length());
   1029                             parsingIDs = false;
   1030                             curData = new RuleBasedTransliterator.Data();
   1031 
   1032                             // By default, rules use part of the private use area
   1033                             // E000..F8FF for variables and other stand-ins.  Currently
   1034                             // the range F000..F8FF is typically sufficient.  The 'use
   1035                             // variable range' pragma allows rule sets to modify this.
   1036                             setVariableRange(0xF000, 0xF8FF);
   1037                         }
   1038 
   1039                         if (resemblesPragma(rule, pos, limit)) {
   1040                             int ppp = parsePragma(rule, pos, limit);
   1041                             if (ppp < 0) {
   1042                                 syntaxError("Unrecognized pragma", rule, pos);
   1043                             }
   1044                             pos = ppp;
   1045                         // Parse a rule
   1046                         } else {
   1047                             pos = parseRule(rule, pos, limit);
   1048                         }
   1049                     }
   1050                 } catch (IllegalArgumentException e) {
   1051                     if (errorCount == 30) {
   1052                         IllegalIcuArgumentException icuEx = new IllegalIcuArgumentException("\nMore than 30 errors; further messages squelched");
   1053                         icuEx.initCause(e);
   1054                         errors.add(icuEx);
   1055                         break main;
   1056                     }
   1057                     e.fillInStackTrace();
   1058                     errors.add(e);
   1059                     ++errorCount;
   1060                     pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';'
   1061                 }
   1062             }
   1063         }
   1064         if (parsingIDs && idBlockResult.length() > 0) {
   1065             if (direction == Transliterator.FORWARD)
   1066                 idBlockVector.add(idBlockResult.toString());
   1067             else
   1068                 idBlockVector.add(0, idBlockResult.toString());
   1069         }
   1070         else if (!parsingIDs && curData != null) {
   1071             if (direction == Transliterator.FORWARD)
   1072                 dataVector.add(curData);
   1073             else
   1074                 dataVector.add(0, curData);
   1075         }
   1076 
   1077         // Convert the set vector to an array
   1078         for (int i = 0; i < dataVector.size(); i++) {
   1079             Data data = dataVector.get(i);
   1080             data.variables = new Object[variablesVector.size()];
   1081             variablesVector.toArray(data.variables);
   1082             data.variableNames = new HashMap<String, char[]>();
   1083             data.variableNames.putAll(variableNames);
   1084         }
   1085         variablesVector = null;
   1086 
   1087         // Do more syntax checking and index the rules
   1088         try {
   1089             if (compoundFilter != null) {
   1090                 if ((direction == Transliterator.FORWARD &&
   1091                      compoundFilterOffset != 1) ||
   1092                     (direction == Transliterator.REVERSE &&
   1093                      compoundFilterOffset != ruleCount)) {
   1094                     throw new IllegalIcuArgumentException("Compound filters misplaced");
   1095                 }
   1096             }
   1097 
   1098             for (int i = 0; i < dataVector.size(); i++) {
   1099                 Data data = dataVector.get(i);
   1100                 data.ruleSet.freeze();
   1101             }
   1102 
   1103             if (idBlockVector.size() == 1 && (idBlockVector.get(0)).length() == 0)
   1104                 idBlockVector.remove(0);
   1105 
   1106         } catch (IllegalArgumentException e) {
   1107             e.fillInStackTrace();
   1108             errors.add(e);
   1109         }
   1110 
   1111         if (errors.size() != 0) {
   1112             for (int i = errors.size()-1; i > 0; --i) {
   1113                 RuntimeException previous = errors.get(i-1);
   1114                 while (previous.getCause() != null) {
   1115                     previous = (RuntimeException) previous.getCause(); // chain specially
   1116                 }
   1117                 previous.initCause(errors.get(i));
   1118             }
   1119             throw errors.get(0);
   1120             // if initCause not supported: throw new IllegalArgumentException(errors.toString());
   1121         }
   1122     }
   1123 
   1124     /**
   1125      * MAIN PARSER.  Parse the next rule in the given rule string, starting
   1126      * at pos.  Return the index after the last character parsed.  Do not
   1127      * parse characters at or after limit.
   1128      *
   1129      * Important:  The character at pos must be a non-whitespace character
   1130      * that is not the comment character.
   1131      *
   1132      * This method handles quoting, escaping, and whitespace removal.  It
   1133      * parses the end-of-rule character.  It recognizes context and cursor
   1134      * indicators.  Once it does a lexical breakdown of the rule at pos, it
   1135      * creates a rule object and adds it to our rule list.
   1136      *
   1137      * This method is tightly coupled to the inner class RuleHalf.
   1138      */
   1139     private int parseRule(String rule, int pos, int limit) {
   1140         // Locate the left side, operator, and right side
   1141         int start = pos;
   1142         char operator = 0;
   1143 
   1144         // Set up segments data
   1145         segmentStandins = new StringBuffer();
   1146         segmentObjects = new ArrayList<StringMatcher>();
   1147 
   1148         RuleHalf left  = new RuleHalf();
   1149         RuleHalf right = new RuleHalf();
   1150 
   1151         undefinedVariableName = null;
   1152         pos = left.parse(rule, pos, limit, this);
   1153 
   1154         if (pos == limit ||
   1155             OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
   1156             syntaxError("No operator pos=" + pos, rule, start);
   1157         }
   1158         ++pos;
   1159 
   1160         // Found an operator char.  Check for forward-reverse operator.
   1161         if (operator == REVERSE_RULE_OP &&
   1162             (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
   1163             ++pos;
   1164             operator = FWDREV_RULE_OP;
   1165         }
   1166 
   1167         // Translate alternate op characters.
   1168         switch (operator) {
   1169         case ALT_FORWARD_RULE_OP:
   1170             operator = FORWARD_RULE_OP;
   1171             break;
   1172         case ALT_REVERSE_RULE_OP:
   1173             operator = REVERSE_RULE_OP;
   1174             break;
   1175         case ALT_FWDREV_RULE_OP:
   1176             operator = FWDREV_RULE_OP;
   1177             break;
   1178         }
   1179 
   1180         pos = right.parse(rule, pos, limit, this);
   1181 
   1182         if (pos < limit) {
   1183             if (rule.charAt(--pos) == END_OF_RULE) {
   1184                 ++pos;
   1185             } else {
   1186                 // RuleHalf parser must have terminated at an operator
   1187                 syntaxError("Unquoted operator", rule, start);
   1188             }
   1189         }
   1190 
   1191         if (operator == VARIABLE_DEF_OP) {
   1192             // LHS is the name.  RHS is a single character, either a literal
   1193             // or a set (already parsed).  If RHS is longer than one
   1194             // character, it is either a multi-character string, or multiple
   1195             // sets, or a mixture of chars and sets -- syntax error.
   1196 
   1197             // We expect to see a single undefined variable (the one being
   1198             // defined).
   1199             if (undefinedVariableName == null) {
   1200                 syntaxError("Missing '$' or duplicate definition", rule, start);
   1201             }
   1202             if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {
   1203                 syntaxError("Malformed LHS", rule, start);
   1204             }
   1205             if (left.anchorStart || left.anchorEnd ||
   1206                 right.anchorStart || right.anchorEnd) {
   1207                 syntaxError("Malformed variable def", rule, start);
   1208             }
   1209             // We allow anything on the right, including an empty string.
   1210             int n = right.text.length();
   1211             char[] value = new char[n];
   1212             right.text.getChars(0, n, value, 0);
   1213             variableNames.put(undefinedVariableName, value);
   1214 
   1215             ++variableLimit;
   1216             return pos;
   1217         }
   1218 
   1219         // If this is not a variable definition rule, we shouldn't have
   1220         // any undefined variable names.
   1221         if (undefinedVariableName != null) {
   1222             syntaxError("Undefined variable $" + undefinedVariableName,
   1223                         rule, start);
   1224         }
   1225 
   1226         // Verify segments
   1227         if (segmentStandins.length() > segmentObjects.size()) {
   1228             syntaxError("Undefined segment reference", rule, start);
   1229         }
   1230         for (int i=0; i<segmentStandins.length(); ++i) {
   1231             if (segmentStandins.charAt(i) == 0) {
   1232                 syntaxError("Internal error", rule, start); // will never happen
   1233             }
   1234         }
   1235         for (int i=0; i<segmentObjects.size(); ++i) {
   1236             if (segmentObjects.get(i) == null) {
   1237                 syntaxError("Internal error", rule, start); // will never happen
   1238             }
   1239         }
   1240 
   1241         // If the direction we want doesn't match the rule
   1242         // direction, do nothing.
   1243         if (operator != FWDREV_RULE_OP &&
   1244             ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) {
   1245             return pos;
   1246         }
   1247 
   1248         // Transform the rule into a forward rule by swapping the
   1249         // sides if necessary.
   1250         if (direction == Transliterator.REVERSE) {
   1251             RuleHalf temp = left;
   1252             left = right;
   1253             right = temp;
   1254         }
   1255 
   1256         // Remove non-applicable elements in forward-reverse
   1257         // rules.  Bidirectional rules ignore elements that do not
   1258         // apply.
   1259         if (operator == FWDREV_RULE_OP) {
   1260             right.removeContext();
   1261             left.cursor = -1;
   1262             left.cursorOffset = 0;
   1263         }
   1264 
   1265         // Normalize context
   1266         if (left.ante < 0) {
   1267             left.ante = 0;
   1268         }
   1269         if (left.post < 0) {
   1270             left.post = left.text.length();
   1271         }
   1272 
   1273         // Context is only allowed on the input side.  Cursors are only
   1274         // allowed on the output side.  Segment delimiters can only appear
   1275         // on the left, and references on the right.  Cursor offset
   1276         // cannot appear without an explicit cursor.  Cursor offset
   1277         // cannot place the cursor outside the limits of the context.
   1278         // Anchors are only allowed on the input side.
   1279         if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
   1280             (right.cursorOffset != 0 && right.cursor < 0) ||
   1281             // - The following two checks were used to ensure that the
   1282             // - the cursor offset stayed within the ante- or postcontext.
   1283             // - However, with the addition of quantifiers, we have to
   1284             // - allow arbitrary cursor offsets and do runtime checking.
   1285             //(right.cursorOffset > (left.text.length() - left.post)) ||
   1286             //(-right.cursorOffset > left.ante) ||
   1287             right.anchorStart || right.anchorEnd ||
   1288             !left.isValidInput(this) || !right.isValidOutput(this) ||
   1289             left.ante > left.post) {
   1290             syntaxError("Malformed rule", rule, start);
   1291         }
   1292 
   1293         // Flatten segment objects vector to an array
   1294         UnicodeMatcher[] segmentsArray = null;
   1295         if (segmentObjects.size() > 0) {
   1296             segmentsArray = new UnicodeMatcher[segmentObjects.size()];
   1297             segmentObjects.toArray(segmentsArray);
   1298         }
   1299 
   1300         curData.ruleSet.addRule(new TransliterationRule(
   1301                                      left.text, left.ante, left.post,
   1302                                      right.text, right.cursor, right.cursorOffset,
   1303                                      segmentsArray,
   1304                                      left.anchorStart, left.anchorEnd,
   1305                                      curData));
   1306 
   1307         return pos;
   1308     }
   1309 
   1310     /**
   1311      * Set the variable range to [start, end] (inclusive).
   1312      */
   1313     private void setVariableRange(int start, int end) {
   1314         if (start > end || start < 0 || end > 0xFFFF) {
   1315             throw new IllegalIcuArgumentException("Invalid variable range " + start + ", " + end);
   1316         }
   1317 
   1318         curData.variablesBase = (char) start; // first private use
   1319 
   1320         if (dataVector.size() == 0) {
   1321             variableNext = (char) start;
   1322             variableLimit = (char) (end + 1);
   1323         }
   1324     }
   1325 
   1326     /**
   1327      * Assert that the given character is NOT within the variable range.
   1328      * If it is, signal an error.  This is neccesary to ensure that the
   1329      * variable range does not overlap characters used in a rule.
   1330      */
   1331     private void checkVariableRange(int ch, String rule, int start) {
   1332         if (ch >= curData.variablesBase && ch < variableLimit) {
   1333             syntaxError("Variable range character in rule", rule, start);
   1334         }
   1335     }
   1336 
   1337     // (The following method is part of an unimplemented feature.
   1338     // Remove this clover pragma after the feature is implemented.
   1339     // 2003-06-11 ICU 2.6 Alan)
   1340     ///CLOVER:OFF
   1341     /**
   1342      * Set the maximum backup to 'backup', in response to a pragma
   1343      * statement.
   1344      */
   1345     private void pragmaMaximumBackup(int backup) {
   1346         //TODO Finish
   1347         throw new IllegalIcuArgumentException("use maximum backup pragma not implemented yet");
   1348     }
   1349     ///CLOVER:ON
   1350 
   1351     // (The following method is part of an unimplemented feature.
   1352     // Remove this clover pragma after the feature is implemented.
   1353     // 2003-06-11 ICU 2.6 Alan)
   1354     ///CLOVER:OFF
   1355     /**
   1356      * Begin normalizing all rules using the given mode, in response
   1357      * to a pragma statement.
   1358      */
   1359     private void pragmaNormalizeRules(Normalizer.Mode mode) {
   1360         //TODO Finish
   1361         throw new IllegalIcuArgumentException("use normalize rules pragma not implemented yet");
   1362     }
   1363     ///CLOVER:ON
   1364 
   1365     /**
   1366      * Return true if the given rule looks like a pragma.
   1367      * @param pos offset to the first non-whitespace character
   1368      * of the rule.
   1369      * @param limit pointer past the last character of the rule.
   1370      */
   1371     static boolean resemblesPragma(String rule, int pos, int limit) {
   1372         // Must start with /use\s/i
   1373         return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0;
   1374     }
   1375 
   1376     /**
   1377      * Parse a pragma.  This method assumes resemblesPragma() has
   1378      * already returned true.
   1379      * @param pos offset to the first non-whitespace character
   1380      * of the rule.
   1381      * @param limit pointer past the last character of the rule.
   1382      * @return the position index after the final ';' of the pragma,
   1383      * or -1 on failure.
   1384      */
   1385     private int parsePragma(String rule, int pos, int limit) {
   1386         int[] array = new int[2];
   1387 
   1388         // resemblesPragma() has already returned true, so we
   1389         // know that pos points to /use\s/i; we can skip 4 characters
   1390         // immediately
   1391         pos += 4;
   1392 
   1393         // Here are the pragmas we recognize:
   1394         // use variable range 0xE000 0xEFFF;
   1395         // use maximum backup 16;
   1396         // use nfd rules;
   1397         int p = Utility.parsePattern(rule, pos, limit, "~variable range # #~;", array);
   1398         if (p >= 0) {
   1399             setVariableRange(array[0], array[1]);
   1400             return p;
   1401         }
   1402 
   1403         p = Utility.parsePattern(rule, pos, limit, "~maximum backup #~;", array);
   1404         if (p >= 0) {
   1405             pragmaMaximumBackup(array[0]);
   1406             return p;
   1407         }
   1408 
   1409         p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null);
   1410         if (p >= 0) {
   1411             pragmaNormalizeRules(Normalizer.NFD);
   1412             return p;
   1413         }
   1414 
   1415         p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null);
   1416         if (p >= 0) {
   1417             pragmaNormalizeRules(Normalizer.NFC);
   1418             return p;
   1419         }
   1420 
   1421         // Syntax error: unable to parse pragma
   1422         return -1;
   1423     }
   1424 
   1425     /**
   1426      * Throw an exception indicating a syntax error.  Search the rule string
   1427      * for the probable end of the rule.  Of course, if the error is that
   1428      * the end of rule marker is missing, then the rule end will not be found.
   1429      * In any case the rule start will be correctly reported.
   1430      * @param msg error description
   1431      * @param rule pattern string
   1432      * @param start position of first character of current rule
   1433      */
   1434     static final void syntaxError(String msg, String rule, int start) {
   1435         int end = ruleEnd(rule, start, rule.length());
   1436         throw new IllegalIcuArgumentException(msg + " in \"" +
   1437                                            Utility.escape(rule.substring(start, end)) + '"');
   1438     }
   1439 
   1440     static final int ruleEnd(String rule, int start, int limit) {
   1441         int end = Utility.quotedIndexOf(rule, start, limit, ";");
   1442         if (end < 0) {
   1443             end = limit;
   1444         }
   1445         return end;
   1446     }
   1447 
   1448     /**
   1449      * Parse a UnicodeSet out, store it, and return the stand-in character
   1450      * used to represent it.
   1451      */
   1452     private final char parseSet(String rule, ParsePosition pos) {
   1453         UnicodeSet set = new UnicodeSet(rule, pos, parseData);
   1454         if (variableNext >= variableLimit) {
   1455             throw new RuntimeException("Private use variables exhausted");
   1456         }
   1457         set.compact();
   1458         return generateStandInFor(set);
   1459     }
   1460 
   1461     /**
   1462      * Generate and return a stand-in for a new UnicodeMatcher or UnicodeReplacer.
   1463      * Store the object.
   1464      */
   1465     char generateStandInFor(Object obj) {
   1466         // assert(obj != null);
   1467 
   1468         // Look up previous stand-in, if any.  This is a short list
   1469         // (typical n is 0, 1, or 2); linear search is optimal.
   1470         for (int i=0; i<variablesVector.size(); ++i) {
   1471             if (variablesVector.get(i) == obj) { // [sic] pointer comparison
   1472                 return (char) (curData.variablesBase + i);
   1473             }
   1474         }
   1475 
   1476         if (variableNext >= variableLimit) {
   1477             throw new RuntimeException("Variable range exhausted");
   1478         }
   1479         variablesVector.add(obj);
   1480         return variableNext++;
   1481     }
   1482 
   1483     /**
   1484      * Return the standin for segment seg (1-based).
   1485      */
   1486     public char getSegmentStandin(int seg) {
   1487         if (segmentStandins.length() < seg) {
   1488             segmentStandins.setLength(seg);
   1489         }
   1490         char c = segmentStandins.charAt(seg-1);
   1491         if (c == 0) {
   1492             if (variableNext >= variableLimit) {
   1493                 throw new RuntimeException("Variable range exhausted");
   1494             }
   1495             c = variableNext++;
   1496             // Set a placeholder in the master variables vector that will be
   1497             // filled in later by setSegmentObject().  We know that we will get
   1498             // called first because setSegmentObject() will call us.
   1499             variablesVector.add(null);
   1500             segmentStandins.setCharAt(seg-1, c);
   1501         }
   1502         return c;
   1503     }
   1504 
   1505     /**
   1506      * Set the object for segment seg (1-based).
   1507      */
   1508     public void setSegmentObject(int seg, StringMatcher obj) {
   1509         // Since we call parseSection() recursively, nested
   1510         // segments will result in segment i+1 getting parsed
   1511         // and stored before segment i; be careful with the
   1512         // vector handling here.
   1513         while (segmentObjects.size() < seg) {
   1514             segmentObjects.add(null);
   1515         }
   1516         int index = getSegmentStandin(seg) - curData.variablesBase;
   1517         if (segmentObjects.get(seg-1) != null ||
   1518             variablesVector.get(index) != null) {
   1519             throw new RuntimeException(); // should never happen
   1520         }
   1521         segmentObjects.set(seg-1, obj);
   1522         variablesVector.set(index, obj);
   1523     }
   1524 
   1525     /**
   1526      * Return the stand-in for the dot set.  It is allocated the first
   1527      * time and reused thereafter.
   1528      */
   1529     char getDotStandIn() {
   1530         if (dotStandIn == -1) {
   1531             dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));
   1532         }
   1533         return (char) dotStandIn;
   1534     }
   1535 
   1536     /**
   1537      * Append the value of the given variable name to the given
   1538      * StringBuffer.
   1539      * @exception IllegalIcuArgumentException if the name is unknown.
   1540      */
   1541     private void appendVariableDef(String name, StringBuffer buf) {
   1542         char[] ch = variableNames.get(name);
   1543         if (ch == null) {
   1544             // We allow one undefined variable so that variable definition
   1545             // statements work.  For the first undefined variable we return
   1546             // the special placeholder variableLimit-1, and save the variable
   1547             // name.
   1548             if (undefinedVariableName == null) {
   1549                 undefinedVariableName = name;
   1550                 if (variableNext >= variableLimit) {
   1551                     throw new RuntimeException("Private use variables exhausted");
   1552                 }
   1553                 buf.append(--variableLimit);
   1554             } else {
   1555                 throw new IllegalIcuArgumentException("Undefined variable $"
   1556                                                    + name);
   1557             }
   1558         } else {
   1559             buf.append(ch);
   1560         }
   1561     }
   1562 }
   1563 
   1564 //eof
   1565