Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  *******************************************************************************
      6  * Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 
     10 package android.icu.text;
     11 
     12 import java.text.ParsePosition;
     13 import java.util.HashMap;
     14 
     15 import android.icu.impl.Assert;
     16 import android.icu.impl.Utility;
     17 import android.icu.lang.UCharacter;
     18 
     19 /**
     20   *  This class is part of the Rule Based Break Iterator rule compiler.
     21   *  It scans the rules and builds the parse tree.
     22   *  There is no public API here.
     23   */
     24 class RBBIRuleScanner {
     25 
     26     private final static int    kStackSize = 100;               // The size of the state stack for
     27     //   rules parsing.  Corresponds roughly
     28     //   to the depth of parentheses nesting
     29     //   that is allowed in the rules.
     30 
     31     static class RBBIRuleChar {
     32         int             fChar;
     33         boolean         fEscaped;
     34     }
     35 
     36 
     37 
     38     RBBIRuleBuilder               fRB;              // The rule builder that we are part of.
     39 
     40     int                       fScanIndex;        // Index of current character being processed
     41                                                      //   in the rule input string.
     42     int                       fNextIndex;        // Index of the next character, which
     43                                                      //   is the first character not yet scanned.
     44     boolean                  fQuoteMode;        // Scan is in a 'quoted region'
     45     int                       fLineNum;          // Line number in input file.
     46     int                       fCharNum;          // Char position within the line.
     47     int                       fLastChar;         // Previous char, needed to count CR-LF
     48                                                      //   as a single line, not two.
     49 
     50     RBBIRuleChar              fC = new RBBIRuleChar();    // Current char for parse state machine
     51                                                      //   processing.
     52 
     53 
     54     short  fStack[] = new short[kStackSize];  // State stack, holds state pushes
     55     int                       fStackPtr;           //  and pops as specified in the state
     56                                                        //  transition rules.
     57 
     58     RBBINode  fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
     59                                                            //  during the parse of a rule
     60     int                        fNodeStackPtr;
     61 
     62 
     63     boolean                    fReverseRule;         // True if the rule currently being scanned
     64                                                      //  is a reverse direction rule (if it
     65                                                      //  starts with a '!')
     66 
     67     boolean                    fLookAheadRule;       // True if the rule includes a '/'
     68                                                      //   somewhere within it.
     69 
     70     boolean                    fNoChainInRule;       // True if the current rule starts with a '^'.
     71 
     72 
     73     RBBISymbolTable            fSymbolTable;         // symbol table, holds definitions of
     74                                                      //   $variable symbols.
     75 
     76     HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to
     77                                                                                        //   the sets created while parsing rules.
     78                                                                                        //   The key is the string used for creating
     79                                                                                        //   the set.
     80 
     81     UnicodeSet      fRuleSets[] = new UnicodeSet[10];    // Unicode Sets that are needed during
     82                                                      //  the scanning of RBBI rules.  The
     83                                                      //  Indices for these are assigned by the
     84                                                      //  perl script that builds the state tables.
     85                                                      //  See rbbirpt.h.
     86 
     87     int                        fRuleNum;         // Counts each rule as it is scanned.
     88 
     89     int                        fOptionStart;     // Input index of start of a !!option
     90                                                  //   keyword, while being scanned.
     91 
     92 
     93    // gRuleSet_rule_char_pattern is characters that may appear as literals in patterns without escaping or quoting.
     94    static private String gRuleSet_rule_char_pattern       = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
     95    static private String gRuleSet_name_char_pattern       = "[_\\p{L}\\p{N}]";
     96    static private String gRuleSet_digit_char_pattern      = "[0-9]";
     97    static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]";
     98    static private String gRuleSet_white_space_pattern     = "[\\p{Pattern_White_Space}]";
     99    static private String kAny =  "any";
    100 
    101 
    102 
    103 
    104     //----------------------------------------------------------------------------------------
    105     //
    106     //  Constructor.
    107     //
    108     //----------------------------------------------------------------------------------------
    109     RBBIRuleScanner(RBBIRuleBuilder rb) {
    110         fRB = rb;
    111         fLineNum = 1;
    112 
    113         //
    114         //  Set up the constant Unicode Sets.
    115         //     Note: These could be made static and shared among
    116         //            all instances of RBBIRuleScanners.
    117         fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern);
    118         fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern);
    119         fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern);
    120         fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern);
    121         fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern);
    122 
    123         fSymbolTable = new RBBISymbolTable(this);
    124     }
    125 
    126     //----------------------------------------------------------------------------------------
    127     //
    128     //  doParseAction Do some action during rule parsing.
    129     //                       Called by the parse state machine.
    130     //                       Actions build the parse tree and Unicode Sets,
    131     //                       and maintain the parse stack for nested expressions.
    132     //
    133     //----------------------------------------------------------------------------------------
    134     boolean doParseActions(int action) {
    135         RBBINode n = null;
    136 
    137         boolean returnVal = true;
    138 
    139         switch (action) {
    140 
    141         case RBBIRuleParseTable.doExprStart:
    142             pushNewNode(RBBINode.opStart);
    143             fRuleNum++;
    144             break;
    145 
    146         case RBBIRuleParseTable.doNoChain:
    147             // Scanned a '^' while on the rule start state.
    148             fNoChainInRule = true;
    149             break;
    150 
    151 
    152         case RBBIRuleParseTable.doExprOrOperator: {
    153             fixOpStack(RBBINode.precOpCat);
    154             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
    155             RBBINode orNode = pushNewNode(RBBINode.opOr);
    156             orNode.fLeftChild = operandNode;
    157             operandNode.fParent = orNode;
    158         }
    159             break;
    160 
    161         case RBBIRuleParseTable.doExprCatOperator:
    162         // concatenation operator.
    163         // For the implicit concatenation of adjacent terms in an expression
    164         // that are
    165         //   not separated by any other operator. Action is invoked between the
    166         //   actions for the two terms.
    167         {
    168             fixOpStack(RBBINode.precOpCat);
    169             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
    170             RBBINode catNode = pushNewNode(RBBINode.opCat);
    171             catNode.fLeftChild = operandNode;
    172             operandNode.fParent = catNode;
    173         }
    174             break;
    175 
    176         case RBBIRuleParseTable.doLParen:
    177             // Open Paren.
    178             //   The openParen node is a dummy operation type with a low
    179             // precedence,
    180             //     which has the affect of ensuring that any real binary op that
    181             //     follows within the parens binds more tightly to the operands than
    182             //     stuff outside of the parens.
    183             pushNewNode(RBBINode.opLParen);
    184             break;
    185 
    186         case RBBIRuleParseTable.doExprRParen:
    187             fixOpStack(RBBINode.precLParen);
    188             break;
    189 
    190         case RBBIRuleParseTable.doNOP:
    191             break;
    192 
    193         case RBBIRuleParseTable.doStartAssign:
    194             // We've just scanned "$variable = "
    195             // The top of the node stack has the $variable ref node.
    196 
    197             // Save the start position of the RHS text in the StartExpression
    198             // node
    199             //   that precedes the $variableReference node on the stack.
    200             //   This will eventually be used when saving the full $variable
    201             // replacement
    202             //   text as a string.
    203             n = fNodeStack[fNodeStackPtr - 1];
    204             n.fFirstPos = fNextIndex; // move past the '='
    205 
    206             // Push a new start-of-expression node; needed to keep parse of the
    207             //   RHS expression happy.
    208             pushNewNode(RBBINode.opStart);
    209             break;
    210 
    211         case RBBIRuleParseTable.doEndAssign: {
    212             // We have reached the end of an assignement statement.
    213             //   Current scan char is the ';' that terminates the assignment.
    214 
    215             // Terminate expression, leaves expression parse tree rooted in TOS
    216             // node.
    217             fixOpStack(RBBINode.precStart);
    218 
    219             RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2];
    220             RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1];
    221             RBBINode RHSExprNode = fNodeStack[fNodeStackPtr];
    222 
    223             // Save original text of right side of assignment, excluding the
    224             // terminating ';'
    225             //  in the root of the node for the right-hand-side expression.
    226             RHSExprNode.fFirstPos = startExprNode.fFirstPos;
    227             RHSExprNode.fLastPos = fScanIndex;
    228             // fRB.fRules.extractBetween(RHSExprNode.fFirstPos,
    229             // RHSExprNode.fLastPos, RHSExprNode.fText);
    230             RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos,
    231                     RHSExprNode.fLastPos);
    232 
    233             // Expression parse tree becomes l. child of the $variable reference
    234             // node.
    235             varRefNode.fLeftChild = RHSExprNode;
    236             RHSExprNode.fParent = varRefNode;
    237 
    238             // Make a symbol table entry for the $variableRef node.
    239             fSymbolTable.addEntry(varRefNode.fText, varRefNode);
    240 
    241             // Clean up the stack.
    242             fNodeStackPtr -= 3;
    243             break;
    244         }
    245 
    246         case RBBIRuleParseTable.doEndOfRule: {
    247             fixOpStack(RBBINode.precStart); // Terminate expression, leaves
    248                                             // expression
    249 
    250             if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) {
    251                 printNodeStack("end of rule");
    252             }
    253             Assert.assrt(fNodeStackPtr == 1);
    254             RBBINode thisRule = fNodeStack[fNodeStackPtr];
    255 
    256             // If this rule includes a look-ahead '/', add a endMark node to the
    257             //   expression tree.
    258             if (fLookAheadRule) {
    259                 RBBINode endNode = pushNewNode(RBBINode.endMark);
    260                 RBBINode catNode = pushNewNode(RBBINode.opCat);
    261                 fNodeStackPtr -= 2;
    262                 catNode.fLeftChild = thisRule;
    263                 catNode.fRightChild = endNode;
    264                 fNodeStack[fNodeStackPtr] = catNode;
    265                 endNode.fVal = fRuleNum;
    266                 endNode.fLookAheadEnd = true;
    267                 thisRule = catNode;
    268 
    269                 // TODO: Disable chaining out of look-ahead (hard break) rules.
    270                 //   The break on rule match is forced, so there is no point in building up
    271                 //   the state table to chain into another rule for a longer match.
    272             }
    273 
    274             // Mark this node as being the root of a rule.
    275             thisRule.fRuleRoot = true;
    276 
    277             // Flag if chaining into this rule is wanted.
    278             //
    279             if (fRB.fChainRules &&          // If rule chaining is enabled globally via !!chain
    280                     !fNoChainInRule) {      //     and no '^' chain-in inhibit was on this rule
    281                 thisRule.fChainIn = true;
    282             }
    283 
    284 
    285             // All rule expressions are ORed together.
    286             // The ';' that terminates an expression really just functions as a
    287             // '|' with
    288             //   a low operator prededence.
    289             //
    290             // Each of the four sets of rules are collected separately.
    291             //  (forward, reverse, safe_forward, safe_reverse)
    292             //  OR this rule into the appropriate group of them.
    293             //
    294 
    295             int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree);
    296 
    297             if (fRB.fTreeRoots[destRules] != null) {
    298                 // This is not the first rule encountered.
    299                 // OR previous stuff (from *destRules)
    300                 // with the current rule expression (on the Node Stack)
    301                 //  with the resulting OR expression going to *destRules
    302                 //
    303                 thisRule = fNodeStack[fNodeStackPtr];
    304                 RBBINode prevRules = fRB.fTreeRoots[destRules];
    305                 RBBINode orNode = pushNewNode(RBBINode.opOr);
    306                 orNode.fLeftChild = prevRules;
    307                 prevRules.fParent = orNode;
    308                 orNode.fRightChild = thisRule;
    309                 thisRule.fParent = orNode;
    310                 fRB.fTreeRoots[destRules] = orNode;
    311             } else {
    312                 // This is the first rule encountered (for this direction).
    313                 // Just move its parse tree from the stack to *destRules.
    314                 fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr];
    315             }
    316             fReverseRule = false; // in preparation for the next rule.
    317             fLookAheadRule = false;
    318             fNoChainInRule = false;
    319             fNodeStackPtr = 0;
    320         }
    321             break;
    322 
    323         case RBBIRuleParseTable.doRuleError:
    324             error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
    325             returnVal = false;
    326             break;
    327 
    328         case RBBIRuleParseTable.doVariableNameExpectedErr:
    329             error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
    330             break;
    331 
    332         //
    333         //  Unary operands + ? *
    334         //    These all appear after the operand to which they apply.
    335         //    When we hit one, the operand (may be a whole sub expression)
    336         //    will be on the top of the stack.
    337         //    Unary Operator becomes TOS, with the old TOS as its one child.
    338         case RBBIRuleParseTable.doUnaryOpPlus: {
    339             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
    340             RBBINode plusNode = pushNewNode(RBBINode.opPlus);
    341             plusNode.fLeftChild = operandNode;
    342             operandNode.fParent = plusNode;
    343         }
    344             break;
    345 
    346         case RBBIRuleParseTable.doUnaryOpQuestion: {
    347             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
    348             RBBINode qNode = pushNewNode(RBBINode.opQuestion);
    349             qNode.fLeftChild = operandNode;
    350             operandNode.fParent = qNode;
    351         }
    352             break;
    353 
    354         case RBBIRuleParseTable.doUnaryOpStar: {
    355             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
    356             RBBINode starNode = pushNewNode(RBBINode.opStar);
    357             starNode.fLeftChild = operandNode;
    358             operandNode.fParent = starNode;
    359         }
    360             break;
    361 
    362         case RBBIRuleParseTable.doRuleChar:
    363         // A "Rule Character" is any single character that is a literal part
    364         // of the regular expression. Like a, b and c in the expression "(abc*)
    365         // | [:L:]"
    366         // These are pretty uncommon in break rules; the terms are more commonly
    367         //  sets. To keep things uniform, treat these characters like as
    368         // sets that just happen to contain only one character.
    369         {
    370             n = pushNewNode(RBBINode.setRef);
    371             String s = String.valueOf((char)fC.fChar);
    372             findSetFor(s, n, null);
    373             n.fFirstPos = fScanIndex;
    374             n.fLastPos = fNextIndex;
    375             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
    376             break;
    377         }
    378 
    379         case RBBIRuleParseTable.doDotAny:
    380         // scanned a ".", meaning match any single character.
    381         {
    382             n = pushNewNode(RBBINode.setRef);
    383             findSetFor(kAny, n, null);
    384             n.fFirstPos = fScanIndex;
    385             n.fLastPos = fNextIndex;
    386             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
    387             break;
    388         }
    389 
    390         case RBBIRuleParseTable.doSlash:
    391             // Scanned a '/', which identifies a look-ahead break position in a
    392             // rule.
    393             n = pushNewNode(RBBINode.lookAhead);
    394             n.fVal = fRuleNum;
    395             n.fFirstPos = fScanIndex;
    396             n.fLastPos = fNextIndex;
    397             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
    398             fLookAheadRule = true;
    399             break;
    400 
    401         case RBBIRuleParseTable.doStartTagValue:
    402             // Scanned a '{', the opening delimiter for a tag value within a
    403             // rule.
    404             n = pushNewNode(RBBINode.tag);
    405             n.fVal = 0;
    406             n.fFirstPos = fScanIndex;
    407             n.fLastPos = fNextIndex;
    408             break;
    409 
    410         case RBBIRuleParseTable.doTagDigit:
    411         // Just scanned a decimal digit that's part of a tag value
    412         {
    413             n = fNodeStack[fNodeStackPtr];
    414             int v = UCharacter.digit((char) fC.fChar, 10);
    415             n.fVal = n.fVal * 10 + v;
    416             break;
    417         }
    418 
    419         case RBBIRuleParseTable.doTagValue:
    420             n = fNodeStack[fNodeStackPtr];
    421             n.fLastPos = fNextIndex;
    422             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
    423             break;
    424 
    425         case RBBIRuleParseTable.doTagExpectedError:
    426             error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG);
    427             returnVal = false;
    428             break;
    429 
    430         case RBBIRuleParseTable.doOptionStart:
    431             // Scanning a !!option. At the start of string.
    432             fOptionStart = fScanIndex;
    433             break;
    434 
    435         case RBBIRuleParseTable.doOptionEnd: {
    436             String opt = fRB.fRules.substring(fOptionStart, fScanIndex);
    437             if (opt.equals("chain")) {
    438                 fRB.fChainRules = true;
    439             } else if (opt.equals("LBCMNoChain")) {
    440                 fRB.fLBCMNoChain = true;
    441             } else if (opt.equals("forward")) {
    442                 fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree;
    443             } else if (opt.equals("reverse")) {
    444                 fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree;
    445             } else if (opt.equals("safe_forward")) {
    446                 fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree;
    447             } else if (opt.equals("safe_reverse")) {
    448                 fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree;
    449             } else if (opt.equals("lookAheadHardBreak")) {
    450                 fRB.fLookAheadHardBreak = true;
    451             } else if (opt.equals("quoted_literals_only")) {
    452                 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].clear();
    453             } else if (opt.equals("unquoted_literals")) {
    454                 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].applyPattern(gRuleSet_rule_char_pattern);
    455             } else {
    456                 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION);
    457             }
    458             break;
    459         }
    460 
    461         case RBBIRuleParseTable.doReverseDir:
    462             fReverseRule = true;
    463             break;
    464 
    465         case RBBIRuleParseTable.doStartVariableName:
    466             n = pushNewNode(RBBINode.varRef);
    467             n.fFirstPos = fScanIndex;
    468             break;
    469 
    470         case RBBIRuleParseTable.doEndVariableName:
    471             n = fNodeStack[fNodeStackPtr];
    472             if (n == null || n.fType != RBBINode.varRef) {
    473                 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
    474                 break;
    475             }
    476             n.fLastPos = fScanIndex;
    477             n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos);
    478             // Look the newly scanned name up in the symbol table
    479             //   If there's an entry, set the l. child of the var ref to the
    480             // replacement expression.
    481             //   (We also pass through here when scanning assignments, but no harm
    482             // is done, other
    483             //    than a slight wasted effort that seems hard to avoid. Lookup will
    484             // be null)
    485             n.fLeftChild = fSymbolTable.lookupNode(n.fText);
    486             break;
    487 
    488         case RBBIRuleParseTable.doCheckVarDef:
    489             n = fNodeStack[fNodeStackPtr];
    490             if (n.fLeftChild == null) {
    491                 error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE);
    492                 returnVal = false;
    493             }
    494             break;
    495 
    496         case RBBIRuleParseTable.doExprFinished:
    497             break;
    498 
    499         case RBBIRuleParseTable.doRuleErrorAssignExpr:
    500             error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR);
    501             returnVal = false;
    502             break;
    503 
    504         case RBBIRuleParseTable.doExit:
    505             returnVal = false;
    506             break;
    507 
    508         case RBBIRuleParseTable.doScanUnicodeSet:
    509             scanSet();
    510             break;
    511 
    512         default:
    513             error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
    514             returnVal = false;
    515             break;
    516         }
    517         return returnVal;
    518     }
    519 
    520     //----------------------------------------------------------------------------------------
    521     //
    522     //  Error Throw and IllegalArgumentException in response to a rule parse
    523     // error.
    524     //
    525     //----------------------------------------------------------------------------------------
    526     void error(int e) {
    527         String s = "Error " + e + " at line " + fLineNum + " column "
    528                 + fCharNum;
    529         IllegalArgumentException ex = new IllegalArgumentException(s);
    530         throw ex;
    531 
    532     }
    533 
    534     //----------------------------------------------------------------------------------------
    535     //
    536     //  fixOpStack The parse stack holds partially assembled chunks of the parse
    537     // tree.
    538     //               An entry on the stack may be as small as a single setRef node,
    539     //               or as large as the parse tree
    540     //               for an entire expression (this will be the one item left on the stack
    541     //               when the parsing of an RBBI rule completes.
    542     //
    543     //               This function is called when a binary operator is encountered.
    544     //               It looks back up the stack for operators that are not yet associated
    545     //               with a right operand, and if the precedence of the stacked operator >=
    546     //               the precedence of the current operator, binds the operand left,
    547     //               to the previously encountered operator.
    548     //
    549     //----------------------------------------------------------------------------------------
    550     void fixOpStack(int p) {
    551         RBBINode n;
    552         // printNodeStack("entering fixOpStack()");
    553         for (;;) {
    554             n = fNodeStack[fNodeStackPtr - 1]; // an operator node
    555             if (n.fPrecedence == 0) {
    556                 System.out.print("RBBIRuleScanner.fixOpStack, bad operator node");
    557                 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
    558                 return;
    559             }
    560 
    561             if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) {
    562                 // The most recent operand goes with the current operator,
    563                 //   not with the previously stacked one.
    564                 break;
    565             }
    566             // Stack operator is a binary op ( '|' or concatenation)
    567             //   TOS operand becomes right child of this operator.
    568             //   Resulting subexpression becomes the TOS operand.
    569             n.fRightChild = fNodeStack[fNodeStackPtr];
    570             fNodeStack[fNodeStackPtr].fParent = n;
    571             fNodeStackPtr--;
    572             // printNodeStack("looping in fixOpStack() ");
    573         }
    574 
    575         if (p <= RBBINode.precLParen) {
    576             // Scan is at a right paren or end of expression.
    577             //  The scanned item must match the stack, or else there was an
    578             // error.
    579             //  Discard the left paren (or start expr) node from the stack,
    580             //  leaving the completed (sub)expression as TOS.
    581             if (n.fPrecedence != p) {
    582                 // Right paren encountered matched start of expression node, or
    583                 // end of expression matched with a left paren node.
    584                 error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN);
    585             }
    586             fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr];
    587             fNodeStackPtr--;
    588             // Delete the now-discarded LParen or Start node.
    589             // delete n;
    590         }
    591         // printNodeStack("leaving fixOpStack()");
    592     }
    593 
    594     //----------------------------------------------------------------------------
    595     //
    596     //       RBBISetTableEl is an entry in the hash table of UnicodeSets that have
    597     //                        been encountered. The val Node will be of nodetype uset
    598     //                        and contain pointers to the actual UnicodeSets.
    599     //                        The Key is the source string for initializing the set.
    600     //
    601     //                        The hash table is used to avoid creating duplicate
    602     //                        unnamed (not $var references) UnicodeSets.
    603     //
    604     //----------------------------------------------------------------------------
    605     static class RBBISetTableEl {
    606         String key;
    607 
    608         RBBINode val;
    609     }
    610 
    611 
    612     //----------------------------------------------------------------------------------------
    613     //
    614     //   findSetFor given a String,
    615     //                  - find the corresponding Unicode Set (uset node)
    616     //                         (create one if necessary)
    617     //                  - Set fLeftChild of the caller's node (should be a setRef node)
    618     //                         to the uset node
    619     //                 Maintain a hash table of uset nodes, so the same one is always used
    620     //                    for the same string.
    621     //                 If a "to adopt" set is provided and we haven't seen this key before,
    622     //                    add the provided set to the hash table.
    623     //                 If the string is one (32 bit) char in length, the set contains
    624     //                    just one element which is the char in question.
    625     //                 If the string is "any", return a set containing all chars.
    626     //
    627     //----------------------------------------------------------------------------------------
    628     void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) {
    629 
    630         RBBISetTableEl el;
    631 
    632         // First check whether we've already cached a set for this string.
    633         // If so, just use the cached set in the new node.
    634         //   delete any set provided by the caller, since we own it.
    635         el = fSetTable.get(s);
    636         if (el != null) {
    637             node.fLeftChild = el.val;
    638             Assert.assrt(node.fLeftChild.fType == RBBINode.uset);
    639             return;
    640         }
    641 
    642         // Haven't seen this set before.
    643         // If the caller didn't provide us with a prebuilt set,
    644         //   create a new UnicodeSet now.
    645         if (setToAdopt == null) {
    646             if (s.equals(kAny)) {
    647                 setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
    648             } else {
    649                 int c;
    650                 c = UTF16.charAt(s, 0);
    651                 setToAdopt = new UnicodeSet(c, c);
    652             }
    653         }
    654 
    655         //
    656         // Make a new uset node to refer to this UnicodeSet
    657         // This new uset node becomes the child of the caller's setReference
    658         // node.
    659         //
    660         RBBINode usetNode = new RBBINode(RBBINode.uset);
    661         usetNode.fInputSet = setToAdopt;
    662         usetNode.fParent = node;
    663         node.fLeftChild = usetNode;
    664         usetNode.fText = s;
    665 
    666         //
    667         // Add the new uset node to the list of all uset nodes.
    668         //
    669         fRB.fUSetNodes.add(usetNode);
    670 
    671         //
    672         // Add the new set to the set hash table.
    673         //
    674         el = new RBBISetTableEl();
    675         el.key = s;
    676         el.val = usetNode;
    677         fSetTable.put(el.key, el);
    678 
    679         return;
    680     }
    681 
    682     //
    683     //  Assorted Unicode character constants.
    684     //     Numeric because there is no portable way to enter them as literals.
    685     //     (Think EBCDIC).
    686     //
    687     static final int chNEL = 0x85; //    NEL newline variant
    688 
    689     static final int chLS = 0x2028; //    Unicode Line Separator
    690 
    691     //----------------------------------------------------------------------------------------
    692     //
    693     //  stripRules    Return a rules string without unnecessary
    694     //                characters.
    695     //
    696     //----------------------------------------------------------------------------------------
    697     static String stripRules(String rules) {
    698         StringBuilder strippedRules = new StringBuilder();
    699         int rulesLength = rules.length();
    700         for (int idx = 0; idx < rulesLength;) {
    701             char ch = rules.charAt(idx++);
    702             if (ch == '#') {
    703                 while (idx < rulesLength
    704                         && ch != '\r' && ch != '\n' && ch != chNEL) {
    705                     ch = rules.charAt(idx++);
    706                 }
    707             }
    708             if (!UCharacter.isISOControl(ch)) {
    709                 strippedRules.append(ch);
    710             }
    711         }
    712         return strippedRules.toString();
    713     }
    714 
    715     //----------------------------------------------------------------------------------------
    716     //
    717     //  nextCharLL    Low Level Next Char from rule input source.
    718     //                Get a char from the input character iterator,
    719     //                keep track of input position for error reporting.
    720     //
    721     //----------------------------------------------------------------------------------------
    722     int nextCharLL() {
    723         int ch;
    724 
    725         if (fNextIndex >= fRB.fRules.length()) {
    726             return -1;
    727         }
    728         ch = UTF16.charAt(fRB.fRules, fNextIndex);
    729         fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
    730 
    731         if (ch == '\r' ||
    732             ch == chNEL ||
    733             ch == chLS ||
    734             ch == '\n' && fLastChar != '\r') {
    735             // Character is starting a new line.  Bump up the line number, and
    736             //  reset the column to 0.
    737             fLineNum++;
    738             fCharNum = 0;
    739             if (fQuoteMode) {
    740                 error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING);
    741                 fQuoteMode = false;
    742             }
    743         } else {
    744             // Character is not starting a new line.  Except in the case of a
    745             //   LF following a CR, increment the column position.
    746             if (ch != '\n') {
    747                 fCharNum++;
    748             }
    749         }
    750         fLastChar = ch;
    751         return ch;
    752     }
    753 
    754     //---------------------------------------------------------------------------------
    755     //
    756     //   nextChar     for rules scanning.  At this level, we handle stripping
    757     //                out comments and processing backslash character escapes.
    758     //                The rest of the rules grammar is handled at the next level up.
    759     //
    760     //---------------------------------------------------------------------------------
    761     void nextChar(RBBIRuleChar c) {
    762 
    763         // Unicode Character constants needed for the processing done by nextChar(),
    764         //   in hex because literals wont work on EBCDIC machines.
    765 
    766         fScanIndex = fNextIndex;
    767         c.fChar = nextCharLL();
    768         c.fEscaped = false;
    769 
    770         //
    771         //  check for '' sequence.
    772         //  These are recognized in all contexts, whether in quoted text or not.
    773         //
    774         if (c.fChar == '\'') {
    775             if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') {
    776                 c.fChar = nextCharLL(); // get nextChar officially so character counts
    777                 c.fEscaped = true; //   stay correct.
    778             } else {
    779                 // Single quote, by itself.
    780                 //   Toggle quoting mode.
    781                 //   Return either '('  or ')', because quotes cause a grouping of the quoted text.
    782                 fQuoteMode = !fQuoteMode;
    783                 if (fQuoteMode == true) {
    784                     c.fChar = '(';
    785                 } else {
    786                     c.fChar = ')';
    787                 }
    788                 c.fEscaped = false; // The paren that we return is not escaped.
    789                 return;
    790             }
    791         }
    792 
    793         if (fQuoteMode) {
    794             c.fEscaped = true;
    795         } else {
    796             // We are not in a 'quoted region' of the source.
    797             //
    798             if (c.fChar == '#') {
    799                 // Start of a comment.  Consume the rest of it.
    800                 //  The new-line char that terminates the comment is always returned.
    801                 //  It will be treated as white-space, and serves to break up anything
    802                 //    that might otherwise incorrectly clump together with a comment in
    803                 //    the middle (a variable name, for example.)
    804                 for (;;) {
    805                     c.fChar = nextCharLL();
    806                     if (c.fChar == -1 || // EOF
    807                         c.fChar == '\r' ||
    808                         c.fChar == '\n' ||
    809                         c.fChar == chNEL ||
    810                         c.fChar == chLS)
    811                     {
    812                         break;
    813                     }
    814                 }
    815             }
    816             if (c.fChar == -1) {
    817                 return;
    818             }
    819 
    820             //
    821             //  check for backslash escaped characters.
    822             //  Use String.unescapeAt() to handle them.
    823             //
    824             if (c.fChar == '\\') {
    825                 c.fEscaped = true;
    826                 int[] unescapeIndex = new int[1];
    827                 unescapeIndex[0] = fNextIndex;
    828                 c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
    829                 if (unescapeIndex[0] == fNextIndex) {
    830                     error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
    831                 }
    832 
    833                 fCharNum += unescapeIndex[0] - fNextIndex;
    834                 fNextIndex = unescapeIndex[0];
    835             }
    836         }
    837         // putc(c.fChar, stdout);
    838     }
    839 
    840     //---------------------------------------------------------------------------------
    841     //
    842     //  Parse RBBI rules.   The state machine for rules parsing is here.
    843     //                      The state tables are hand-written in the file rbbirpt.txt,
    844     //                      and converted to the form used here by a perl
    845     //                      script rbbicst.pl
    846     //
    847     //---------------------------------------------------------------------------------
    848     void parse() {
    849         int state;
    850         RBBIRuleParseTable.RBBIRuleTableElement tableEl;
    851 
    852         state = 1;
    853         nextChar(fC);
    854         //
    855         // Main loop for the rule parsing state machine.
    856         //   Runs once per state transition.
    857         //   Each time through optionally performs, depending on the state table,
    858         //      - an advance to the the next input char
    859         //      - an action to be performed.
    860         //      - pushing or popping a state to/from the local state return stack.
    861         //
    862         for (;;) {
    863             // Quit if state == 0.  This is the normal way to exit the state machine.
    864             //
    865             if (state == 0) {
    866                 break;
    867             }
    868 
    869             // Find the state table element that matches the input char from the rule, or the
    870             //    class of the input character.  Start with the first table row for this
    871             //    state, then linearly scan forward until we find a row that matches the
    872             //    character.  The last row for each state always matches all characters, so
    873             //    the search will stop there, if not before.
    874             //
    875             tableEl = RBBIRuleParseTable.gRuleParseStateTable[state];
    876             if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
    877                 System.out.println("char, line, col = (\'" + (char) fC.fChar
    878                         + "\', " + fLineNum + ", " + fCharNum + "    state = "
    879                         + tableEl.fStateName);
    880             }
    881 
    882             for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state.
    883                 tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow];
    884                 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
    885                     System.out.print(".");
    886                 }
    887                 if (tableEl.fCharClass < 127 && fC.fEscaped == false
    888                         && tableEl.fCharClass == fC.fChar) {
    889                     // Table row specified an individual character, not a set, and
    890                     //   the input character is not escaped, and
    891                     //   the input character matched it.
    892                     break;
    893                 }
    894                 if (tableEl.fCharClass == 255) {
    895                     // Table row specified default, match anything character class.
    896                     break;
    897                 }
    898                 if (tableEl.fCharClass == 254 && fC.fEscaped) {
    899                     // Table row specified "escaped" and the char was escaped.
    900                     break;
    901                 }
    902                 if (tableEl.fCharClass == 253 && fC.fEscaped
    903                         && (fC.fChar == 0x50 || fC.fChar == 0x70)) {
    904                     // Table row specified "escaped P" and the char is either 'p' or 'P'.
    905                     break;
    906                 }
    907                 if (tableEl.fCharClass == 252 && fC.fChar == -1) {
    908                     // Table row specified eof and we hit eof on the input.
    909                     break;
    910                 }
    911 
    912                 if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class &&
    913                         fC.fEscaped == false && //   char is not escaped &&
    914                         fC.fChar != -1) { //   char is not EOF
    915                     UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128];
    916                     if (uniset.contains(fC.fChar)) {
    917                         // Table row specified a character class, or set of characters,
    918                         //   and the current char matches it.
    919                         break;
    920                     }
    921                 }
    922             }
    923 
    924             if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
    925                 System.out.println("");
    926             }
    927             //
    928             // We've found the row of the state table that matches the current input
    929             //   character from the rules string.
    930             // Perform any action specified  by this row in the state table.
    931             if (doParseActions(tableEl.fAction) == false) {
    932                 // Break out of the state machine loop if the
    933                 //   the action signalled some kind of error, or
    934                 //   the action was to exit, occurs on normal end-of-rules-input.
    935                 break;
    936             }
    937 
    938             if (tableEl.fPushState != 0) {
    939                 fStackPtr++;
    940                 if (fStackPtr >= kStackSize) {
    941                     System.out.println("RBBIRuleScanner.parse() - state stack overflow.");
    942                     error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
    943                 }
    944                 fStack[fStackPtr] = tableEl.fPushState;
    945             }
    946 
    947             if (tableEl.fNextChar) {
    948                 nextChar(fC);
    949             }
    950 
    951             // Get the next state from the table entry, or from the
    952             //   state stack if the next state was specified as "pop".
    953             if (tableEl.fNextState != 255) {
    954                 state = tableEl.fNextState;
    955             } else {
    956                 state = fStack[fStackPtr];
    957                 fStackPtr--;
    958                 if (fStackPtr < 0) {
    959                     System.out.println("RBBIRuleScanner.parse() - state stack underflow.");
    960                     error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
    961                 }
    962             }
    963 
    964         }
    965 
    966         // If there are no forward rules throw an error.
    967         //
    968         if (fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree] == null) {
    969             error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
    970         }
    971 
    972         //
    973         // If there were NO user specified reverse rules, set up the equivalent of ".*;"
    974         //
    975         if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) {
    976             fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar);
    977             RBBINode operand = pushNewNode(RBBINode.setRef);
    978             findSetFor(kAny, operand, null);
    979             fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand;
    980             operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree];
    981             fNodeStackPtr -= 2;
    982         }
    983 
    984         //
    985         // Parsing of the input RBBI rules is complete.
    986         // We now have a parse tree for the rule expressions
    987         // and a list of all UnicodeSets that are referenced.
    988         //
    989         if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) {
    990             fSymbolTable.rbbiSymtablePrint();
    991         }
    992         if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) {
    993             System.out.println("Completed Forward Rules Parse Tree...");
    994             fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true);
    995             System.out.println("\nCompleted Reverse Rules Parse Tree...");
    996             fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true);
    997             System.out.println("\nCompleted Safe Point Forward Rules Parse Tree...");
    998             if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) {
    999                 System.out.println("  -- null -- ");
   1000             } else {
   1001                 fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true);
   1002             }
   1003             System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree...");
   1004             if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) {
   1005                 System.out.println("  -- null -- ");
   1006             } else {
   1007                 fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true);
   1008             }
   1009         }
   1010     }
   1011 
   1012     //---------------------------------------------------------------------------------
   1013     //
   1014     //  printNodeStack     for debugging...
   1015     //
   1016     //---------------------------------------------------------------------------------
   1017     ///CLOVER:OFF
   1018     void printNodeStack(String title) {
   1019         int i;
   1020         System.out.println(title + ".  Dumping node stack...\n");
   1021         for (i = fNodeStackPtr; i > 0; i--) {
   1022             fNodeStack[i].printTree(true);
   1023         }
   1024     }
   1025     ///CLOVER:ON
   1026 
   1027     //---------------------------------------------------------------------------------
   1028     //
   1029     //  pushNewNode   create a new RBBINode of the specified type and push it
   1030     //                onto the stack of nodes.
   1031     //
   1032     //---------------------------------------------------------------------------------
   1033     RBBINode pushNewNode(int nodeType) {
   1034         fNodeStackPtr++;
   1035         if (fNodeStackPtr >= kStackSize) {
   1036             System.out.println("RBBIRuleScanner.pushNewNode - stack overflow.");
   1037             error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
   1038         }
   1039         fNodeStack[fNodeStackPtr] = new RBBINode(nodeType);
   1040         return fNodeStack[fNodeStackPtr];
   1041     }
   1042 
   1043     //---------------------------------------------------------------------------------
   1044     //
   1045     //  scanSet    Construct a UnicodeSet from the text at the current scan
   1046     //             position.  Advance the scan position to the first character
   1047     //             after the set.
   1048     //
   1049     //             A new RBBI setref node referring to the set is pushed onto the node
   1050     //             stack.
   1051     //
   1052     //             The scan position is normally under the control of the state machine
   1053     //             that controls rule parsing.  UnicodeSets, however, are parsed by
   1054     //             the UnicodeSet constructor, not by the RBBI rule parser.
   1055     //
   1056     //---------------------------------------------------------------------------------
   1057     void scanSet() {
   1058         UnicodeSet uset = null;
   1059         int startPos;
   1060         ParsePosition pos = new ParsePosition(fScanIndex);
   1061         int i;
   1062 
   1063         startPos = fScanIndex;
   1064         try {
   1065             uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE);
   1066         } catch (Exception e) { // TODO:  catch fewer exception types.
   1067             // Repackage UnicodeSet errors as RBBI rule builder errors, with location info.
   1068             error(RBBIRuleBuilder.U_BRK_MALFORMED_SET);
   1069         }
   1070 
   1071         // Verify that the set contains at least one code point.
   1072         //
   1073         if (uset.isEmpty()) {
   1074             // This set is empty.
   1075             //  Make it an error, because it almost certainly is not what the user wanted.
   1076             //  Also, avoids having to think about corner cases in the tree manipulation code
   1077             //   that occurs later on.
   1078             //  TODO:  this shouldn't be an error; it does happen.
   1079             error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
   1080         }
   1081 
   1082         // Advance the RBBI parse postion over the UnicodeSet pattern.
   1083         //   Don't just set fScanIndex because the line/char positions maintained
   1084         //   for error reporting would be thrown off.
   1085         i = pos.getIndex();
   1086         for (;;) {
   1087             if (fNextIndex >= i) {
   1088                 break;
   1089             }
   1090             nextCharLL();
   1091         }
   1092 
   1093         RBBINode n;
   1094 
   1095         n = pushNewNode(RBBINode.setRef);
   1096         n.fFirstPos = startPos;
   1097         n.fLastPos = fNextIndex;
   1098         n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
   1099         //  findSetFor() serves several purposes here:
   1100         //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
   1101         //     - Mantains collection of all sets in use, needed later for establishing
   1102         //          character categories for run time engine.
   1103         //     - Eliminates mulitiple instances of the same set.
   1104         //     - Creates a new uset node if necessary (if this isn't a duplicate.)
   1105         findSetFor(n.fText, n, uset);
   1106     }
   1107 
   1108 }
   1109 
   1110