Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 /*
      3  *******************************************************************************
      4  * Copyright (C) 2003-2011, International Business Machines Corporation and others. All Rights Reserved.
      5  *******************************************************************************
      6  */
      7 
      8 package android.icu.text;
      9 
     10 import java.text.ParsePosition;
     11 import java.util.HashMap;
     12 
     13 import android.icu.impl.Assert;
     14 import android.icu.impl.Utility;
     15 import android.icu.lang.UCharacter;
     16 
     17 /**
     18   *  This class is part of the Rule Based Break Iterator rule compiler.
     19   *  It scans the rules and builds the parse tree.
     20   *  There is no public API here.
     21   */
     22 class RBBIRuleScanner {
     23 
     24     private final static int    kStackSize = 100;               // The size of the state stack for
     25     //   rules parsing.  Corresponds roughly
     26     //   to the depth of parentheses nesting
     27     //   that is allowed in the rules.
     28 
     29     static class RBBIRuleChar {
     30         int             fChar;
     31         boolean         fEscaped;
     32     }
     33 
     34 
     35 
     36     RBBIRuleBuilder               fRB;              // The rule builder that we are part of.
     37 
     38     int                       fScanIndex;        // Index of current character being processed
     39                                                      //   in the rule input string.
     40     int                       fNextIndex;        // Index of the next character, which
     41                                                      //   is the first character not yet scanned.
     42     boolean                  fQuoteMode;        // Scan is in a 'quoted region'
     43     int                       fLineNum;          // Line number in input file.
     44     int                       fCharNum;          // Char position within the line.
     45     int                       fLastChar;         // Previous char, needed to count CR-LF
     46                                                      //   as a single line, not two.
     47 
     48     RBBIRuleChar              fC = new RBBIRuleChar();    // Current char for parse state machine
     49                                                      //   processing.
     50     String                    fVarName;          // $variableName, valid when we've just
     51                                                      //   scanned one.
     52 
     53 
     54     short  fStack[] = new short[kStackSize];  // State stack, holds state pushes
     55     int                       fStackPtr;           //  and pops as specified in the state
     56                                                        //  transition rules.
     57 
     58     RBBINode  fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
     59                                                            //  during the parse of a rule
     60     int                        fNodeStackPtr;
     61 
     62 
     63     boolean                          fReverseRule;     // True if the rule currently being scanned
     64                                                      //  is a reverse direction rule (if it
     65                                                      //  starts with a '!')
     66 
     67     boolean                          fLookAheadRule;   // True if the rule includes a '/'
     68                                                      //   somewhere within it.
     69 
     70     RBBISymbolTable              fSymbolTable;     // symbol table, holds definitions of
     71                                                      //   $variable symbols.
     72 
     73     HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to
     74                                                                                        //   the sets created while parsing rules.
     75                                                                                        //   The key is the string used for creating
     76                                                                                        //   the set.
     77 
     78     UnicodeSet      fRuleSets[] = new UnicodeSet[10];    // Unicode Sets that are needed during
     79                                                      //  the scanning of RBBI rules.  The
     80                                                      //  indicies for these are assigned by the
     81                                                      //  perl script that builds the state tables.
     82                                                      //  See rbbirpt.h.
     83 
     84     int                        fRuleNum;         // Counts each rule as it is scanned.
     85 
     86     int                        fOptionStart;     // Input index of start of a !!option
     87                                                  //   keyword, while being scanned.
     88 
     89 
     90 
     91    static private String gRuleSet_rule_char_pattern       = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
     92    static private String gRuleSet_name_char_pattern       = "[_\\p{L}\\p{N}]";
     93    static private String gRuleSet_digit_char_pattern      = "[0-9]";
     94    static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]";
     95    static private String gRuleSet_white_space_pattern     = "[\\p{Pattern_White_Space}]";
     96    static private String kAny =  "any";
     97 
     98 
     99 
    100 
    101     //----------------------------------------------------------------------------------------
    102     //
    103     //  Constructor.
    104     //
    105     //----------------------------------------------------------------------------------------
    106     RBBIRuleScanner(RBBIRuleBuilder rb) {
    107         fRB = rb;
    108         fLineNum = 1;
    109 
    110         //
    111         //  Set up the constant Unicode Sets.
    112         //     Note: These could be made static and shared among
    113         //            all instances of RBBIRuleScanners.
    114         fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern);
    115         fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern);
    116         fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern);
    117         fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern);
    118         fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern);
    119 
    120         fSymbolTable = new RBBISymbolTable(this, rb.fRules);
    121     }
    122 
    123     //----------------------------------------------------------------------------------------
    124     //
    125     //  doParseAction Do some action during rule parsing.
    126     //                       Called by the parse state machine.
    127     //                       Actions build the parse tree and Unicode Sets,
    128     //                       and maintain the parse stack for nested expressions.
    129     //
    130     //----------------------------------------------------------------------------------------
    131     boolean doParseActions(int action) {
    132         RBBINode n = null;
    133 
    134         boolean returnVal = true;
    135 
    136         switch (action) {
    137 
    138         case RBBIRuleParseTable.doExprStart:
    139             pushNewNode(RBBINode.opStart);
    140             fRuleNum++;
    141             break;
    142 
    143         case RBBIRuleParseTable.doExprOrOperator: {
    144             fixOpStack(RBBINode.precOpCat);
    145             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
    146             RBBINode orNode = pushNewNode(RBBINode.opOr);
    147             orNode.fLeftChild = operandNode;
    148             operandNode.fParent = orNode;
    149         }
    150             break;
    151 
    152         case RBBIRuleParseTable.doExprCatOperator:
    153         // concatenation operator.
    154         // For the implicit concatenation of adjacent terms in an expression
    155         // that are
    156         //   not separated by any other operator. Action is invoked between the
    157         //   actions for the two terms.
    158         {
    159             fixOpStack(RBBINode.precOpCat);
    160             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
    161             RBBINode catNode = pushNewNode(RBBINode.opCat);
    162             catNode.fLeftChild = operandNode;
    163             operandNode.fParent = catNode;
    164         }
    165             break;
    166 
    167         case RBBIRuleParseTable.doLParen:
    168             // Open Paren.
    169             //   The openParen node is a dummy operation type with a low
    170             // precedence,
    171             //     which has the affect of ensuring that any real binary op that
    172             //     follows within the parens binds more tightly to the operands than
    173             //     stuff outside of the parens.
    174             pushNewNode(RBBINode.opLParen);
    175             break;
    176 
    177         case RBBIRuleParseTable.doExprRParen:
    178             fixOpStack(RBBINode.precLParen);
    179             break;
    180 
    181         case RBBIRuleParseTable.doNOP:
    182             break;
    183 
    184         case RBBIRuleParseTable.doStartAssign:
    185             // We've just scanned "$variable = "
    186             // The top of the node stack has the $variable ref node.
    187 
    188             // Save the start position of the RHS text in the StartExpression
    189             // node
    190             //   that precedes the $variableReference node on the stack.
    191             //   This will eventually be used when saving the full $variable
    192             // replacement
    193             //   text as a string.
    194             n = fNodeStack[fNodeStackPtr - 1];
    195             n.fFirstPos = fNextIndex; // move past the '='
    196 
    197             // Push a new start-of-expression node; needed to keep parse of the
    198             //   RHS expression happy.
    199             pushNewNode(RBBINode.opStart);
    200             break;
    201 
    202         case RBBIRuleParseTable.doEndAssign: {
    203             // We have reached the end of an assignement statement.
    204             //   Current scan char is the ';' that terminates the assignment.
    205 
    206             // Terminate expression, leaves expression parse tree rooted in TOS
    207             // node.
    208             fixOpStack(RBBINode.precStart);
    209 
    210             RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2];
    211             RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1];
    212             RBBINode RHSExprNode = fNodeStack[fNodeStackPtr];
    213 
    214             // Save original text of right side of assignment, excluding the
    215             // terminating ';'
    216             //  in the root of the node for the right-hand-side expression.
    217             RHSExprNode.fFirstPos = startExprNode.fFirstPos;
    218             RHSExprNode.fLastPos = fScanIndex;
    219             // fRB.fRules.extractBetween(RHSExprNode.fFirstPos,
    220             // RHSExprNode.fLastPos, RHSExprNode.fText);
    221             RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos,
    222                     RHSExprNode.fLastPos);
    223 
    224             // Expression parse tree becomes l. child of the $variable reference
    225             // node.
    226             varRefNode.fLeftChild = RHSExprNode;
    227             RHSExprNode.fParent = varRefNode;
    228 
    229             // Make a symbol table entry for the $variableRef node.
    230             fSymbolTable.addEntry(varRefNode.fText, varRefNode);
    231 
    232             // Clean up the stack.
    233             fNodeStackPtr -= 3;
    234             break;
    235         }
    236 
    237         case RBBIRuleParseTable.doEndOfRule: {
    238             fixOpStack(RBBINode.precStart); // Terminate expression, leaves
    239                                             // expression
    240 
    241             if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) {
    242                 printNodeStack("end of rule");
    243             }
    244             Assert.assrt(fNodeStackPtr == 1);
    245 
    246             // If this rule includes a look-ahead '/', add a endMark node to the
    247             //   expression tree.
    248             if (fLookAheadRule) {
    249                 RBBINode thisRule = fNodeStack[fNodeStackPtr];
    250                 RBBINode endNode = pushNewNode(RBBINode.endMark);
    251                 RBBINode catNode = pushNewNode(RBBINode.opCat);
    252                 fNodeStackPtr -= 2;
    253                 catNode.fLeftChild = thisRule;
    254                 catNode.fRightChild = endNode;
    255                 fNodeStack[fNodeStackPtr] = catNode;
    256                 endNode.fVal = fRuleNum;
    257                 endNode.fLookAheadEnd = true;
    258             }
    259 
    260             // All rule expressions are ORed together.
    261             // The ';' that terminates an expression really just functions as a
    262             // '|' with
    263             //   a low operator prededence.
    264             //
    265             // Each of the four sets of rules are collected separately.
    266             //  (forward, reverse, safe_forward, safe_reverse)
    267             //  OR this rule into the appropriate group of them.
    268             //
    269 
    270             int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree);
    271 
    272             if (fRB.fTreeRoots[destRules] != null) {
    273                 // This is not the first rule encounted.
    274                 // OR previous stuff (from *destRules)
    275                 // with the current rule expression (on the Node Stack)
    276                 //  with the resulting OR expression going to *destRules
    277                 //
    278                 RBBINode thisRule = fNodeStack[fNodeStackPtr];
    279                 RBBINode prevRules = fRB.fTreeRoots[destRules];
    280                 RBBINode orNode = pushNewNode(RBBINode.opOr);
    281                 orNode.fLeftChild = prevRules;
    282                 prevRules.fParent = orNode;
    283                 orNode.fRightChild = thisRule;
    284                 thisRule.fParent = orNode;
    285                 fRB.fTreeRoots[destRules] = orNode;
    286             } else {
    287                 // This is the first rule encountered (for this direction).
    288                 // Just move its parse tree from the stack to *destRules.
    289                 fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr];
    290             }
    291             fReverseRule = false; // in preparation for the next rule.
    292             fLookAheadRule = false;
    293             fNodeStackPtr = 0;
    294         }
    295             break;
    296 
    297         case RBBIRuleParseTable.doRuleError:
    298             error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
    299             returnVal = false;
    300             break;
    301 
    302         case RBBIRuleParseTable.doVariableNameExpectedErr:
    303             error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
    304             break;
    305 
    306         //
    307         //  Unary operands + ? *
    308         //    These all appear after the operand to which they apply.
    309         //    When we hit one, the operand (may be a whole sub expression)
    310         //    will be on the top of the stack.
    311         //    Unary Operator becomes TOS, with the old TOS as its one child.
    312         case RBBIRuleParseTable.doUnaryOpPlus: {
    313             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
    314             RBBINode plusNode = pushNewNode(RBBINode.opPlus);
    315             plusNode.fLeftChild = operandNode;
    316             operandNode.fParent = plusNode;
    317         }
    318             break;
    319 
    320         case RBBIRuleParseTable.doUnaryOpQuestion: {
    321             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
    322             RBBINode qNode = pushNewNode(RBBINode.opQuestion);
    323             qNode.fLeftChild = operandNode;
    324             operandNode.fParent = qNode;
    325         }
    326             break;
    327 
    328         case RBBIRuleParseTable.doUnaryOpStar: {
    329             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
    330             RBBINode starNode = pushNewNode(RBBINode.opStar);
    331             starNode.fLeftChild = operandNode;
    332             operandNode.fParent = starNode;
    333         }
    334             break;
    335 
    336         case RBBIRuleParseTable.doRuleChar:
    337         // A "Rule Character" is any single character that is a literal part
    338         // of the regular expression. Like a, b and c in the expression "(abc*)
    339         // | [:L:]"
    340         // These are pretty uncommon in break rules; the terms are more commonly
    341         //  sets. To keep things uniform, treat these characters like as
    342         // sets that just happen to contain only one character.
    343         {
    344             n = pushNewNode(RBBINode.setRef);
    345             String s = String.valueOf((char)fC.fChar);
    346             findSetFor(s, n, null);
    347             n.fFirstPos = fScanIndex;
    348             n.fLastPos = fNextIndex;
    349             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
    350             break;
    351         }
    352 
    353         case RBBIRuleParseTable.doDotAny:
    354         // scanned a ".", meaning match any single character.
    355         {
    356             n = pushNewNode(RBBINode.setRef);
    357             findSetFor(kAny, n, null);
    358             n.fFirstPos = fScanIndex;
    359             n.fLastPos = fNextIndex;
    360             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
    361             break;
    362         }
    363 
    364         case RBBIRuleParseTable.doSlash:
    365             // Scanned a '/', which identifies a look-ahead break position in a
    366             // rule.
    367             n = pushNewNode(RBBINode.lookAhead);
    368             n.fVal = fRuleNum;
    369             n.fFirstPos = fScanIndex;
    370             n.fLastPos = fNextIndex;
    371             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
    372             fLookAheadRule = true;
    373             break;
    374 
    375         case RBBIRuleParseTable.doStartTagValue:
    376             // Scanned a '{', the opening delimiter for a tag value within a
    377             // rule.
    378             n = pushNewNode(RBBINode.tag);
    379             n.fVal = 0;
    380             n.fFirstPos = fScanIndex;
    381             n.fLastPos = fNextIndex;
    382             break;
    383 
    384         case RBBIRuleParseTable.doTagDigit:
    385         // Just scanned a decimal digit that's part of a tag value
    386         {
    387             n = fNodeStack[fNodeStackPtr];
    388             int v = UCharacter.digit((char) fC.fChar, 10);
    389             n.fVal = n.fVal * 10 + v;
    390             break;
    391         }
    392 
    393         case RBBIRuleParseTable.doTagValue:
    394             n = fNodeStack[fNodeStackPtr];
    395             n.fLastPos = fNextIndex;
    396             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
    397             break;
    398 
    399         case RBBIRuleParseTable.doTagExpectedError:
    400             error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG);
    401             returnVal = false;
    402             break;
    403 
    404         case RBBIRuleParseTable.doOptionStart:
    405             // Scanning a !!option. At the start of string.
    406             fOptionStart = fScanIndex;
    407             break;
    408 
    409         case RBBIRuleParseTable.doOptionEnd: {
    410             String opt = fRB.fRules.substring(fOptionStart, fScanIndex);
    411             if (opt.equals("chain")) {
    412                 fRB.fChainRules = true;
    413             } else if (opt.equals("LBCMNoChain")) {
    414                 fRB.fLBCMNoChain = true;
    415             } else if (opt.equals("forward")) {
    416                 fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree;
    417             } else if (opt.equals("reverse")) {
    418                 fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree;
    419             } else if (opt.equals("safe_forward")) {
    420                 fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree;
    421             } else if (opt.equals("safe_reverse")) {
    422                 fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree;
    423             } else if (opt.equals("lookAheadHardBreak")) {
    424                 fRB.fLookAheadHardBreak = true;
    425             } else {
    426                 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION);
    427             }
    428             break;
    429         }
    430 
    431         case RBBIRuleParseTable.doReverseDir:
    432             fReverseRule = true;
    433             break;
    434 
    435         case RBBIRuleParseTable.doStartVariableName:
    436             n = pushNewNode(RBBINode.varRef);
    437             n.fFirstPos = fScanIndex;
    438             break;
    439 
    440         case RBBIRuleParseTable.doEndVariableName:
    441             n = fNodeStack[fNodeStackPtr];
    442             if (n == null || n.fType != RBBINode.varRef) {
    443                 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
    444                 break;
    445             }
    446             n.fLastPos = fScanIndex;
    447             n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos);
    448             // Look the newly scanned name up in the symbol table
    449             //   If there's an entry, set the l. child of the var ref to the
    450             // replacement expression.
    451             //   (We also pass through here when scanning assignments, but no harm
    452             // is done, other
    453             //    than a slight wasted effort that seems hard to avoid. Lookup will
    454             // be null)
    455             n.fLeftChild = fSymbolTable.lookupNode(n.fText);
    456             break;
    457 
    458         case RBBIRuleParseTable.doCheckVarDef:
    459             n = fNodeStack[fNodeStackPtr];
    460             if (n.fLeftChild == null) {
    461                 error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE);
    462                 returnVal = false;
    463             }
    464             break;
    465 
    466         case RBBIRuleParseTable.doExprFinished:
    467             break;
    468 
    469         case RBBIRuleParseTable.doRuleErrorAssignExpr:
    470             error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR);
    471             returnVal = false;
    472             break;
    473 
    474         case RBBIRuleParseTable.doExit:
    475             returnVal = false;
    476             break;
    477 
    478         case RBBIRuleParseTable.doScanUnicodeSet:
    479             scanSet();
    480             break;
    481 
    482         default:
    483             error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
    484             returnVal = false;
    485             break;
    486         }
    487         return returnVal;
    488     }
    489 
    490     //----------------------------------------------------------------------------------------
    491     //
    492     //  Error Throw and IllegalArgumentException in response to a rule parse
    493     // error.
    494     //
    495     //----------------------------------------------------------------------------------------
    496     void error(int e) {
    497         String s = "Error " + e + " at line " + fLineNum + " column "
    498                 + fCharNum;
    499         IllegalArgumentException ex = new IllegalArgumentException(s);
    500         throw ex;
    501 
    502     }
    503 
    504     //----------------------------------------------------------------------------------------
    505     //
    506     //  fixOpStack The parse stack holds partially assembled chunks of the parse
    507     // tree.
    508     //               An entry on the stack may be as small as a single setRef node,
    509     //               or as large as the parse tree
    510     //               for an entire expression (this will be the one item left on the stack
    511     //               when the parsing of an RBBI rule completes.
    512     //
    513     //               This function is called when a binary operator is encountered.
    514     //               It looks back up the stack for operators that are not yet associated
    515     //               with a right operand, and if the precedence of the stacked operator >=
    516     //               the precedence of the current operator, binds the operand left,
    517     //               to the previously encountered operator.
    518     //
    519     //----------------------------------------------------------------------------------------
    520     void fixOpStack(int p) {
    521         RBBINode n;
    522         // printNodeStack("entering fixOpStack()");
    523         for (;;) {
    524             n = fNodeStack[fNodeStackPtr - 1]; // an operator node
    525             if (n.fPrecedence == 0) {
    526                 System.out.print("RBBIRuleScanner.fixOpStack, bad operator node");
    527                 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
    528                 return;
    529             }
    530 
    531             if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) {
    532                 // The most recent operand goes with the current operator,
    533                 //   not with the previously stacked one.
    534                 break;
    535             }
    536             // Stack operator is a binary op ( '|' or concatenation)
    537             //   TOS operand becomes right child of this operator.
    538             //   Resulting subexpression becomes the TOS operand.
    539             n.fRightChild = fNodeStack[fNodeStackPtr];
    540             fNodeStack[fNodeStackPtr].fParent = n;
    541             fNodeStackPtr--;
    542             // printNodeStack("looping in fixOpStack() ");
    543         }
    544 
    545         if (p <= RBBINode.precLParen) {
    546             // Scan is at a right paren or end of expression.
    547             //  The scanned item must match the stack, or else there was an
    548             // error.
    549             //  Discard the left paren (or start expr) node from the stack,
    550             //  leaving the completed (sub)expression as TOS.
    551             if (n.fPrecedence != p) {
    552                 // Right paren encountered matched start of expression node, or
    553                 // end of expression matched with a left paren node.
    554                 error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN);
    555             }
    556             fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr];
    557             fNodeStackPtr--;
    558             // Delete the now-discarded LParen or Start node.
    559             // delete n;
    560         }
    561         // printNodeStack("leaving fixOpStack()");
    562     }
    563 
    564     //----------------------------------------------------------------------------
    565     //
    566     //       RBBISetTableEl is an entry in the hash table of UnicodeSets that have
    567     //                        been encountered. The val Node will be of nodetype uset
    568     //                        and contain pointers to the actual UnicodeSets.
    569     //                        The Key is the source string for initializing the set.
    570     //
    571     //                        The hash table is used to avoid creating duplicate
    572     //                        unnamed (not $var references) UnicodeSets.
    573     //
    574     //----------------------------------------------------------------------------
    575     static class RBBISetTableEl {
    576         String key;
    577 
    578         RBBINode val;
    579     }
    580 
    581 
    582     //----------------------------------------------------------------------------------------
    583     //
    584     //   findSetFor given a String,
    585     //                  - find the corresponding Unicode Set (uset node)
    586     //                         (create one if necessary)
    587     //                  - Set fLeftChild of the caller's node (should be a setRef node)
    588     //                         to the uset node
    589     //                 Maintain a hash table of uset nodes, so the same one is always used
    590     //                    for the same string.
    591     //                 If a "to adopt" set is provided and we haven't seen this key before,
    592     //                    add the provided set to the hash table.
    593     //                 If the string is one (32 bit) char in length, the set contains
    594     //                    just one element which is the char in question.
    595     //                 If the string is "any", return a set containing all chars.
    596     //
    597     //----------------------------------------------------------------------------------------
    598     void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) {
    599 
    600         RBBISetTableEl el;
    601 
    602         // First check whether we've already cached a set for this string.
    603         // If so, just use the cached set in the new node.
    604         //   delete any set provided by the caller, since we own it.
    605         el = fSetTable.get(s);
    606         if (el != null) {
    607             node.fLeftChild = el.val;
    608             Assert.assrt(node.fLeftChild.fType == RBBINode.uset);
    609             return;
    610         }
    611 
    612         // Haven't seen this set before.
    613         // If the caller didn't provide us with a prebuilt set,
    614         //   create a new UnicodeSet now.
    615         if (setToAdopt == null) {
    616             if (s.equals(kAny)) {
    617                 setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
    618             } else {
    619                 int c;
    620                 c = UTF16.charAt(s, 0);
    621                 setToAdopt = new UnicodeSet(c, c);
    622             }
    623         }
    624 
    625         //
    626         // Make a new uset node to refer to this UnicodeSet
    627         // This new uset node becomes the child of the caller's setReference
    628         // node.
    629         //
    630         RBBINode usetNode = new RBBINode(RBBINode.uset);
    631         usetNode.fInputSet = setToAdopt;
    632         usetNode.fParent = node;
    633         node.fLeftChild = usetNode;
    634         usetNode.fText = s;
    635 
    636         //
    637         // Add the new uset node to the list of all uset nodes.
    638         //
    639         fRB.fUSetNodes.add(usetNode);
    640 
    641         //
    642         // Add the new set to the set hash table.
    643         //
    644         el = new RBBISetTableEl();
    645         el.key = s;
    646         el.val = usetNode;
    647         fSetTable.put(el.key, el);
    648 
    649         return;
    650     }
    651 
    652     //
    653     //  Assorted Unicode character constants.
    654     //     Numeric because there is no portable way to enter them as literals.
    655     //     (Think EBCDIC).
    656     //
    657     static final int chNEL = 0x85; //    NEL newline variant
    658 
    659     static final int chLS = 0x2028; //    Unicode Line Separator
    660 
    661     //----------------------------------------------------------------------------------------
    662     //
    663     //  stripRules    Return a rules string without unnecessary
    664     //                characters.
    665     //
    666     //----------------------------------------------------------------------------------------
    667     static String stripRules(String rules) {
    668         StringBuilder strippedRules = new StringBuilder();
    669         int rulesLength = rules.length();
    670         for (int idx = 0; idx < rulesLength;) {
    671             char ch = rules.charAt(idx++);
    672             if (ch == '#') {
    673                 while (idx < rulesLength
    674                         && ch != '\r' && ch != '\n' && ch != chNEL) {
    675                     ch = rules.charAt(idx++);
    676                 }
    677             }
    678             if (!UCharacter.isISOControl(ch)) {
    679                 strippedRules.append(ch);
    680             }
    681         }
    682         return strippedRules.toString();
    683     }
    684 
    685     //----------------------------------------------------------------------------------------
    686     //
    687     //  nextCharLL    Low Level Next Char from rule input source.
    688     //                Get a char from the input character iterator,
    689     //                keep track of input position for error reporting.
    690     //
    691     //----------------------------------------------------------------------------------------
    692     int nextCharLL() {
    693         int ch;
    694 
    695         if (fNextIndex >= fRB.fRules.length()) {
    696             return -1;
    697         }
    698         ch = UTF16.charAt(fRB.fRules, fNextIndex);
    699         fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
    700 
    701         if (ch == '\r' ||
    702             ch == chNEL ||
    703             ch == chLS ||
    704             ch == '\n' && fLastChar != '\r') {
    705             // Character is starting a new line.  Bump up the line number, and
    706             //  reset the column to 0.
    707             fLineNum++;
    708             fCharNum = 0;
    709             if (fQuoteMode) {
    710                 error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING);
    711                 fQuoteMode = false;
    712             }
    713         } else {
    714             // Character is not starting a new line.  Except in the case of a
    715             //   LF following a CR, increment the column position.
    716             if (ch != '\n') {
    717                 fCharNum++;
    718             }
    719         }
    720         fLastChar = ch;
    721         return ch;
    722     }
    723 
    724     //---------------------------------------------------------------------------------
    725     //
    726     //   nextChar     for rules scanning.  At this level, we handle stripping
    727     //                out comments and processing backslash character escapes.
    728     //                The rest of the rules grammar is handled at the next level up.
    729     //
    730     //---------------------------------------------------------------------------------
    731     void nextChar(RBBIRuleChar c) {
    732 
    733         // Unicode Character constants needed for the processing done by nextChar(),
    734         //   in hex because literals wont work on EBCDIC machines.
    735 
    736         fScanIndex = fNextIndex;
    737         c.fChar = nextCharLL();
    738         c.fEscaped = false;
    739 
    740         //
    741         //  check for '' sequence.
    742         //  These are recognized in all contexts, whether in quoted text or not.
    743         //
    744         if (c.fChar == '\'') {
    745             if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') {
    746                 c.fChar = nextCharLL(); // get nextChar officially so character counts
    747                 c.fEscaped = true; //   stay correct.
    748             } else {
    749                 // Single quote, by itself.
    750                 //   Toggle quoting mode.
    751                 //   Return either '('  or ')', because quotes cause a grouping of the quoted text.
    752                 fQuoteMode = !fQuoteMode;
    753                 if (fQuoteMode == true) {
    754                     c.fChar = '(';
    755                 } else {
    756                     c.fChar = ')';
    757                 }
    758                 c.fEscaped = false; // The paren that we return is not escaped.
    759                 return;
    760             }
    761         }
    762 
    763         if (fQuoteMode) {
    764             c.fEscaped = true;
    765         } else {
    766             // We are not in a 'quoted region' of the source.
    767             //
    768             if (c.fChar == '#') {
    769                 // Start of a comment.  Consume the rest of it.
    770                 //  The new-line char that terminates the comment is always returned.
    771                 //  It will be treated as white-space, and serves to break up anything
    772                 //    that might otherwise incorrectly clump together with a comment in
    773                 //    the middle (a variable name, for example.)
    774                 for (;;) {
    775                     c.fChar = nextCharLL();
    776                     if (c.fChar == -1 || // EOF
    777                         c.fChar == '\r' ||
    778                         c.fChar == '\n' ||
    779                         c.fChar == chNEL ||
    780                         c.fChar == chLS)
    781                     {
    782                         break;
    783                     }
    784                 }
    785             }
    786             if (c.fChar == -1) {
    787                 return;
    788             }
    789 
    790             //
    791             //  check for backslash escaped characters.
    792             //  Use String.unescapeAt() to handle them.
    793             //
    794             if (c.fChar == '\\') {
    795                 c.fEscaped = true;
    796                 int[] unescapeIndex = new int[1];
    797                 unescapeIndex[0] = fNextIndex;
    798                 c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
    799                 if (unescapeIndex[0] == fNextIndex) {
    800                     error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
    801                 }
    802 
    803                 fCharNum += unescapeIndex[0] - fNextIndex;
    804                 fNextIndex = unescapeIndex[0];
    805             }
    806         }
    807         // putc(c.fChar, stdout);
    808     }
    809 
    810     //---------------------------------------------------------------------------------
    811     //
    812     //  Parse RBBI rules.   The state machine for rules parsing is here.
    813     //                      The state tables are hand-written in the file rbbirpt.txt,
    814     //                      and converted to the form used here by a perl
    815     //                      script rbbicst.pl
    816     //
    817     //---------------------------------------------------------------------------------
    818     void parse() {
    819         int state;
    820         RBBIRuleParseTable.RBBIRuleTableElement tableEl;
    821 
    822         state = 1;
    823         nextChar(fC);
    824         //
    825         // Main loop for the rule parsing state machine.
    826         //   Runs once per state transition.
    827         //   Each time through optionally performs, depending on the state table,
    828         //      - an advance to the the next input char
    829         //      - an action to be performed.
    830         //      - pushing or popping a state to/from the local state return stack.
    831         //
    832         for (;;) {
    833             // Quit if state == 0.  This is the normal way to exit the state machine.
    834             //
    835             if (state == 0) {
    836                 break;
    837             }
    838 
    839             // Find the state table element that matches the input char from the rule, or the
    840             //    class of the input character.  Start with the first table row for this
    841             //    state, then linearly scan forward until we find a row that matches the
    842             //    character.  The last row for each state always matches all characters, so
    843             //    the search will stop there, if not before.
    844             //
    845             tableEl = RBBIRuleParseTable.gRuleParseStateTable[state];
    846             if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
    847                 System.out.println("char, line, col = (\'" + (char) fC.fChar
    848                         + "\', " + fLineNum + ", " + fCharNum + "    state = "
    849                         + tableEl.fStateName);
    850             }
    851 
    852             for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state.
    853                 tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow];
    854                 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
    855                     System.out.print(".");
    856                 }
    857                 if (tableEl.fCharClass < 127 && fC.fEscaped == false
    858                         && tableEl.fCharClass == fC.fChar) {
    859                     // Table row specified an individual character, not a set, and
    860                     //   the input character is not escaped, and
    861                     //   the input character matched it.
    862                     break;
    863                 }
    864                 if (tableEl.fCharClass == 255) {
    865                     // Table row specified default, match anything character class.
    866                     break;
    867                 }
    868                 if (tableEl.fCharClass == 254 && fC.fEscaped) {
    869                     // Table row specified "escaped" and the char was escaped.
    870                     break;
    871                 }
    872                 if (tableEl.fCharClass == 253 && fC.fEscaped
    873                         && (fC.fChar == 0x50 || fC.fChar == 0x70)) {
    874                     // Table row specified "escaped P" and the char is either 'p' or 'P'.
    875                     break;
    876                 }
    877                 if (tableEl.fCharClass == 252 && fC.fChar == -1) {
    878                     // Table row specified eof and we hit eof on the input.
    879                     break;
    880                 }
    881 
    882                 if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class &&
    883                         fC.fEscaped == false && //   char is not escaped &&
    884                         fC.fChar != -1) { //   char is not EOF
    885                     UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128];
    886                     if (uniset.contains(fC.fChar)) {
    887                         // Table row specified a character class, or set of characters,
    888                         //   and the current char matches it.
    889                         break;
    890                     }
    891                 }
    892             }
    893 
    894             if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
    895                 System.out.println("");
    896             }
    897             //
    898             // We've found the row of the state table that matches the current input
    899             //   character from the rules string.
    900             // Perform any action specified  by this row in the state table.
    901             if (doParseActions(tableEl.fAction) == false) {
    902                 // Break out of the state machine loop if the
    903                 //   the action signalled some kind of error, or
    904                 //   the action was to exit, occurs on normal end-of-rules-input.
    905                 break;
    906             }
    907 
    908             if (tableEl.fPushState != 0) {
    909                 fStackPtr++;
    910                 if (fStackPtr >= kStackSize) {
    911                     System.out.println("RBBIRuleScanner.parse() - state stack overflow.");
    912                     error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
    913                 }
    914                 fStack[fStackPtr] = tableEl.fPushState;
    915             }
    916 
    917             if (tableEl.fNextChar) {
    918                 nextChar(fC);
    919             }
    920 
    921             // Get the next state from the table entry, or from the
    922             //   state stack if the next state was specified as "pop".
    923             if (tableEl.fNextState != 255) {
    924                 state = tableEl.fNextState;
    925             } else {
    926                 state = fStack[fStackPtr];
    927                 fStackPtr--;
    928                 if (fStackPtr < 0) {
    929                     System.out.println("RBBIRuleScanner.parse() - state stack underflow.");
    930                     error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
    931                 }
    932             }
    933 
    934         }
    935 
    936         //
    937         // If there were NO user specified reverse rules, set up the equivalent of ".*;"
    938         //
    939         if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) {
    940             fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar);
    941             RBBINode operand = pushNewNode(RBBINode.setRef);
    942             findSetFor(kAny, operand, null);
    943             fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand;
    944             operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree];
    945             fNodeStackPtr -= 2;
    946         }
    947 
    948         //
    949         // Parsing of the input RBBI rules is complete.
    950         // We now have a parse tree for the rule expressions
    951         // and a list of all UnicodeSets that are referenced.
    952         //
    953         if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) {
    954             fSymbolTable.rbbiSymtablePrint();
    955         }
    956         if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) {
    957             System.out.println("Completed Forward Rules Parse Tree...");
    958             fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true);
    959             System.out.println("\nCompleted Reverse Rules Parse Tree...");
    960             fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true);
    961             System.out.println("\nCompleted Safe Point Forward Rules Parse Tree...");
    962             if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) {
    963                 System.out.println("  -- null -- ");
    964             } else {
    965                 fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true);
    966             }
    967             System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree...");
    968             if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) {
    969                 System.out.println("  -- null -- ");
    970             } else {
    971                 fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true);
    972             }
    973         }
    974     }
    975 
    976     //---------------------------------------------------------------------------------
    977     //
    978     //  printNodeStack     for debugging...
    979     //
    980     //---------------------------------------------------------------------------------
    981     ///CLOVER:OFF
    982     void printNodeStack(String title) {
    983         int i;
    984         System.out.println(title + ".  Dumping node stack...\n");
    985         for (i = fNodeStackPtr; i > 0; i--) {
    986             fNodeStack[i].printTree(true);
    987         }
    988     }
    989     ///CLOVER:ON
    990 
    991     //---------------------------------------------------------------------------------
    992     //
    993     //  pushNewNode   create a new RBBINode of the specified type and push it
    994     //                onto the stack of nodes.
    995     //
    996     //---------------------------------------------------------------------------------
    997     RBBINode pushNewNode(int nodeType) {
    998         fNodeStackPtr++;
    999         if (fNodeStackPtr >= kStackSize) {
   1000             System.out.println("RBBIRuleScanner.pushNewNode - stack overflow.");
   1001             error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
   1002         }
   1003         fNodeStack[fNodeStackPtr] = new RBBINode(nodeType);
   1004         return fNodeStack[fNodeStackPtr];
   1005     }
   1006 
   1007     //---------------------------------------------------------------------------------
   1008     //
   1009     //  scanSet    Construct a UnicodeSet from the text at the current scan
   1010     //             position.  Advance the scan position to the first character
   1011     //             after the set.
   1012     //
   1013     //             A new RBBI setref node referring to the set is pushed onto the node
   1014     //             stack.
   1015     //
   1016     //             The scan position is normally under the control of the state machine
   1017     //             that controls rule parsing.  UnicodeSets, however, are parsed by
   1018     //             the UnicodeSet constructor, not by the RBBI rule parser.
   1019     //
   1020     //---------------------------------------------------------------------------------
   1021     void scanSet() {
   1022         UnicodeSet uset = null;
   1023         int startPos;
   1024         ParsePosition pos = new ParsePosition(fScanIndex);
   1025         int i;
   1026 
   1027         startPos = fScanIndex;
   1028         try {
   1029             uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE);
   1030         } catch (Exception e) { // TODO:  catch fewer exception types.
   1031             // Repackage UnicodeSet errors as RBBI rule builder errors, with location info.
   1032             error(RBBIRuleBuilder.U_BRK_MALFORMED_SET);
   1033         }
   1034 
   1035         // Verify that the set contains at least one code point.
   1036         //
   1037         if (uset.isEmpty()) {
   1038             // This set is empty.
   1039             //  Make it an error, because it almost certainly is not what the user wanted.
   1040             //  Also, avoids having to think about corner cases in the tree manipulation code
   1041             //   that occurs later on.
   1042             //  TODO:  this shouldn't be an error; it does happen.
   1043             error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
   1044         }
   1045 
   1046         // Advance the RBBI parse postion over the UnicodeSet pattern.
   1047         //   Don't just set fScanIndex because the line/char positions maintained
   1048         //   for error reporting would be thrown off.
   1049         i = pos.getIndex();
   1050         for (;;) {
   1051             if (fNextIndex >= i) {
   1052                 break;
   1053             }
   1054             nextCharLL();
   1055         }
   1056 
   1057         RBBINode n;
   1058 
   1059         n = pushNewNode(RBBINode.setRef);
   1060         n.fFirstPos = startPos;
   1061         n.fLastPos = fNextIndex;
   1062         n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
   1063         //  findSetFor() serves several purposes here:
   1064         //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
   1065         //     - Mantains collection of all sets in use, needed later for establishing
   1066         //          character categories for run time engine.
   1067         //     - Eliminates mulitiple instances of the same set.
   1068         //     - Creates a new uset node if necessary (if this isn't a duplicate.)
   1069         findSetFor(n.fText, n, uset);
   1070     }
   1071 
   1072 }
   1073 
   1074