Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5 **********************************************************************
      6 *   Copyright (c) 2002-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 **********************************************************************
      9 */
     10 
     11 package android.icu.text;
     12 
     13 import java.util.ArrayList;
     14 import java.util.Collection;
     15 import java.util.HashSet;
     16 import java.util.List;
     17 import java.util.Set;
     18 import java.util.SortedSet;
     19 import java.util.TreeSet;
     20 
     21 import android.icu.impl.Assert;
     22 import android.icu.lang.UCharacter;
     23 import android.icu.lang.UProperty;
     24 
     25 //
     26 //  class RBBITableBuilder is part of the RBBI rule compiler.
     27 //                         It builds the state transition table used by the RBBI runtime
     28 //                         from the expression syntax tree generated by the rule scanner.
     29 //
     30 //                         This class is part of the RBBI implementation only.
     31 //                         There is no user-visible public API here.
     32 //
     33 class RBBITableBuilder {
     34 
     35 
     36 
     37     //
     38     //  RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors,
     39     //                        one for each state.
     40     static class RBBIStateDescriptor {
     41         boolean      fMarked;
     42         int          fAccepting;
     43         int          fLookAhead;
     44         SortedSet<Integer> fTagVals;
     45         int          fTagsIdx;
     46         Set<RBBINode> fPositions;                 // Set of parse tree positions associated
     47                                                   //   with this state.  Unordered (it's a set).
     48                                                   //   UVector contents are RBBINode *
     49 
     50         int[]        fDtran;                      // Transitions out of this state.
     51                                                    //   indexed by input character
     52                                                    //   contents is int index of dest state
     53                                                    //   in RBBITableBuilder.fDStates
     54 
     55         RBBIStateDescriptor(int maxInputSymbol) {
     56             fTagVals = new TreeSet<Integer>();
     57             fPositions = new HashSet<RBBINode>();
     58             fDtran = new int[maxInputSymbol+1];    // fDtran needs to be pre-sized.
     59                                                     //   It is indexed by input symbols, and will
     60                                                     //   hold  the next state number for each
     61                                                     //   symbol.
     62         }
     63     }
     64 
     65 
     66     private  RBBIRuleBuilder  fRB;
     67     private  int             fRootIx;             // The array index into RBBIRuleBuilder.fTreeRoots
     68                                                    //   for the parse tree to operate on.
     69                                                    //   Too bad Java can't do indirection more easily!
     70 
     71     private  List<RBBIStateDescriptor> fDStates;    //  D states (Aho's terminology)
     72                                                     //  Index is state number
     73                                                     //  Contents are RBBIStateDescriptor pointers.
     74 
     75     //-----------------------------------------------------------------------------
     76     //
     77     //  Constructor    for RBBITableBuilder.
     78     //
     79     //                 rootNode is an index into the array of root nodes that is held by
     80     //                          the overall RBBIRuleBuilder.
     81     //-----------------------------------------------------------------------------
     82     RBBITableBuilder(RBBIRuleBuilder rb,  int rootNodeIx)  {
     83            fRootIx     = rootNodeIx;
     84            fRB         = rb;
     85            fDStates    = new ArrayList<RBBIStateDescriptor>();
     86         }
     87 
     88 
     89 
     90 
     91        //-----------------------------------------------------------------------------
     92        //
     93        //   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion
     94        //                               table from the RBBI rules parse tree.
     95        //
     96        //-----------------------------------------------------------------------------
     97        void  build() {
     98            // If there were no rules, just return.  This situation can easily arise
     99            //   for the reverse rules.
    100            if (fRB.fTreeRoots[fRootIx]==null) {
    101                return;
    102            }
    103 
    104            //
    105            // Walk through the tree, replacing any references to $variables with a copy of the
    106            //   parse tree for the substition expression.
    107            //
    108            fRB.fTreeRoots[fRootIx] = fRB.fTreeRoots[fRootIx].flattenVariables();
    109            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("ftree")>=0) {
    110                System.out.println("Parse tree after flattening variable references.");
    111                fRB.fTreeRoots[fRootIx].printTree(true);
    112            }
    113 
    114            //
    115            // If the rules contained any references to {bof}
    116            //   add a {bof} <cat> <former root of tree> to the
    117            //   tree.  Means that all matches must start out with the
    118            //   {bof} fake character.
    119            //
    120            if (fRB.fSetBuilder.sawBOF()) {
    121                RBBINode bofTop    = new RBBINode(RBBINode.opCat);
    122                RBBINode bofLeaf   = new RBBINode(RBBINode.leafChar);
    123                bofTop.fLeftChild  = bofLeaf;
    124                bofTop.fRightChild = fRB.fTreeRoots[fRootIx];
    125                bofLeaf.fParent    = bofTop;
    126                bofLeaf.fVal       = 2;      // Reserved value for {bof}.
    127                fRB.fTreeRoots[fRootIx] = bofTop;
    128            }
    129 
    130            //
    131            // Add a unique right-end marker to the expression.
    132            //   Appears as a cat-node, left child being the original tree,
    133            //   right child being the end marker.
    134            //
    135            RBBINode cn = new RBBINode(RBBINode.opCat);
    136            cn.fLeftChild = fRB.fTreeRoots[fRootIx];
    137            fRB.fTreeRoots[fRootIx].fParent = cn;
    138            cn.fRightChild = new RBBINode(RBBINode.endMark);
    139            cn.fRightChild.fParent = cn;
    140            fRB.fTreeRoots[fRootIx] = cn;
    141 
    142            //
    143            //  Replace all references to UnicodeSets with the tree for the equivalent
    144            //      expression.
    145            //
    146            fRB.fTreeRoots[fRootIx].flattenSets();
    147            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("stree")>=0) {
    148                System.out.println("Parse tree after flattening Unicode Set references.");
    149                fRB.fTreeRoots[fRootIx].printTree(true);
    150            }
    151 
    152 
    153            //
    154            // calculate the functions nullable, firstpos, lastpos and followpos on
    155            // nodes in the parse tree.
    156            //    See the alogrithm description in Aho.
    157            //    Understanding how this works by looking at the code alone will be
    158            //       nearly impossible.
    159            //
    160            calcNullable(fRB.fTreeRoots[fRootIx]);
    161            calcFirstPos(fRB.fTreeRoots[fRootIx]);
    162            calcLastPos(fRB.fTreeRoots[fRootIx]);
    163            calcFollowPos(fRB.fTreeRoots[fRootIx]);
    164            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("pos")>=0) {
    165                System.out.print("\n");
    166                printPosSets(fRB.fTreeRoots[fRootIx]);
    167            }
    168 
    169            //
    170            //  For "chained" rules, modify the followPos sets
    171            //
    172            if (fRB.fChainRules) {
    173                calcChainedFollowPos(fRB.fTreeRoots[fRootIx]);
    174            }
    175 
    176            //
    177            //  BOF (start of input) test fixup.
    178            //
    179            if (fRB.fSetBuilder.sawBOF()) {
    180                bofFixup();
    181            }
    182 
    183            //
    184            // Build the DFA state transition tables.
    185            //
    186            buildStateTable();
    187            flagAcceptingStates();
    188            flagLookAheadStates();
    189            flagTaggedStates();
    190 
    191            //
    192            // Update the global table of rule status {tag} values
    193            // The rule builder has a global vector of status values that are common
    194            //    for all tables.  Merge the ones from this table into the global set.
    195            //
    196            mergeRuleStatusVals();
    197 
    198            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("states")>=0) {printStates();}
    199        }
    200 
    201 
    202 
    203        //-----------------------------------------------------------------------------
    204        //
    205        //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
    206        //
    207        //-----------------------------------------------------------------------------
    208        void calcNullable(RBBINode n) {
    209            if (n == null) {
    210                return;
    211            }
    212            if (n.fType == RBBINode.setRef ||
    213                n.fType == RBBINode.endMark ) {
    214                // These are non-empty leaf node types.
    215                n.fNullable = false;
    216                return;
    217            }
    218 
    219            if (n.fType == RBBINode.lookAhead || n.fType == RBBINode.tag) {
    220                // Lookahead marker node.  It's a leaf, so no recursion on children.
    221                // It's nullable because it does not match any literal text from the input stream.
    222                n.fNullable = true;
    223                return;
    224            }
    225 
    226 
    227            // The node is not a leaf.
    228            //  Calculate nullable on its children.
    229            calcNullable(n.fLeftChild);
    230            calcNullable(n.fRightChild);
    231 
    232            // Apply functions from table 3.40 in Aho
    233            if (n.fType == RBBINode.opOr) {
    234                n.fNullable = n.fLeftChild.fNullable || n.fRightChild.fNullable;
    235            }
    236            else if (n.fType == RBBINode.opCat) {
    237                n.fNullable = n.fLeftChild.fNullable && n.fRightChild.fNullable;
    238            }
    239            else if (n.fType == RBBINode.opStar || n.fType == RBBINode.opQuestion) {
    240                n.fNullable = true;
    241            }
    242            else {
    243                n.fNullable = false;
    244            }
    245        }
    246 
    247 
    248 
    249 
    250        //-----------------------------------------------------------------------------
    251        //
    252        //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
    253        //
    254        //-----------------------------------------------------------------------------
    255        void calcFirstPos(RBBINode n) {
    256            if (n == null) {
    257                return;
    258            }
    259            if (n.fType == RBBINode.leafChar  ||
    260                n.fType == RBBINode.endMark   ||
    261                n.fType == RBBINode.lookAhead ||
    262                n.fType == RBBINode.tag) {
    263                // These are non-empty leaf node types.
    264                n.fFirstPosSet.add(n);
    265                return;
    266            }
    267 
    268            // The node is not a leaf.
    269            //  Calculate firstPos on its children.
    270            calcFirstPos(n.fLeftChild);
    271            calcFirstPos(n.fRightChild);
    272 
    273            // Apply functions from table 3.40 in Aho
    274            if (n.fType == RBBINode.opOr) {
    275                n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
    276                n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
    277            }
    278            else if (n.fType == RBBINode.opCat) {
    279                n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
    280                if (n.fLeftChild.fNullable) {
    281                    n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
    282                }
    283            }
    284            else if (n.fType == RBBINode.opStar ||
    285                     n.fType == RBBINode.opQuestion ||
    286                     n.fType == RBBINode.opPlus) {
    287                n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
    288            }
    289        }
    290 
    291 
    292 
    293        //-----------------------------------------------------------------------------
    294        //
    295        //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
    296        //
    297        //-----------------------------------------------------------------------------
    298        void calcLastPos(RBBINode n) {
    299            if (n == null) {
    300                return;
    301            }
    302            if (n.fType == RBBINode.leafChar  ||
    303                n.fType == RBBINode.endMark   ||
    304                n.fType == RBBINode.lookAhead ||
    305                n.fType == RBBINode.tag) {
    306                // These are non-empty leaf node types.
    307                n.fLastPosSet.add(n);
    308                return;
    309            }
    310 
    311            // The node is not a leaf.
    312            //  Calculate lastPos on its children.
    313            calcLastPos(n.fLeftChild);
    314            calcLastPos(n.fRightChild);
    315 
    316            // Apply functions from table 3.40 in Aho
    317            if (n.fType == RBBINode.opOr) {
    318                n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
    319                n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
    320            }
    321            else if (n.fType == RBBINode.opCat) {
    322                n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
    323                if (n.fRightChild.fNullable) {
    324                    n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
    325                }
    326            }
    327            else if (n.fType == RBBINode.opStar     ||
    328                     n.fType == RBBINode.opQuestion ||
    329                     n.fType == RBBINode.opPlus) {
    330                n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
    331            }
    332        }
    333 
    334 
    335 
    336        //-----------------------------------------------------------------------------
    337        //
    338        //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
    339        //
    340        //-----------------------------------------------------------------------------
    341        void calcFollowPos(RBBINode n) {
    342            if (n == null ||
    343                n.fType == RBBINode.leafChar ||
    344                n.fType == RBBINode.endMark) {
    345                return;
    346            }
    347 
    348            calcFollowPos(n.fLeftChild);
    349            calcFollowPos(n.fRightChild);
    350 
    351            // Aho rule #1
    352            if (n.fType == RBBINode.opCat) {
    353                for (RBBINode i /* is 'i' in Aho's description */ : n.fLeftChild.fLastPosSet) {
    354                    i.fFollowPos.addAll(n.fRightChild.fFirstPosSet);
    355                }
    356            }
    357 
    358            // Aho rule #2
    359            if (n.fType == RBBINode.opStar ||
    360                n.fType == RBBINode.opPlus) {
    361                for (RBBINode i /* again, n and i are the names from Aho's description */ : n.fLastPosSet) {
    362                    i.fFollowPos.addAll(n.fFirstPosSet);
    363                }
    364            }
    365        }
    366 
    367        //-----------------------------------------------------------------------------
    368        //
    369        //           addRuleRootNodes    Recursively walk a parse tree, adding all nodes flagged
    370        //                               as roots of a rule to a destination vector.
    371        //
    372        //-----------------------------------------------------------------------------
    373        void addRuleRootNodes(List<RBBINode> dest, RBBINode node) {
    374            if (node == null) {
    375                return;
    376            }
    377            if (node.fRuleRoot) {
    378                dest.add(node);
    379                // Note: rules cannot nest. If we found a rule start node,
    380                //       no child node can also be a start node.
    381                return;
    382            }
    383            addRuleRootNodes(dest, node.fLeftChild);
    384            addRuleRootNodes(dest, node.fRightChild);
    385        }
    386 
    387        //-----------------------------------------------------------------------------
    388        //
    389        //   calcChainedFollowPos.    Modify the previously calculated followPos sets
    390        //                            to implement rule chaining.  NOT described by Aho
    391        //
    392        //-----------------------------------------------------------------------------
    393        void calcChainedFollowPos(RBBINode tree) {
    394 
    395            List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>();
    396            List<RBBINode> leafNodes      = new ArrayList<RBBINode>();
    397 
    398             // get a list of all endmarker nodes.
    399            tree.findNodes(endMarkerNodes, RBBINode.endMark);
    400 
    401            // get a list all leaf nodes
    402            tree.findNodes(leafNodes, RBBINode.leafChar);
    403 
    404            // Collect all leaf nodes that can start matches for rules
    405            // with inbound chaining enabled, which is the union of the
    406            // firstPosition sets from each of the rule root nodes.
    407 
    408            List<RBBINode> ruleRootNodes = new ArrayList<RBBINode>();
    409            addRuleRootNodes(ruleRootNodes, tree);
    410 
    411            Set<RBBINode> matchStartNodes = new HashSet<RBBINode>();
    412            for (RBBINode node: ruleRootNodes) {
    413                if (node.fChainIn) {
    414                    matchStartNodes.addAll(node.fFirstPosSet);
    415                }
    416            }
    417 
    418            // Iterate over all leaf nodes,
    419            //
    420            for (RBBINode tNode : leafNodes) {
    421                RBBINode endNode = null;
    422 
    423                // Identify leaf nodes that correspond to overall rule match positions.
    424                //   These include an endMarkerNode in their followPos sets.
    425                for (RBBINode endMarkerNode : endMarkerNodes) {
    426                    if (tNode.fFollowPos.contains(endMarkerNode)) {
    427                        endNode = tNode;
    428                        break;
    429                    }
    430                }
    431                if (endNode == null) {
    432                    // node wasn't an end node.  Try again with the next.
    433                    continue;
    434                }
    435 
    436                // We've got a node that can end a match.
    437 
    438                // Line Break Specific hack:  If this node's val correspond to the $CM char class,
    439                //                            don't chain from it.
    440                // TODO:  Add rule syntax for this behavior, get specifics out of here and
    441                //        into the rule file.
    442                if (fRB.fLBCMNoChain) {
    443                    int c = this.fRB.fSetBuilder.getFirstChar(endNode.fVal);
    444                    if (c != -1) {
    445                        // c == -1 occurs with sets containing only the {eof} marker string.
    446                        int cLBProp = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
    447                        if (cLBProp == UCharacter.LineBreak.COMBINING_MARK) {
    448                            continue;
    449                        }
    450                    }
    451                }
    452 
    453 
    454                // Now iterate over the nodes that can start a match, looking for ones
    455                //   with the same char class as our ending node.
    456                for (RBBINode startNode : matchStartNodes) {
    457                    if (startNode.fType != RBBINode.leafChar) {
    458                        continue;
    459                    }
    460 
    461                    if (endNode.fVal == startNode.fVal) {
    462                        // The end val (character class) of one possible match is the
    463                        //   same as the start of another.
    464 
    465                        // Add all nodes from the followPos of the start node to the
    466                        //  followPos set of the end node, which will have the effect of
    467                        //  letting matches transition from a match state at endNode
    468                        //  to the second char of a match starting with startNode.
    469                        endNode.fFollowPos.addAll(startNode.fFollowPos);
    470                    }
    471                }
    472            }
    473        }
    474 
    475 
    476        //-----------------------------------------------------------------------------
    477        //
    478        //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
    479        //                Do an swizzle similar to chaining, modifying the followPos set of
    480        //                the bofNode to include the followPos nodes from other {bot} nodes
    481        //                scattered through the tree.
    482        //
    483        //                This function has much in common with calcChainedFollowPos().
    484        //
    485        //-----------------------------------------------------------------------------
    486        void bofFixup() {
    487            //
    488            //   The parse tree looks like this ...
    489            //         fTree root  --.       <cat>
    490            //                               /     \
    491            //                            <cat>   <#end node>
    492            //                           /     \
    493            //                     <bofNode>   rest
    494            //                               of tree
    495            //
    496            //    We will be adding things to the followPos set of the <bofNode>
    497            //
    498            RBBINode  bofNode = fRB.fTreeRoots[fRootIx].fLeftChild.fLeftChild;
    499            Assert.assrt(bofNode.fType == RBBINode.leafChar);
    500            Assert.assrt(bofNode.fVal == 2);
    501 
    502            // Get all nodes that can be the start a match of the user-written rules
    503            //  (excluding the fake bofNode)
    504            //  We want the nodes that can start a match in the
    505            //     part labeled "rest of tree"
    506            //
    507            Set<RBBINode> matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet;
    508            for (RBBINode startNode : matchStartNodes) {
    509                if (startNode.fType != RBBINode.leafChar) {
    510                    continue;
    511                }
    512 
    513                if (startNode.fVal == bofNode.fVal) {
    514                    //  We found a leaf node corresponding to a {bof} that was
    515                    //    explicitly written into a rule.
    516                    //  Add everything from the followPos set of this node to the
    517                    //    followPos set of the fake bofNode at the start of the tree.
    518                    //
    519                    bofNode.fFollowPos.addAll(startNode.fFollowPos);
    520                }
    521            }
    522        }
    523 
    524        //-----------------------------------------------------------------------------
    525        //
    526        //   buildStateTable()    Determine the set of runtime DFA states and the
    527        //                        transition tables for these states, by the algorithm
    528        //                        of fig. 3.44 in Aho.
    529        //
    530        //                        Most of the comments are quotes of Aho's psuedo-code.
    531        //
    532        //-----------------------------------------------------------------------------
    533        void buildStateTable() {
    534            //
    535            // Add a dummy state 0 - the stop state.  Not from Aho.
    536            int      lastInputSymbol = fRB.fSetBuilder.getNumCharCategories() - 1;
    537            RBBIStateDescriptor failState = new RBBIStateDescriptor(lastInputSymbol);
    538            fDStates.add(failState);
    539 
    540            // initially, the only unmarked state in Dstates is firstpos(root),
    541            //       where toot is the root of the syntax tree for (r)#;
    542            RBBIStateDescriptor initialState = new RBBIStateDescriptor(lastInputSymbol);
    543            initialState.fPositions.addAll(fRB.fTreeRoots[fRootIx].fFirstPosSet);
    544            fDStates.add(initialState);
    545 
    546            // while there is an unmarked state T in Dstates do begin
    547            for (;;) {
    548                RBBIStateDescriptor T = null;
    549                int              tx;
    550                for (tx=1; tx<fDStates.size(); tx++) {
    551                    RBBIStateDescriptor temp  = fDStates.get(tx);
    552                    if (temp.fMarked == false) {
    553                        T = temp;
    554                        break;
    555                    }
    556                }
    557                if (T == null) {
    558                    break;
    559                }
    560 
    561                // mark T;
    562                T.fMarked = true;
    563 
    564                // for each input symbol a do begin
    565                int  a;
    566                for (a = 1; a<=lastInputSymbol; a++) {
    567                    // let U be the set of positions that are in followpos(p)
    568                    //    for some position p in T
    569                    //    such that the symbol at position p is a;
    570                    Set<RBBINode> U = null;
    571                    for (RBBINode p : T.fPositions) {
    572                        if ((p.fType == RBBINode.leafChar) &&  (p.fVal == a)) {
    573                            if (U == null) {
    574                                U = new HashSet<RBBINode>();
    575                            }
    576                            U.addAll(p.fFollowPos);
    577                        }
    578                    }
    579 
    580                    // if U is not empty and not in DStates then
    581                    int  ux = 0;
    582                    boolean    UinDstates = false;
    583                    if (U != null) {
    584                        Assert.assrt(U.size() > 0);
    585                        int  ix;
    586                        for (ix=0; ix<fDStates.size(); ix++) {
    587                            RBBIStateDescriptor temp2;
    588                            temp2 = fDStates.get(ix);
    589                            if (U.equals(temp2.fPositions)) {
    590                                U  = temp2.fPositions;
    591                                ux = ix;
    592                                UinDstates = true;
    593                                break;
    594                            }
    595                        }
    596 
    597                        // Add U as an unmarked state to Dstates
    598                        if (!UinDstates)
    599                        {
    600                            RBBIStateDescriptor newState = new RBBIStateDescriptor(lastInputSymbol);
    601                            newState.fPositions = U;
    602                            fDStates.add(newState);
    603                            ux = fDStates.size()-1;
    604                        }
    605 
    606                        // Dtran[T, a] := U;
    607                        T.fDtran[a] = ux;
    608                    }
    609                }
    610            }
    611        }
    612 
    613 
    614 
    615        //-----------------------------------------------------------------------------
    616        //
    617        //   flagAcceptingStates    Identify accepting states.
    618        //                          First get a list of all of the end marker nodes.
    619        //                          Then, for each state s,
    620        //                              if s contains one of the end marker nodes in its list of tree positions then
    621        //                                  s is an accepting state.
    622        //
    623        //-----------------------------------------------------------------------------
    624        void     flagAcceptingStates() {
    625            List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>();
    626            RBBINode    endMarker;
    627            int     i;
    628            int     n;
    629 
    630            fRB.fTreeRoots[fRootIx].findNodes(endMarkerNodes, RBBINode.endMark);
    631 
    632            for (i=0; i<endMarkerNodes.size(); i++) {
    633                endMarker = endMarkerNodes.get(i);
    634                for (n=0; n<fDStates.size(); n++) {
    635                    RBBIStateDescriptor sd = fDStates.get(n);
    636                    //if (sd.fPositions.indexOf(endMarker) >= 0) {
    637                    if (sd.fPositions.contains(endMarker)) {
    638                        // Any non-zero value for fAccepting means this is an accepting node.
    639                        // The value is what will be returned to the user as the break status.
    640                        // If no other value was specified, force it to -1.
    641 
    642                        if (sd.fAccepting==0) {
    643                         // State hasn't been marked as accepting yet.  Do it now.
    644                            sd.fAccepting = endMarker.fVal;
    645                            if (sd.fAccepting == 0) {
    646                                sd.fAccepting = -1;
    647                         }
    648                        }
    649                        if (sd.fAccepting==-1 && endMarker.fVal != 0) {
    650                         // Both lookahead and non-lookahead accepting for this state.
    651                         // Favor the look-ahead.  Expedient for line break.
    652                         // TODO:  need a more elegant resolution for conflicting rules.
    653                         sd.fAccepting = endMarker.fVal;
    654                     }
    655                         // implicit else:
    656                         // if sd.fAccepting already had a value other than 0 or -1, leave it be.
    657 
    658                        // If the end marker node is from a look-ahead rule, set
    659                        //   the fLookAhead field or this state also.
    660                        if (endMarker.fLookAheadEnd) {
    661                         // TODO:  don't change value if already set?
    662                         // TODO:  allow for more than one active look-ahead rule in engine.
    663                         //        Make value here an index to a side array in engine?
    664                            sd.fLookAhead = sd.fAccepting;
    665                        }
    666                    }
    667                }
    668            }
    669        }
    670 
    671 
    672        //-----------------------------------------------------------------------------
    673        //
    674        //    flagLookAheadStates   Very similar to flagAcceptingStates, above.
    675        //
    676        //-----------------------------------------------------------------------------
    677        void     flagLookAheadStates() {
    678            List<RBBINode> lookAheadNodes = new ArrayList<RBBINode>();
    679            RBBINode    lookAheadNode;
    680            int     i;
    681            int     n;
    682 
    683            fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes, RBBINode.lookAhead);
    684            for (i=0; i<lookAheadNodes.size(); i++) {
    685                lookAheadNode = lookAheadNodes.get(i);
    686 
    687                for (n=0; n<fDStates.size(); n++) {
    688                    RBBIStateDescriptor sd = fDStates.get(n);
    689                    if (sd.fPositions.contains(lookAheadNode)) {
    690                        sd.fLookAhead = lookAheadNode.fVal;
    691                    }
    692                }
    693            }
    694        }
    695 
    696 
    697 
    698 
    699        //-----------------------------------------------------------------------------
    700        //
    701        //    flagTaggedStates
    702        //
    703        //-----------------------------------------------------------------------------
    704        void     flagTaggedStates() {
    705            List<RBBINode> tagNodes = new ArrayList<RBBINode>();
    706            RBBINode    tagNode;
    707            int     i;
    708            int     n;
    709 
    710            fRB.fTreeRoots[fRootIx].findNodes(tagNodes, RBBINode.tag);
    711            for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
    712                tagNode = tagNodes.get(i);
    713 
    714                for (n=0; n<fDStates.size(); n++) {              //    For each state  s (row in the state table)
    715                    RBBIStateDescriptor sd = fDStates.get(n);
    716                    if (sd.fPositions.contains(tagNode)) {       //       if  s include the tag node t
    717                        sd.fTagVals.add(Integer.valueOf(tagNode.fVal));
    718                    }
    719                }
    720            }
    721        }
    722 
    723 
    724 
    725        //-----------------------------------------------------------------------------
    726        //
    727        //  mergeRuleStatusVals
    728        //
    729        //      Allocate positions in the  global array of rule status {tag} values
    730        //
    731        //      The RBBI runtime uses an array of {sets of status values} that can
    732        //      be returned for boundaries.  Each accepting state that has non-zero
    733        //      status includes an index into this array.  The format of the array
    734        //      is
    735        //           Num of status values in group 1
    736        //              status val
    737        //              status val
    738        //              ...
    739        //           Num of status vals in group 2
    740        //              status val
    741        //              status val
    742        //              ...
    743        //           etc.
    744        //
    745        //
    746        //-----------------------------------------------------------------------------
    747 
    748        void  mergeRuleStatusVals() {
    749            //
    750            //  The basic outline of what happens here is this...
    751            //
    752            //    for each state in this state table
    753            //       if the status tag list for this state is in the global statuses list
    754            //           record where and
    755            //           continue with the next state
    756            //       else
    757            //           add the tag list for this state to the global list.
    758            //
    759            int n;
    760 
    761            // Pre-load a single tag of {0} into the table.
    762            //   We will need this as a default, for rule sets with no explicit tagging,
    763            //   or with explicit tagging of {0}.
    764            if (fRB.fRuleStatusVals.size() == 0) {
    765                fRB.fRuleStatusVals.add(Integer.valueOf(1));    // Num of statuses in group
    766                fRB.fRuleStatusVals.add(Integer.valueOf(0));    //   and our single status of zero
    767 
    768                SortedSet<Integer> s0 = new TreeSet<Integer>();
    769                Integer izero = Integer.valueOf(0);
    770                fRB.fStatusSets.put(s0, izero);
    771                SortedSet<Integer> s1 = new TreeSet<Integer>();
    772                s1.add(izero);
    773                fRB.fStatusSets.put(s0, izero);
    774            }
    775 
    776            //    For each state, check whether the state's status tag values are
    777            //       already entered into the status values array, and add them if not.
    778            for (n=0; n<fDStates.size(); n++) {
    779                RBBIStateDescriptor sd = fDStates.get(n);
    780                Set<Integer> statusVals = sd.fTagVals;
    781                Integer arrayIndexI = fRB.fStatusSets.get(statusVals);
    782                if (arrayIndexI == null) {
    783                    // This is the first encounter of this set of status values.
    784                    //   Add them to the statusSets map, This map associates
    785                    //   the set of status values with an index in the runtime status
    786                    //   values array.
    787                    arrayIndexI = Integer.valueOf(fRB.fRuleStatusVals.size());
    788                    fRB.fStatusSets.put(statusVals, arrayIndexI);
    789 
    790                    // Add the new set of status values to the vector of values that
    791                    //   will eventually become the array used by the runtime engine.
    792                    fRB.fRuleStatusVals.add(Integer.valueOf(statusVals.size()));
    793                    fRB.fRuleStatusVals.addAll(statusVals);
    794                }
    795 
    796                // Save the runtime array index back into the state descriptor.
    797                sd.fTagsIdx = arrayIndexI.intValue();
    798            }
    799        }
    800 
    801 
    802 
    803 
    804 
    805 
    806 
    807        //-----------------------------------------------------------------------------
    808        //
    809        //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
    810        //                 for each node in the tree.
    811        //
    812        //-----------------------------------------------------------------------------
    813 
    814        void printPosSets(RBBINode n) {
    815            if (n==null) {
    816                return;
    817            }
    818            RBBINode.printNode(n);
    819            System.out.print("         Nullable:  " + n.fNullable);
    820 
    821            System.out.print("         firstpos:  ");
    822            printSet(n.fFirstPosSet);
    823 
    824            System.out.print("         lastpos:   ");
    825            printSet(n.fLastPosSet);
    826 
    827            System.out.print("         followpos: ");
    828            printSet(n.fFollowPos);
    829 
    830            printPosSets(n.fLeftChild);
    831            printPosSets(n.fRightChild);
    832        }
    833 
    834 
    835 
    836 
    837        //-----------------------------------------------------------------------------
    838        //
    839        //   getTableSize()    Calculate the size in bytes of the runtime form of this
    840        //                     state transition table.
    841        //
    842        //          Note:  Refer to common/rbbidata.h from ICU4C for the declarations
    843        //                 of the structures being matched by this calculation.
    844        //
    845        //-----------------------------------------------------------------------------
    846        int  getTableSize()  {
    847            int    size = 0;
    848            int    numRows;
    849            int    numCols;
    850            int    rowSize;
    851 
    852            if (fRB.fTreeRoots[fRootIx] == null) {
    853                return 0;
    854            }
    855 
    856            size    = /*sizeof(RBBIStateTable) - 4 */ 16;    // The header, with no rows to the table.
    857 
    858            numRows = fDStates.size();
    859            numCols = fRB.fSetBuilder.getNumCharCategories();
    860 
    861            //  Note  The declaration of RBBIStateTableRow is for a table of two columns.
    862            //        Therefore we subtract two from numCols when determining
    863            //        how much storage to add to a row for the total columns.
    864            // rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
    865            rowSize = 8 + 2*numCols;
    866            size   += numRows * rowSize;
    867            while (size % 8 > 0) {    // Size must be multiple of 8 bytes in size.
    868                size++;
    869            }
    870 
    871            return size;
    872        }
    873 
    874 
    875 
    876        //-----------------------------------------------------------------------------
    877        //
    878        //   exportTable()    export the state transition table in the ICU4C format.
    879        //
    880        //                    Most of the table is 16 bit shorts.  This function exports
    881        //                    the whole thing as an array of shorts.
    882        //
    883        //                    The size of the array must be rounded up to a multiple of
    884        //                    8 bytes.
    885        //
    886        //                    See struct RBBIStateTable in ICU4C, common/rbbidata.h
    887        //
    888        //-----------------------------------------------------------------------------
    889 
    890        short [] exportTable() {
    891            int                state;
    892            int                col;
    893 
    894            if (fRB.fTreeRoots[fRootIx] == null) {
    895                return new short[0];
    896            }
    897 
    898            Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&
    899                fDStates.size() < 0x7fff);
    900 
    901            int numStates = fDStates.size();
    902 
    903            // Size of table size in shorts.
    904            //  the "4" is the size of struct RBBIStateTableRow, the row header part only.
    905            int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories();
    906            int tableSize = getTableSize() / 2;
    907 
    908 
    909            short [] table = new short[tableSize];
    910 
    911            //
    912            // Fill in the header fields.
    913            //      Annoying because they really want to be ints, not shorts.
    914            //
    915            // RBBIStateTable.fNumStates
    916            table[RBBIDataWrapper.NUMSTATES]   = (short)(numStates >>> 16);
    917            table[RBBIDataWrapper.NUMSTATES+1] = (short)(numStates & 0x0000ffff);
    918 
    919            // RBBIStateTable.fRowLen
    920            table[RBBIDataWrapper.ROWLEN]   = (short)(rowLen >>> 16);
    921            table[RBBIDataWrapper.ROWLEN+1] = (short)(rowLen & 0x0000ffff);
    922 
    923            // RBBIStateTable.fFlags
    924            int flags = 0;
    925            if (fRB.fLookAheadHardBreak) {
    926                flags  |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK;
    927            }
    928            if (fRB.fSetBuilder.sawBOF()) {
    929                flags  |= RBBIDataWrapper.RBBI_BOF_REQUIRED;
    930            }
    931            table[RBBIDataWrapper.FLAGS]   = (short)(flags >>> 16);
    932            table[RBBIDataWrapper.FLAGS+1] = (short)(flags & 0x0000ffff);
    933 
    934            int numCharCategories = fRB.fSetBuilder.getNumCharCategories();
    935            for (state=0; state<numStates; state++) {
    936                RBBIStateDescriptor sd = fDStates.get(state);
    937                int                row = 8 + state*rowLen;
    938                Assert.assrt (-32768 < sd.fAccepting && sd.fAccepting <= 32767);
    939                Assert.assrt (-32768 < sd.fLookAhead && sd.fLookAhead <= 32767);
    940                table[row + RBBIDataWrapper.ACCEPTING] = (short)sd.fAccepting;
    941                table[row + RBBIDataWrapper.LOOKAHEAD] = (short)sd.fLookAhead;
    942                table[row + RBBIDataWrapper.TAGIDX]    = (short)sd.fTagsIdx;
    943                for (col=0; col<numCharCategories; col++) {
    944                    table[row + RBBIDataWrapper.NEXTSTATES + col] = (short)sd.fDtran[col];
    945                }
    946            }
    947            return table;
    948        }
    949 
    950 
    951 
    952        //-----------------------------------------------------------------------------
    953        //
    954        //   printSet    Debug function.   Print the contents of a set of Nodes
    955        //
    956        //-----------------------------------------------------------------------------
    957 
    958        void printSet(Collection<RBBINode> s) {
    959            for (RBBINode n : s) {
    960                RBBINode.printInt(n.fSerialNum, 8);
    961            }
    962            System.out.println();
    963        }
    964 
    965 
    966 
    967        //-----------------------------------------------------------------------------
    968        //
    969        //   printStates    Debug Function.  Dump the fully constructed state transition table.
    970        //
    971        //-----------------------------------------------------------------------------
    972 
    973        void printStates() {
    974            int     c;    // input "character"
    975            int     n;    // state number
    976 
    977            System.out.print("state |           i n p u t     s y m b o l s \n");
    978            System.out.print("      | Acc  LA    Tag");
    979            for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
    980                RBBINode.printInt(c, 3);
    981            }
    982            System.out.print("\n");
    983            System.out.print("      |---------------");
    984            for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
    985                System.out.print("---");
    986            }
    987            System.out.print("\n");
    988 
    989            for (n=0; n<fDStates.size(); n++) {
    990                RBBIStateDescriptor sd = fDStates.get(n);
    991                RBBINode.printInt(n, 5);
    992                System.out.print(" | ");
    993 
    994                RBBINode.printInt(sd.fAccepting, 3);
    995                RBBINode.printInt(sd.fLookAhead, 4);
    996                RBBINode.printInt(sd.fTagsIdx, 6);
    997                System.out.print(" ");
    998                for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
    999                    RBBINode.printInt(sd.fDtran[c], 3);
   1000                }
   1001                System.out.print("\n");
   1002            }
   1003            System.out.print("\n\n");
   1004        }
   1005 
   1006 
   1007 
   1008 
   1009        //-----------------------------------------------------------------------------
   1010        //
   1011        //   printRuleStatusTable    Debug Function.  Dump the common rule status table
   1012        //
   1013        //-----------------------------------------------------------------------------
   1014 
   1015        void printRuleStatusTable() {
   1016            int  thisRecord = 0;
   1017            int  nextRecord = 0;
   1018            int      i;
   1019            List<Integer> tbl = fRB.fRuleStatusVals;
   1020 
   1021            System.out.print("index |  tags \n");
   1022            System.out.print("-------------------\n");
   1023 
   1024            while (nextRecord < tbl.size()) {
   1025                thisRecord = nextRecord;
   1026                nextRecord = thisRecord + tbl.get(thisRecord).intValue() + 1;
   1027                RBBINode.printInt(thisRecord, 7);
   1028                for (i=thisRecord+1; i<nextRecord; i++) {
   1029                    int val = tbl.get(i).intValue();
   1030                    RBBINode.printInt(val, 7);
   1031                }
   1032                System.out.print("\n");
   1033            }
   1034            System.out.print("\n\n");
   1035        }
   1036 
   1037 
   1038 
   1039 }
   1040