Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (c) 2002-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 */
      9 //
     10 //  rbbitblb.cpp
     11 //
     12 
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_BREAK_ITERATION
     17 
     18 #include "unicode/unistr.h"
     19 #include "rbbitblb.h"
     20 #include "rbbirb.h"
     21 #include "rbbisetb.h"
     22 #include "rbbidata.h"
     23 #include "cstring.h"
     24 #include "uassert.h"
     25 #include "uvectr32.h"
     26 #include "cmemory.h"
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
     31         fRB(rb),
     32         fTree(*rootNode),
     33         fStatus(&status),
     34         fDStates(nullptr),
     35         fSafeTable(nullptr) {
     36     if (U_FAILURE(status)) {
     37         return;
     38     }
     39     // fDStates is UVector<RBBIStateDescriptor *>
     40     fDStates = new UVector(status);
     41     if (U_SUCCESS(status) && fDStates == nullptr ) {
     42         status = U_MEMORY_ALLOCATION_ERROR;
     43     }
     44 }
     45 
     46 
     47 
     48 RBBITableBuilder::~RBBITableBuilder() {
     49     int i;
     50     for (i=0; i<fDStates->size(); i++) {
     51         delete (RBBIStateDescriptor *)fDStates->elementAt(i);
     52     }
     53     delete fDStates;
     54     delete fSafeTable;
     55 }
     56 
     57 
     58 //-----------------------------------------------------------------------------
     59 //
     60 //   RBBITableBuilder::buildForwardTable  -  This is the main function for building
     61 //                               the DFA state transition table from the RBBI rules parse tree.
     62 //
     63 //-----------------------------------------------------------------------------
     64 void  RBBITableBuilder::buildForwardTable() {
     65 
     66     if (U_FAILURE(*fStatus)) {
     67         return;
     68     }
     69 
     70     // If there were no rules, just return.  This situation can easily arise
     71     //   for the reverse rules.
     72     if (fTree==NULL) {
     73         return;
     74     }
     75 
     76     //
     77     // Walk through the tree, replacing any references to $variables with a copy of the
     78     //   parse tree for the substition expression.
     79     //
     80     fTree = fTree->flattenVariables();
     81 #ifdef RBBI_DEBUG
     82     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
     83         RBBIDebugPuts("\nParse tree after flattening variable references.");
     84         RBBINode::printTree(fTree, TRUE);
     85     }
     86 #endif
     87 
     88     //
     89     // If the rules contained any references to {bof}
     90     //   add a {bof} <cat> <former root of tree> to the
     91     //   tree.  Means that all matches must start out with the
     92     //   {bof} fake character.
     93     //
     94     if (fRB->fSetBuilder->sawBOF()) {
     95         RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
     96         RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
     97         // Delete and exit if memory allocation failed.
     98         if (bofTop == NULL || bofLeaf == NULL) {
     99             *fStatus = U_MEMORY_ALLOCATION_ERROR;
    100             delete bofTop;
    101             delete bofLeaf;
    102             return;
    103         }
    104         bofTop->fLeftChild  = bofLeaf;
    105         bofTop->fRightChild = fTree;
    106         bofLeaf->fParent    = bofTop;
    107         bofLeaf->fVal       = 2;      // Reserved value for {bof}.
    108         fTree               = bofTop;
    109     }
    110 
    111     //
    112     // Add a unique right-end marker to the expression.
    113     //   Appears as a cat-node, left child being the original tree,
    114     //   right child being the end marker.
    115     //
    116     RBBINode *cn = new RBBINode(RBBINode::opCat);
    117     // Exit if memory allocation failed.
    118     if (cn == NULL) {
    119         *fStatus = U_MEMORY_ALLOCATION_ERROR;
    120         return;
    121     }
    122     cn->fLeftChild = fTree;
    123     fTree->fParent = cn;
    124     cn->fRightChild = new RBBINode(RBBINode::endMark);
    125     // Delete and exit if memory allocation failed.
    126     if (cn->fRightChild == NULL) {
    127         *fStatus = U_MEMORY_ALLOCATION_ERROR;
    128         delete cn;
    129         return;
    130     }
    131     cn->fRightChild->fParent = cn;
    132     fTree = cn;
    133 
    134     //
    135     //  Replace all references to UnicodeSets with the tree for the equivalent
    136     //      expression.
    137     //
    138     fTree->flattenSets();
    139 #ifdef RBBI_DEBUG
    140     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
    141         RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
    142         RBBINode::printTree(fTree, TRUE);
    143     }
    144 #endif
    145 
    146 
    147     //
    148     // calculate the functions nullable, firstpos, lastpos and followpos on
    149     // nodes in the parse tree.
    150     //    See the alogrithm description in Aho.
    151     //    Understanding how this works by looking at the code alone will be
    152     //       nearly impossible.
    153     //
    154     calcNullable(fTree);
    155     calcFirstPos(fTree);
    156     calcLastPos(fTree);
    157     calcFollowPos(fTree);
    158     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
    159         RBBIDebugPuts("\n");
    160         printPosSets(fTree);
    161     }
    162 
    163     //
    164     //  For "chained" rules, modify the followPos sets
    165     //
    166     if (fRB->fChainRules) {
    167         calcChainedFollowPos(fTree);
    168     }
    169 
    170     //
    171     //  BOF (start of input) test fixup.
    172     //
    173     if (fRB->fSetBuilder->sawBOF()) {
    174         bofFixup();
    175     }
    176 
    177     //
    178     // Build the DFA state transition tables.
    179     //
    180     buildStateTable();
    181     flagAcceptingStates();
    182     flagLookAheadStates();
    183     flagTaggedStates();
    184 
    185     //
    186     // Update the global table of rule status {tag} values
    187     // The rule builder has a global vector of status values that are common
    188     //    for all tables.  Merge the ones from this table into the global set.
    189     //
    190     mergeRuleStatusVals();
    191 }
    192 
    193 
    194 
    195 //-----------------------------------------------------------------------------
    196 //
    197 //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
    198 //
    199 //-----------------------------------------------------------------------------
    200 void RBBITableBuilder::calcNullable(RBBINode *n) {
    201     if (n == NULL) {
    202         return;
    203     }
    204     if (n->fType == RBBINode::setRef ||
    205         n->fType == RBBINode::endMark ) {
    206         // These are non-empty leaf node types.
    207         n->fNullable = FALSE;
    208         return;
    209     }
    210 
    211     if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
    212         // Lookahead marker node.  It's a leaf, so no recursion on children.
    213         // It's nullable because it does not match any literal text from the input stream.
    214         n->fNullable = TRUE;
    215         return;
    216     }
    217 
    218 
    219     // The node is not a leaf.
    220     //  Calculate nullable on its children.
    221     calcNullable(n->fLeftChild);
    222     calcNullable(n->fRightChild);
    223 
    224     // Apply functions from table 3.40 in Aho
    225     if (n->fType == RBBINode::opOr) {
    226         n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
    227     }
    228     else if (n->fType == RBBINode::opCat) {
    229         n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
    230     }
    231     else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
    232         n->fNullable = TRUE;
    233     }
    234     else {
    235         n->fNullable = FALSE;
    236     }
    237 }
    238 
    239 
    240 
    241 
    242 //-----------------------------------------------------------------------------
    243 //
    244 //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
    245 //
    246 //-----------------------------------------------------------------------------
    247 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
    248     if (n == NULL) {
    249         return;
    250     }
    251     if (n->fType == RBBINode::leafChar  ||
    252         n->fType == RBBINode::endMark   ||
    253         n->fType == RBBINode::lookAhead ||
    254         n->fType == RBBINode::tag) {
    255         // These are non-empty leaf node types.
    256         // Note: In order to maintain the sort invariant on the set,
    257         // this function should only be called on a node whose set is
    258         // empty to start with.
    259         n->fFirstPosSet->addElement(n, *fStatus);
    260         return;
    261     }
    262 
    263     // The node is not a leaf.
    264     //  Calculate firstPos on its children.
    265     calcFirstPos(n->fLeftChild);
    266     calcFirstPos(n->fRightChild);
    267 
    268     // Apply functions from table 3.40 in Aho
    269     if (n->fType == RBBINode::opOr) {
    270         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
    271         setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
    272     }
    273     else if (n->fType == RBBINode::opCat) {
    274         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
    275         if (n->fLeftChild->fNullable) {
    276             setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
    277         }
    278     }
    279     else if (n->fType == RBBINode::opStar ||
    280              n->fType == RBBINode::opQuestion ||
    281              n->fType == RBBINode::opPlus) {
    282         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
    283     }
    284 }
    285 
    286 
    287 
    288 //-----------------------------------------------------------------------------
    289 //
    290 //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
    291 //
    292 //-----------------------------------------------------------------------------
    293 void RBBITableBuilder::calcLastPos(RBBINode *n) {
    294     if (n == NULL) {
    295         return;
    296     }
    297     if (n->fType == RBBINode::leafChar  ||
    298         n->fType == RBBINode::endMark   ||
    299         n->fType == RBBINode::lookAhead ||
    300         n->fType == RBBINode::tag) {
    301         // These are non-empty leaf node types.
    302         // Note: In order to maintain the sort invariant on the set,
    303         // this function should only be called on a node whose set is
    304         // empty to start with.
    305         n->fLastPosSet->addElement(n, *fStatus);
    306         return;
    307     }
    308 
    309     // The node is not a leaf.
    310     //  Calculate lastPos on its children.
    311     calcLastPos(n->fLeftChild);
    312     calcLastPos(n->fRightChild);
    313 
    314     // Apply functions from table 3.40 in Aho
    315     if (n->fType == RBBINode::opOr) {
    316         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
    317         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
    318     }
    319     else if (n->fType == RBBINode::opCat) {
    320         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
    321         if (n->fRightChild->fNullable) {
    322             setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
    323         }
    324     }
    325     else if (n->fType == RBBINode::opStar     ||
    326              n->fType == RBBINode::opQuestion ||
    327              n->fType == RBBINode::opPlus) {
    328         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
    329     }
    330 }
    331 
    332 
    333 
    334 //-----------------------------------------------------------------------------
    335 //
    336 //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
    337 //
    338 //-----------------------------------------------------------------------------
    339 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
    340     if (n == NULL ||
    341         n->fType == RBBINode::leafChar ||
    342         n->fType == RBBINode::endMark) {
    343         return;
    344     }
    345 
    346     calcFollowPos(n->fLeftChild);
    347     calcFollowPos(n->fRightChild);
    348 
    349     // Aho rule #1
    350     if (n->fType == RBBINode::opCat) {
    351         RBBINode *i;   // is 'i' in Aho's description
    352         uint32_t     ix;
    353 
    354         UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
    355 
    356         for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
    357             i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
    358             setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
    359         }
    360     }
    361 
    362     // Aho rule #2
    363     if (n->fType == RBBINode::opStar ||
    364         n->fType == RBBINode::opPlus) {
    365         RBBINode   *i;  // again, n and i are the names from Aho's description.
    366         uint32_t    ix;
    367 
    368         for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
    369             i = (RBBINode *)n->fLastPosSet->elementAt(ix);
    370             setAdd(i->fFollowPos, n->fFirstPosSet);
    371         }
    372     }
    373 
    374 
    375 
    376 }
    377 
    378 //-----------------------------------------------------------------------------
    379 //
    380 //    addRuleRootNodes    Recursively walk a parse tree, adding all nodes flagged
    381 //                        as roots of a rule to a destination vector.
    382 //
    383 //-----------------------------------------------------------------------------
    384 void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
    385     if (node == NULL || U_FAILURE(*fStatus)) {
    386         return;
    387     }
    388     if (node->fRuleRoot) {
    389         dest->addElement(node, *fStatus);
    390         // Note: rules cannot nest. If we found a rule start node,
    391         //       no child node can also be a start node.
    392         return;
    393     }
    394     addRuleRootNodes(dest, node->fLeftChild);
    395     addRuleRootNodes(dest, node->fRightChild);
    396 }
    397 
    398 //-----------------------------------------------------------------------------
    399 //
    400 //   calcChainedFollowPos.    Modify the previously calculated followPos sets
    401 //                            to implement rule chaining.  NOT described by Aho
    402 //
    403 //-----------------------------------------------------------------------------
    404 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
    405 
    406     UVector         endMarkerNodes(*fStatus);
    407     UVector         leafNodes(*fStatus);
    408     int32_t         i;
    409 
    410     if (U_FAILURE(*fStatus)) {
    411         return;
    412     }
    413 
    414     // get a list of all endmarker nodes.
    415     tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
    416 
    417     // get a list all leaf nodes
    418     tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
    419     if (U_FAILURE(*fStatus)) {
    420         return;
    421     }
    422 
    423     // Collect all leaf nodes that can start matches for rules
    424     // with inbound chaining enabled, which is the union of the
    425     // firstPosition sets from each of the rule root nodes.
    426 
    427     UVector ruleRootNodes(*fStatus);
    428     addRuleRootNodes(&ruleRootNodes, tree);
    429 
    430     UVector matchStartNodes(*fStatus);
    431     for (int j=0; j<ruleRootNodes.size(); ++j) {
    432         RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j));
    433         if (node->fChainIn) {
    434             setAdd(&matchStartNodes, node->fFirstPosSet);
    435         }
    436     }
    437     if (U_FAILURE(*fStatus)) {
    438         return;
    439     }
    440 
    441     int32_t  endNodeIx;
    442     int32_t  startNodeIx;
    443 
    444     for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
    445         RBBINode *tNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
    446         RBBINode *endNode = NULL;
    447 
    448         // Identify leaf nodes that correspond to overall rule match positions.
    449         //   These include an endMarkerNode in their followPos sets.
    450         for (i=0; i<endMarkerNodes.size(); i++) {
    451             if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
    452                 endNode = tNode;
    453                 break;
    454             }
    455         }
    456         if (endNode == NULL) {
    457             // node wasn't an end node.  Try again with the next.
    458             continue;
    459         }
    460 
    461         // We've got a node that can end a match.
    462 
    463         // Line Break Specific hack:  If this node's val correspond to the $CM char class,
    464         //                            don't chain from it.
    465         // TODO:  Add rule syntax for this behavior, get specifics out of here and
    466         //        into the rule file.
    467         if (fRB->fLBCMNoChain) {
    468             UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
    469             if (c != -1) {
    470                 // c == -1 occurs with sets containing only the {eof} marker string.
    471                 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
    472                 if (cLBProp == U_LB_COMBINING_MARK) {
    473                     continue;
    474                 }
    475             }
    476         }
    477 
    478 
    479         // Now iterate over the nodes that can start a match, looking for ones
    480         //   with the same char class as our ending node.
    481         RBBINode *startNode;
    482         for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
    483             startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
    484             if (startNode->fType != RBBINode::leafChar) {
    485                 continue;
    486             }
    487 
    488             if (endNode->fVal == startNode->fVal) {
    489                 // The end val (character class) of one possible match is the
    490                 //   same as the start of another.
    491 
    492                 // Add all nodes from the followPos of the start node to the
    493                 //  followPos set of the end node, which will have the effect of
    494                 //  letting matches transition from a match state at endNode
    495                 //  to the second char of a match starting with startNode.
    496                 setAdd(endNode->fFollowPos, startNode->fFollowPos);
    497             }
    498         }
    499     }
    500 }
    501 
    502 
    503 //-----------------------------------------------------------------------------
    504 //
    505 //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
    506 //                Do an swizzle similar to chaining, modifying the followPos set of
    507 //                the bofNode to include the followPos nodes from other {bot} nodes
    508 //                scattered through the tree.
    509 //
    510 //                This function has much in common with calcChainedFollowPos().
    511 //
    512 //-----------------------------------------------------------------------------
    513 void RBBITableBuilder::bofFixup() {
    514 
    515     if (U_FAILURE(*fStatus)) {
    516         return;
    517     }
    518 
    519     //   The parse tree looks like this ...
    520     //         fTree root  --->       <cat>
    521     //                               /     \       .
    522     //                            <cat>   <#end node>
    523     //                           /     \  .
    524     //                     <bofNode>   rest
    525     //                               of tree
    526     //
    527     //    We will be adding things to the followPos set of the <bofNode>
    528     //
    529     RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
    530     U_ASSERT(bofNode->fType == RBBINode::leafChar);
    531     U_ASSERT(bofNode->fVal == 2);
    532 
    533     // Get all nodes that can be the start a match of the user-written rules
    534     //  (excluding the fake bofNode)
    535     //  We want the nodes that can start a match in the
    536     //     part labeled "rest of tree"
    537     //
    538     UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
    539 
    540     RBBINode *startNode;
    541     int       startNodeIx;
    542     for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
    543         startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
    544         if (startNode->fType != RBBINode::leafChar) {
    545             continue;
    546         }
    547 
    548         if (startNode->fVal == bofNode->fVal) {
    549             //  We found a leaf node corresponding to a {bof} that was
    550             //    explicitly written into a rule.
    551             //  Add everything from the followPos set of this node to the
    552             //    followPos set of the fake bofNode at the start of the tree.
    553             //
    554             setAdd(bofNode->fFollowPos, startNode->fFollowPos);
    555         }
    556     }
    557 }
    558 
    559 //-----------------------------------------------------------------------------
    560 //
    561 //   buildStateTable()    Determine the set of runtime DFA states and the
    562 //                        transition tables for these states, by the algorithm
    563 //                        of fig. 3.44 in Aho.
    564 //
    565 //                        Most of the comments are quotes of Aho's psuedo-code.
    566 //
    567 //-----------------------------------------------------------------------------
    568 void RBBITableBuilder::buildStateTable() {
    569     if (U_FAILURE(*fStatus)) {
    570         return;
    571     }
    572     RBBIStateDescriptor *failState;
    573     // Set it to NULL to avoid uninitialized warning
    574     RBBIStateDescriptor *initialState = NULL;
    575     //
    576     // Add a dummy state 0 - the stop state.  Not from Aho.
    577     int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
    578     failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
    579     if (failState == NULL) {
    580         *fStatus = U_MEMORY_ALLOCATION_ERROR;
    581         goto ExitBuildSTdeleteall;
    582     }
    583     failState->fPositions = new UVector(*fStatus);
    584     if (failState->fPositions == NULL) {
    585         *fStatus = U_MEMORY_ALLOCATION_ERROR;
    586     }
    587     if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
    588         goto ExitBuildSTdeleteall;
    589     }
    590     fDStates->addElement(failState, *fStatus);
    591     if (U_FAILURE(*fStatus)) {
    592         goto ExitBuildSTdeleteall;
    593     }
    594 
    595     // initially, the only unmarked state in Dstates is firstpos(root),
    596     //       where toot is the root of the syntax tree for (r)#;
    597     initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
    598     if (initialState == NULL) {
    599         *fStatus = U_MEMORY_ALLOCATION_ERROR;
    600     }
    601     if (U_FAILURE(*fStatus)) {
    602         goto ExitBuildSTdeleteall;
    603     }
    604     initialState->fPositions = new UVector(*fStatus);
    605     if (initialState->fPositions == NULL) {
    606         *fStatus = U_MEMORY_ALLOCATION_ERROR;
    607     }
    608     if (U_FAILURE(*fStatus)) {
    609         goto ExitBuildSTdeleteall;
    610     }
    611     setAdd(initialState->fPositions, fTree->fFirstPosSet);
    612     fDStates->addElement(initialState, *fStatus);
    613     if (U_FAILURE(*fStatus)) {
    614         goto ExitBuildSTdeleteall;
    615     }
    616 
    617     // while there is an unmarked state T in Dstates do begin
    618     for (;;) {
    619         RBBIStateDescriptor *T = NULL;
    620         int32_t              tx;
    621         for (tx=1; tx<fDStates->size(); tx++) {
    622             RBBIStateDescriptor *temp;
    623             temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
    624             if (temp->fMarked == FALSE) {
    625                 T = temp;
    626                 break;
    627             }
    628         }
    629         if (T == NULL) {
    630             break;
    631         }
    632 
    633         // mark T;
    634         T->fMarked = TRUE;
    635 
    636         // for each input symbol a do begin
    637         int32_t  a;
    638         for (a = 1; a<=lastInputSymbol; a++) {
    639             // let U be the set of positions that are in followpos(p)
    640             //    for some position p in T
    641             //    such that the symbol at position p is a;
    642             UVector    *U = NULL;
    643             RBBINode   *p;
    644             int32_t     px;
    645             for (px=0; px<T->fPositions->size(); px++) {
    646                 p = (RBBINode *)T->fPositions->elementAt(px);
    647                 if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
    648                     if (U == NULL) {
    649                         U = new UVector(*fStatus);
    650                         if (U == NULL) {
    651                         	*fStatus = U_MEMORY_ALLOCATION_ERROR;
    652                         	goto ExitBuildSTdeleteall;
    653                         }
    654                     }
    655                     setAdd(U, p->fFollowPos);
    656                 }
    657             }
    658 
    659             // if U is not empty and not in DStates then
    660             int32_t  ux = 0;
    661             UBool    UinDstates = FALSE;
    662             if (U != NULL) {
    663                 U_ASSERT(U->size() > 0);
    664                 int  ix;
    665                 for (ix=0; ix<fDStates->size(); ix++) {
    666                     RBBIStateDescriptor *temp2;
    667                     temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
    668                     if (setEquals(U, temp2->fPositions)) {
    669                         delete U;
    670                         U  = temp2->fPositions;
    671                         ux = ix;
    672                         UinDstates = TRUE;
    673                         break;
    674                     }
    675                 }
    676 
    677                 // Add U as an unmarked state to Dstates
    678                 if (!UinDstates)
    679                 {
    680                     RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
    681                     if (newState == NULL) {
    682                     	*fStatus = U_MEMORY_ALLOCATION_ERROR;
    683                     }
    684                     if (U_FAILURE(*fStatus)) {
    685                         goto ExitBuildSTdeleteall;
    686                     }
    687                     newState->fPositions = U;
    688                     fDStates->addElement(newState, *fStatus);
    689                     if (U_FAILURE(*fStatus)) {
    690                         return;
    691                     }
    692                     ux = fDStates->size()-1;
    693                 }
    694 
    695                 // Dtran[T, a] := U;
    696                 T->fDtran->setElementAt(ux, a);
    697             }
    698         }
    699     }
    700     return;
    701     // delete local pointers only if error occured.
    702 ExitBuildSTdeleteall:
    703     delete initialState;
    704     delete failState;
    705 }
    706 
    707 
    708 
    709 //-----------------------------------------------------------------------------
    710 //
    711 //   flagAcceptingStates    Identify accepting states.
    712 //                          First get a list of all of the end marker nodes.
    713 //                          Then, for each state s,
    714 //                              if s contains one of the end marker nodes in its list of tree positions then
    715 //                                  s is an accepting state.
    716 //
    717 //-----------------------------------------------------------------------------
    718 void     RBBITableBuilder::flagAcceptingStates() {
    719     if (U_FAILURE(*fStatus)) {
    720         return;
    721     }
    722     UVector     endMarkerNodes(*fStatus);
    723     RBBINode    *endMarker;
    724     int32_t     i;
    725     int32_t     n;
    726 
    727     if (U_FAILURE(*fStatus)) {
    728         return;
    729     }
    730 
    731     fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
    732     if (U_FAILURE(*fStatus)) {
    733         return;
    734     }
    735 
    736     for (i=0; i<endMarkerNodes.size(); i++) {
    737         endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
    738         for (n=0; n<fDStates->size(); n++) {
    739             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
    740             if (sd->fPositions->indexOf(endMarker) >= 0) {
    741                 // Any non-zero value for fAccepting means this is an accepting node.
    742                 // The value is what will be returned to the user as the break status.
    743                 // If no other value was specified, force it to -1.
    744 
    745                 if (sd->fAccepting==0) {
    746                     // State hasn't been marked as accepting yet.  Do it now.
    747                     sd->fAccepting = endMarker->fVal;
    748                     if (sd->fAccepting == 0) {
    749                         sd->fAccepting = -1;
    750                     }
    751                 }
    752                 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
    753                     // Both lookahead and non-lookahead accepting for this state.
    754                     // Favor the look-ahead.  Expedient for line break.
    755                     // TODO:  need a more elegant resolution for conflicting rules.
    756                     sd->fAccepting = endMarker->fVal;
    757                 }
    758                 // implicit else:
    759                 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
    760 
    761                 // If the end marker node is from a look-ahead rule, set
    762                 //   the fLookAhead field for this state also.
    763                 if (endMarker->fLookAheadEnd) {
    764                     // TODO:  don't change value if already set?
    765                     // TODO:  allow for more than one active look-ahead rule in engine.
    766                     //        Make value here an index to a side array in engine?
    767                     sd->fLookAhead = sd->fAccepting;
    768                 }
    769             }
    770         }
    771     }
    772 }
    773 
    774 
    775 //-----------------------------------------------------------------------------
    776 //
    777 //    flagLookAheadStates   Very similar to flagAcceptingStates, above.
    778 //
    779 //-----------------------------------------------------------------------------
    780 void     RBBITableBuilder::flagLookAheadStates() {
    781     if (U_FAILURE(*fStatus)) {
    782         return;
    783     }
    784     UVector     lookAheadNodes(*fStatus);
    785     RBBINode    *lookAheadNode;
    786     int32_t     i;
    787     int32_t     n;
    788 
    789     fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
    790     if (U_FAILURE(*fStatus)) {
    791         return;
    792     }
    793     for (i=0; i<lookAheadNodes.size(); i++) {
    794         lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
    795 
    796         for (n=0; n<fDStates->size(); n++) {
    797             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
    798             if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
    799                 sd->fLookAhead = lookAheadNode->fVal;
    800             }
    801         }
    802     }
    803 }
    804 
    805 
    806 
    807 
    808 //-----------------------------------------------------------------------------
    809 //
    810 //    flagTaggedStates
    811 //
    812 //-----------------------------------------------------------------------------
    813 void     RBBITableBuilder::flagTaggedStates() {
    814     if (U_FAILURE(*fStatus)) {
    815         return;
    816     }
    817     UVector     tagNodes(*fStatus);
    818     RBBINode    *tagNode;
    819     int32_t     i;
    820     int32_t     n;
    821 
    822     if (U_FAILURE(*fStatus)) {
    823         return;
    824     }
    825     fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
    826     if (U_FAILURE(*fStatus)) {
    827         return;
    828     }
    829     for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
    830         tagNode = (RBBINode *)tagNodes.elementAt(i);
    831 
    832         for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
    833             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
    834             if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
    835                 sortedAdd(&sd->fTagVals, tagNode->fVal);
    836             }
    837         }
    838     }
    839 }
    840 
    841 
    842 
    843 
    844 //-----------------------------------------------------------------------------
    845 //
    846 //  mergeRuleStatusVals
    847 //
    848 //      Update the global table of rule status {tag} values
    849 //      The rule builder has a global vector of status values that are common
    850 //      for all tables.  Merge the ones from this table into the global set.
    851 //
    852 //-----------------------------------------------------------------------------
    853 void  RBBITableBuilder::mergeRuleStatusVals() {
    854     //
    855     //  The basic outline of what happens here is this...
    856     //
    857     //    for each state in this state table
    858     //       if the status tag list for this state is in the global statuses list
    859     //           record where and
    860     //           continue with the next state
    861     //       else
    862     //           add the tag list for this state to the global list.
    863     //
    864     int i;
    865     int n;
    866 
    867     // Pre-set a single tag of {0} into the table.
    868     //   We will need this as a default, for rule sets with no explicit tagging.
    869     if (fRB->fRuleStatusVals->size() == 0) {
    870         fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
    871         fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
    872     }
    873 
    874     //    For each state
    875     for (n=0; n<fDStates->size(); n++) {
    876         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
    877         UVector *thisStatesTagValues = sd->fTagVals;
    878         if (thisStatesTagValues == NULL) {
    879             // No tag values are explicitly associated with this state.
    880             //   Set the default tag value.
    881             sd->fTagsIdx = 0;
    882             continue;
    883         }
    884 
    885         // There are tag(s) associated with this state.
    886         //   fTagsIdx will be the index into the global tag list for this state's tag values.
    887         //   Initial value of -1 flags that we haven't got it set yet.
    888         sd->fTagsIdx = -1;
    889         int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
    890         int32_t  nextTagGroupStart = 0;
    891 
    892         // Loop runs once per group of tags in the global list
    893         while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
    894             thisTagGroupStart = nextTagGroupStart;
    895             nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
    896             if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
    897                 // The number of tags for this state is different from
    898                 //    the number of tags in this group from the global list.
    899                 //    Continue with the next group from the global list.
    900                 continue;
    901             }
    902             // The lengths match, go ahead and compare the actual tag values
    903             //    between this state and the group from the global list.
    904             for (i=0; i<thisStatesTagValues->size(); i++) {
    905                 if (thisStatesTagValues->elementAti(i) !=
    906                     fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
    907                     // Mismatch.
    908                     break;
    909                 }
    910             }
    911 
    912             if (i == thisStatesTagValues->size()) {
    913                 // We found a set of tag values in the global list that match
    914                 //   those for this state.  Use them.
    915                 sd->fTagsIdx = thisTagGroupStart;
    916                 break;
    917             }
    918         }
    919 
    920         if (sd->fTagsIdx == -1) {
    921             // No suitable entry in the global tag list already.  Add one
    922             sd->fTagsIdx = fRB->fRuleStatusVals->size();
    923             fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
    924             for (i=0; i<thisStatesTagValues->size(); i++) {
    925                 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
    926             }
    927         }
    928     }
    929 }
    930 
    931 
    932 
    933 
    934 
    935 
    936 
    937 //-----------------------------------------------------------------------------
    938 //
    939 //  sortedAdd  Add a value to a vector of sorted values (ints).
    940 //             Do not replicate entries; if the value is already there, do not
    941 //                add a second one.
    942 //             Lazily create the vector if it does not already exist.
    943 //
    944 //-----------------------------------------------------------------------------
    945 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
    946     int32_t i;
    947 
    948     if (*vector == NULL) {
    949         *vector = new UVector(*fStatus);
    950     }
    951     if (*vector == NULL || U_FAILURE(*fStatus)) {
    952         return;
    953     }
    954     UVector *vec = *vector;
    955     int32_t  vSize = vec->size();
    956     for (i=0; i<vSize; i++) {
    957         int32_t valAtI = vec->elementAti(i);
    958         if (valAtI == val) {
    959             // The value is already in the vector.  Don't add it again.
    960             return;
    961         }
    962         if (valAtI > val) {
    963             break;
    964         }
    965     }
    966     vec->insertElementAt(val, i, *fStatus);
    967 }
    968 
    969 
    970 
    971 //-----------------------------------------------------------------------------
    972 //
    973 //  setAdd     Set operation on UVector
    974 //             dest = dest union source
    975 //             Elements may only appear once and must be sorted.
    976 //
    977 //-----------------------------------------------------------------------------
    978 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
    979     int32_t destOriginalSize = dest->size();
    980     int32_t sourceSize       = source->size();
    981     int32_t di           = 0;
    982     MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
    983     void **destPtr, **sourcePtr;
    984     void **destLim, **sourceLim;
    985 
    986     if (destOriginalSize > destArray.getCapacity()) {
    987         if (destArray.resize(destOriginalSize) == NULL) {
    988             return;
    989         }
    990     }
    991     destPtr = destArray.getAlias();
    992     destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
    993 
    994     if (sourceSize > sourceArray.getCapacity()) {
    995         if (sourceArray.resize(sourceSize) == NULL) {
    996             return;
    997         }
    998     }
    999     sourcePtr = sourceArray.getAlias();
   1000     sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
   1001 
   1002     // Avoid multiple "get element" calls by getting the contents into arrays
   1003     (void) dest->toArray(destPtr);
   1004     (void) source->toArray(sourcePtr);
   1005 
   1006     dest->setSize(sourceSize+destOriginalSize, *fStatus);
   1007 
   1008     while (sourcePtr < sourceLim && destPtr < destLim) {
   1009         if (*destPtr == *sourcePtr) {
   1010             dest->setElementAt(*sourcePtr++, di++);
   1011             destPtr++;
   1012         }
   1013         // This check is required for machines with segmented memory, like i5/OS.
   1014         // Direct pointer comparison is not recommended.
   1015         else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
   1016             dest->setElementAt(*destPtr++, di++);
   1017         }
   1018         else { /* *sourcePtr < *destPtr */
   1019             dest->setElementAt(*sourcePtr++, di++);
   1020         }
   1021     }
   1022 
   1023     // At most one of these two cleanup loops will execute
   1024     while (destPtr < destLim) {
   1025         dest->setElementAt(*destPtr++, di++);
   1026     }
   1027     while (sourcePtr < sourceLim) {
   1028         dest->setElementAt(*sourcePtr++, di++);
   1029     }
   1030 
   1031     dest->setSize(di, *fStatus);
   1032 }
   1033 
   1034 
   1035 
   1036 //-----------------------------------------------------------------------------
   1037 //
   1038 //  setEqual    Set operation on UVector.
   1039 //              Compare for equality.
   1040 //              Elements must be sorted.
   1041 //
   1042 //-----------------------------------------------------------------------------
   1043 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
   1044     return a->equals(*b);
   1045 }
   1046 
   1047 
   1048 //-----------------------------------------------------------------------------
   1049 //
   1050 //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
   1051 //                 for each node in the tree.
   1052 //
   1053 //-----------------------------------------------------------------------------
   1054 #ifdef RBBI_DEBUG
   1055 void RBBITableBuilder::printPosSets(RBBINode *n) {
   1056     if (n==NULL) {
   1057         return;
   1058     }
   1059     printf("\n");
   1060     RBBINode::printNodeHeader();
   1061     RBBINode::printNode(n);
   1062     RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"TRUE":"FALSE");
   1063 
   1064     RBBIDebugPrintf("         firstpos:  ");
   1065     printSet(n->fFirstPosSet);
   1066 
   1067     RBBIDebugPrintf("         lastpos:   ");
   1068     printSet(n->fLastPosSet);
   1069 
   1070     RBBIDebugPrintf("         followpos: ");
   1071     printSet(n->fFollowPos);
   1072 
   1073     printPosSets(n->fLeftChild);
   1074     printPosSets(n->fRightChild);
   1075 }
   1076 #endif
   1077 
   1078 //
   1079 //    findDuplCharClassFrom()
   1080 //
   1081 bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
   1082     int32_t numStates = fDStates->size();
   1083     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
   1084 
   1085     for (; categories->first < numCols-1; categories->first++) {
   1086         for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
   1087             // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
   1088             uint16_t table_base = 0;
   1089             uint16_t table_dupl = 1;
   1090             for (int32_t state=0; state<numStates; state++) {
   1091                 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
   1092                 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
   1093                 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
   1094                 if (table_base != table_dupl) {
   1095                     break;
   1096                 }
   1097             }
   1098             if (table_base == table_dupl) {
   1099                 return true;
   1100             }
   1101         }
   1102     }
   1103     return false;
   1104 }
   1105 
   1106 
   1107 //
   1108 //    removeColumn()
   1109 //
   1110 void RBBITableBuilder::removeColumn(int32_t column) {
   1111     int32_t numStates = fDStates->size();
   1112     for (int32_t state=0; state<numStates; state++) {
   1113         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
   1114         U_ASSERT(column < sd->fDtran->size());
   1115         sd->fDtran->removeElementAt(column);
   1116     }
   1117 }
   1118 
   1119 /*
   1120  * findDuplicateState
   1121  */
   1122 bool RBBITableBuilder::findDuplicateState(IntPair *states) {
   1123     int32_t numStates = fDStates->size();
   1124     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
   1125 
   1126     for (; states->first<numStates-1; states->first++) {
   1127         RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
   1128         for (states->second=states->first+1; states->second<numStates; states->second++) {
   1129             RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
   1130             if (firstSD->fAccepting != duplSD->fAccepting ||
   1131                 firstSD->fLookAhead != duplSD->fLookAhead ||
   1132                 firstSD->fTagsIdx   != duplSD->fTagsIdx) {
   1133                 continue;
   1134             }
   1135             bool rowsMatch = true;
   1136             for (int32_t col=0; col < numCols; ++col) {
   1137                 int32_t firstVal = firstSD->fDtran->elementAti(col);
   1138                 int32_t duplVal = duplSD->fDtran->elementAti(col);
   1139                 if (!((firstVal == duplVal) ||
   1140                         ((firstVal == states->first || firstVal == states->second) &&
   1141                         (duplVal  == states->first || duplVal  == states->second)))) {
   1142                     rowsMatch = false;
   1143                     break;
   1144                 }
   1145             }
   1146             if (rowsMatch) {
   1147                 return true;
   1148             }
   1149         }
   1150     }
   1151     return false;
   1152 }
   1153 
   1154 
   1155 bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
   1156     int32_t numStates = fSafeTable->size();
   1157 
   1158     for (; states->first<numStates-1; states->first++) {
   1159         UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
   1160         for (states->second=states->first+1; states->second<numStates; states->second++) {
   1161             UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
   1162             bool rowsMatch = true;
   1163             int32_t numCols = firstRow->length();
   1164             for (int32_t col=0; col < numCols; ++col) {
   1165                 int32_t firstVal = firstRow->charAt(col);
   1166                 int32_t duplVal = duplRow->charAt(col);
   1167                 if (!((firstVal == duplVal) ||
   1168                         ((firstVal == states->first || firstVal == states->second) &&
   1169                         (duplVal  == states->first || duplVal  == states->second)))) {
   1170                     rowsMatch = false;
   1171                     break;
   1172                 }
   1173             }
   1174             if (rowsMatch) {
   1175                 return true;
   1176             }
   1177         }
   1178     }
   1179     return false;
   1180 }
   1181 
   1182 
   1183 void RBBITableBuilder::removeState(IntPair duplStates) {
   1184     const int32_t keepState = duplStates.first;
   1185     const int32_t duplState = duplStates.second;
   1186     U_ASSERT(keepState < duplState);
   1187     U_ASSERT(duplState < fDStates->size());
   1188 
   1189     RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
   1190     fDStates->removeElementAt(duplState);
   1191     delete duplSD;
   1192 
   1193     int32_t numStates = fDStates->size();
   1194     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
   1195     for (int32_t state=0; state<numStates; ++state) {
   1196         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
   1197         for (int32_t col=0; col<numCols; col++) {
   1198             int32_t existingVal = sd->fDtran->elementAti(col);
   1199             int32_t newVal = existingVal;
   1200             if (existingVal == duplState) {
   1201                 newVal = keepState;
   1202             } else if (existingVal > duplState) {
   1203                 newVal = existingVal - 1;
   1204             }
   1205             sd->fDtran->setElementAt(newVal, col);
   1206         }
   1207         if (sd->fAccepting == duplState) {
   1208             sd->fAccepting = keepState;
   1209         } else if (sd->fAccepting > duplState) {
   1210             sd->fAccepting--;
   1211         }
   1212         if (sd->fLookAhead == duplState) {
   1213             sd->fLookAhead = keepState;
   1214         } else if (sd->fLookAhead > duplState) {
   1215             sd->fLookAhead--;
   1216         }
   1217     }
   1218 }
   1219 
   1220 void RBBITableBuilder::removeSafeState(IntPair duplStates) {
   1221     const int32_t keepState = duplStates.first;
   1222     const int32_t duplState = duplStates.second;
   1223     U_ASSERT(keepState < duplState);
   1224     U_ASSERT(duplState < fSafeTable->size());
   1225 
   1226     fSafeTable->removeElementAt(duplState);   // Note that fSafeTable has a deleter function
   1227                                               // and will auto-delete the removed element.
   1228     int32_t numStates = fSafeTable->size();
   1229     for (int32_t state=0; state<numStates; ++state) {
   1230         UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
   1231         int32_t numCols = sd->length();
   1232         for (int32_t col=0; col<numCols; col++) {
   1233             int32_t existingVal = sd->charAt(col);
   1234             int32_t newVal = existingVal;
   1235             if (existingVal == duplState) {
   1236                 newVal = keepState;
   1237             } else if (existingVal > duplState) {
   1238                 newVal = existingVal - 1;
   1239             }
   1240             sd->setCharAt(col, static_cast<char16_t>(newVal));
   1241         }
   1242     }
   1243 }
   1244 
   1245 
   1246 /*
   1247  * RemoveDuplicateStates
   1248  */
   1249 int32_t RBBITableBuilder::removeDuplicateStates() {
   1250     IntPair dupls = {3, 0};
   1251     int32_t numStatesRemoved = 0;
   1252 
   1253     while (findDuplicateState(&dupls)) {
   1254         // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
   1255         removeState(dupls);
   1256         ++numStatesRemoved;
   1257     }
   1258     return numStatesRemoved;
   1259 }
   1260 
   1261 
   1262 //-----------------------------------------------------------------------------
   1263 //
   1264 //   getTableSize()    Calculate the size of the runtime form of this
   1265 //                     state transition table.
   1266 //
   1267 //-----------------------------------------------------------------------------
   1268 int32_t  RBBITableBuilder::getTableSize() const {
   1269     int32_t    size = 0;
   1270     int32_t    numRows;
   1271     int32_t    numCols;
   1272     int32_t    rowSize;
   1273 
   1274     if (fTree == NULL) {
   1275         return 0;
   1276     }
   1277 
   1278     size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
   1279 
   1280     numRows = fDStates->size();
   1281     numCols = fRB->fSetBuilder->getNumCharCategories();
   1282 
   1283     rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
   1284     size   += numRows * rowSize;
   1285     return size;
   1286 }
   1287 
   1288 
   1289 //-----------------------------------------------------------------------------
   1290 //
   1291 //   exportTable()    export the state transition table in the format required
   1292 //                    by the runtime engine.  getTableSize() bytes of memory
   1293 //                    must be available at the output address "where".
   1294 //
   1295 //-----------------------------------------------------------------------------
   1296 void RBBITableBuilder::exportTable(void *where) {
   1297     RBBIStateTable    *table = (RBBIStateTable *)where;
   1298     uint32_t           state;
   1299     int                col;
   1300 
   1301     if (U_FAILURE(*fStatus) || fTree == NULL) {
   1302         return;
   1303     }
   1304 
   1305     int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
   1306     if (catCount > 0x7fff ||
   1307         fDStates->size() > 0x7fff) {
   1308         *fStatus = U_BRK_INTERNAL_ERROR;
   1309         return;
   1310     }
   1311 
   1312     table->fRowLen    = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
   1313     table->fNumStates = fDStates->size();
   1314     table->fFlags     = 0;
   1315     if (fRB->fLookAheadHardBreak) {
   1316         table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
   1317     }
   1318     if (fRB->fSetBuilder->sawBOF()) {
   1319         table->fFlags  |= RBBI_BOF_REQUIRED;
   1320     }
   1321     table->fReserved  = 0;
   1322 
   1323     for (state=0; state<table->fNumStates; state++) {
   1324         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
   1325         RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
   1326         U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
   1327         U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
   1328         row->fAccepting = (int16_t)sd->fAccepting;
   1329         row->fLookAhead = (int16_t)sd->fLookAhead;
   1330         row->fTagIdx    = (int16_t)sd->fTagsIdx;
   1331         for (col=0; col<catCount; col++) {
   1332             row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
   1333         }
   1334     }
   1335 }
   1336 
   1337 
   1338 /**
   1339  *   Synthesize a safe state table from the main state table.
   1340  */
   1341 void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
   1342     // The safe table creation has three steps:
   1343 
   1344     // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
   1345     // following the pair do not depend on context or state before the pair. To test
   1346     // whether a pair is safe, run it through the main forward state table, starting
   1347     // from each state. If the the final state is the same, no matter what the starting state,
   1348     // the pair is safe.
   1349     //
   1350     // 2. Build a state table that recognizes the safe pairs. It's similar to their
   1351     // forward table, with a column for each input character [class], and a row for
   1352     // each state. Row 1 is the start state, and row 0 is the stop state. Initially
   1353     // create an additional state for each input character category; being in
   1354     // one of these states means that the character has been seen, and is potentially
   1355     // the first of a pair. In each of these rows, the entry for the second character
   1356     // of a safe pair is set to the stop state (0), indicating that a match was found.
   1357     // All other table entries are set to the state corresponding the current input
   1358     // character, allowing that charcter to be the of a start following pair.
   1359     //
   1360     // Because the safe rules are to be run in reverse, moving backwards in the text,
   1361     // the first and second pair categories are swapped when building the table.
   1362     //
   1363     // 3. Compress the table. There are typically many rows (states) that are
   1364     // equivalent - that have zeroes (match completed) in the same columns -
   1365     // and can be folded together.
   1366 
   1367     // Each safe pair is stored as two UChars in the safePair string.
   1368     UnicodeString safePairs;
   1369 
   1370     int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
   1371     int32_t numStates = fDStates->size();
   1372 
   1373     for (int32_t c1=0; c1<numCharClasses; ++c1) {
   1374         for (int32_t c2=0; c2 < numCharClasses; ++c2) {
   1375             int32_t wantedEndState = -1;
   1376             int32_t endState = 0;
   1377             for (int32_t startState = 1; startState < numStates; ++startState) {
   1378                 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
   1379                 int32_t s2 = startStateD->fDtran->elementAti(c1);
   1380                 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
   1381                 endState = s2StateD->fDtran->elementAti(c2);
   1382                 if (wantedEndState < 0) {
   1383                     wantedEndState = endState;
   1384                 } else {
   1385                     if (wantedEndState != endState) {
   1386                         break;
   1387                     }
   1388                 }
   1389             }
   1390             if (wantedEndState == endState) {
   1391                 safePairs.append((char16_t)c1);
   1392                 safePairs.append((char16_t)c2);
   1393                 // printf("(%d, %d) ", c1, c2);
   1394             }
   1395         }
   1396         // printf("\n");
   1397     }
   1398 
   1399     // Populate the initial safe table.
   1400     // The table as a whole is UVector<UnicodeString>
   1401     // Each row is represented by a UnicodeString, being used as a Vector<int16>.
   1402     // Row 0 is the stop state.
   1403     // Row 1 is the start sate.
   1404     // Row 2 and beyond are other states, initially one per char class, but
   1405     //   after initial construction, many of the states will be combined, compacting the table.
   1406     // The String holds the nextState data only. The four leading fields of a row, fAccepting,
   1407     // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
   1408 
   1409     U_ASSERT(fSafeTable == nullptr);
   1410     fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
   1411     for (int32_t row=0; row<numCharClasses + 2; ++row) {
   1412         fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
   1413     }
   1414 
   1415     // From the start state, each input char class transitions to the state for that input.
   1416     UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
   1417     for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
   1418         // Note: +2 for the start & stop state.
   1419         startState.setCharAt(charClass, static_cast<char16_t>(charClass+2));
   1420     }
   1421 
   1422     // Initially make every other state table row look like the start state row,
   1423     for (int32_t row=2; row<numCharClasses+2; ++row) {
   1424         UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
   1425         rowState = startState;   // UnicodeString assignment, copies contents.
   1426     }
   1427 
   1428     // Run through the safe pairs, set the next state to zero when pair has been seen.
   1429     // Zero being the stop state, meaning we found a safe point.
   1430     for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
   1431         int32_t c1 = safePairs.charAt(pairIdx);
   1432         int32_t c2 = safePairs.charAt(pairIdx + 1);
   1433 
   1434         UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
   1435         rowState.setCharAt(c1, 0);
   1436     }
   1437 
   1438     // Remove duplicate or redundant rows from the table.
   1439     IntPair states = {1, 0};
   1440     while (findDuplicateSafeState(&states)) {
   1441         // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
   1442         removeSafeState(states);
   1443     }
   1444 }
   1445 
   1446 
   1447 //-----------------------------------------------------------------------------
   1448 //
   1449 //   getSafeTableSize()    Calculate the size of the runtime form of this
   1450 //                         safe state table.
   1451 //
   1452 //-----------------------------------------------------------------------------
   1453 int32_t  RBBITableBuilder::getSafeTableSize() const {
   1454     int32_t    size = 0;
   1455     int32_t    numRows;
   1456     int32_t    numCols;
   1457     int32_t    rowSize;
   1458 
   1459     if (fSafeTable == nullptr) {
   1460         return 0;
   1461     }
   1462 
   1463     size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
   1464 
   1465     numRows = fSafeTable->size();
   1466     numCols = fRB->fSetBuilder->getNumCharCategories();
   1467 
   1468     rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
   1469     size   += numRows * rowSize;
   1470     return size;
   1471 }
   1472 
   1473 
   1474 //-----------------------------------------------------------------------------
   1475 //
   1476 //   exportSafeTable()   export the state transition table in the format required
   1477 //                       by the runtime engine.  getTableSize() bytes of memory
   1478 //                       must be available at the output address "where".
   1479 //
   1480 //-----------------------------------------------------------------------------
   1481 void RBBITableBuilder::exportSafeTable(void *where) {
   1482     RBBIStateTable    *table = (RBBIStateTable *)where;
   1483     uint32_t           state;
   1484     int                col;
   1485 
   1486     if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
   1487         return;
   1488     }
   1489 
   1490     int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
   1491     if (catCount > 0x7fff ||
   1492             fSafeTable->size() > 0x7fff) {
   1493         *fStatus = U_BRK_INTERNAL_ERROR;
   1494         return;
   1495     }
   1496 
   1497     table->fRowLen    = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
   1498     table->fNumStates = fSafeTable->size();
   1499     table->fFlags     = 0;
   1500     table->fReserved  = 0;
   1501 
   1502     for (state=0; state<table->fNumStates; state++) {
   1503         UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
   1504         RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
   1505         row->fAccepting = 0;
   1506         row->fLookAhead = 0;
   1507         row->fTagIdx    = 0;
   1508         row->fReserved  = 0;
   1509         for (col=0; col<catCount; col++) {
   1510             row->fNextState[col] = rowString->charAt(col);
   1511         }
   1512     }
   1513 }
   1514 
   1515 
   1516 
   1517 
   1518 //-----------------------------------------------------------------------------
   1519 //
   1520 //   printSet    Debug function.   Print the contents of a UVector
   1521 //
   1522 //-----------------------------------------------------------------------------
   1523 #ifdef RBBI_DEBUG
   1524 void RBBITableBuilder::printSet(UVector *s) {
   1525     int32_t  i;
   1526     for (i=0; i<s->size(); i++) {
   1527         const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
   1528         RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
   1529     }
   1530     RBBIDebugPrintf("\n");
   1531 }
   1532 #endif
   1533 
   1534 
   1535 //-----------------------------------------------------------------------------
   1536 //
   1537 //   printStates    Debug Function.  Dump the fully constructed state transition table.
   1538 //
   1539 //-----------------------------------------------------------------------------
   1540 #ifdef RBBI_DEBUG
   1541 void RBBITableBuilder::printStates() {
   1542     int     c;    // input "character"
   1543     int     n;    // state number
   1544 
   1545     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
   1546     RBBIDebugPrintf("      | Acc  LA    Tag");
   1547     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
   1548         RBBIDebugPrintf(" %2d", c);
   1549     }
   1550     RBBIDebugPrintf("\n");
   1551     RBBIDebugPrintf("      |---------------");
   1552     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
   1553         RBBIDebugPrintf("---");
   1554     }
   1555     RBBIDebugPrintf("\n");
   1556 
   1557     for (n=0; n<fDStates->size(); n++) {
   1558         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
   1559         RBBIDebugPrintf("  %3d | " , n);
   1560         RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
   1561         for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
   1562             RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
   1563         }
   1564         RBBIDebugPrintf("\n");
   1565     }
   1566     RBBIDebugPrintf("\n\n");
   1567 }
   1568 #endif
   1569 
   1570 
   1571 //-----------------------------------------------------------------------------
   1572 //
   1573 //   printSafeTable    Debug Function.  Dump the fully constructed safe table.
   1574 //
   1575 //-----------------------------------------------------------------------------
   1576 #ifdef RBBI_DEBUG
   1577 void RBBITableBuilder::printReverseTable() {
   1578     int     c;    // input "character"
   1579     int     n;    // state number
   1580 
   1581     RBBIDebugPrintf("    Safe Reverse Table \n");
   1582     if (fSafeTable == nullptr) {
   1583         RBBIDebugPrintf("   --- nullptr ---\n");
   1584         return;
   1585     }
   1586     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
   1587     RBBIDebugPrintf("      | Acc  LA    Tag");
   1588     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
   1589         RBBIDebugPrintf(" %2d", c);
   1590     }
   1591     RBBIDebugPrintf("\n");
   1592     RBBIDebugPrintf("      |---------------");
   1593     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
   1594         RBBIDebugPrintf("---");
   1595     }
   1596     RBBIDebugPrintf("\n");
   1597 
   1598     for (n=0; n<fSafeTable->size(); n++) {
   1599         UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
   1600         RBBIDebugPrintf("  %3d | " , n);
   1601         RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0);  // Accepting, LookAhead, Tags
   1602         for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
   1603             RBBIDebugPrintf(" %2d", rowString->charAt(c));
   1604         }
   1605         RBBIDebugPrintf("\n");
   1606     }
   1607     RBBIDebugPrintf("\n\n");
   1608 }
   1609 #endif
   1610 
   1611 
   1612 
   1613 //-----------------------------------------------------------------------------
   1614 //
   1615 //   printRuleStatusTable    Debug Function.  Dump the common rule status table
   1616 //
   1617 //-----------------------------------------------------------------------------
   1618 #ifdef RBBI_DEBUG
   1619 void RBBITableBuilder::printRuleStatusTable() {
   1620     int32_t  thisRecord = 0;
   1621     int32_t  nextRecord = 0;
   1622     int      i;
   1623     UVector  *tbl = fRB->fRuleStatusVals;
   1624 
   1625     RBBIDebugPrintf("index |  tags \n");
   1626     RBBIDebugPrintf("-------------------\n");
   1627 
   1628     while (nextRecord < tbl->size()) {
   1629         thisRecord = nextRecord;
   1630         nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
   1631         RBBIDebugPrintf("%4d   ", thisRecord);
   1632         for (i=thisRecord+1; i<nextRecord; i++) {
   1633             RBBIDebugPrintf("  %5d", tbl->elementAti(i));
   1634         }
   1635         RBBIDebugPrintf("\n");
   1636     }
   1637     RBBIDebugPrintf("\n\n");
   1638 }
   1639 #endif
   1640 
   1641 
   1642 //-----------------------------------------------------------------------------
   1643 //
   1644 //   RBBIStateDescriptor     Methods.  This is a very struct-like class
   1645 //                           Most access is directly to the fields.
   1646 //
   1647 //-----------------------------------------------------------------------------
   1648 
   1649 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
   1650     fMarked    = FALSE;
   1651     fAccepting = 0;
   1652     fLookAhead = 0;
   1653     fTagsIdx   = 0;
   1654     fTagVals   = NULL;
   1655     fPositions = NULL;
   1656     fDtran     = NULL;
   1657 
   1658     fDtran     = new UVector32(lastInputSymbol+1, *fStatus);
   1659     if (U_FAILURE(*fStatus)) {
   1660         return;
   1661     }
   1662     if (fDtran == NULL) {
   1663         *fStatus = U_MEMORY_ALLOCATION_ERROR;
   1664         return;
   1665     }
   1666     fDtran->setSize(lastInputSymbol+1);    // fDtran needs to be pre-sized.
   1667                                            //   It is indexed by input symbols, and will
   1668                                            //   hold  the next state number for each
   1669                                            //   symbol.
   1670 }
   1671 
   1672 
   1673 RBBIStateDescriptor::~RBBIStateDescriptor() {
   1674     delete       fPositions;
   1675     delete       fDtran;
   1676     delete       fTagVals;
   1677     fPositions = NULL;
   1678     fDtran     = NULL;
   1679     fTagVals   = NULL;
   1680 }
   1681 
   1682 U_NAMESPACE_END
   1683 
   1684 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1685