Home | History | Annotate | Download | only in common
      1 /*
      2 **********************************************************************
      3 *   Copyright (c) 2002-2009, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 */
      7 //
      8 //  rbbitblb.cpp
      9 //
     10 
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_BREAK_ITERATION
     15 
     16 #include "unicode/unistr.h"
     17 #include "rbbitblb.h"
     18 #include "rbbirb.h"
     19 #include "rbbisetb.h"
     20 #include "rbbidata.h"
     21 #include "cstring.h"
     22 #include "uassert.h"
     23 #include "cmemory.h"
     24 
     25 U_NAMESPACE_BEGIN
     26 
     27 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
     28  fTree(*rootNode) {
     29     fRB                 = rb;
     30     fStatus             = fRB->fStatus;
     31     UErrorCode status   = U_ZERO_ERROR;
     32     fDStates            = new UVector(status);
     33     if (U_FAILURE(*fStatus)) {
     34         return;
     35     }
     36     if (U_FAILURE(status)) {
     37         *fStatus = status;
     38         return;
     39     }
     40     if (fDStates == NULL) {
     41         *fStatus = U_MEMORY_ALLOCATION_ERROR;;
     42     }
     43 }
     44 
     45 
     46 
     47 RBBITableBuilder::~RBBITableBuilder() {
     48     int i;
     49     for (i=0; i<fDStates->size(); i++) {
     50         delete (RBBIStateDescriptor *)fDStates->elementAt(i);
     51     }
     52     delete   fDStates;
     53 }
     54 
     55 
     56 //-----------------------------------------------------------------------------
     57 //
     58 //   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion
     59 //                               table from the RBBI rules parse tree.
     60 //
     61 //-----------------------------------------------------------------------------
     62 void  RBBITableBuilder::build() {
     63 
     64     if (U_FAILURE(*fStatus)) {
     65         return;
     66     }
     67 
     68     // If there were no rules, just return.  This situation can easily arise
     69     //   for the reverse rules.
     70     if (fTree==NULL) {
     71         return;
     72     }
     73 
     74     //
     75     // Walk through the tree, replacing any references to $variables with a copy of the
     76     //   parse tree for the substition expression.
     77     //
     78     fTree = fTree->flattenVariables();
     79 #ifdef RBBI_DEBUG
     80     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
     81         RBBIDebugPuts("Parse tree after flattening variable references.");
     82         fTree->printTree(TRUE);
     83     }
     84 #endif
     85 
     86     //
     87     // If the rules contained any references to {bof}
     88     //   add a {bof} <cat> <former root of tree> to the
     89     //   tree.  Means that all matches must start out with the
     90     //   {bof} fake character.
     91     //
     92     if (fRB->fSetBuilder->sawBOF()) {
     93         RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
     94         RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
     95         // Delete and exit if memory allocation failed.
     96         if (bofTop == NULL || bofLeaf == NULL) {
     97             *fStatus = U_MEMORY_ALLOCATION_ERROR;
     98             delete bofTop;
     99             delete bofLeaf;
    100             return;
    101         }
    102         bofTop->fLeftChild  = bofLeaf;
    103         bofTop->fRightChild = fTree;
    104         bofLeaf->fParent    = bofTop;
    105         bofLeaf->fVal       = 2;      // Reserved value for {bof}.
    106         fTree               = bofTop;
    107     }
    108 
    109     //
    110     // Add a unique right-end marker to the expression.
    111     //   Appears as a cat-node, left child being the original tree,
    112     //   right child being the end marker.
    113     //
    114     RBBINode *cn = new RBBINode(RBBINode::opCat);
    115     // Exit if memory allocation failed.
    116     if (cn == NULL) {
    117         *fStatus = U_MEMORY_ALLOCATION_ERROR;
    118         return;
    119     }
    120     cn->fLeftChild = fTree;
    121     fTree->fParent = cn;
    122     cn->fRightChild = new RBBINode(RBBINode::endMark);
    123     // Delete and exit if memory allocation failed.
    124     if (cn->fRightChild == NULL) {
    125         *fStatus = U_MEMORY_ALLOCATION_ERROR;
    126         delete cn;
    127         return;
    128     }
    129     cn->fRightChild->fParent = cn;
    130     fTree = cn;
    131 
    132     //
    133     //  Replace all references to UnicodeSets with the tree for the equivalent
    134     //      expression.
    135     //
    136     fTree->flattenSets();
    137 #ifdef RBBI_DEBUG
    138     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
    139         RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
    140         fTree->printTree(TRUE);
    141     }
    142 #endif
    143 
    144 
    145     //
    146     // calculate the functions nullable, firstpos, lastpos and followpos on
    147     // nodes in the parse tree.
    148     //    See the alogrithm description in Aho.
    149     //    Understanding how this works by looking at the code alone will be
    150     //       nearly impossible.
    151     //
    152     calcNullable(fTree);
    153     calcFirstPos(fTree);
    154     calcLastPos(fTree);
    155     calcFollowPos(fTree);
    156     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
    157         RBBIDebugPuts("\n");
    158         printPosSets(fTree);
    159     }
    160 
    161     //
    162     //  For "chained" rules, modify the followPos sets
    163     //
    164     if (fRB->fChainRules) {
    165         calcChainedFollowPos(fTree);
    166     }
    167 
    168     //
    169     //  BOF (start of input) test fixup.
    170     //
    171     if (fRB->fSetBuilder->sawBOF()) {
    172         bofFixup();
    173     }
    174 
    175     //
    176     // Build the DFA state transition tables.
    177     //
    178     buildStateTable();
    179     flagAcceptingStates();
    180     flagLookAheadStates();
    181     flagTaggedStates();
    182 
    183     //
    184     // Update the global table of rule status {tag} values
    185     // The rule builder has a global vector of status values that are common
    186     //    for all tables.  Merge the ones from this table into the global set.
    187     //
    188     mergeRuleStatusVals();
    189 
    190     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
    191 }
    192 
    193 
    194 
    195 //-----------------------------------------------------------------------------
    196 //
    197 //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
    198 //
    199 //-----------------------------------------------------------------------------
    200 void RBBITableBuilder::calcNullable(RBBINode *n) {
    201     if (n == NULL) {
    202         return;
    203     }
    204     if (n->fType == RBBINode::setRef ||
    205         n->fType == RBBINode::endMark ) {
    206         // These are non-empty leaf node types.
    207         n->fNullable = FALSE;
    208         return;
    209     }
    210 
    211     if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
    212         // Lookahead marker node.  It's a leaf, so no recursion on children.
    213         // It's nullable because it does not match any literal text from the input stream.
    214         n->fNullable = TRUE;
    215         return;
    216     }
    217 
    218 
    219     // The node is not a leaf.
    220     //  Calculate nullable on its children.
    221     calcNullable(n->fLeftChild);
    222     calcNullable(n->fRightChild);
    223 
    224     // Apply functions from table 3.40 in Aho
    225     if (n->fType == RBBINode::opOr) {
    226         n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
    227     }
    228     else if (n->fType == RBBINode::opCat) {
    229         n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
    230     }
    231     else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
    232         n->fNullable = TRUE;
    233     }
    234     else {
    235         n->fNullable = FALSE;
    236     }
    237 }
    238 
    239 
    240 
    241 
    242 //-----------------------------------------------------------------------------
    243 //
    244 //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
    245 //
    246 //-----------------------------------------------------------------------------
    247 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
    248     if (n == NULL) {
    249         return;
    250     }
    251     if (n->fType == RBBINode::leafChar  ||
    252         n->fType == RBBINode::endMark   ||
    253         n->fType == RBBINode::lookAhead ||
    254         n->fType == RBBINode::tag) {
    255         // These are non-empty leaf node types.
    256         // Note: In order to maintain the sort invariant on the set,
    257         // this function should only be called on a node whose set is
    258         // empty to start with.
    259         n->fFirstPosSet->addElement(n, *fStatus);
    260         return;
    261     }
    262 
    263     // The node is not a leaf.
    264     //  Calculate firstPos on its children.
    265     calcFirstPos(n->fLeftChild);
    266     calcFirstPos(n->fRightChild);
    267 
    268     // Apply functions from table 3.40 in Aho
    269     if (n->fType == RBBINode::opOr) {
    270         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
    271         setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
    272     }
    273     else if (n->fType == RBBINode::opCat) {
    274         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
    275         if (n->fLeftChild->fNullable) {
    276             setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
    277         }
    278     }
    279     else if (n->fType == RBBINode::opStar ||
    280              n->fType == RBBINode::opQuestion ||
    281              n->fType == RBBINode::opPlus) {
    282         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
    283     }
    284 }
    285 
    286 
    287 
    288 //-----------------------------------------------------------------------------
    289 //
    290 //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
    291 //
    292 //-----------------------------------------------------------------------------
    293 void RBBITableBuilder::calcLastPos(RBBINode *n) {
    294     if (n == NULL) {
    295         return;
    296     }
    297     if (n->fType == RBBINode::leafChar  ||
    298         n->fType == RBBINode::endMark   ||
    299         n->fType == RBBINode::lookAhead ||
    300         n->fType == RBBINode::tag) {
    301         // These are non-empty leaf node types.
    302         // Note: In order to maintain the sort invariant on the set,
    303         // this function should only be called on a node whose set is
    304         // empty to start with.
    305         n->fLastPosSet->addElement(n, *fStatus);
    306         return;
    307     }
    308 
    309     // The node is not a leaf.
    310     //  Calculate lastPos on its children.
    311     calcLastPos(n->fLeftChild);
    312     calcLastPos(n->fRightChild);
    313 
    314     // Apply functions from table 3.40 in Aho
    315     if (n->fType == RBBINode::opOr) {
    316         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
    317         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
    318     }
    319     else if (n->fType == RBBINode::opCat) {
    320         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
    321         if (n->fRightChild->fNullable) {
    322             setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
    323         }
    324     }
    325     else if (n->fType == RBBINode::opStar     ||
    326              n->fType == RBBINode::opQuestion ||
    327              n->fType == RBBINode::opPlus) {
    328         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
    329     }
    330 }
    331 
    332 
    333 
    334 //-----------------------------------------------------------------------------
    335 //
    336 //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
    337 //
    338 //-----------------------------------------------------------------------------
    339 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
    340     if (n == NULL ||
    341         n->fType == RBBINode::leafChar ||
    342         n->fType == RBBINode::endMark) {
    343         return;
    344     }
    345 
    346     calcFollowPos(n->fLeftChild);
    347     calcFollowPos(n->fRightChild);
    348 
    349     // Aho rule #1
    350     if (n->fType == RBBINode::opCat) {
    351         RBBINode *i;   // is 'i' in Aho's description
    352         uint32_t     ix;
    353 
    354         UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
    355 
    356         for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
    357             i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
    358             setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
    359         }
    360     }
    361 
    362     // Aho rule #2
    363     if (n->fType == RBBINode::opStar ||
    364         n->fType == RBBINode::opPlus) {
    365         RBBINode   *i;  // again, n and i are the names from Aho's description.
    366         uint32_t    ix;
    367 
    368         for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
    369             i = (RBBINode *)n->fLastPosSet->elementAt(ix);
    370             setAdd(i->fFollowPos, n->fFirstPosSet);
    371         }
    372     }
    373 
    374 
    375 
    376 }
    377 
    378 
    379 //-----------------------------------------------------------------------------
    380 //
    381 //   calcChainedFollowPos.    Modify the previously calculated followPos sets
    382 //                            to implement rule chaining.  NOT described by Aho
    383 //
    384 //-----------------------------------------------------------------------------
    385 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
    386 
    387     UVector         endMarkerNodes(*fStatus);
    388     UVector         leafNodes(*fStatus);
    389     int32_t         i;
    390 
    391     if (U_FAILURE(*fStatus)) {
    392         return;
    393     }
    394 
    395     // get a list of all endmarker nodes.
    396     tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
    397 
    398     // get a list all leaf nodes
    399     tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
    400     if (U_FAILURE(*fStatus)) {
    401         return;
    402     }
    403 
    404     // Get all nodes that can be the start a match, which is FirstPosition()
    405     // of the portion of the tree corresponding to user-written rules.
    406     // See the tree description in bofFixup().
    407     RBBINode *userRuleRoot = tree;
    408     if (fRB->fSetBuilder->sawBOF()) {
    409         userRuleRoot = tree->fLeftChild->fRightChild;
    410     }
    411     U_ASSERT(userRuleRoot != NULL);
    412     UVector *matchStartNodes = userRuleRoot->fFirstPosSet;
    413 
    414 
    415     // Iteratate over all leaf nodes,
    416     //
    417     int32_t  endNodeIx;
    418     int32_t  startNodeIx;
    419 
    420     for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
    421         RBBINode *tNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
    422         RBBINode *endNode = NULL;
    423 
    424         // Identify leaf nodes that correspond to overall rule match positions.
    425         //   These include an endMarkerNode in their followPos sets.
    426         for (i=0; i<endMarkerNodes.size(); i++) {
    427             if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
    428                 endNode = tNode;
    429                 break;
    430             }
    431         }
    432         if (endNode == NULL) {
    433             // node wasn't an end node.  Try again with the next.
    434             continue;
    435         }
    436 
    437         // We've got a node that can end a match.
    438 
    439         // Line Break Specific hack:  If this node's val correspond to the $CM char class,
    440         //                            don't chain from it.
    441         // TODO:  Add rule syntax for this behavior, get specifics out of here and
    442         //        into the rule file.
    443         if (fRB->fLBCMNoChain) {
    444             UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
    445             if (c != -1) {
    446                 // c == -1 occurs with sets containing only the {eof} marker string.
    447                 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
    448                 if (cLBProp == U_LB_COMBINING_MARK) {
    449                     continue;
    450                 }
    451             }
    452         }
    453 
    454 
    455         // Now iterate over the nodes that can start a match, looking for ones
    456         //   with the same char class as our ending node.
    457         RBBINode *startNode;
    458         for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
    459             startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
    460             if (startNode->fType != RBBINode::leafChar) {
    461                 continue;
    462             }
    463 
    464             if (endNode->fVal == startNode->fVal) {
    465                 // The end val (character class) of one possible match is the
    466                 //   same as the start of another.
    467 
    468                 // Add all nodes from the followPos of the start node to the
    469                 //  followPos set of the end node, which will have the effect of
    470                 //  letting matches transition from a match state at endNode
    471                 //  to the second char of a match starting with startNode.
    472                 setAdd(endNode->fFollowPos, startNode->fFollowPos);
    473             }
    474         }
    475     }
    476 }
    477 
    478 
    479 //-----------------------------------------------------------------------------
    480 //
    481 //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
    482 //                Do an swizzle similar to chaining, modifying the followPos set of
    483 //                the bofNode to include the followPos nodes from other {bot} nodes
    484 //                scattered through the tree.
    485 //
    486 //                This function has much in common with calcChainedFollowPos().
    487 //
    488 //-----------------------------------------------------------------------------
    489 void RBBITableBuilder::bofFixup() {
    490 
    491     if (U_FAILURE(*fStatus)) {
    492         return;
    493     }
    494 
    495     //   The parse tree looks like this ...
    496     //         fTree root  --->       <cat>
    497     //                               /     \       .
    498     //                            <cat>   <#end node>
    499     //                           /     \  .
    500     //                     <bofNode>   rest
    501     //                               of tree
    502     //
    503     //    We will be adding things to the followPos set of the <bofNode>
    504     //
    505     RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
    506     U_ASSERT(bofNode->fType == RBBINode::leafChar);
    507     U_ASSERT(bofNode->fVal == 2);
    508 
    509     // Get all nodes that can be the start a match of the user-written rules
    510     //  (excluding the fake bofNode)
    511     //  We want the nodes that can start a match in the
    512     //     part labeled "rest of tree"
    513     //
    514     UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
    515 
    516     RBBINode *startNode;
    517     int       startNodeIx;
    518     for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
    519         startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
    520         if (startNode->fType != RBBINode::leafChar) {
    521             continue;
    522         }
    523 
    524         if (startNode->fVal == bofNode->fVal) {
    525             //  We found a leaf node corresponding to a {bof} that was
    526             //    explicitly written into a rule.
    527             //  Add everything from the followPos set of this node to the
    528             //    followPos set of the fake bofNode at the start of the tree.
    529             //
    530             setAdd(bofNode->fFollowPos, startNode->fFollowPos);
    531         }
    532     }
    533 }
    534 
    535 //-----------------------------------------------------------------------------
    536 //
    537 //   buildStateTable()    Determine the set of runtime DFA states and the
    538 //                        transition tables for these states, by the algorithm
    539 //                        of fig. 3.44 in Aho.
    540 //
    541 //                        Most of the comments are quotes of Aho's psuedo-code.
    542 //
    543 //-----------------------------------------------------------------------------
    544 void RBBITableBuilder::buildStateTable() {
    545     if (U_FAILURE(*fStatus)) {
    546         return;
    547     }
    548     RBBIStateDescriptor *failState;
    549     // Set it to NULL to avoid uninitialized warning
    550     RBBIStateDescriptor *initialState = NULL;
    551     //
    552     // Add a dummy state 0 - the stop state.  Not from Aho.
    553     int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
    554     failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
    555     if (failState == NULL) {
    556         *fStatus = U_MEMORY_ALLOCATION_ERROR;
    557         goto ExitBuildSTdeleteall;
    558     }
    559     failState->fPositions = new UVector(*fStatus);
    560     if (failState->fPositions == NULL) {
    561         *fStatus = U_MEMORY_ALLOCATION_ERROR;
    562     }
    563     if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
    564         goto ExitBuildSTdeleteall;
    565     }
    566     fDStates->addElement(failState, *fStatus);
    567     if (U_FAILURE(*fStatus)) {
    568         goto ExitBuildSTdeleteall;
    569     }
    570 
    571     // initially, the only unmarked state in Dstates is firstpos(root),
    572     //       where toot is the root of the syntax tree for (r)#;
    573     initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
    574     if (initialState == NULL) {
    575         *fStatus = U_MEMORY_ALLOCATION_ERROR;
    576     }
    577     if (U_FAILURE(*fStatus)) {
    578         goto ExitBuildSTdeleteall;
    579     }
    580     initialState->fPositions = new UVector(*fStatus);
    581     if (initialState->fPositions == NULL) {
    582         *fStatus = U_MEMORY_ALLOCATION_ERROR;
    583     }
    584     if (U_FAILURE(*fStatus)) {
    585         goto ExitBuildSTdeleteall;
    586     }
    587     setAdd(initialState->fPositions, fTree->fFirstPosSet);
    588     fDStates->addElement(initialState, *fStatus);
    589     if (U_FAILURE(*fStatus)) {
    590         goto ExitBuildSTdeleteall;
    591     }
    592 
    593     // while there is an unmarked state T in Dstates do begin
    594     for (;;) {
    595         RBBIStateDescriptor *T = NULL;
    596         int32_t              tx;
    597         for (tx=1; tx<fDStates->size(); tx++) {
    598             RBBIStateDescriptor *temp;
    599             temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
    600             if (temp->fMarked == FALSE) {
    601                 T = temp;
    602                 break;
    603             }
    604         }
    605         if (T == NULL) {
    606             break;
    607         }
    608 
    609         // mark T;
    610         T->fMarked = TRUE;
    611 
    612         // for each input symbol a do begin
    613         int32_t  a;
    614         for (a = 1; a<=lastInputSymbol; a++) {
    615             // let U be the set of positions that are in followpos(p)
    616             //    for some position p in T
    617             //    such that the symbol at position p is a;
    618             UVector    *U = NULL;
    619             RBBINode   *p;
    620             int32_t     px;
    621             for (px=0; px<T->fPositions->size(); px++) {
    622                 p = (RBBINode *)T->fPositions->elementAt(px);
    623                 if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
    624                     if (U == NULL) {
    625                         U = new UVector(*fStatus);
    626                         if (U == NULL) {
    627                         	*fStatus = U_MEMORY_ALLOCATION_ERROR;
    628                         	goto ExitBuildSTdeleteall;
    629                         }
    630                     }
    631                     setAdd(U, p->fFollowPos);
    632                 }
    633             }
    634 
    635             // if U is not empty and not in DStates then
    636             int32_t  ux = 0;
    637             UBool    UinDstates = FALSE;
    638             if (U != NULL) {
    639                 U_ASSERT(U->size() > 0);
    640                 int  ix;
    641                 for (ix=0; ix<fDStates->size(); ix++) {
    642                     RBBIStateDescriptor *temp2;
    643                     temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
    644                     if (setEquals(U, temp2->fPositions)) {
    645                         delete U;
    646                         U  = temp2->fPositions;
    647                         ux = ix;
    648                         UinDstates = TRUE;
    649                         break;
    650                     }
    651                 }
    652 
    653                 // Add U as an unmarked state to Dstates
    654                 if (!UinDstates)
    655                 {
    656                     RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
    657                     if (newState == NULL) {
    658                     	*fStatus = U_MEMORY_ALLOCATION_ERROR;
    659                     }
    660                     if (U_FAILURE(*fStatus)) {
    661                         goto ExitBuildSTdeleteall;
    662                     }
    663                     newState->fPositions = U;
    664                     fDStates->addElement(newState, *fStatus);
    665                     if (U_FAILURE(*fStatus)) {
    666                         return;
    667                     }
    668                     ux = fDStates->size()-1;
    669                 }
    670 
    671                 // Dtran[T, a] := U;
    672                 T->fDtran->setElementAt(ux, a);
    673             }
    674         }
    675     }
    676     return;
    677     // delete local pointers only if error occured.
    678 ExitBuildSTdeleteall:
    679     delete initialState;
    680     delete failState;
    681 }
    682 
    683 
    684 
    685 //-----------------------------------------------------------------------------
    686 //
    687 //   flagAcceptingStates    Identify accepting states.
    688 //                          First get a list of all of the end marker nodes.
    689 //                          Then, for each state s,
    690 //                              if s contains one of the end marker nodes in its list of tree positions then
    691 //                                  s is an accepting state.
    692 //
    693 //-----------------------------------------------------------------------------
    694 void     RBBITableBuilder::flagAcceptingStates() {
    695     if (U_FAILURE(*fStatus)) {
    696         return;
    697     }
    698     UVector     endMarkerNodes(*fStatus);
    699     RBBINode    *endMarker;
    700     int32_t     i;
    701     int32_t     n;
    702 
    703     if (U_FAILURE(*fStatus)) {
    704         return;
    705     }
    706 
    707     fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
    708     if (U_FAILURE(*fStatus)) {
    709         return;
    710     }
    711 
    712     for (i=0; i<endMarkerNodes.size(); i++) {
    713         endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
    714         for (n=0; n<fDStates->size(); n++) {
    715             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
    716             if (sd->fPositions->indexOf(endMarker) >= 0) {
    717                 // Any non-zero value for fAccepting means this is an accepting node.
    718                 // The value is what will be returned to the user as the break status.
    719                 // If no other value was specified, force it to -1.
    720 
    721                 if (sd->fAccepting==0) {
    722                     // State hasn't been marked as accepting yet.  Do it now.
    723                     sd->fAccepting = endMarker->fVal;
    724                     if (sd->fAccepting == 0) {
    725                         sd->fAccepting = -1;
    726                     }
    727                 }
    728                 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
    729                     // Both lookahead and non-lookahead accepting for this state.
    730                     // Favor the look-ahead.  Expedient for line break.
    731                     // TODO:  need a more elegant resolution for conflicting rules.
    732                     sd->fAccepting = endMarker->fVal;
    733                 }
    734                 // implicit else:
    735                 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
    736 
    737                 // If the end marker node is from a look-ahead rule, set
    738                 //   the fLookAhead field or this state also.
    739                 if (endMarker->fLookAheadEnd) {
    740                     // TODO:  don't change value if already set?
    741                     // TODO:  allow for more than one active look-ahead rule in engine.
    742                     //        Make value here an index to a side array in engine?
    743                     sd->fLookAhead = sd->fAccepting;
    744                 }
    745             }
    746         }
    747     }
    748 }
    749 
    750 
    751 //-----------------------------------------------------------------------------
    752 //
    753 //    flagLookAheadStates   Very similar to flagAcceptingStates, above.
    754 //
    755 //-----------------------------------------------------------------------------
    756 void     RBBITableBuilder::flagLookAheadStates() {
    757     if (U_FAILURE(*fStatus)) {
    758         return;
    759     }
    760     UVector     lookAheadNodes(*fStatus);
    761     RBBINode    *lookAheadNode;
    762     int32_t     i;
    763     int32_t     n;
    764 
    765     fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
    766     if (U_FAILURE(*fStatus)) {
    767         return;
    768     }
    769     for (i=0; i<lookAheadNodes.size(); i++) {
    770         lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
    771 
    772         for (n=0; n<fDStates->size(); n++) {
    773             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
    774             if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
    775                 sd->fLookAhead = lookAheadNode->fVal;
    776             }
    777         }
    778     }
    779 }
    780 
    781 
    782 
    783 
    784 //-----------------------------------------------------------------------------
    785 //
    786 //    flagTaggedStates
    787 //
    788 //-----------------------------------------------------------------------------
    789 void     RBBITableBuilder::flagTaggedStates() {
    790     if (U_FAILURE(*fStatus)) {
    791         return;
    792     }
    793     UVector     tagNodes(*fStatus);
    794     RBBINode    *tagNode;
    795     int32_t     i;
    796     int32_t     n;
    797 
    798     if (U_FAILURE(*fStatus)) {
    799         return;
    800     }
    801     fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
    802     if (U_FAILURE(*fStatus)) {
    803         return;
    804     }
    805     for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
    806         tagNode = (RBBINode *)tagNodes.elementAt(i);
    807 
    808         for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
    809             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
    810             if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
    811                 sortedAdd(&sd->fTagVals, tagNode->fVal);
    812             }
    813         }
    814     }
    815 }
    816 
    817 
    818 
    819 
    820 //-----------------------------------------------------------------------------
    821 //
    822 //  mergeRuleStatusVals
    823 //
    824 //      Update the global table of rule status {tag} values
    825 //      The rule builder has a global vector of status values that are common
    826 //      for all tables.  Merge the ones from this table into the global set.
    827 //
    828 //-----------------------------------------------------------------------------
    829 void  RBBITableBuilder::mergeRuleStatusVals() {
    830     //
    831     //  The basic outline of what happens here is this...
    832     //
    833     //    for each state in this state table
    834     //       if the status tag list for this state is in the global statuses list
    835     //           record where and
    836     //           continue with the next state
    837     //       else
    838     //           add the tag list for this state to the global list.
    839     //
    840     int i;
    841     int n;
    842 
    843     // Pre-set a single tag of {0} into the table.
    844     //   We will need this as a default, for rule sets with no explicit tagging.
    845     if (fRB->fRuleStatusVals->size() == 0) {
    846         fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
    847         fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
    848     }
    849 
    850     //    For each state
    851     for (n=0; n<fDStates->size(); n++) {
    852         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
    853         UVector *thisStatesTagValues = sd->fTagVals;
    854         if (thisStatesTagValues == NULL) {
    855             // No tag values are explicitly associated with this state.
    856             //   Set the default tag value.
    857             sd->fTagsIdx = 0;
    858             continue;
    859         }
    860 
    861         // There are tag(s) associated with this state.
    862         //   fTagsIdx will be the index into the global tag list for this state's tag values.
    863         //   Initial value of -1 flags that we haven't got it set yet.
    864         sd->fTagsIdx = -1;
    865         int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
    866         int32_t  nextTagGroupStart = 0;
    867 
    868         // Loop runs once per group of tags in the global list
    869         while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
    870             thisTagGroupStart = nextTagGroupStart;
    871             nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
    872             if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
    873                 // The number of tags for this state is different from
    874                 //    the number of tags in this group from the global list.
    875                 //    Continue with the next group from the global list.
    876                 continue;
    877             }
    878             // The lengths match, go ahead and compare the actual tag values
    879             //    between this state and the group from the global list.
    880             for (i=0; i<thisStatesTagValues->size(); i++) {
    881                 if (thisStatesTagValues->elementAti(i) !=
    882                     fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
    883                     // Mismatch.
    884                     break;
    885                 }
    886             }
    887 
    888             if (i == thisStatesTagValues->size()) {
    889                 // We found a set of tag values in the global list that match
    890                 //   those for this state.  Use them.
    891                 sd->fTagsIdx = thisTagGroupStart;
    892                 break;
    893             }
    894         }
    895 
    896         if (sd->fTagsIdx == -1) {
    897             // No suitable entry in the global tag list already.  Add one
    898             sd->fTagsIdx = fRB->fRuleStatusVals->size();
    899             fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
    900             for (i=0; i<thisStatesTagValues->size(); i++) {
    901                 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
    902             }
    903         }
    904     }
    905 }
    906 
    907 
    908 
    909 
    910 
    911 
    912 
    913 //-----------------------------------------------------------------------------
    914 //
    915 //  sortedAdd  Add a value to a vector of sorted values (ints).
    916 //             Do not replicate entries; if the value is already there, do not
    917 //                add a second one.
    918 //             Lazily create the vector if it does not already exist.
    919 //
    920 //-----------------------------------------------------------------------------
    921 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
    922     int32_t i;
    923 
    924     if (*vector == NULL) {
    925         *vector = new UVector(*fStatus);
    926     }
    927     if (*vector == NULL || U_FAILURE(*fStatus)) {
    928         return;
    929     }
    930     UVector *vec = *vector;
    931     int32_t  vSize = vec->size();
    932     for (i=0; i<vSize; i++) {
    933         int32_t valAtI = vec->elementAti(i);
    934         if (valAtI == val) {
    935             // The value is already in the vector.  Don't add it again.
    936             return;
    937         }
    938         if (valAtI > val) {
    939             break;
    940         }
    941     }
    942     vec->insertElementAt(val, i, *fStatus);
    943 }
    944 
    945 
    946 
    947 //-----------------------------------------------------------------------------
    948 //
    949 //  setAdd     Set operation on UVector
    950 //             dest = dest union source
    951 //             Elements may only appear once and must be sorted.
    952 //
    953 //-----------------------------------------------------------------------------
    954 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
    955     int32_t destOriginalSize = dest->size();
    956     int32_t sourceSize       = source->size();
    957     int32_t di           = 0;
    958     MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
    959     void **destPtr, **sourcePtr;
    960     void **destLim, **sourceLim;
    961 
    962     if (destOriginalSize > destArray.getCapacity()) {
    963         if (destArray.resize(destOriginalSize) == NULL) {
    964             return;
    965         }
    966     }
    967     destPtr = destArray.getAlias();
    968     destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
    969 
    970     if (sourceSize > sourceArray.getCapacity()) {
    971         if (sourceArray.resize(sourceSize) == NULL) {
    972             return;
    973         }
    974     }
    975     sourcePtr = sourceArray.getAlias();
    976     sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
    977 
    978     // Avoid multiple "get element" calls by getting the contents into arrays
    979     (void) dest->toArray(destPtr);
    980     (void) source->toArray(sourcePtr);
    981 
    982     dest->setSize(sourceSize+destOriginalSize, *fStatus);
    983 
    984     while (sourcePtr < sourceLim && destPtr < destLim) {
    985         if (*destPtr == *sourcePtr) {
    986             dest->setElementAt(*sourcePtr++, di++);
    987             destPtr++;
    988         }
    989         // This check is required for machines with segmented memory, like i5/OS.
    990         // Direct pointer comparison is not recommended.
    991         else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
    992             dest->setElementAt(*destPtr++, di++);
    993         }
    994         else { /* *sourcePtr < *destPtr */
    995             dest->setElementAt(*sourcePtr++, di++);
    996         }
    997     }
    998 
    999     // At most one of these two cleanup loops will execute
   1000     while (destPtr < destLim) {
   1001         dest->setElementAt(*destPtr++, di++);
   1002     }
   1003     while (sourcePtr < sourceLim) {
   1004         dest->setElementAt(*sourcePtr++, di++);
   1005     }
   1006 
   1007     dest->setSize(di, *fStatus);
   1008 }
   1009 
   1010 
   1011 
   1012 //-----------------------------------------------------------------------------
   1013 //
   1014 //  setEqual    Set operation on UVector.
   1015 //              Compare for equality.
   1016 //              Elements must be sorted.
   1017 //
   1018 //-----------------------------------------------------------------------------
   1019 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
   1020     return a->equals(*b);
   1021 }
   1022 
   1023 
   1024 //-----------------------------------------------------------------------------
   1025 //
   1026 //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
   1027 //                 for each node in the tree.
   1028 //
   1029 //-----------------------------------------------------------------------------
   1030 #ifdef RBBI_DEBUG
   1031 void RBBITableBuilder::printPosSets(RBBINode *n) {
   1032     if (n==NULL) {
   1033         return;
   1034     }
   1035     n->printNode();
   1036     RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"TRUE":"FALSE");
   1037 
   1038     RBBIDebugPrintf("         firstpos:  ");
   1039     printSet(n->fFirstPosSet);
   1040 
   1041     RBBIDebugPrintf("         lastpos:   ");
   1042     printSet(n->fLastPosSet);
   1043 
   1044     RBBIDebugPrintf("         followpos: ");
   1045     printSet(n->fFollowPos);
   1046 
   1047     printPosSets(n->fLeftChild);
   1048     printPosSets(n->fRightChild);
   1049 }
   1050 #endif
   1051 
   1052 
   1053 
   1054 //-----------------------------------------------------------------------------
   1055 //
   1056 //   getTableSize()    Calculate the size of the runtime form of this
   1057 //                     state transition table.
   1058 //
   1059 //-----------------------------------------------------------------------------
   1060 int32_t  RBBITableBuilder::getTableSize() const {
   1061     int32_t    size = 0;
   1062     int32_t    numRows;
   1063     int32_t    numCols;
   1064     int32_t    rowSize;
   1065 
   1066     if (fTree == NULL) {
   1067         return 0;
   1068     }
   1069 
   1070     size    = sizeof(RBBIStateTable) - 4;    // The header, with no rows to the table.
   1071 
   1072     numRows = fDStates->size();
   1073     numCols = fRB->fSetBuilder->getNumCharCategories();
   1074 
   1075     //  Note  The declaration of RBBIStateTableRow is for a table of two columns.
   1076     //        Therefore we subtract two from numCols when determining
   1077     //        how much storage to add to a row for the total columns.
   1078     rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
   1079     size   += numRows * rowSize;
   1080     return size;
   1081 }
   1082 
   1083 
   1084 
   1085 //-----------------------------------------------------------------------------
   1086 //
   1087 //   exportTable()    export the state transition table in the format required
   1088 //                    by the runtime engine.  getTableSize() bytes of memory
   1089 //                    must be available at the output address "where".
   1090 //
   1091 //-----------------------------------------------------------------------------
   1092 void RBBITableBuilder::exportTable(void *where) {
   1093     RBBIStateTable    *table = (RBBIStateTable *)where;
   1094     uint32_t           state;
   1095     int                col;
   1096 
   1097     if (U_FAILURE(*fStatus) || fTree == NULL) {
   1098         return;
   1099     }
   1100 
   1101     if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
   1102         fDStates->size() > 0x7fff) {
   1103         *fStatus = U_BRK_INTERNAL_ERROR;
   1104         return;
   1105     }
   1106 
   1107     table->fRowLen    = sizeof(RBBIStateTableRow) +
   1108                             sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
   1109     table->fNumStates = fDStates->size();
   1110     table->fFlags     = 0;
   1111     if (fRB->fLookAheadHardBreak) {
   1112         table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
   1113     }
   1114     if (fRB->fSetBuilder->sawBOF()) {
   1115         table->fFlags  |= RBBI_BOF_REQUIRED;
   1116     }
   1117     table->fReserved  = 0;
   1118 
   1119     for (state=0; state<table->fNumStates; state++) {
   1120         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
   1121         RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
   1122         U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
   1123         U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
   1124         row->fAccepting = (int16_t)sd->fAccepting;
   1125         row->fLookAhead = (int16_t)sd->fLookAhead;
   1126         row->fTagIdx    = (int16_t)sd->fTagsIdx;
   1127         for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
   1128             row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
   1129         }
   1130     }
   1131 }
   1132 
   1133 
   1134 
   1135 //-----------------------------------------------------------------------------
   1136 //
   1137 //   printSet    Debug function.   Print the contents of a UVector
   1138 //
   1139 //-----------------------------------------------------------------------------
   1140 #ifdef RBBI_DEBUG
   1141 void RBBITableBuilder::printSet(UVector *s) {
   1142     int32_t  i;
   1143     for (i=0; i<s->size(); i++) {
   1144         void *v = s->elementAt(i);
   1145         RBBIDebugPrintf("%10p", v);
   1146     }
   1147     RBBIDebugPrintf("\n");
   1148 }
   1149 #endif
   1150 
   1151 
   1152 //-----------------------------------------------------------------------------
   1153 //
   1154 //   printStates    Debug Function.  Dump the fully constructed state transition table.
   1155 //
   1156 //-----------------------------------------------------------------------------
   1157 #ifdef RBBI_DEBUG
   1158 void RBBITableBuilder::printStates() {
   1159     int     c;    // input "character"
   1160     int     n;    // state number
   1161 
   1162     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
   1163     RBBIDebugPrintf("      | Acc  LA    Tag");
   1164     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
   1165         RBBIDebugPrintf(" %2d", c);
   1166     }
   1167     RBBIDebugPrintf("\n");
   1168     RBBIDebugPrintf("      |---------------");
   1169     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
   1170         RBBIDebugPrintf("---");
   1171     }
   1172     RBBIDebugPrintf("\n");
   1173 
   1174     for (n=0; n<fDStates->size(); n++) {
   1175         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
   1176         RBBIDebugPrintf("  %3d | " , n);
   1177         RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
   1178         for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
   1179             RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
   1180         }
   1181         RBBIDebugPrintf("\n");
   1182     }
   1183     RBBIDebugPrintf("\n\n");
   1184 }
   1185 #endif
   1186 
   1187 
   1188 
   1189 //-----------------------------------------------------------------------------
   1190 //
   1191 //   printRuleStatusTable    Debug Function.  Dump the common rule status table
   1192 //
   1193 //-----------------------------------------------------------------------------
   1194 #ifdef RBBI_DEBUG
   1195 void RBBITableBuilder::printRuleStatusTable() {
   1196     int32_t  thisRecord = 0;
   1197     int32_t  nextRecord = 0;
   1198     int      i;
   1199     UVector  *tbl = fRB->fRuleStatusVals;
   1200 
   1201     RBBIDebugPrintf("index |  tags \n");
   1202     RBBIDebugPrintf("-------------------\n");
   1203 
   1204     while (nextRecord < tbl->size()) {
   1205         thisRecord = nextRecord;
   1206         nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
   1207         RBBIDebugPrintf("%4d   ", thisRecord);
   1208         for (i=thisRecord+1; i<nextRecord; i++) {
   1209             RBBIDebugPrintf("  %5d", tbl->elementAti(i));
   1210         }
   1211         RBBIDebugPrintf("\n");
   1212     }
   1213     RBBIDebugPrintf("\n\n");
   1214 }
   1215 #endif
   1216 
   1217 
   1218 //-----------------------------------------------------------------------------
   1219 //
   1220 //   RBBIStateDescriptor     Methods.  This is a very struct-like class
   1221 //                           Most access is directly to the fields.
   1222 //
   1223 //-----------------------------------------------------------------------------
   1224 
   1225 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
   1226     fMarked    = FALSE;
   1227     fAccepting = 0;
   1228     fLookAhead = 0;
   1229     fTagsIdx   = 0;
   1230     fTagVals   = NULL;
   1231     fPositions = NULL;
   1232     fDtran     = NULL;
   1233 
   1234     fDtran     = new UVector(lastInputSymbol+1, *fStatus);
   1235     if (U_FAILURE(*fStatus)) {
   1236         return;
   1237     }
   1238     if (fDtran == NULL) {
   1239         *fStatus = U_MEMORY_ALLOCATION_ERROR;
   1240         return;
   1241     }
   1242     fDtran->setSize(lastInputSymbol+1, *fStatus);    // fDtran needs to be pre-sized.
   1243                                            //   It is indexed by input symbols, and will
   1244                                            //   hold  the next state number for each
   1245                                            //   symbol.
   1246 }
   1247 
   1248 
   1249 RBBIStateDescriptor::~RBBIStateDescriptor() {
   1250     delete       fPositions;
   1251     delete       fDtran;
   1252     delete       fTagVals;
   1253     fPositions = NULL;
   1254     fDtran     = NULL;
   1255     fTagVals   = NULL;
   1256 }
   1257 
   1258 U_NAMESPACE_END
   1259 
   1260 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   1261