1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (c) 2002-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 // 10 // rbbitblb.cpp 11 // 12 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_BREAK_ITERATION 17 18 #include "unicode/unistr.h" 19 #include "rbbitblb.h" 20 #include "rbbirb.h" 21 #include "rbbisetb.h" 22 #include "rbbidata.h" 23 #include "cstring.h" 24 #include "uassert.h" 25 #include "uvectr32.h" 26 #include "cmemory.h" 27 28 U_NAMESPACE_BEGIN 29 30 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) : 31 fRB(rb), 32 fTree(*rootNode), 33 fStatus(&status), 34 fDStates(nullptr), 35 fSafeTable(nullptr) { 36 if (U_FAILURE(status)) { 37 return; 38 } 39 // fDStates is UVector<RBBIStateDescriptor *> 40 fDStates = new UVector(status); 41 if (U_SUCCESS(status) && fDStates == nullptr ) { 42 status = U_MEMORY_ALLOCATION_ERROR; 43 } 44 } 45 46 47 48 RBBITableBuilder::~RBBITableBuilder() { 49 int i; 50 for (i=0; i<fDStates->size(); i++) { 51 delete (RBBIStateDescriptor *)fDStates->elementAt(i); 52 } 53 delete fDStates; 54 delete fSafeTable; 55 } 56 57 58 //----------------------------------------------------------------------------- 59 // 60 // RBBITableBuilder::buildForwardTable - This is the main function for building 61 // the DFA state transition table from the RBBI rules parse tree. 62 // 63 //----------------------------------------------------------------------------- 64 void RBBITableBuilder::buildForwardTable() { 65 66 if (U_FAILURE(*fStatus)) { 67 return; 68 } 69 70 // If there were no rules, just return. This situation can easily arise 71 // for the reverse rules. 72 if (fTree==NULL) { 73 return; 74 } 75 76 // 77 // Walk through the tree, replacing any references to $variables with a copy of the 78 // parse tree for the substition expression. 79 // 80 fTree = fTree->flattenVariables(); 81 #ifdef RBBI_DEBUG 82 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) { 83 RBBIDebugPuts("\nParse tree after flattening variable references."); 84 RBBINode::printTree(fTree, TRUE); 85 } 86 #endif 87 88 // 89 // If the rules contained any references to {bof} 90 // add a {bof} <cat> <former root of tree> to the 91 // tree. Means that all matches must start out with the 92 // {bof} fake character. 93 // 94 if (fRB->fSetBuilder->sawBOF()) { 95 RBBINode *bofTop = new RBBINode(RBBINode::opCat); 96 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar); 97 // Delete and exit if memory allocation failed. 98 if (bofTop == NULL || bofLeaf == NULL) { 99 *fStatus = U_MEMORY_ALLOCATION_ERROR; 100 delete bofTop; 101 delete bofLeaf; 102 return; 103 } 104 bofTop->fLeftChild = bofLeaf; 105 bofTop->fRightChild = fTree; 106 bofLeaf->fParent = bofTop; 107 bofLeaf->fVal = 2; // Reserved value for {bof}. 108 fTree = bofTop; 109 } 110 111 // 112 // Add a unique right-end marker to the expression. 113 // Appears as a cat-node, left child being the original tree, 114 // right child being the end marker. 115 // 116 RBBINode *cn = new RBBINode(RBBINode::opCat); 117 // Exit if memory allocation failed. 118 if (cn == NULL) { 119 *fStatus = U_MEMORY_ALLOCATION_ERROR; 120 return; 121 } 122 cn->fLeftChild = fTree; 123 fTree->fParent = cn; 124 cn->fRightChild = new RBBINode(RBBINode::endMark); 125 // Delete and exit if memory allocation failed. 126 if (cn->fRightChild == NULL) { 127 *fStatus = U_MEMORY_ALLOCATION_ERROR; 128 delete cn; 129 return; 130 } 131 cn->fRightChild->fParent = cn; 132 fTree = cn; 133 134 // 135 // Replace all references to UnicodeSets with the tree for the equivalent 136 // expression. 137 // 138 fTree->flattenSets(); 139 #ifdef RBBI_DEBUG 140 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) { 141 RBBIDebugPuts("\nParse tree after flattening Unicode Set references."); 142 RBBINode::printTree(fTree, TRUE); 143 } 144 #endif 145 146 147 // 148 // calculate the functions nullable, firstpos, lastpos and followpos on 149 // nodes in the parse tree. 150 // See the alogrithm description in Aho. 151 // Understanding how this works by looking at the code alone will be 152 // nearly impossible. 153 // 154 calcNullable(fTree); 155 calcFirstPos(fTree); 156 calcLastPos(fTree); 157 calcFollowPos(fTree); 158 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) { 159 RBBIDebugPuts("\n"); 160 printPosSets(fTree); 161 } 162 163 // 164 // For "chained" rules, modify the followPos sets 165 // 166 if (fRB->fChainRules) { 167 calcChainedFollowPos(fTree); 168 } 169 170 // 171 // BOF (start of input) test fixup. 172 // 173 if (fRB->fSetBuilder->sawBOF()) { 174 bofFixup(); 175 } 176 177 // 178 // Build the DFA state transition tables. 179 // 180 buildStateTable(); 181 flagAcceptingStates(); 182 flagLookAheadStates(); 183 flagTaggedStates(); 184 185 // 186 // Update the global table of rule status {tag} values 187 // The rule builder has a global vector of status values that are common 188 // for all tables. Merge the ones from this table into the global set. 189 // 190 mergeRuleStatusVals(); 191 } 192 193 194 195 //----------------------------------------------------------------------------- 196 // 197 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9 198 // 199 //----------------------------------------------------------------------------- 200 void RBBITableBuilder::calcNullable(RBBINode *n) { 201 if (n == NULL) { 202 return; 203 } 204 if (n->fType == RBBINode::setRef || 205 n->fType == RBBINode::endMark ) { 206 // These are non-empty leaf node types. 207 n->fNullable = FALSE; 208 return; 209 } 210 211 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) { 212 // Lookahead marker node. It's a leaf, so no recursion on children. 213 // It's nullable because it does not match any literal text from the input stream. 214 n->fNullable = TRUE; 215 return; 216 } 217 218 219 // The node is not a leaf. 220 // Calculate nullable on its children. 221 calcNullable(n->fLeftChild); 222 calcNullable(n->fRightChild); 223 224 // Apply functions from table 3.40 in Aho 225 if (n->fType == RBBINode::opOr) { 226 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable; 227 } 228 else if (n->fType == RBBINode::opCat) { 229 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable; 230 } 231 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) { 232 n->fNullable = TRUE; 233 } 234 else { 235 n->fNullable = FALSE; 236 } 237 } 238 239 240 241 242 //----------------------------------------------------------------------------- 243 // 244 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9 245 // 246 //----------------------------------------------------------------------------- 247 void RBBITableBuilder::calcFirstPos(RBBINode *n) { 248 if (n == NULL) { 249 return; 250 } 251 if (n->fType == RBBINode::leafChar || 252 n->fType == RBBINode::endMark || 253 n->fType == RBBINode::lookAhead || 254 n->fType == RBBINode::tag) { 255 // These are non-empty leaf node types. 256 // Note: In order to maintain the sort invariant on the set, 257 // this function should only be called on a node whose set is 258 // empty to start with. 259 n->fFirstPosSet->addElement(n, *fStatus); 260 return; 261 } 262 263 // The node is not a leaf. 264 // Calculate firstPos on its children. 265 calcFirstPos(n->fLeftChild); 266 calcFirstPos(n->fRightChild); 267 268 // Apply functions from table 3.40 in Aho 269 if (n->fType == RBBINode::opOr) { 270 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); 271 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet); 272 } 273 else if (n->fType == RBBINode::opCat) { 274 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); 275 if (n->fLeftChild->fNullable) { 276 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet); 277 } 278 } 279 else if (n->fType == RBBINode::opStar || 280 n->fType == RBBINode::opQuestion || 281 n->fType == RBBINode::opPlus) { 282 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); 283 } 284 } 285 286 287 288 //----------------------------------------------------------------------------- 289 // 290 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9 291 // 292 //----------------------------------------------------------------------------- 293 void RBBITableBuilder::calcLastPos(RBBINode *n) { 294 if (n == NULL) { 295 return; 296 } 297 if (n->fType == RBBINode::leafChar || 298 n->fType == RBBINode::endMark || 299 n->fType == RBBINode::lookAhead || 300 n->fType == RBBINode::tag) { 301 // These are non-empty leaf node types. 302 // Note: In order to maintain the sort invariant on the set, 303 // this function should only be called on a node whose set is 304 // empty to start with. 305 n->fLastPosSet->addElement(n, *fStatus); 306 return; 307 } 308 309 // The node is not a leaf. 310 // Calculate lastPos on its children. 311 calcLastPos(n->fLeftChild); 312 calcLastPos(n->fRightChild); 313 314 // Apply functions from table 3.40 in Aho 315 if (n->fType == RBBINode::opOr) { 316 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); 317 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet); 318 } 319 else if (n->fType == RBBINode::opCat) { 320 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet); 321 if (n->fRightChild->fNullable) { 322 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); 323 } 324 } 325 else if (n->fType == RBBINode::opStar || 326 n->fType == RBBINode::opQuestion || 327 n->fType == RBBINode::opPlus) { 328 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); 329 } 330 } 331 332 333 334 //----------------------------------------------------------------------------- 335 // 336 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9 337 // 338 //----------------------------------------------------------------------------- 339 void RBBITableBuilder::calcFollowPos(RBBINode *n) { 340 if (n == NULL || 341 n->fType == RBBINode::leafChar || 342 n->fType == RBBINode::endMark) { 343 return; 344 } 345 346 calcFollowPos(n->fLeftChild); 347 calcFollowPos(n->fRightChild); 348 349 // Aho rule #1 350 if (n->fType == RBBINode::opCat) { 351 RBBINode *i; // is 'i' in Aho's description 352 uint32_t ix; 353 354 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet; 355 356 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) { 357 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix); 358 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet); 359 } 360 } 361 362 // Aho rule #2 363 if (n->fType == RBBINode::opStar || 364 n->fType == RBBINode::opPlus) { 365 RBBINode *i; // again, n and i are the names from Aho's description. 366 uint32_t ix; 367 368 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) { 369 i = (RBBINode *)n->fLastPosSet->elementAt(ix); 370 setAdd(i->fFollowPos, n->fFirstPosSet); 371 } 372 } 373 374 375 376 } 377 378 //----------------------------------------------------------------------------- 379 // 380 // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged 381 // as roots of a rule to a destination vector. 382 // 383 //----------------------------------------------------------------------------- 384 void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) { 385 if (node == NULL || U_FAILURE(*fStatus)) { 386 return; 387 } 388 if (node->fRuleRoot) { 389 dest->addElement(node, *fStatus); 390 // Note: rules cannot nest. If we found a rule start node, 391 // no child node can also be a start node. 392 return; 393 } 394 addRuleRootNodes(dest, node->fLeftChild); 395 addRuleRootNodes(dest, node->fRightChild); 396 } 397 398 //----------------------------------------------------------------------------- 399 // 400 // calcChainedFollowPos. Modify the previously calculated followPos sets 401 // to implement rule chaining. NOT described by Aho 402 // 403 //----------------------------------------------------------------------------- 404 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) { 405 406 UVector endMarkerNodes(*fStatus); 407 UVector leafNodes(*fStatus); 408 int32_t i; 409 410 if (U_FAILURE(*fStatus)) { 411 return; 412 } 413 414 // get a list of all endmarker nodes. 415 tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus); 416 417 // get a list all leaf nodes 418 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus); 419 if (U_FAILURE(*fStatus)) { 420 return; 421 } 422 423 // Collect all leaf nodes that can start matches for rules 424 // with inbound chaining enabled, which is the union of the 425 // firstPosition sets from each of the rule root nodes. 426 427 UVector ruleRootNodes(*fStatus); 428 addRuleRootNodes(&ruleRootNodes, tree); 429 430 UVector matchStartNodes(*fStatus); 431 for (int j=0; j<ruleRootNodes.size(); ++j) { 432 RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j)); 433 if (node->fChainIn) { 434 setAdd(&matchStartNodes, node->fFirstPosSet); 435 } 436 } 437 if (U_FAILURE(*fStatus)) { 438 return; 439 } 440 441 int32_t endNodeIx; 442 int32_t startNodeIx; 443 444 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) { 445 RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx); 446 RBBINode *endNode = NULL; 447 448 // Identify leaf nodes that correspond to overall rule match positions. 449 // These include an endMarkerNode in their followPos sets. 450 for (i=0; i<endMarkerNodes.size(); i++) { 451 if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) { 452 endNode = tNode; 453 break; 454 } 455 } 456 if (endNode == NULL) { 457 // node wasn't an end node. Try again with the next. 458 continue; 459 } 460 461 // We've got a node that can end a match. 462 463 // Line Break Specific hack: If this node's val correspond to the $CM char class, 464 // don't chain from it. 465 // TODO: Add rule syntax for this behavior, get specifics out of here and 466 // into the rule file. 467 if (fRB->fLBCMNoChain) { 468 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal); 469 if (c != -1) { 470 // c == -1 occurs with sets containing only the {eof} marker string. 471 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK); 472 if (cLBProp == U_LB_COMBINING_MARK) { 473 continue; 474 } 475 } 476 } 477 478 479 // Now iterate over the nodes that can start a match, looking for ones 480 // with the same char class as our ending node. 481 RBBINode *startNode; 482 for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) { 483 startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx); 484 if (startNode->fType != RBBINode::leafChar) { 485 continue; 486 } 487 488 if (endNode->fVal == startNode->fVal) { 489 // The end val (character class) of one possible match is the 490 // same as the start of another. 491 492 // Add all nodes from the followPos of the start node to the 493 // followPos set of the end node, which will have the effect of 494 // letting matches transition from a match state at endNode 495 // to the second char of a match starting with startNode. 496 setAdd(endNode->fFollowPos, startNode->fFollowPos); 497 } 498 } 499 } 500 } 501 502 503 //----------------------------------------------------------------------------- 504 // 505 // bofFixup. Fixup for state tables that include {bof} beginning of input testing. 506 // Do an swizzle similar to chaining, modifying the followPos set of 507 // the bofNode to include the followPos nodes from other {bot} nodes 508 // scattered through the tree. 509 // 510 // This function has much in common with calcChainedFollowPos(). 511 // 512 //----------------------------------------------------------------------------- 513 void RBBITableBuilder::bofFixup() { 514 515 if (U_FAILURE(*fStatus)) { 516 return; 517 } 518 519 // The parse tree looks like this ... 520 // fTree root ---> <cat> 521 // / \ . 522 // <cat> <#end node> 523 // / \ . 524 // <bofNode> rest 525 // of tree 526 // 527 // We will be adding things to the followPos set of the <bofNode> 528 // 529 RBBINode *bofNode = fTree->fLeftChild->fLeftChild; 530 U_ASSERT(bofNode->fType == RBBINode::leafChar); 531 U_ASSERT(bofNode->fVal == 2); 532 533 // Get all nodes that can be the start a match of the user-written rules 534 // (excluding the fake bofNode) 535 // We want the nodes that can start a match in the 536 // part labeled "rest of tree" 537 // 538 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet; 539 540 RBBINode *startNode; 541 int startNodeIx; 542 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) { 543 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx); 544 if (startNode->fType != RBBINode::leafChar) { 545 continue; 546 } 547 548 if (startNode->fVal == bofNode->fVal) { 549 // We found a leaf node corresponding to a {bof} that was 550 // explicitly written into a rule. 551 // Add everything from the followPos set of this node to the 552 // followPos set of the fake bofNode at the start of the tree. 553 // 554 setAdd(bofNode->fFollowPos, startNode->fFollowPos); 555 } 556 } 557 } 558 559 //----------------------------------------------------------------------------- 560 // 561 // buildStateTable() Determine the set of runtime DFA states and the 562 // transition tables for these states, by the algorithm 563 // of fig. 3.44 in Aho. 564 // 565 // Most of the comments are quotes of Aho's psuedo-code. 566 // 567 //----------------------------------------------------------------------------- 568 void RBBITableBuilder::buildStateTable() { 569 if (U_FAILURE(*fStatus)) { 570 return; 571 } 572 RBBIStateDescriptor *failState; 573 // Set it to NULL to avoid uninitialized warning 574 RBBIStateDescriptor *initialState = NULL; 575 // 576 // Add a dummy state 0 - the stop state. Not from Aho. 577 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1; 578 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus); 579 if (failState == NULL) { 580 *fStatus = U_MEMORY_ALLOCATION_ERROR; 581 goto ExitBuildSTdeleteall; 582 } 583 failState->fPositions = new UVector(*fStatus); 584 if (failState->fPositions == NULL) { 585 *fStatus = U_MEMORY_ALLOCATION_ERROR; 586 } 587 if (failState->fPositions == NULL || U_FAILURE(*fStatus)) { 588 goto ExitBuildSTdeleteall; 589 } 590 fDStates->addElement(failState, *fStatus); 591 if (U_FAILURE(*fStatus)) { 592 goto ExitBuildSTdeleteall; 593 } 594 595 // initially, the only unmarked state in Dstates is firstpos(root), 596 // where toot is the root of the syntax tree for (r)#; 597 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus); 598 if (initialState == NULL) { 599 *fStatus = U_MEMORY_ALLOCATION_ERROR; 600 } 601 if (U_FAILURE(*fStatus)) { 602 goto ExitBuildSTdeleteall; 603 } 604 initialState->fPositions = new UVector(*fStatus); 605 if (initialState->fPositions == NULL) { 606 *fStatus = U_MEMORY_ALLOCATION_ERROR; 607 } 608 if (U_FAILURE(*fStatus)) { 609 goto ExitBuildSTdeleteall; 610 } 611 setAdd(initialState->fPositions, fTree->fFirstPosSet); 612 fDStates->addElement(initialState, *fStatus); 613 if (U_FAILURE(*fStatus)) { 614 goto ExitBuildSTdeleteall; 615 } 616 617 // while there is an unmarked state T in Dstates do begin 618 for (;;) { 619 RBBIStateDescriptor *T = NULL; 620 int32_t tx; 621 for (tx=1; tx<fDStates->size(); tx++) { 622 RBBIStateDescriptor *temp; 623 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx); 624 if (temp->fMarked == FALSE) { 625 T = temp; 626 break; 627 } 628 } 629 if (T == NULL) { 630 break; 631 } 632 633 // mark T; 634 T->fMarked = TRUE; 635 636 // for each input symbol a do begin 637 int32_t a; 638 for (a = 1; a<=lastInputSymbol; a++) { 639 // let U be the set of positions that are in followpos(p) 640 // for some position p in T 641 // such that the symbol at position p is a; 642 UVector *U = NULL; 643 RBBINode *p; 644 int32_t px; 645 for (px=0; px<T->fPositions->size(); px++) { 646 p = (RBBINode *)T->fPositions->elementAt(px); 647 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) { 648 if (U == NULL) { 649 U = new UVector(*fStatus); 650 if (U == NULL) { 651 *fStatus = U_MEMORY_ALLOCATION_ERROR; 652 goto ExitBuildSTdeleteall; 653 } 654 } 655 setAdd(U, p->fFollowPos); 656 } 657 } 658 659 // if U is not empty and not in DStates then 660 int32_t ux = 0; 661 UBool UinDstates = FALSE; 662 if (U != NULL) { 663 U_ASSERT(U->size() > 0); 664 int ix; 665 for (ix=0; ix<fDStates->size(); ix++) { 666 RBBIStateDescriptor *temp2; 667 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix); 668 if (setEquals(U, temp2->fPositions)) { 669 delete U; 670 U = temp2->fPositions; 671 ux = ix; 672 UinDstates = TRUE; 673 break; 674 } 675 } 676 677 // Add U as an unmarked state to Dstates 678 if (!UinDstates) 679 { 680 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus); 681 if (newState == NULL) { 682 *fStatus = U_MEMORY_ALLOCATION_ERROR; 683 } 684 if (U_FAILURE(*fStatus)) { 685 goto ExitBuildSTdeleteall; 686 } 687 newState->fPositions = U; 688 fDStates->addElement(newState, *fStatus); 689 if (U_FAILURE(*fStatus)) { 690 return; 691 } 692 ux = fDStates->size()-1; 693 } 694 695 // Dtran[T, a] := U; 696 T->fDtran->setElementAt(ux, a); 697 } 698 } 699 } 700 return; 701 // delete local pointers only if error occured. 702 ExitBuildSTdeleteall: 703 delete initialState; 704 delete failState; 705 } 706 707 708 709 //----------------------------------------------------------------------------- 710 // 711 // flagAcceptingStates Identify accepting states. 712 // First get a list of all of the end marker nodes. 713 // Then, for each state s, 714 // if s contains one of the end marker nodes in its list of tree positions then 715 // s is an accepting state. 716 // 717 //----------------------------------------------------------------------------- 718 void RBBITableBuilder::flagAcceptingStates() { 719 if (U_FAILURE(*fStatus)) { 720 return; 721 } 722 UVector endMarkerNodes(*fStatus); 723 RBBINode *endMarker; 724 int32_t i; 725 int32_t n; 726 727 if (U_FAILURE(*fStatus)) { 728 return; 729 } 730 731 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus); 732 if (U_FAILURE(*fStatus)) { 733 return; 734 } 735 736 for (i=0; i<endMarkerNodes.size(); i++) { 737 endMarker = (RBBINode *)endMarkerNodes.elementAt(i); 738 for (n=0; n<fDStates->size(); n++) { 739 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 740 if (sd->fPositions->indexOf(endMarker) >= 0) { 741 // Any non-zero value for fAccepting means this is an accepting node. 742 // The value is what will be returned to the user as the break status. 743 // If no other value was specified, force it to -1. 744 745 if (sd->fAccepting==0) { 746 // State hasn't been marked as accepting yet. Do it now. 747 sd->fAccepting = endMarker->fVal; 748 if (sd->fAccepting == 0) { 749 sd->fAccepting = -1; 750 } 751 } 752 if (sd->fAccepting==-1 && endMarker->fVal != 0) { 753 // Both lookahead and non-lookahead accepting for this state. 754 // Favor the look-ahead. Expedient for line break. 755 // TODO: need a more elegant resolution for conflicting rules. 756 sd->fAccepting = endMarker->fVal; 757 } 758 // implicit else: 759 // if sd->fAccepting already had a value other than 0 or -1, leave it be. 760 761 // If the end marker node is from a look-ahead rule, set 762 // the fLookAhead field for this state also. 763 if (endMarker->fLookAheadEnd) { 764 // TODO: don't change value if already set? 765 // TODO: allow for more than one active look-ahead rule in engine. 766 // Make value here an index to a side array in engine? 767 sd->fLookAhead = sd->fAccepting; 768 } 769 } 770 } 771 } 772 } 773 774 775 //----------------------------------------------------------------------------- 776 // 777 // flagLookAheadStates Very similar to flagAcceptingStates, above. 778 // 779 //----------------------------------------------------------------------------- 780 void RBBITableBuilder::flagLookAheadStates() { 781 if (U_FAILURE(*fStatus)) { 782 return; 783 } 784 UVector lookAheadNodes(*fStatus); 785 RBBINode *lookAheadNode; 786 int32_t i; 787 int32_t n; 788 789 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus); 790 if (U_FAILURE(*fStatus)) { 791 return; 792 } 793 for (i=0; i<lookAheadNodes.size(); i++) { 794 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i); 795 796 for (n=0; n<fDStates->size(); n++) { 797 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 798 if (sd->fPositions->indexOf(lookAheadNode) >= 0) { 799 sd->fLookAhead = lookAheadNode->fVal; 800 } 801 } 802 } 803 } 804 805 806 807 808 //----------------------------------------------------------------------------- 809 // 810 // flagTaggedStates 811 // 812 //----------------------------------------------------------------------------- 813 void RBBITableBuilder::flagTaggedStates() { 814 if (U_FAILURE(*fStatus)) { 815 return; 816 } 817 UVector tagNodes(*fStatus); 818 RBBINode *tagNode; 819 int32_t i; 820 int32_t n; 821 822 if (U_FAILURE(*fStatus)) { 823 return; 824 } 825 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus); 826 if (U_FAILURE(*fStatus)) { 827 return; 828 } 829 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em) 830 tagNode = (RBBINode *)tagNodes.elementAt(i); 831 832 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table) 833 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 834 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t 835 sortedAdd(&sd->fTagVals, tagNode->fVal); 836 } 837 } 838 } 839 } 840 841 842 843 844 //----------------------------------------------------------------------------- 845 // 846 // mergeRuleStatusVals 847 // 848 // Update the global table of rule status {tag} values 849 // The rule builder has a global vector of status values that are common 850 // for all tables. Merge the ones from this table into the global set. 851 // 852 //----------------------------------------------------------------------------- 853 void RBBITableBuilder::mergeRuleStatusVals() { 854 // 855 // The basic outline of what happens here is this... 856 // 857 // for each state in this state table 858 // if the status tag list for this state is in the global statuses list 859 // record where and 860 // continue with the next state 861 // else 862 // add the tag list for this state to the global list. 863 // 864 int i; 865 int n; 866 867 // Pre-set a single tag of {0} into the table. 868 // We will need this as a default, for rule sets with no explicit tagging. 869 if (fRB->fRuleStatusVals->size() == 0) { 870 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group 871 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero 872 } 873 874 // For each state 875 for (n=0; n<fDStates->size(); n++) { 876 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 877 UVector *thisStatesTagValues = sd->fTagVals; 878 if (thisStatesTagValues == NULL) { 879 // No tag values are explicitly associated with this state. 880 // Set the default tag value. 881 sd->fTagsIdx = 0; 882 continue; 883 } 884 885 // There are tag(s) associated with this state. 886 // fTagsIdx will be the index into the global tag list for this state's tag values. 887 // Initial value of -1 flags that we haven't got it set yet. 888 sd->fTagsIdx = -1; 889 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list 890 int32_t nextTagGroupStart = 0; 891 892 // Loop runs once per group of tags in the global list 893 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) { 894 thisTagGroupStart = nextTagGroupStart; 895 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1; 896 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) { 897 // The number of tags for this state is different from 898 // the number of tags in this group from the global list. 899 // Continue with the next group from the global list. 900 continue; 901 } 902 // The lengths match, go ahead and compare the actual tag values 903 // between this state and the group from the global list. 904 for (i=0; i<thisStatesTagValues->size(); i++) { 905 if (thisStatesTagValues->elementAti(i) != 906 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) { 907 // Mismatch. 908 break; 909 } 910 } 911 912 if (i == thisStatesTagValues->size()) { 913 // We found a set of tag values in the global list that match 914 // those for this state. Use them. 915 sd->fTagsIdx = thisTagGroupStart; 916 break; 917 } 918 } 919 920 if (sd->fTagsIdx == -1) { 921 // No suitable entry in the global tag list already. Add one 922 sd->fTagsIdx = fRB->fRuleStatusVals->size(); 923 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus); 924 for (i=0; i<thisStatesTagValues->size(); i++) { 925 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus); 926 } 927 } 928 } 929 } 930 931 932 933 934 935 936 937 //----------------------------------------------------------------------------- 938 // 939 // sortedAdd Add a value to a vector of sorted values (ints). 940 // Do not replicate entries; if the value is already there, do not 941 // add a second one. 942 // Lazily create the vector if it does not already exist. 943 // 944 //----------------------------------------------------------------------------- 945 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) { 946 int32_t i; 947 948 if (*vector == NULL) { 949 *vector = new UVector(*fStatus); 950 } 951 if (*vector == NULL || U_FAILURE(*fStatus)) { 952 return; 953 } 954 UVector *vec = *vector; 955 int32_t vSize = vec->size(); 956 for (i=0; i<vSize; i++) { 957 int32_t valAtI = vec->elementAti(i); 958 if (valAtI == val) { 959 // The value is already in the vector. Don't add it again. 960 return; 961 } 962 if (valAtI > val) { 963 break; 964 } 965 } 966 vec->insertElementAt(val, i, *fStatus); 967 } 968 969 970 971 //----------------------------------------------------------------------------- 972 // 973 // setAdd Set operation on UVector 974 // dest = dest union source 975 // Elements may only appear once and must be sorted. 976 // 977 //----------------------------------------------------------------------------- 978 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) { 979 int32_t destOriginalSize = dest->size(); 980 int32_t sourceSize = source->size(); 981 int32_t di = 0; 982 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc 983 void **destPtr, **sourcePtr; 984 void **destLim, **sourceLim; 985 986 if (destOriginalSize > destArray.getCapacity()) { 987 if (destArray.resize(destOriginalSize) == NULL) { 988 return; 989 } 990 } 991 destPtr = destArray.getAlias(); 992 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()? 993 994 if (sourceSize > sourceArray.getCapacity()) { 995 if (sourceArray.resize(sourceSize) == NULL) { 996 return; 997 } 998 } 999 sourcePtr = sourceArray.getAlias(); 1000 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()? 1001 1002 // Avoid multiple "get element" calls by getting the contents into arrays 1003 (void) dest->toArray(destPtr); 1004 (void) source->toArray(sourcePtr); 1005 1006 dest->setSize(sourceSize+destOriginalSize, *fStatus); 1007 1008 while (sourcePtr < sourceLim && destPtr < destLim) { 1009 if (*destPtr == *sourcePtr) { 1010 dest->setElementAt(*sourcePtr++, di++); 1011 destPtr++; 1012 } 1013 // This check is required for machines with segmented memory, like i5/OS. 1014 // Direct pointer comparison is not recommended. 1015 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) { 1016 dest->setElementAt(*destPtr++, di++); 1017 } 1018 else { /* *sourcePtr < *destPtr */ 1019 dest->setElementAt(*sourcePtr++, di++); 1020 } 1021 } 1022 1023 // At most one of these two cleanup loops will execute 1024 while (destPtr < destLim) { 1025 dest->setElementAt(*destPtr++, di++); 1026 } 1027 while (sourcePtr < sourceLim) { 1028 dest->setElementAt(*sourcePtr++, di++); 1029 } 1030 1031 dest->setSize(di, *fStatus); 1032 } 1033 1034 1035 1036 //----------------------------------------------------------------------------- 1037 // 1038 // setEqual Set operation on UVector. 1039 // Compare for equality. 1040 // Elements must be sorted. 1041 // 1042 //----------------------------------------------------------------------------- 1043 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) { 1044 return a->equals(*b); 1045 } 1046 1047 1048 //----------------------------------------------------------------------------- 1049 // 1050 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos 1051 // for each node in the tree. 1052 // 1053 //----------------------------------------------------------------------------- 1054 #ifdef RBBI_DEBUG 1055 void RBBITableBuilder::printPosSets(RBBINode *n) { 1056 if (n==NULL) { 1057 return; 1058 } 1059 printf("\n"); 1060 RBBINode::printNodeHeader(); 1061 RBBINode::printNode(n); 1062 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE"); 1063 1064 RBBIDebugPrintf(" firstpos: "); 1065 printSet(n->fFirstPosSet); 1066 1067 RBBIDebugPrintf(" lastpos: "); 1068 printSet(n->fLastPosSet); 1069 1070 RBBIDebugPrintf(" followpos: "); 1071 printSet(n->fFollowPos); 1072 1073 printPosSets(n->fLeftChild); 1074 printPosSets(n->fRightChild); 1075 } 1076 #endif 1077 1078 // 1079 // findDuplCharClassFrom() 1080 // 1081 bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) { 1082 int32_t numStates = fDStates->size(); 1083 int32_t numCols = fRB->fSetBuilder->getNumCharCategories(); 1084 1085 for (; categories->first < numCols-1; categories->first++) { 1086 for (categories->second=categories->first+1; categories->second < numCols; categories->second++) { 1087 // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates). 1088 uint16_t table_base = 0; 1089 uint16_t table_dupl = 1; 1090 for (int32_t state=0; state<numStates; state++) { 1091 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state); 1092 table_base = (uint16_t)sd->fDtran->elementAti(categories->first); 1093 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second); 1094 if (table_base != table_dupl) { 1095 break; 1096 } 1097 } 1098 if (table_base == table_dupl) { 1099 return true; 1100 } 1101 } 1102 } 1103 return false; 1104 } 1105 1106 1107 // 1108 // removeColumn() 1109 // 1110 void RBBITableBuilder::removeColumn(int32_t column) { 1111 int32_t numStates = fDStates->size(); 1112 for (int32_t state=0; state<numStates; state++) { 1113 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state); 1114 U_ASSERT(column < sd->fDtran->size()); 1115 sd->fDtran->removeElementAt(column); 1116 } 1117 } 1118 1119 /* 1120 * findDuplicateState 1121 */ 1122 bool RBBITableBuilder::findDuplicateState(IntPair *states) { 1123 int32_t numStates = fDStates->size(); 1124 int32_t numCols = fRB->fSetBuilder->getNumCharCategories(); 1125 1126 for (; states->first<numStates-1; states->first++) { 1127 RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first); 1128 for (states->second=states->first+1; states->second<numStates; states->second++) { 1129 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second); 1130 if (firstSD->fAccepting != duplSD->fAccepting || 1131 firstSD->fLookAhead != duplSD->fLookAhead || 1132 firstSD->fTagsIdx != duplSD->fTagsIdx) { 1133 continue; 1134 } 1135 bool rowsMatch = true; 1136 for (int32_t col=0; col < numCols; ++col) { 1137 int32_t firstVal = firstSD->fDtran->elementAti(col); 1138 int32_t duplVal = duplSD->fDtran->elementAti(col); 1139 if (!((firstVal == duplVal) || 1140 ((firstVal == states->first || firstVal == states->second) && 1141 (duplVal == states->first || duplVal == states->second)))) { 1142 rowsMatch = false; 1143 break; 1144 } 1145 } 1146 if (rowsMatch) { 1147 return true; 1148 } 1149 } 1150 } 1151 return false; 1152 } 1153 1154 1155 bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) { 1156 int32_t numStates = fSafeTable->size(); 1157 1158 for (; states->first<numStates-1; states->first++) { 1159 UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first)); 1160 for (states->second=states->first+1; states->second<numStates; states->second++) { 1161 UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second)); 1162 bool rowsMatch = true; 1163 int32_t numCols = firstRow->length(); 1164 for (int32_t col=0; col < numCols; ++col) { 1165 int32_t firstVal = firstRow->charAt(col); 1166 int32_t duplVal = duplRow->charAt(col); 1167 if (!((firstVal == duplVal) || 1168 ((firstVal == states->first || firstVal == states->second) && 1169 (duplVal == states->first || duplVal == states->second)))) { 1170 rowsMatch = false; 1171 break; 1172 } 1173 } 1174 if (rowsMatch) { 1175 return true; 1176 } 1177 } 1178 } 1179 return false; 1180 } 1181 1182 1183 void RBBITableBuilder::removeState(IntPair duplStates) { 1184 const int32_t keepState = duplStates.first; 1185 const int32_t duplState = duplStates.second; 1186 U_ASSERT(keepState < duplState); 1187 U_ASSERT(duplState < fDStates->size()); 1188 1189 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState); 1190 fDStates->removeElementAt(duplState); 1191 delete duplSD; 1192 1193 int32_t numStates = fDStates->size(); 1194 int32_t numCols = fRB->fSetBuilder->getNumCharCategories(); 1195 for (int32_t state=0; state<numStates; ++state) { 1196 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state); 1197 for (int32_t col=0; col<numCols; col++) { 1198 int32_t existingVal = sd->fDtran->elementAti(col); 1199 int32_t newVal = existingVal; 1200 if (existingVal == duplState) { 1201 newVal = keepState; 1202 } else if (existingVal > duplState) { 1203 newVal = existingVal - 1; 1204 } 1205 sd->fDtran->setElementAt(newVal, col); 1206 } 1207 if (sd->fAccepting == duplState) { 1208 sd->fAccepting = keepState; 1209 } else if (sd->fAccepting > duplState) { 1210 sd->fAccepting--; 1211 } 1212 if (sd->fLookAhead == duplState) { 1213 sd->fLookAhead = keepState; 1214 } else if (sd->fLookAhead > duplState) { 1215 sd->fLookAhead--; 1216 } 1217 } 1218 } 1219 1220 void RBBITableBuilder::removeSafeState(IntPair duplStates) { 1221 const int32_t keepState = duplStates.first; 1222 const int32_t duplState = duplStates.second; 1223 U_ASSERT(keepState < duplState); 1224 U_ASSERT(duplState < fSafeTable->size()); 1225 1226 fSafeTable->removeElementAt(duplState); // Note that fSafeTable has a deleter function 1227 // and will auto-delete the removed element. 1228 int32_t numStates = fSafeTable->size(); 1229 for (int32_t state=0; state<numStates; ++state) { 1230 UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state); 1231 int32_t numCols = sd->length(); 1232 for (int32_t col=0; col<numCols; col++) { 1233 int32_t existingVal = sd->charAt(col); 1234 int32_t newVal = existingVal; 1235 if (existingVal == duplState) { 1236 newVal = keepState; 1237 } else if (existingVal > duplState) { 1238 newVal = existingVal - 1; 1239 } 1240 sd->setCharAt(col, static_cast<char16_t>(newVal)); 1241 } 1242 } 1243 } 1244 1245 1246 /* 1247 * RemoveDuplicateStates 1248 */ 1249 int32_t RBBITableBuilder::removeDuplicateStates() { 1250 IntPair dupls = {3, 0}; 1251 int32_t numStatesRemoved = 0; 1252 1253 while (findDuplicateState(&dupls)) { 1254 // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second); 1255 removeState(dupls); 1256 ++numStatesRemoved; 1257 } 1258 return numStatesRemoved; 1259 } 1260 1261 1262 //----------------------------------------------------------------------------- 1263 // 1264 // getTableSize() Calculate the size of the runtime form of this 1265 // state transition table. 1266 // 1267 //----------------------------------------------------------------------------- 1268 int32_t RBBITableBuilder::getTableSize() const { 1269 int32_t size = 0; 1270 int32_t numRows; 1271 int32_t numCols; 1272 int32_t rowSize; 1273 1274 if (fTree == NULL) { 1275 return 0; 1276 } 1277 1278 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table. 1279 1280 numRows = fDStates->size(); 1281 numCols = fRB->fSetBuilder->getNumCharCategories(); 1282 1283 rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols; 1284 size += numRows * rowSize; 1285 return size; 1286 } 1287 1288 1289 //----------------------------------------------------------------------------- 1290 // 1291 // exportTable() export the state transition table in the format required 1292 // by the runtime engine. getTableSize() bytes of memory 1293 // must be available at the output address "where". 1294 // 1295 //----------------------------------------------------------------------------- 1296 void RBBITableBuilder::exportTable(void *where) { 1297 RBBIStateTable *table = (RBBIStateTable *)where; 1298 uint32_t state; 1299 int col; 1300 1301 if (U_FAILURE(*fStatus) || fTree == NULL) { 1302 return; 1303 } 1304 1305 int32_t catCount = fRB->fSetBuilder->getNumCharCategories(); 1306 if (catCount > 0x7fff || 1307 fDStates->size() > 0x7fff) { 1308 *fStatus = U_BRK_INTERNAL_ERROR; 1309 return; 1310 } 1311 1312 table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount; 1313 table->fNumStates = fDStates->size(); 1314 table->fFlags = 0; 1315 if (fRB->fLookAheadHardBreak) { 1316 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK; 1317 } 1318 if (fRB->fSetBuilder->sawBOF()) { 1319 table->fFlags |= RBBI_BOF_REQUIRED; 1320 } 1321 table->fReserved = 0; 1322 1323 for (state=0; state<table->fNumStates; state++) { 1324 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state); 1325 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen); 1326 U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767); 1327 U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767); 1328 row->fAccepting = (int16_t)sd->fAccepting; 1329 row->fLookAhead = (int16_t)sd->fLookAhead; 1330 row->fTagIdx = (int16_t)sd->fTagsIdx; 1331 for (col=0; col<catCount; col++) { 1332 row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col); 1333 } 1334 } 1335 } 1336 1337 1338 /** 1339 * Synthesize a safe state table from the main state table. 1340 */ 1341 void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) { 1342 // The safe table creation has three steps: 1343 1344 // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries 1345 // following the pair do not depend on context or state before the pair. To test 1346 // whether a pair is safe, run it through the main forward state table, starting 1347 // from each state. If the the final state is the same, no matter what the starting state, 1348 // the pair is safe. 1349 // 1350 // 2. Build a state table that recognizes the safe pairs. It's similar to their 1351 // forward table, with a column for each input character [class], and a row for 1352 // each state. Row 1 is the start state, and row 0 is the stop state. Initially 1353 // create an additional state for each input character category; being in 1354 // one of these states means that the character has been seen, and is potentially 1355 // the first of a pair. In each of these rows, the entry for the second character 1356 // of a safe pair is set to the stop state (0), indicating that a match was found. 1357 // All other table entries are set to the state corresponding the current input 1358 // character, allowing that charcter to be the of a start following pair. 1359 // 1360 // Because the safe rules are to be run in reverse, moving backwards in the text, 1361 // the first and second pair categories are swapped when building the table. 1362 // 1363 // 3. Compress the table. There are typically many rows (states) that are 1364 // equivalent - that have zeroes (match completed) in the same columns - 1365 // and can be folded together. 1366 1367 // Each safe pair is stored as two UChars in the safePair string. 1368 UnicodeString safePairs; 1369 1370 int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories(); 1371 int32_t numStates = fDStates->size(); 1372 1373 for (int32_t c1=0; c1<numCharClasses; ++c1) { 1374 for (int32_t c2=0; c2 < numCharClasses; ++c2) { 1375 int32_t wantedEndState = -1; 1376 int32_t endState = 0; 1377 for (int32_t startState = 1; startState < numStates; ++startState) { 1378 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState)); 1379 int32_t s2 = startStateD->fDtran->elementAti(c1); 1380 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2)); 1381 endState = s2StateD->fDtran->elementAti(c2); 1382 if (wantedEndState < 0) { 1383 wantedEndState = endState; 1384 } else { 1385 if (wantedEndState != endState) { 1386 break; 1387 } 1388 } 1389 } 1390 if (wantedEndState == endState) { 1391 safePairs.append((char16_t)c1); 1392 safePairs.append((char16_t)c2); 1393 // printf("(%d, %d) ", c1, c2); 1394 } 1395 } 1396 // printf("\n"); 1397 } 1398 1399 // Populate the initial safe table. 1400 // The table as a whole is UVector<UnicodeString> 1401 // Each row is represented by a UnicodeString, being used as a Vector<int16>. 1402 // Row 0 is the stop state. 1403 // Row 1 is the start sate. 1404 // Row 2 and beyond are other states, initially one per char class, but 1405 // after initial construction, many of the states will be combined, compacting the table. 1406 // The String holds the nextState data only. The four leading fields of a row, fAccepting, 1407 // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building. 1408 1409 U_ASSERT(fSafeTable == nullptr); 1410 fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status); 1411 for (int32_t row=0; row<numCharClasses + 2; ++row) { 1412 fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status); 1413 } 1414 1415 // From the start state, each input char class transitions to the state for that input. 1416 UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1)); 1417 for (int32_t charClass=0; charClass < numCharClasses; ++charClass) { 1418 // Note: +2 for the start & stop state. 1419 startState.setCharAt(charClass, static_cast<char16_t>(charClass+2)); 1420 } 1421 1422 // Initially make every other state table row look like the start state row, 1423 for (int32_t row=2; row<numCharClasses+2; ++row) { 1424 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row)); 1425 rowState = startState; // UnicodeString assignment, copies contents. 1426 } 1427 1428 // Run through the safe pairs, set the next state to zero when pair has been seen. 1429 // Zero being the stop state, meaning we found a safe point. 1430 for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) { 1431 int32_t c1 = safePairs.charAt(pairIdx); 1432 int32_t c2 = safePairs.charAt(pairIdx + 1); 1433 1434 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2)); 1435 rowState.setCharAt(c1, 0); 1436 } 1437 1438 // Remove duplicate or redundant rows from the table. 1439 IntPair states = {1, 0}; 1440 while (findDuplicateSafeState(&states)) { 1441 // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second); 1442 removeSafeState(states); 1443 } 1444 } 1445 1446 1447 //----------------------------------------------------------------------------- 1448 // 1449 // getSafeTableSize() Calculate the size of the runtime form of this 1450 // safe state table. 1451 // 1452 //----------------------------------------------------------------------------- 1453 int32_t RBBITableBuilder::getSafeTableSize() const { 1454 int32_t size = 0; 1455 int32_t numRows; 1456 int32_t numCols; 1457 int32_t rowSize; 1458 1459 if (fSafeTable == nullptr) { 1460 return 0; 1461 } 1462 1463 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table. 1464 1465 numRows = fSafeTable->size(); 1466 numCols = fRB->fSetBuilder->getNumCharCategories(); 1467 1468 rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols; 1469 size += numRows * rowSize; 1470 return size; 1471 } 1472 1473 1474 //----------------------------------------------------------------------------- 1475 // 1476 // exportSafeTable() export the state transition table in the format required 1477 // by the runtime engine. getTableSize() bytes of memory 1478 // must be available at the output address "where". 1479 // 1480 //----------------------------------------------------------------------------- 1481 void RBBITableBuilder::exportSafeTable(void *where) { 1482 RBBIStateTable *table = (RBBIStateTable *)where; 1483 uint32_t state; 1484 int col; 1485 1486 if (U_FAILURE(*fStatus) || fSafeTable == nullptr) { 1487 return; 1488 } 1489 1490 int32_t catCount = fRB->fSetBuilder->getNumCharCategories(); 1491 if (catCount > 0x7fff || 1492 fSafeTable->size() > 0x7fff) { 1493 *fStatus = U_BRK_INTERNAL_ERROR; 1494 return; 1495 } 1496 1497 table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount; 1498 table->fNumStates = fSafeTable->size(); 1499 table->fFlags = 0; 1500 table->fReserved = 0; 1501 1502 for (state=0; state<table->fNumStates; state++) { 1503 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state); 1504 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen); 1505 row->fAccepting = 0; 1506 row->fLookAhead = 0; 1507 row->fTagIdx = 0; 1508 row->fReserved = 0; 1509 for (col=0; col<catCount; col++) { 1510 row->fNextState[col] = rowString->charAt(col); 1511 } 1512 } 1513 } 1514 1515 1516 1517 1518 //----------------------------------------------------------------------------- 1519 // 1520 // printSet Debug function. Print the contents of a UVector 1521 // 1522 //----------------------------------------------------------------------------- 1523 #ifdef RBBI_DEBUG 1524 void RBBITableBuilder::printSet(UVector *s) { 1525 int32_t i; 1526 for (i=0; i<s->size(); i++) { 1527 const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i)); 1528 RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum); 1529 } 1530 RBBIDebugPrintf("\n"); 1531 } 1532 #endif 1533 1534 1535 //----------------------------------------------------------------------------- 1536 // 1537 // printStates Debug Function. Dump the fully constructed state transition table. 1538 // 1539 //----------------------------------------------------------------------------- 1540 #ifdef RBBI_DEBUG 1541 void RBBITableBuilder::printStates() { 1542 int c; // input "character" 1543 int n; // state number 1544 1545 RBBIDebugPrintf("state | i n p u t s y m b o l s \n"); 1546 RBBIDebugPrintf(" | Acc LA Tag"); 1547 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1548 RBBIDebugPrintf(" %2d", c); 1549 } 1550 RBBIDebugPrintf("\n"); 1551 RBBIDebugPrintf(" |---------------"); 1552 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1553 RBBIDebugPrintf("---"); 1554 } 1555 RBBIDebugPrintf("\n"); 1556 1557 for (n=0; n<fDStates->size(); n++) { 1558 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 1559 RBBIDebugPrintf(" %3d | " , n); 1560 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx); 1561 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1562 RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c)); 1563 } 1564 RBBIDebugPrintf("\n"); 1565 } 1566 RBBIDebugPrintf("\n\n"); 1567 } 1568 #endif 1569 1570 1571 //----------------------------------------------------------------------------- 1572 // 1573 // printSafeTable Debug Function. Dump the fully constructed safe table. 1574 // 1575 //----------------------------------------------------------------------------- 1576 #ifdef RBBI_DEBUG 1577 void RBBITableBuilder::printReverseTable() { 1578 int c; // input "character" 1579 int n; // state number 1580 1581 RBBIDebugPrintf(" Safe Reverse Table \n"); 1582 if (fSafeTable == nullptr) { 1583 RBBIDebugPrintf(" --- nullptr ---\n"); 1584 return; 1585 } 1586 RBBIDebugPrintf("state | i n p u t s y m b o l s \n"); 1587 RBBIDebugPrintf(" | Acc LA Tag"); 1588 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1589 RBBIDebugPrintf(" %2d", c); 1590 } 1591 RBBIDebugPrintf("\n"); 1592 RBBIDebugPrintf(" |---------------"); 1593 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1594 RBBIDebugPrintf("---"); 1595 } 1596 RBBIDebugPrintf("\n"); 1597 1598 for (n=0; n<fSafeTable->size(); n++) { 1599 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n); 1600 RBBIDebugPrintf(" %3d | " , n); 1601 RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags 1602 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1603 RBBIDebugPrintf(" %2d", rowString->charAt(c)); 1604 } 1605 RBBIDebugPrintf("\n"); 1606 } 1607 RBBIDebugPrintf("\n\n"); 1608 } 1609 #endif 1610 1611 1612 1613 //----------------------------------------------------------------------------- 1614 // 1615 // printRuleStatusTable Debug Function. Dump the common rule status table 1616 // 1617 //----------------------------------------------------------------------------- 1618 #ifdef RBBI_DEBUG 1619 void RBBITableBuilder::printRuleStatusTable() { 1620 int32_t thisRecord = 0; 1621 int32_t nextRecord = 0; 1622 int i; 1623 UVector *tbl = fRB->fRuleStatusVals; 1624 1625 RBBIDebugPrintf("index | tags \n"); 1626 RBBIDebugPrintf("-------------------\n"); 1627 1628 while (nextRecord < tbl->size()) { 1629 thisRecord = nextRecord; 1630 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1; 1631 RBBIDebugPrintf("%4d ", thisRecord); 1632 for (i=thisRecord+1; i<nextRecord; i++) { 1633 RBBIDebugPrintf(" %5d", tbl->elementAti(i)); 1634 } 1635 RBBIDebugPrintf("\n"); 1636 } 1637 RBBIDebugPrintf("\n\n"); 1638 } 1639 #endif 1640 1641 1642 //----------------------------------------------------------------------------- 1643 // 1644 // RBBIStateDescriptor Methods. This is a very struct-like class 1645 // Most access is directly to the fields. 1646 // 1647 //----------------------------------------------------------------------------- 1648 1649 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) { 1650 fMarked = FALSE; 1651 fAccepting = 0; 1652 fLookAhead = 0; 1653 fTagsIdx = 0; 1654 fTagVals = NULL; 1655 fPositions = NULL; 1656 fDtran = NULL; 1657 1658 fDtran = new UVector32(lastInputSymbol+1, *fStatus); 1659 if (U_FAILURE(*fStatus)) { 1660 return; 1661 } 1662 if (fDtran == NULL) { 1663 *fStatus = U_MEMORY_ALLOCATION_ERROR; 1664 return; 1665 } 1666 fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized. 1667 // It is indexed by input symbols, and will 1668 // hold the next state number for each 1669 // symbol. 1670 } 1671 1672 1673 RBBIStateDescriptor::~RBBIStateDescriptor() { 1674 delete fPositions; 1675 delete fDtran; 1676 delete fTagVals; 1677 fPositions = NULL; 1678 fDtran = NULL; 1679 fTagVals = NULL; 1680 } 1681 1682 U_NAMESPACE_END 1683 1684 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1685