1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ********************************************************************** 6 * Copyright (c) 2002-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ********************************************************************** 9 */ 10 11 package android.icu.text; 12 13 import java.util.ArrayList; 14 import java.util.Collection; 15 import java.util.HashSet; 16 import java.util.List; 17 import java.util.Set; 18 import java.util.SortedSet; 19 import java.util.TreeSet; 20 21 import android.icu.impl.Assert; 22 import android.icu.lang.UCharacter; 23 import android.icu.lang.UProperty; 24 25 // 26 // class RBBITableBuilder is part of the RBBI rule compiler. 27 // It builds the state transition table used by the RBBI runtime 28 // from the expression syntax tree generated by the rule scanner. 29 // 30 // This class is part of the RBBI implementation only. 31 // There is no user-visible public API here. 32 // 33 class RBBITableBuilder { 34 35 36 37 // 38 // RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors, 39 // one for each state. 40 static class RBBIStateDescriptor { 41 boolean fMarked; 42 int fAccepting; 43 int fLookAhead; 44 SortedSet<Integer> fTagVals; 45 int fTagsIdx; 46 Set<RBBINode> fPositions; // Set of parse tree positions associated 47 // with this state. Unordered (it's a set). 48 // UVector contents are RBBINode * 49 50 int[] fDtran; // Transitions out of this state. 51 // indexed by input character 52 // contents is int index of dest state 53 // in RBBITableBuilder.fDStates 54 55 RBBIStateDescriptor(int maxInputSymbol) { 56 fTagVals = new TreeSet<Integer>(); 57 fPositions = new HashSet<RBBINode>(); 58 fDtran = new int[maxInputSymbol+1]; // fDtran needs to be pre-sized. 59 // It is indexed by input symbols, and will 60 // hold the next state number for each 61 // symbol. 62 } 63 } 64 65 66 private RBBIRuleBuilder fRB; 67 private int fRootIx; // The array index into RBBIRuleBuilder.fTreeRoots 68 // for the parse tree to operate on. 69 // Too bad Java can't do indirection more easily! 70 71 private List<RBBIStateDescriptor> fDStates; // D states (Aho's terminology) 72 // Index is state number 73 // Contents are RBBIStateDescriptor pointers. 74 75 //----------------------------------------------------------------------------- 76 // 77 // Constructor for RBBITableBuilder. 78 // 79 // rootNode is an index into the array of root nodes that is held by 80 // the overall RBBIRuleBuilder. 81 //----------------------------------------------------------------------------- 82 RBBITableBuilder(RBBIRuleBuilder rb, int rootNodeIx) { 83 fRootIx = rootNodeIx; 84 fRB = rb; 85 fDStates = new ArrayList<RBBIStateDescriptor>(); 86 } 87 88 89 90 91 //----------------------------------------------------------------------------- 92 // 93 // RBBITableBuilder::build - This is the main function for building the DFA state transtion 94 // table from the RBBI rules parse tree. 95 // 96 //----------------------------------------------------------------------------- 97 void build() { 98 // If there were no rules, just return. This situation can easily arise 99 // for the reverse rules. 100 if (fRB.fTreeRoots[fRootIx]==null) { 101 return; 102 } 103 104 // 105 // Walk through the tree, replacing any references to $variables with a copy of the 106 // parse tree for the substition expression. 107 // 108 fRB.fTreeRoots[fRootIx] = fRB.fTreeRoots[fRootIx].flattenVariables(); 109 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("ftree")>=0) { 110 System.out.println("Parse tree after flattening variable references."); 111 fRB.fTreeRoots[fRootIx].printTree(true); 112 } 113 114 // 115 // If the rules contained any references to {bof} 116 // add a {bof} <cat> <former root of tree> to the 117 // tree. Means that all matches must start out with the 118 // {bof} fake character. 119 // 120 if (fRB.fSetBuilder.sawBOF()) { 121 RBBINode bofTop = new RBBINode(RBBINode.opCat); 122 RBBINode bofLeaf = new RBBINode(RBBINode.leafChar); 123 bofTop.fLeftChild = bofLeaf; 124 bofTop.fRightChild = fRB.fTreeRoots[fRootIx]; 125 bofLeaf.fParent = bofTop; 126 bofLeaf.fVal = 2; // Reserved value for {bof}. 127 fRB.fTreeRoots[fRootIx] = bofTop; 128 } 129 130 // 131 // Add a unique right-end marker to the expression. 132 // Appears as a cat-node, left child being the original tree, 133 // right child being the end marker. 134 // 135 RBBINode cn = new RBBINode(RBBINode.opCat); 136 cn.fLeftChild = fRB.fTreeRoots[fRootIx]; 137 fRB.fTreeRoots[fRootIx].fParent = cn; 138 cn.fRightChild = new RBBINode(RBBINode.endMark); 139 cn.fRightChild.fParent = cn; 140 fRB.fTreeRoots[fRootIx] = cn; 141 142 // 143 // Replace all references to UnicodeSets with the tree for the equivalent 144 // expression. 145 // 146 fRB.fTreeRoots[fRootIx].flattenSets(); 147 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("stree")>=0) { 148 System.out.println("Parse tree after flattening Unicode Set references."); 149 fRB.fTreeRoots[fRootIx].printTree(true); 150 } 151 152 153 // 154 // calculate the functions nullable, firstpos, lastpos and followpos on 155 // nodes in the parse tree. 156 // See the alogrithm description in Aho. 157 // Understanding how this works by looking at the code alone will be 158 // nearly impossible. 159 // 160 calcNullable(fRB.fTreeRoots[fRootIx]); 161 calcFirstPos(fRB.fTreeRoots[fRootIx]); 162 calcLastPos(fRB.fTreeRoots[fRootIx]); 163 calcFollowPos(fRB.fTreeRoots[fRootIx]); 164 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("pos")>=0) { 165 System.out.print("\n"); 166 printPosSets(fRB.fTreeRoots[fRootIx]); 167 } 168 169 // 170 // For "chained" rules, modify the followPos sets 171 // 172 if (fRB.fChainRules) { 173 calcChainedFollowPos(fRB.fTreeRoots[fRootIx]); 174 } 175 176 // 177 // BOF (start of input) test fixup. 178 // 179 if (fRB.fSetBuilder.sawBOF()) { 180 bofFixup(); 181 } 182 183 // 184 // Build the DFA state transition tables. 185 // 186 buildStateTable(); 187 flagAcceptingStates(); 188 flagLookAheadStates(); 189 flagTaggedStates(); 190 191 // 192 // Update the global table of rule status {tag} values 193 // The rule builder has a global vector of status values that are common 194 // for all tables. Merge the ones from this table into the global set. 195 // 196 mergeRuleStatusVals(); 197 198 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("states")>=0) {printStates();} 199 } 200 201 202 203 //----------------------------------------------------------------------------- 204 // 205 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9 206 // 207 //----------------------------------------------------------------------------- 208 void calcNullable(RBBINode n) { 209 if (n == null) { 210 return; 211 } 212 if (n.fType == RBBINode.setRef || 213 n.fType == RBBINode.endMark ) { 214 // These are non-empty leaf node types. 215 n.fNullable = false; 216 return; 217 } 218 219 if (n.fType == RBBINode.lookAhead || n.fType == RBBINode.tag) { 220 // Lookahead marker node. It's a leaf, so no recursion on children. 221 // It's nullable because it does not match any literal text from the input stream. 222 n.fNullable = true; 223 return; 224 } 225 226 227 // The node is not a leaf. 228 // Calculate nullable on its children. 229 calcNullable(n.fLeftChild); 230 calcNullable(n.fRightChild); 231 232 // Apply functions from table 3.40 in Aho 233 if (n.fType == RBBINode.opOr) { 234 n.fNullable = n.fLeftChild.fNullable || n.fRightChild.fNullable; 235 } 236 else if (n.fType == RBBINode.opCat) { 237 n.fNullable = n.fLeftChild.fNullable && n.fRightChild.fNullable; 238 } 239 else if (n.fType == RBBINode.opStar || n.fType == RBBINode.opQuestion) { 240 n.fNullable = true; 241 } 242 else { 243 n.fNullable = false; 244 } 245 } 246 247 248 249 250 //----------------------------------------------------------------------------- 251 // 252 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9 253 // 254 //----------------------------------------------------------------------------- 255 void calcFirstPos(RBBINode n) { 256 if (n == null) { 257 return; 258 } 259 if (n.fType == RBBINode.leafChar || 260 n.fType == RBBINode.endMark || 261 n.fType == RBBINode.lookAhead || 262 n.fType == RBBINode.tag) { 263 // These are non-empty leaf node types. 264 n.fFirstPosSet.add(n); 265 return; 266 } 267 268 // The node is not a leaf. 269 // Calculate firstPos on its children. 270 calcFirstPos(n.fLeftChild); 271 calcFirstPos(n.fRightChild); 272 273 // Apply functions from table 3.40 in Aho 274 if (n.fType == RBBINode.opOr) { 275 n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet); 276 n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet); 277 } 278 else if (n.fType == RBBINode.opCat) { 279 n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet); 280 if (n.fLeftChild.fNullable) { 281 n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet); 282 } 283 } 284 else if (n.fType == RBBINode.opStar || 285 n.fType == RBBINode.opQuestion || 286 n.fType == RBBINode.opPlus) { 287 n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet); 288 } 289 } 290 291 292 293 //----------------------------------------------------------------------------- 294 // 295 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9 296 // 297 //----------------------------------------------------------------------------- 298 void calcLastPos(RBBINode n) { 299 if (n == null) { 300 return; 301 } 302 if (n.fType == RBBINode.leafChar || 303 n.fType == RBBINode.endMark || 304 n.fType == RBBINode.lookAhead || 305 n.fType == RBBINode.tag) { 306 // These are non-empty leaf node types. 307 n.fLastPosSet.add(n); 308 return; 309 } 310 311 // The node is not a leaf. 312 // Calculate lastPos on its children. 313 calcLastPos(n.fLeftChild); 314 calcLastPos(n.fRightChild); 315 316 // Apply functions from table 3.40 in Aho 317 if (n.fType == RBBINode.opOr) { 318 n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet); 319 n.fLastPosSet.addAll(n.fRightChild.fLastPosSet); 320 } 321 else if (n.fType == RBBINode.opCat) { 322 n.fLastPosSet.addAll(n.fRightChild.fLastPosSet); 323 if (n.fRightChild.fNullable) { 324 n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet); 325 } 326 } 327 else if (n.fType == RBBINode.opStar || 328 n.fType == RBBINode.opQuestion || 329 n.fType == RBBINode.opPlus) { 330 n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet); 331 } 332 } 333 334 335 336 //----------------------------------------------------------------------------- 337 // 338 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9 339 // 340 //----------------------------------------------------------------------------- 341 void calcFollowPos(RBBINode n) { 342 if (n == null || 343 n.fType == RBBINode.leafChar || 344 n.fType == RBBINode.endMark) { 345 return; 346 } 347 348 calcFollowPos(n.fLeftChild); 349 calcFollowPos(n.fRightChild); 350 351 // Aho rule #1 352 if (n.fType == RBBINode.opCat) { 353 for (RBBINode i /* is 'i' in Aho's description */ : n.fLeftChild.fLastPosSet) { 354 i.fFollowPos.addAll(n.fRightChild.fFirstPosSet); 355 } 356 } 357 358 // Aho rule #2 359 if (n.fType == RBBINode.opStar || 360 n.fType == RBBINode.opPlus) { 361 for (RBBINode i /* again, n and i are the names from Aho's description */ : n.fLastPosSet) { 362 i.fFollowPos.addAll(n.fFirstPosSet); 363 } 364 } 365 } 366 367 //----------------------------------------------------------------------------- 368 // 369 // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged 370 // as roots of a rule to a destination vector. 371 // 372 //----------------------------------------------------------------------------- 373 void addRuleRootNodes(List<RBBINode> dest, RBBINode node) { 374 if (node == null) { 375 return; 376 } 377 if (node.fRuleRoot) { 378 dest.add(node); 379 // Note: rules cannot nest. If we found a rule start node, 380 // no child node can also be a start node. 381 return; 382 } 383 addRuleRootNodes(dest, node.fLeftChild); 384 addRuleRootNodes(dest, node.fRightChild); 385 } 386 387 //----------------------------------------------------------------------------- 388 // 389 // calcChainedFollowPos. Modify the previously calculated followPos sets 390 // to implement rule chaining. NOT described by Aho 391 // 392 //----------------------------------------------------------------------------- 393 void calcChainedFollowPos(RBBINode tree) { 394 395 List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>(); 396 List<RBBINode> leafNodes = new ArrayList<RBBINode>(); 397 398 // get a list of all endmarker nodes. 399 tree.findNodes(endMarkerNodes, RBBINode.endMark); 400 401 // get a list all leaf nodes 402 tree.findNodes(leafNodes, RBBINode.leafChar); 403 404 // Collect all leaf nodes that can start matches for rules 405 // with inbound chaining enabled, which is the union of the 406 // firstPosition sets from each of the rule root nodes. 407 408 List<RBBINode> ruleRootNodes = new ArrayList<RBBINode>(); 409 addRuleRootNodes(ruleRootNodes, tree); 410 411 Set<RBBINode> matchStartNodes = new HashSet<RBBINode>(); 412 for (RBBINode node: ruleRootNodes) { 413 if (node.fChainIn) { 414 matchStartNodes.addAll(node.fFirstPosSet); 415 } 416 } 417 418 // Iterate over all leaf nodes, 419 // 420 for (RBBINode tNode : leafNodes) { 421 RBBINode endNode = null; 422 423 // Identify leaf nodes that correspond to overall rule match positions. 424 // These include an endMarkerNode in their followPos sets. 425 for (RBBINode endMarkerNode : endMarkerNodes) { 426 if (tNode.fFollowPos.contains(endMarkerNode)) { 427 endNode = tNode; 428 break; 429 } 430 } 431 if (endNode == null) { 432 // node wasn't an end node. Try again with the next. 433 continue; 434 } 435 436 // We've got a node that can end a match. 437 438 // Line Break Specific hack: If this node's val correspond to the $CM char class, 439 // don't chain from it. 440 // TODO: Add rule syntax for this behavior, get specifics out of here and 441 // into the rule file. 442 if (fRB.fLBCMNoChain) { 443 int c = this.fRB.fSetBuilder.getFirstChar(endNode.fVal); 444 if (c != -1) { 445 // c == -1 occurs with sets containing only the {eof} marker string. 446 int cLBProp = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK); 447 if (cLBProp == UCharacter.LineBreak.COMBINING_MARK) { 448 continue; 449 } 450 } 451 } 452 453 454 // Now iterate over the nodes that can start a match, looking for ones 455 // with the same char class as our ending node. 456 for (RBBINode startNode : matchStartNodes) { 457 if (startNode.fType != RBBINode.leafChar) { 458 continue; 459 } 460 461 if (endNode.fVal == startNode.fVal) { 462 // The end val (character class) of one possible match is the 463 // same as the start of another. 464 465 // Add all nodes from the followPos of the start node to the 466 // followPos set of the end node, which will have the effect of 467 // letting matches transition from a match state at endNode 468 // to the second char of a match starting with startNode. 469 endNode.fFollowPos.addAll(startNode.fFollowPos); 470 } 471 } 472 } 473 } 474 475 476 //----------------------------------------------------------------------------- 477 // 478 // bofFixup. Fixup for state tables that include {bof} beginning of input testing. 479 // Do an swizzle similar to chaining, modifying the followPos set of 480 // the bofNode to include the followPos nodes from other {bot} nodes 481 // scattered through the tree. 482 // 483 // This function has much in common with calcChainedFollowPos(). 484 // 485 //----------------------------------------------------------------------------- 486 void bofFixup() { 487 // 488 // The parse tree looks like this ... 489 // fTree root --. <cat> 490 // / \ 491 // <cat> <#end node> 492 // / \ 493 // <bofNode> rest 494 // of tree 495 // 496 // We will be adding things to the followPos set of the <bofNode> 497 // 498 RBBINode bofNode = fRB.fTreeRoots[fRootIx].fLeftChild.fLeftChild; 499 Assert.assrt(bofNode.fType == RBBINode.leafChar); 500 Assert.assrt(bofNode.fVal == 2); 501 502 // Get all nodes that can be the start a match of the user-written rules 503 // (excluding the fake bofNode) 504 // We want the nodes that can start a match in the 505 // part labeled "rest of tree" 506 // 507 Set<RBBINode> matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet; 508 for (RBBINode startNode : matchStartNodes) { 509 if (startNode.fType != RBBINode.leafChar) { 510 continue; 511 } 512 513 if (startNode.fVal == bofNode.fVal) { 514 // We found a leaf node corresponding to a {bof} that was 515 // explicitly written into a rule. 516 // Add everything from the followPos set of this node to the 517 // followPos set of the fake bofNode at the start of the tree. 518 // 519 bofNode.fFollowPos.addAll(startNode.fFollowPos); 520 } 521 } 522 } 523 524 //----------------------------------------------------------------------------- 525 // 526 // buildStateTable() Determine the set of runtime DFA states and the 527 // transition tables for these states, by the algorithm 528 // of fig. 3.44 in Aho. 529 // 530 // Most of the comments are quotes of Aho's psuedo-code. 531 // 532 //----------------------------------------------------------------------------- 533 void buildStateTable() { 534 // 535 // Add a dummy state 0 - the stop state. Not from Aho. 536 int lastInputSymbol = fRB.fSetBuilder.getNumCharCategories() - 1; 537 RBBIStateDescriptor failState = new RBBIStateDescriptor(lastInputSymbol); 538 fDStates.add(failState); 539 540 // initially, the only unmarked state in Dstates is firstpos(root), 541 // where toot is the root of the syntax tree for (r)#; 542 RBBIStateDescriptor initialState = new RBBIStateDescriptor(lastInputSymbol); 543 initialState.fPositions.addAll(fRB.fTreeRoots[fRootIx].fFirstPosSet); 544 fDStates.add(initialState); 545 546 // while there is an unmarked state T in Dstates do begin 547 for (;;) { 548 RBBIStateDescriptor T = null; 549 int tx; 550 for (tx=1; tx<fDStates.size(); tx++) { 551 RBBIStateDescriptor temp = fDStates.get(tx); 552 if (temp.fMarked == false) { 553 T = temp; 554 break; 555 } 556 } 557 if (T == null) { 558 break; 559 } 560 561 // mark T; 562 T.fMarked = true; 563 564 // for each input symbol a do begin 565 int a; 566 for (a = 1; a<=lastInputSymbol; a++) { 567 // let U be the set of positions that are in followpos(p) 568 // for some position p in T 569 // such that the symbol at position p is a; 570 Set<RBBINode> U = null; 571 for (RBBINode p : T.fPositions) { 572 if ((p.fType == RBBINode.leafChar) && (p.fVal == a)) { 573 if (U == null) { 574 U = new HashSet<RBBINode>(); 575 } 576 U.addAll(p.fFollowPos); 577 } 578 } 579 580 // if U is not empty and not in DStates then 581 int ux = 0; 582 boolean UinDstates = false; 583 if (U != null) { 584 Assert.assrt(U.size() > 0); 585 int ix; 586 for (ix=0; ix<fDStates.size(); ix++) { 587 RBBIStateDescriptor temp2; 588 temp2 = fDStates.get(ix); 589 if (U.equals(temp2.fPositions)) { 590 U = temp2.fPositions; 591 ux = ix; 592 UinDstates = true; 593 break; 594 } 595 } 596 597 // Add U as an unmarked state to Dstates 598 if (!UinDstates) 599 { 600 RBBIStateDescriptor newState = new RBBIStateDescriptor(lastInputSymbol); 601 newState.fPositions = U; 602 fDStates.add(newState); 603 ux = fDStates.size()-1; 604 } 605 606 // Dtran[T, a] := U; 607 T.fDtran[a] = ux; 608 } 609 } 610 } 611 } 612 613 614 615 //----------------------------------------------------------------------------- 616 // 617 // flagAcceptingStates Identify accepting states. 618 // First get a list of all of the end marker nodes. 619 // Then, for each state s, 620 // if s contains one of the end marker nodes in its list of tree positions then 621 // s is an accepting state. 622 // 623 //----------------------------------------------------------------------------- 624 void flagAcceptingStates() { 625 List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>(); 626 RBBINode endMarker; 627 int i; 628 int n; 629 630 fRB.fTreeRoots[fRootIx].findNodes(endMarkerNodes, RBBINode.endMark); 631 632 for (i=0; i<endMarkerNodes.size(); i++) { 633 endMarker = endMarkerNodes.get(i); 634 for (n=0; n<fDStates.size(); n++) { 635 RBBIStateDescriptor sd = fDStates.get(n); 636 //if (sd.fPositions.indexOf(endMarker) >= 0) { 637 if (sd.fPositions.contains(endMarker)) { 638 // Any non-zero value for fAccepting means this is an accepting node. 639 // The value is what will be returned to the user as the break status. 640 // If no other value was specified, force it to -1. 641 642 if (sd.fAccepting==0) { 643 // State hasn't been marked as accepting yet. Do it now. 644 sd.fAccepting = endMarker.fVal; 645 if (sd.fAccepting == 0) { 646 sd.fAccepting = -1; 647 } 648 } 649 if (sd.fAccepting==-1 && endMarker.fVal != 0) { 650 // Both lookahead and non-lookahead accepting for this state. 651 // Favor the look-ahead. Expedient for line break. 652 // TODO: need a more elegant resolution for conflicting rules. 653 sd.fAccepting = endMarker.fVal; 654 } 655 // implicit else: 656 // if sd.fAccepting already had a value other than 0 or -1, leave it be. 657 658 // If the end marker node is from a look-ahead rule, set 659 // the fLookAhead field or this state also. 660 if (endMarker.fLookAheadEnd) { 661 // TODO: don't change value if already set? 662 // TODO: allow for more than one active look-ahead rule in engine. 663 // Make value here an index to a side array in engine? 664 sd.fLookAhead = sd.fAccepting; 665 } 666 } 667 } 668 } 669 } 670 671 672 //----------------------------------------------------------------------------- 673 // 674 // flagLookAheadStates Very similar to flagAcceptingStates, above. 675 // 676 //----------------------------------------------------------------------------- 677 void flagLookAheadStates() { 678 List<RBBINode> lookAheadNodes = new ArrayList<RBBINode>(); 679 RBBINode lookAheadNode; 680 int i; 681 int n; 682 683 fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes, RBBINode.lookAhead); 684 for (i=0; i<lookAheadNodes.size(); i++) { 685 lookAheadNode = lookAheadNodes.get(i); 686 687 for (n=0; n<fDStates.size(); n++) { 688 RBBIStateDescriptor sd = fDStates.get(n); 689 if (sd.fPositions.contains(lookAheadNode)) { 690 sd.fLookAhead = lookAheadNode.fVal; 691 } 692 } 693 } 694 } 695 696 697 698 699 //----------------------------------------------------------------------------- 700 // 701 // flagTaggedStates 702 // 703 //----------------------------------------------------------------------------- 704 void flagTaggedStates() { 705 List<RBBINode> tagNodes = new ArrayList<RBBINode>(); 706 RBBINode tagNode; 707 int i; 708 int n; 709 710 fRB.fTreeRoots[fRootIx].findNodes(tagNodes, RBBINode.tag); 711 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em) 712 tagNode = tagNodes.get(i); 713 714 for (n=0; n<fDStates.size(); n++) { // For each state s (row in the state table) 715 RBBIStateDescriptor sd = fDStates.get(n); 716 if (sd.fPositions.contains(tagNode)) { // if s include the tag node t 717 sd.fTagVals.add(Integer.valueOf(tagNode.fVal)); 718 } 719 } 720 } 721 } 722 723 724 725 //----------------------------------------------------------------------------- 726 // 727 // mergeRuleStatusVals 728 // 729 // Allocate positions in the global array of rule status {tag} values 730 // 731 // The RBBI runtime uses an array of {sets of status values} that can 732 // be returned for boundaries. Each accepting state that has non-zero 733 // status includes an index into this array. The format of the array 734 // is 735 // Num of status values in group 1 736 // status val 737 // status val 738 // ... 739 // Num of status vals in group 2 740 // status val 741 // status val 742 // ... 743 // etc. 744 // 745 // 746 //----------------------------------------------------------------------------- 747 748 void mergeRuleStatusVals() { 749 // 750 // The basic outline of what happens here is this... 751 // 752 // for each state in this state table 753 // if the status tag list for this state is in the global statuses list 754 // record where and 755 // continue with the next state 756 // else 757 // add the tag list for this state to the global list. 758 // 759 int n; 760 761 // Pre-load a single tag of {0} into the table. 762 // We will need this as a default, for rule sets with no explicit tagging, 763 // or with explicit tagging of {0}. 764 if (fRB.fRuleStatusVals.size() == 0) { 765 fRB.fRuleStatusVals.add(Integer.valueOf(1)); // Num of statuses in group 766 fRB.fRuleStatusVals.add(Integer.valueOf(0)); // and our single status of zero 767 768 SortedSet<Integer> s0 = new TreeSet<Integer>(); 769 Integer izero = Integer.valueOf(0); 770 fRB.fStatusSets.put(s0, izero); 771 SortedSet<Integer> s1 = new TreeSet<Integer>(); 772 s1.add(izero); 773 fRB.fStatusSets.put(s0, izero); 774 } 775 776 // For each state, check whether the state's status tag values are 777 // already entered into the status values array, and add them if not. 778 for (n=0; n<fDStates.size(); n++) { 779 RBBIStateDescriptor sd = fDStates.get(n); 780 Set<Integer> statusVals = sd.fTagVals; 781 Integer arrayIndexI = fRB.fStatusSets.get(statusVals); 782 if (arrayIndexI == null) { 783 // This is the first encounter of this set of status values. 784 // Add them to the statusSets map, This map associates 785 // the set of status values with an index in the runtime status 786 // values array. 787 arrayIndexI = Integer.valueOf(fRB.fRuleStatusVals.size()); 788 fRB.fStatusSets.put(statusVals, arrayIndexI); 789 790 // Add the new set of status values to the vector of values that 791 // will eventually become the array used by the runtime engine. 792 fRB.fRuleStatusVals.add(Integer.valueOf(statusVals.size())); 793 fRB.fRuleStatusVals.addAll(statusVals); 794 } 795 796 // Save the runtime array index back into the state descriptor. 797 sd.fTagsIdx = arrayIndexI.intValue(); 798 } 799 } 800 801 802 803 804 805 806 807 //----------------------------------------------------------------------------- 808 // 809 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos 810 // for each node in the tree. 811 // 812 //----------------------------------------------------------------------------- 813 814 void printPosSets(RBBINode n) { 815 if (n==null) { 816 return; 817 } 818 RBBINode.printNode(n); 819 System.out.print(" Nullable: " + n.fNullable); 820 821 System.out.print(" firstpos: "); 822 printSet(n.fFirstPosSet); 823 824 System.out.print(" lastpos: "); 825 printSet(n.fLastPosSet); 826 827 System.out.print(" followpos: "); 828 printSet(n.fFollowPos); 829 830 printPosSets(n.fLeftChild); 831 printPosSets(n.fRightChild); 832 } 833 834 835 836 837 //----------------------------------------------------------------------------- 838 // 839 // getTableSize() Calculate the size in bytes of the runtime form of this 840 // state transition table. 841 // 842 // Note: Refer to common/rbbidata.h from ICU4C for the declarations 843 // of the structures being matched by this calculation. 844 // 845 //----------------------------------------------------------------------------- 846 int getTableSize() { 847 int size = 0; 848 int numRows; 849 int numCols; 850 int rowSize; 851 852 if (fRB.fTreeRoots[fRootIx] == null) { 853 return 0; 854 } 855 856 size = /*sizeof(RBBIStateTable) - 4 */ 16; // The header, with no rows to the table. 857 858 numRows = fDStates.size(); 859 numCols = fRB.fSetBuilder.getNumCharCategories(); 860 861 // Note The declaration of RBBIStateTableRow is for a table of two columns. 862 // Therefore we subtract two from numCols when determining 863 // how much storage to add to a row for the total columns. 864 // rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2); 865 rowSize = 8 + 2*numCols; 866 size += numRows * rowSize; 867 while (size % 8 > 0) { // Size must be multiple of 8 bytes in size. 868 size++; 869 } 870 871 return size; 872 } 873 874 875 876 //----------------------------------------------------------------------------- 877 // 878 // exportTable() export the state transition table in the ICU4C format. 879 // 880 // Most of the table is 16 bit shorts. This function exports 881 // the whole thing as an array of shorts. 882 // 883 // The size of the array must be rounded up to a multiple of 884 // 8 bytes. 885 // 886 // See struct RBBIStateTable in ICU4C, common/rbbidata.h 887 // 888 //----------------------------------------------------------------------------- 889 890 short [] exportTable() { 891 int state; 892 int col; 893 894 if (fRB.fTreeRoots[fRootIx] == null) { 895 return new short[0]; 896 } 897 898 Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff && 899 fDStates.size() < 0x7fff); 900 901 int numStates = fDStates.size(); 902 903 // Size of table size in shorts. 904 // the "4" is the size of struct RBBIStateTableRow, the row header part only. 905 int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories(); 906 int tableSize = getTableSize() / 2; 907 908 909 short [] table = new short[tableSize]; 910 911 // 912 // Fill in the header fields. 913 // Annoying because they really want to be ints, not shorts. 914 // 915 // RBBIStateTable.fNumStates 916 table[RBBIDataWrapper.NUMSTATES] = (short)(numStates >>> 16); 917 table[RBBIDataWrapper.NUMSTATES+1] = (short)(numStates & 0x0000ffff); 918 919 // RBBIStateTable.fRowLen 920 table[RBBIDataWrapper.ROWLEN] = (short)(rowLen >>> 16); 921 table[RBBIDataWrapper.ROWLEN+1] = (short)(rowLen & 0x0000ffff); 922 923 // RBBIStateTable.fFlags 924 int flags = 0; 925 if (fRB.fLookAheadHardBreak) { 926 flags |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK; 927 } 928 if (fRB.fSetBuilder.sawBOF()) { 929 flags |= RBBIDataWrapper.RBBI_BOF_REQUIRED; 930 } 931 table[RBBIDataWrapper.FLAGS] = (short)(flags >>> 16); 932 table[RBBIDataWrapper.FLAGS+1] = (short)(flags & 0x0000ffff); 933 934 int numCharCategories = fRB.fSetBuilder.getNumCharCategories(); 935 for (state=0; state<numStates; state++) { 936 RBBIStateDescriptor sd = fDStates.get(state); 937 int row = 8 + state*rowLen; 938 Assert.assrt (-32768 < sd.fAccepting && sd.fAccepting <= 32767); 939 Assert.assrt (-32768 < sd.fLookAhead && sd.fLookAhead <= 32767); 940 table[row + RBBIDataWrapper.ACCEPTING] = (short)sd.fAccepting; 941 table[row + RBBIDataWrapper.LOOKAHEAD] = (short)sd.fLookAhead; 942 table[row + RBBIDataWrapper.TAGIDX] = (short)sd.fTagsIdx; 943 for (col=0; col<numCharCategories; col++) { 944 table[row + RBBIDataWrapper.NEXTSTATES + col] = (short)sd.fDtran[col]; 945 } 946 } 947 return table; 948 } 949 950 951 952 //----------------------------------------------------------------------------- 953 // 954 // printSet Debug function. Print the contents of a set of Nodes 955 // 956 //----------------------------------------------------------------------------- 957 958 void printSet(Collection<RBBINode> s) { 959 for (RBBINode n : s) { 960 RBBINode.printInt(n.fSerialNum, 8); 961 } 962 System.out.println(); 963 } 964 965 966 967 //----------------------------------------------------------------------------- 968 // 969 // printStates Debug Function. Dump the fully constructed state transition table. 970 // 971 //----------------------------------------------------------------------------- 972 973 void printStates() { 974 int c; // input "character" 975 int n; // state number 976 977 System.out.print("state | i n p u t s y m b o l s \n"); 978 System.out.print(" | Acc LA Tag"); 979 for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) { 980 RBBINode.printInt(c, 3); 981 } 982 System.out.print("\n"); 983 System.out.print(" |---------------"); 984 for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) { 985 System.out.print("---"); 986 } 987 System.out.print("\n"); 988 989 for (n=0; n<fDStates.size(); n++) { 990 RBBIStateDescriptor sd = fDStates.get(n); 991 RBBINode.printInt(n, 5); 992 System.out.print(" | "); 993 994 RBBINode.printInt(sd.fAccepting, 3); 995 RBBINode.printInt(sd.fLookAhead, 4); 996 RBBINode.printInt(sd.fTagsIdx, 6); 997 System.out.print(" "); 998 for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) { 999 RBBINode.printInt(sd.fDtran[c], 3); 1000 } 1001 System.out.print("\n"); 1002 } 1003 System.out.print("\n\n"); 1004 } 1005 1006 1007 1008 1009 //----------------------------------------------------------------------------- 1010 // 1011 // printRuleStatusTable Debug Function. Dump the common rule status table 1012 // 1013 //----------------------------------------------------------------------------- 1014 1015 void printRuleStatusTable() { 1016 int thisRecord = 0; 1017 int nextRecord = 0; 1018 int i; 1019 List<Integer> tbl = fRB.fRuleStatusVals; 1020 1021 System.out.print("index | tags \n"); 1022 System.out.print("-------------------\n"); 1023 1024 while (nextRecord < tbl.size()) { 1025 thisRecord = nextRecord; 1026 nextRecord = thisRecord + tbl.get(thisRecord).intValue() + 1; 1027 RBBINode.printInt(thisRecord, 7); 1028 for (i=thisRecord+1; i<nextRecord; i++) { 1029 int val = tbl.get(i).intValue(); 1030 RBBINode.printInt(val, 7); 1031 } 1032 System.out.print("\n"); 1033 } 1034 System.out.print("\n\n"); 1035 } 1036 1037 1038 1039 } 1040