1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package android.icu.text; 11 12 import java.text.ParsePosition; 13 import java.util.HashMap; 14 15 import android.icu.impl.Assert; 16 import android.icu.impl.Utility; 17 import android.icu.lang.UCharacter; 18 19 /** 20 * This class is part of the Rule Based Break Iterator rule compiler. 21 * It scans the rules and builds the parse tree. 22 * There is no public API here. 23 */ 24 class RBBIRuleScanner { 25 26 private final static int kStackSize = 100; // The size of the state stack for 27 // rules parsing. Corresponds roughly 28 // to the depth of parentheses nesting 29 // that is allowed in the rules. 30 31 static class RBBIRuleChar { 32 int fChar; 33 boolean fEscaped; 34 } 35 36 37 38 RBBIRuleBuilder fRB; // The rule builder that we are part of. 39 40 int fScanIndex; // Index of current character being processed 41 // in the rule input string. 42 int fNextIndex; // Index of the next character, which 43 // is the first character not yet scanned. 44 boolean fQuoteMode; // Scan is in a 'quoted region' 45 int fLineNum; // Line number in input file. 46 int fCharNum; // Char position within the line. 47 int fLastChar; // Previous char, needed to count CR-LF 48 // as a single line, not two. 49 50 RBBIRuleChar fC = new RBBIRuleChar(); // Current char for parse state machine 51 // processing. 52 53 54 short fStack[] = new short[kStackSize]; // State stack, holds state pushes 55 int fStackPtr; // and pops as specified in the state 56 // transition rules. 57 58 RBBINode fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created 59 // during the parse of a rule 60 int fNodeStackPtr; 61 62 63 boolean fReverseRule; // True if the rule currently being scanned 64 // is a reverse direction rule (if it 65 // starts with a '!') 66 67 boolean fLookAheadRule; // True if the rule includes a '/' 68 // somewhere within it. 69 70 boolean fNoChainInRule; // True if the current rule starts with a '^'. 71 72 73 RBBISymbolTable fSymbolTable; // symbol table, holds definitions of 74 // $variable symbols. 75 76 HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to 77 // the sets created while parsing rules. 78 // The key is the string used for creating 79 // the set. 80 81 UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during 82 // the scanning of RBBI rules. The 83 // Indices for these are assigned by the 84 // perl script that builds the state tables. 85 // See rbbirpt.h. 86 87 int fRuleNum; // Counts each rule as it is scanned. 88 89 int fOptionStart; // Input index of start of a !!option 90 // keyword, while being scanned. 91 92 93 // gRuleSet_rule_char_pattern is characters that may appear as literals in patterns without escaping or quoting. 94 static private String gRuleSet_rule_char_pattern = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]"; 95 static private String gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]"; 96 static private String gRuleSet_digit_char_pattern = "[0-9]"; 97 static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]"; 98 static private String gRuleSet_white_space_pattern = "[\\p{Pattern_White_Space}]"; 99 static private String kAny = "any"; 100 101 102 103 104 //---------------------------------------------------------------------------------------- 105 // 106 // Constructor. 107 // 108 //---------------------------------------------------------------------------------------- 109 RBBIRuleScanner(RBBIRuleBuilder rb) { 110 fRB = rb; 111 fLineNum = 1; 112 113 // 114 // Set up the constant Unicode Sets. 115 // Note: These could be made static and shared among 116 // all instances of RBBIRuleScanners. 117 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern); 118 fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern); 119 fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern); 120 fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern); 121 fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern); 122 123 fSymbolTable = new RBBISymbolTable(this); 124 } 125 126 //---------------------------------------------------------------------------------------- 127 // 128 // doParseAction Do some action during rule parsing. 129 // Called by the parse state machine. 130 // Actions build the parse tree and Unicode Sets, 131 // and maintain the parse stack for nested expressions. 132 // 133 //---------------------------------------------------------------------------------------- 134 boolean doParseActions(int action) { 135 RBBINode n = null; 136 137 boolean returnVal = true; 138 139 switch (action) { 140 141 case RBBIRuleParseTable.doExprStart: 142 pushNewNode(RBBINode.opStart); 143 fRuleNum++; 144 break; 145 146 case RBBIRuleParseTable.doNoChain: 147 // Scanned a '^' while on the rule start state. 148 fNoChainInRule = true; 149 break; 150 151 152 case RBBIRuleParseTable.doExprOrOperator: { 153 fixOpStack(RBBINode.precOpCat); 154 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 155 RBBINode orNode = pushNewNode(RBBINode.opOr); 156 orNode.fLeftChild = operandNode; 157 operandNode.fParent = orNode; 158 } 159 break; 160 161 case RBBIRuleParseTable.doExprCatOperator: 162 // concatenation operator. 163 // For the implicit concatenation of adjacent terms in an expression 164 // that are 165 // not separated by any other operator. Action is invoked between the 166 // actions for the two terms. 167 { 168 fixOpStack(RBBINode.precOpCat); 169 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 170 RBBINode catNode = pushNewNode(RBBINode.opCat); 171 catNode.fLeftChild = operandNode; 172 operandNode.fParent = catNode; 173 } 174 break; 175 176 case RBBIRuleParseTable.doLParen: 177 // Open Paren. 178 // The openParen node is a dummy operation type with a low 179 // precedence, 180 // which has the affect of ensuring that any real binary op that 181 // follows within the parens binds more tightly to the operands than 182 // stuff outside of the parens. 183 pushNewNode(RBBINode.opLParen); 184 break; 185 186 case RBBIRuleParseTable.doExprRParen: 187 fixOpStack(RBBINode.precLParen); 188 break; 189 190 case RBBIRuleParseTable.doNOP: 191 break; 192 193 case RBBIRuleParseTable.doStartAssign: 194 // We've just scanned "$variable = " 195 // The top of the node stack has the $variable ref node. 196 197 // Save the start position of the RHS text in the StartExpression 198 // node 199 // that precedes the $variableReference node on the stack. 200 // This will eventually be used when saving the full $variable 201 // replacement 202 // text as a string. 203 n = fNodeStack[fNodeStackPtr - 1]; 204 n.fFirstPos = fNextIndex; // move past the '=' 205 206 // Push a new start-of-expression node; needed to keep parse of the 207 // RHS expression happy. 208 pushNewNode(RBBINode.opStart); 209 break; 210 211 case RBBIRuleParseTable.doEndAssign: { 212 // We have reached the end of an assignement statement. 213 // Current scan char is the ';' that terminates the assignment. 214 215 // Terminate expression, leaves expression parse tree rooted in TOS 216 // node. 217 fixOpStack(RBBINode.precStart); 218 219 RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2]; 220 RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1]; 221 RBBINode RHSExprNode = fNodeStack[fNodeStackPtr]; 222 223 // Save original text of right side of assignment, excluding the 224 // terminating ';' 225 // in the root of the node for the right-hand-side expression. 226 RHSExprNode.fFirstPos = startExprNode.fFirstPos; 227 RHSExprNode.fLastPos = fScanIndex; 228 // fRB.fRules.extractBetween(RHSExprNode.fFirstPos, 229 // RHSExprNode.fLastPos, RHSExprNode.fText); 230 RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos, 231 RHSExprNode.fLastPos); 232 233 // Expression parse tree becomes l. child of the $variable reference 234 // node. 235 varRefNode.fLeftChild = RHSExprNode; 236 RHSExprNode.fParent = varRefNode; 237 238 // Make a symbol table entry for the $variableRef node. 239 fSymbolTable.addEntry(varRefNode.fText, varRefNode); 240 241 // Clean up the stack. 242 fNodeStackPtr -= 3; 243 break; 244 } 245 246 case RBBIRuleParseTable.doEndOfRule: { 247 fixOpStack(RBBINode.precStart); // Terminate expression, leaves 248 // expression 249 250 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) { 251 printNodeStack("end of rule"); 252 } 253 Assert.assrt(fNodeStackPtr == 1); 254 RBBINode thisRule = fNodeStack[fNodeStackPtr]; 255 256 // If this rule includes a look-ahead '/', add a endMark node to the 257 // expression tree. 258 if (fLookAheadRule) { 259 RBBINode endNode = pushNewNode(RBBINode.endMark); 260 RBBINode catNode = pushNewNode(RBBINode.opCat); 261 fNodeStackPtr -= 2; 262 catNode.fLeftChild = thisRule; 263 catNode.fRightChild = endNode; 264 fNodeStack[fNodeStackPtr] = catNode; 265 endNode.fVal = fRuleNum; 266 endNode.fLookAheadEnd = true; 267 thisRule = catNode; 268 269 // TODO: Disable chaining out of look-ahead (hard break) rules. 270 // The break on rule match is forced, so there is no point in building up 271 // the state table to chain into another rule for a longer match. 272 } 273 274 // Mark this node as being the root of a rule. 275 thisRule.fRuleRoot = true; 276 277 // Flag if chaining into this rule is wanted. 278 // 279 if (fRB.fChainRules && // If rule chaining is enabled globally via !!chain 280 !fNoChainInRule) { // and no '^' chain-in inhibit was on this rule 281 thisRule.fChainIn = true; 282 } 283 284 285 // All rule expressions are ORed together. 286 // The ';' that terminates an expression really just functions as a 287 // '|' with 288 // a low operator prededence. 289 // 290 // Each of the four sets of rules are collected separately. 291 // (forward, reverse, safe_forward, safe_reverse) 292 // OR this rule into the appropriate group of them. 293 // 294 295 int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree); 296 297 if (fRB.fTreeRoots[destRules] != null) { 298 // This is not the first rule encountered. 299 // OR previous stuff (from *destRules) 300 // with the current rule expression (on the Node Stack) 301 // with the resulting OR expression going to *destRules 302 // 303 thisRule = fNodeStack[fNodeStackPtr]; 304 RBBINode prevRules = fRB.fTreeRoots[destRules]; 305 RBBINode orNode = pushNewNode(RBBINode.opOr); 306 orNode.fLeftChild = prevRules; 307 prevRules.fParent = orNode; 308 orNode.fRightChild = thisRule; 309 thisRule.fParent = orNode; 310 fRB.fTreeRoots[destRules] = orNode; 311 } else { 312 // This is the first rule encountered (for this direction). 313 // Just move its parse tree from the stack to *destRules. 314 fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr]; 315 } 316 fReverseRule = false; // in preparation for the next rule. 317 fLookAheadRule = false; 318 fNoChainInRule = false; 319 fNodeStackPtr = 0; 320 } 321 break; 322 323 case RBBIRuleParseTable.doRuleError: 324 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 325 returnVal = false; 326 break; 327 328 case RBBIRuleParseTable.doVariableNameExpectedErr: 329 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 330 break; 331 332 // 333 // Unary operands + ? * 334 // These all appear after the operand to which they apply. 335 // When we hit one, the operand (may be a whole sub expression) 336 // will be on the top of the stack. 337 // Unary Operator becomes TOS, with the old TOS as its one child. 338 case RBBIRuleParseTable.doUnaryOpPlus: { 339 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 340 RBBINode plusNode = pushNewNode(RBBINode.opPlus); 341 plusNode.fLeftChild = operandNode; 342 operandNode.fParent = plusNode; 343 } 344 break; 345 346 case RBBIRuleParseTable.doUnaryOpQuestion: { 347 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 348 RBBINode qNode = pushNewNode(RBBINode.opQuestion); 349 qNode.fLeftChild = operandNode; 350 operandNode.fParent = qNode; 351 } 352 break; 353 354 case RBBIRuleParseTable.doUnaryOpStar: { 355 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 356 RBBINode starNode = pushNewNode(RBBINode.opStar); 357 starNode.fLeftChild = operandNode; 358 operandNode.fParent = starNode; 359 } 360 break; 361 362 case RBBIRuleParseTable.doRuleChar: 363 // A "Rule Character" is any single character that is a literal part 364 // of the regular expression. Like a, b and c in the expression "(abc*) 365 // | [:L:]" 366 // These are pretty uncommon in break rules; the terms are more commonly 367 // sets. To keep things uniform, treat these characters like as 368 // sets that just happen to contain only one character. 369 { 370 n = pushNewNode(RBBINode.setRef); 371 String s = String.valueOf((char)fC.fChar); 372 findSetFor(s, n, null); 373 n.fFirstPos = fScanIndex; 374 n.fLastPos = fNextIndex; 375 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 376 break; 377 } 378 379 case RBBIRuleParseTable.doDotAny: 380 // scanned a ".", meaning match any single character. 381 { 382 n = pushNewNode(RBBINode.setRef); 383 findSetFor(kAny, n, null); 384 n.fFirstPos = fScanIndex; 385 n.fLastPos = fNextIndex; 386 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 387 break; 388 } 389 390 case RBBIRuleParseTable.doSlash: 391 // Scanned a '/', which identifies a look-ahead break position in a 392 // rule. 393 n = pushNewNode(RBBINode.lookAhead); 394 n.fVal = fRuleNum; 395 n.fFirstPos = fScanIndex; 396 n.fLastPos = fNextIndex; 397 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 398 fLookAheadRule = true; 399 break; 400 401 case RBBIRuleParseTable.doStartTagValue: 402 // Scanned a '{', the opening delimiter for a tag value within a 403 // rule. 404 n = pushNewNode(RBBINode.tag); 405 n.fVal = 0; 406 n.fFirstPos = fScanIndex; 407 n.fLastPos = fNextIndex; 408 break; 409 410 case RBBIRuleParseTable.doTagDigit: 411 // Just scanned a decimal digit that's part of a tag value 412 { 413 n = fNodeStack[fNodeStackPtr]; 414 int v = UCharacter.digit((char) fC.fChar, 10); 415 n.fVal = n.fVal * 10 + v; 416 break; 417 } 418 419 case RBBIRuleParseTable.doTagValue: 420 n = fNodeStack[fNodeStackPtr]; 421 n.fLastPos = fNextIndex; 422 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 423 break; 424 425 case RBBIRuleParseTable.doTagExpectedError: 426 error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG); 427 returnVal = false; 428 break; 429 430 case RBBIRuleParseTable.doOptionStart: 431 // Scanning a !!option. At the start of string. 432 fOptionStart = fScanIndex; 433 break; 434 435 case RBBIRuleParseTable.doOptionEnd: { 436 String opt = fRB.fRules.substring(fOptionStart, fScanIndex); 437 if (opt.equals("chain")) { 438 fRB.fChainRules = true; 439 } else if (opt.equals("LBCMNoChain")) { 440 fRB.fLBCMNoChain = true; 441 } else if (opt.equals("forward")) { 442 fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree; 443 } else if (opt.equals("reverse")) { 444 fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree; 445 } else if (opt.equals("safe_forward")) { 446 fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree; 447 } else if (opt.equals("safe_reverse")) { 448 fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree; 449 } else if (opt.equals("lookAheadHardBreak")) { 450 fRB.fLookAheadHardBreak = true; 451 } else if (opt.equals("quoted_literals_only")) { 452 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].clear(); 453 } else if (opt.equals("unquoted_literals")) { 454 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].applyPattern(gRuleSet_rule_char_pattern); 455 } else { 456 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION); 457 } 458 break; 459 } 460 461 case RBBIRuleParseTable.doReverseDir: 462 fReverseRule = true; 463 break; 464 465 case RBBIRuleParseTable.doStartVariableName: 466 n = pushNewNode(RBBINode.varRef); 467 n.fFirstPos = fScanIndex; 468 break; 469 470 case RBBIRuleParseTable.doEndVariableName: 471 n = fNodeStack[fNodeStackPtr]; 472 if (n == null || n.fType != RBBINode.varRef) { 473 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 474 break; 475 } 476 n.fLastPos = fScanIndex; 477 n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos); 478 // Look the newly scanned name up in the symbol table 479 // If there's an entry, set the l. child of the var ref to the 480 // replacement expression. 481 // (We also pass through here when scanning assignments, but no harm 482 // is done, other 483 // than a slight wasted effort that seems hard to avoid. Lookup will 484 // be null) 485 n.fLeftChild = fSymbolTable.lookupNode(n.fText); 486 break; 487 488 case RBBIRuleParseTable.doCheckVarDef: 489 n = fNodeStack[fNodeStackPtr]; 490 if (n.fLeftChild == null) { 491 error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE); 492 returnVal = false; 493 } 494 break; 495 496 case RBBIRuleParseTable.doExprFinished: 497 break; 498 499 case RBBIRuleParseTable.doRuleErrorAssignExpr: 500 error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR); 501 returnVal = false; 502 break; 503 504 case RBBIRuleParseTable.doExit: 505 returnVal = false; 506 break; 507 508 case RBBIRuleParseTable.doScanUnicodeSet: 509 scanSet(); 510 break; 511 512 default: 513 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 514 returnVal = false; 515 break; 516 } 517 return returnVal; 518 } 519 520 //---------------------------------------------------------------------------------------- 521 // 522 // Error Throw and IllegalArgumentException in response to a rule parse 523 // error. 524 // 525 //---------------------------------------------------------------------------------------- 526 void error(int e) { 527 String s = "Error " + e + " at line " + fLineNum + " column " 528 + fCharNum; 529 IllegalArgumentException ex = new IllegalArgumentException(s); 530 throw ex; 531 532 } 533 534 //---------------------------------------------------------------------------------------- 535 // 536 // fixOpStack The parse stack holds partially assembled chunks of the parse 537 // tree. 538 // An entry on the stack may be as small as a single setRef node, 539 // or as large as the parse tree 540 // for an entire expression (this will be the one item left on the stack 541 // when the parsing of an RBBI rule completes. 542 // 543 // This function is called when a binary operator is encountered. 544 // It looks back up the stack for operators that are not yet associated 545 // with a right operand, and if the precedence of the stacked operator >= 546 // the precedence of the current operator, binds the operand left, 547 // to the previously encountered operator. 548 // 549 //---------------------------------------------------------------------------------------- 550 void fixOpStack(int p) { 551 RBBINode n; 552 // printNodeStack("entering fixOpStack()"); 553 for (;;) { 554 n = fNodeStack[fNodeStackPtr - 1]; // an operator node 555 if (n.fPrecedence == 0) { 556 System.out.print("RBBIRuleScanner.fixOpStack, bad operator node"); 557 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 558 return; 559 } 560 561 if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) { 562 // The most recent operand goes with the current operator, 563 // not with the previously stacked one. 564 break; 565 } 566 // Stack operator is a binary op ( '|' or concatenation) 567 // TOS operand becomes right child of this operator. 568 // Resulting subexpression becomes the TOS operand. 569 n.fRightChild = fNodeStack[fNodeStackPtr]; 570 fNodeStack[fNodeStackPtr].fParent = n; 571 fNodeStackPtr--; 572 // printNodeStack("looping in fixOpStack() "); 573 } 574 575 if (p <= RBBINode.precLParen) { 576 // Scan is at a right paren or end of expression. 577 // The scanned item must match the stack, or else there was an 578 // error. 579 // Discard the left paren (or start expr) node from the stack, 580 // leaving the completed (sub)expression as TOS. 581 if (n.fPrecedence != p) { 582 // Right paren encountered matched start of expression node, or 583 // end of expression matched with a left paren node. 584 error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN); 585 } 586 fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr]; 587 fNodeStackPtr--; 588 // Delete the now-discarded LParen or Start node. 589 // delete n; 590 } 591 // printNodeStack("leaving fixOpStack()"); 592 } 593 594 //---------------------------------------------------------------------------- 595 // 596 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have 597 // been encountered. The val Node will be of nodetype uset 598 // and contain pointers to the actual UnicodeSets. 599 // The Key is the source string for initializing the set. 600 // 601 // The hash table is used to avoid creating duplicate 602 // unnamed (not $var references) UnicodeSets. 603 // 604 //---------------------------------------------------------------------------- 605 static class RBBISetTableEl { 606 String key; 607 608 RBBINode val; 609 } 610 611 612 //---------------------------------------------------------------------------------------- 613 // 614 // findSetFor given a String, 615 // - find the corresponding Unicode Set (uset node) 616 // (create one if necessary) 617 // - Set fLeftChild of the caller's node (should be a setRef node) 618 // to the uset node 619 // Maintain a hash table of uset nodes, so the same one is always used 620 // for the same string. 621 // If a "to adopt" set is provided and we haven't seen this key before, 622 // add the provided set to the hash table. 623 // If the string is one (32 bit) char in length, the set contains 624 // just one element which is the char in question. 625 // If the string is "any", return a set containing all chars. 626 // 627 //---------------------------------------------------------------------------------------- 628 void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) { 629 630 RBBISetTableEl el; 631 632 // First check whether we've already cached a set for this string. 633 // If so, just use the cached set in the new node. 634 // delete any set provided by the caller, since we own it. 635 el = fSetTable.get(s); 636 if (el != null) { 637 node.fLeftChild = el.val; 638 Assert.assrt(node.fLeftChild.fType == RBBINode.uset); 639 return; 640 } 641 642 // Haven't seen this set before. 643 // If the caller didn't provide us with a prebuilt set, 644 // create a new UnicodeSet now. 645 if (setToAdopt == null) { 646 if (s.equals(kAny)) { 647 setToAdopt = new UnicodeSet(0x000000, 0x10ffff); 648 } else { 649 int c; 650 c = UTF16.charAt(s, 0); 651 setToAdopt = new UnicodeSet(c, c); 652 } 653 } 654 655 // 656 // Make a new uset node to refer to this UnicodeSet 657 // This new uset node becomes the child of the caller's setReference 658 // node. 659 // 660 RBBINode usetNode = new RBBINode(RBBINode.uset); 661 usetNode.fInputSet = setToAdopt; 662 usetNode.fParent = node; 663 node.fLeftChild = usetNode; 664 usetNode.fText = s; 665 666 // 667 // Add the new uset node to the list of all uset nodes. 668 // 669 fRB.fUSetNodes.add(usetNode); 670 671 // 672 // Add the new set to the set hash table. 673 // 674 el = new RBBISetTableEl(); 675 el.key = s; 676 el.val = usetNode; 677 fSetTable.put(el.key, el); 678 679 return; 680 } 681 682 // 683 // Assorted Unicode character constants. 684 // Numeric because there is no portable way to enter them as literals. 685 // (Think EBCDIC). 686 // 687 static final int chNEL = 0x85; // NEL newline variant 688 689 static final int chLS = 0x2028; // Unicode Line Separator 690 691 //---------------------------------------------------------------------------------------- 692 // 693 // stripRules Return a rules string without unnecessary 694 // characters. 695 // 696 //---------------------------------------------------------------------------------------- 697 static String stripRules(String rules) { 698 StringBuilder strippedRules = new StringBuilder(); 699 int rulesLength = rules.length(); 700 for (int idx = 0; idx < rulesLength;) { 701 char ch = rules.charAt(idx++); 702 if (ch == '#') { 703 while (idx < rulesLength 704 && ch != '\r' && ch != '\n' && ch != chNEL) { 705 ch = rules.charAt(idx++); 706 } 707 } 708 if (!UCharacter.isISOControl(ch)) { 709 strippedRules.append(ch); 710 } 711 } 712 return strippedRules.toString(); 713 } 714 715 //---------------------------------------------------------------------------------------- 716 // 717 // nextCharLL Low Level Next Char from rule input source. 718 // Get a char from the input character iterator, 719 // keep track of input position for error reporting. 720 // 721 //---------------------------------------------------------------------------------------- 722 int nextCharLL() { 723 int ch; 724 725 if (fNextIndex >= fRB.fRules.length()) { 726 return -1; 727 } 728 ch = UTF16.charAt(fRB.fRules, fNextIndex); 729 fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1); 730 731 if (ch == '\r' || 732 ch == chNEL || 733 ch == chLS || 734 ch == '\n' && fLastChar != '\r') { 735 // Character is starting a new line. Bump up the line number, and 736 // reset the column to 0. 737 fLineNum++; 738 fCharNum = 0; 739 if (fQuoteMode) { 740 error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING); 741 fQuoteMode = false; 742 } 743 } else { 744 // Character is not starting a new line. Except in the case of a 745 // LF following a CR, increment the column position. 746 if (ch != '\n') { 747 fCharNum++; 748 } 749 } 750 fLastChar = ch; 751 return ch; 752 } 753 754 //--------------------------------------------------------------------------------- 755 // 756 // nextChar for rules scanning. At this level, we handle stripping 757 // out comments and processing backslash character escapes. 758 // The rest of the rules grammar is handled at the next level up. 759 // 760 //--------------------------------------------------------------------------------- 761 void nextChar(RBBIRuleChar c) { 762 763 // Unicode Character constants needed for the processing done by nextChar(), 764 // in hex because literals wont work on EBCDIC machines. 765 766 fScanIndex = fNextIndex; 767 c.fChar = nextCharLL(); 768 c.fEscaped = false; 769 770 // 771 // check for '' sequence. 772 // These are recognized in all contexts, whether in quoted text or not. 773 // 774 if (c.fChar == '\'') { 775 if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') { 776 c.fChar = nextCharLL(); // get nextChar officially so character counts 777 c.fEscaped = true; // stay correct. 778 } else { 779 // Single quote, by itself. 780 // Toggle quoting mode. 781 // Return either '(' or ')', because quotes cause a grouping of the quoted text. 782 fQuoteMode = !fQuoteMode; 783 if (fQuoteMode == true) { 784 c.fChar = '('; 785 } else { 786 c.fChar = ')'; 787 } 788 c.fEscaped = false; // The paren that we return is not escaped. 789 return; 790 } 791 } 792 793 if (fQuoteMode) { 794 c.fEscaped = true; 795 } else { 796 // We are not in a 'quoted region' of the source. 797 // 798 if (c.fChar == '#') { 799 // Start of a comment. Consume the rest of it. 800 // The new-line char that terminates the comment is always returned. 801 // It will be treated as white-space, and serves to break up anything 802 // that might otherwise incorrectly clump together with a comment in 803 // the middle (a variable name, for example.) 804 for (;;) { 805 c.fChar = nextCharLL(); 806 if (c.fChar == -1 || // EOF 807 c.fChar == '\r' || 808 c.fChar == '\n' || 809 c.fChar == chNEL || 810 c.fChar == chLS) 811 { 812 break; 813 } 814 } 815 } 816 if (c.fChar == -1) { 817 return; 818 } 819 820 // 821 // check for backslash escaped characters. 822 // Use String.unescapeAt() to handle them. 823 // 824 if (c.fChar == '\\') { 825 c.fEscaped = true; 826 int[] unescapeIndex = new int[1]; 827 unescapeIndex[0] = fNextIndex; 828 c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex); 829 if (unescapeIndex[0] == fNextIndex) { 830 error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED); 831 } 832 833 fCharNum += unescapeIndex[0] - fNextIndex; 834 fNextIndex = unescapeIndex[0]; 835 } 836 } 837 // putc(c.fChar, stdout); 838 } 839 840 //--------------------------------------------------------------------------------- 841 // 842 // Parse RBBI rules. The state machine for rules parsing is here. 843 // The state tables are hand-written in the file rbbirpt.txt, 844 // and converted to the form used here by a perl 845 // script rbbicst.pl 846 // 847 //--------------------------------------------------------------------------------- 848 void parse() { 849 int state; 850 RBBIRuleParseTable.RBBIRuleTableElement tableEl; 851 852 state = 1; 853 nextChar(fC); 854 // 855 // Main loop for the rule parsing state machine. 856 // Runs once per state transition. 857 // Each time through optionally performs, depending on the state table, 858 // - an advance to the the next input char 859 // - an action to be performed. 860 // - pushing or popping a state to/from the local state return stack. 861 // 862 for (;;) { 863 // Quit if state == 0. This is the normal way to exit the state machine. 864 // 865 if (state == 0) { 866 break; 867 } 868 869 // Find the state table element that matches the input char from the rule, or the 870 // class of the input character. Start with the first table row for this 871 // state, then linearly scan forward until we find a row that matches the 872 // character. The last row for each state always matches all characters, so 873 // the search will stop there, if not before. 874 // 875 tableEl = RBBIRuleParseTable.gRuleParseStateTable[state]; 876 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 877 System.out.println("char, line, col = (\'" + (char) fC.fChar 878 + "\', " + fLineNum + ", " + fCharNum + " state = " 879 + tableEl.fStateName); 880 } 881 882 for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state. 883 tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow]; 884 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 885 System.out.print("."); 886 } 887 if (tableEl.fCharClass < 127 && fC.fEscaped == false 888 && tableEl.fCharClass == fC.fChar) { 889 // Table row specified an individual character, not a set, and 890 // the input character is not escaped, and 891 // the input character matched it. 892 break; 893 } 894 if (tableEl.fCharClass == 255) { 895 // Table row specified default, match anything character class. 896 break; 897 } 898 if (tableEl.fCharClass == 254 && fC.fEscaped) { 899 // Table row specified "escaped" and the char was escaped. 900 break; 901 } 902 if (tableEl.fCharClass == 253 && fC.fEscaped 903 && (fC.fChar == 0x50 || fC.fChar == 0x70)) { 904 // Table row specified "escaped P" and the char is either 'p' or 'P'. 905 break; 906 } 907 if (tableEl.fCharClass == 252 && fC.fChar == -1) { 908 // Table row specified eof and we hit eof on the input. 909 break; 910 } 911 912 if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class && 913 fC.fEscaped == false && // char is not escaped && 914 fC.fChar != -1) { // char is not EOF 915 UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128]; 916 if (uniset.contains(fC.fChar)) { 917 // Table row specified a character class, or set of characters, 918 // and the current char matches it. 919 break; 920 } 921 } 922 } 923 924 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 925 System.out.println(""); 926 } 927 // 928 // We've found the row of the state table that matches the current input 929 // character from the rules string. 930 // Perform any action specified by this row in the state table. 931 if (doParseActions(tableEl.fAction) == false) { 932 // Break out of the state machine loop if the 933 // the action signalled some kind of error, or 934 // the action was to exit, occurs on normal end-of-rules-input. 935 break; 936 } 937 938 if (tableEl.fPushState != 0) { 939 fStackPtr++; 940 if (fStackPtr >= kStackSize) { 941 System.out.println("RBBIRuleScanner.parse() - state stack overflow."); 942 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 943 } 944 fStack[fStackPtr] = tableEl.fPushState; 945 } 946 947 if (tableEl.fNextChar) { 948 nextChar(fC); 949 } 950 951 // Get the next state from the table entry, or from the 952 // state stack if the next state was specified as "pop". 953 if (tableEl.fNextState != 255) { 954 state = tableEl.fNextState; 955 } else { 956 state = fStack[fStackPtr]; 957 fStackPtr--; 958 if (fStackPtr < 0) { 959 System.out.println("RBBIRuleScanner.parse() - state stack underflow."); 960 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 961 } 962 } 963 964 } 965 966 // If there are no forward rules throw an error. 967 // 968 if (fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree] == null) { 969 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 970 } 971 972 // 973 // If there were NO user specified reverse rules, set up the equivalent of ".*;" 974 // 975 if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) { 976 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar); 977 RBBINode operand = pushNewNode(RBBINode.setRef); 978 findSetFor(kAny, operand, null); 979 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand; 980 operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree]; 981 fNodeStackPtr -= 2; 982 } 983 984 // 985 // Parsing of the input RBBI rules is complete. 986 // We now have a parse tree for the rule expressions 987 // and a list of all UnicodeSets that are referenced. 988 // 989 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) { 990 fSymbolTable.rbbiSymtablePrint(); 991 } 992 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) { 993 System.out.println("Completed Forward Rules Parse Tree..."); 994 fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true); 995 System.out.println("\nCompleted Reverse Rules Parse Tree..."); 996 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true); 997 System.out.println("\nCompleted Safe Point Forward Rules Parse Tree..."); 998 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) { 999 System.out.println(" -- null -- "); 1000 } else { 1001 fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true); 1002 } 1003 System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree..."); 1004 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) { 1005 System.out.println(" -- null -- "); 1006 } else { 1007 fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true); 1008 } 1009 } 1010 } 1011 1012 //--------------------------------------------------------------------------------- 1013 // 1014 // printNodeStack for debugging... 1015 // 1016 //--------------------------------------------------------------------------------- 1017 ///CLOVER:OFF 1018 void printNodeStack(String title) { 1019 int i; 1020 System.out.println(title + ". Dumping node stack...\n"); 1021 for (i = fNodeStackPtr; i > 0; i--) { 1022 fNodeStack[i].printTree(true); 1023 } 1024 } 1025 ///CLOVER:ON 1026 1027 //--------------------------------------------------------------------------------- 1028 // 1029 // pushNewNode create a new RBBINode of the specified type and push it 1030 // onto the stack of nodes. 1031 // 1032 //--------------------------------------------------------------------------------- 1033 RBBINode pushNewNode(int nodeType) { 1034 fNodeStackPtr++; 1035 if (fNodeStackPtr >= kStackSize) { 1036 System.out.println("RBBIRuleScanner.pushNewNode - stack overflow."); 1037 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 1038 } 1039 fNodeStack[fNodeStackPtr] = new RBBINode(nodeType); 1040 return fNodeStack[fNodeStackPtr]; 1041 } 1042 1043 //--------------------------------------------------------------------------------- 1044 // 1045 // scanSet Construct a UnicodeSet from the text at the current scan 1046 // position. Advance the scan position to the first character 1047 // after the set. 1048 // 1049 // A new RBBI setref node referring to the set is pushed onto the node 1050 // stack. 1051 // 1052 // The scan position is normally under the control of the state machine 1053 // that controls rule parsing. UnicodeSets, however, are parsed by 1054 // the UnicodeSet constructor, not by the RBBI rule parser. 1055 // 1056 //--------------------------------------------------------------------------------- 1057 void scanSet() { 1058 UnicodeSet uset = null; 1059 int startPos; 1060 ParsePosition pos = new ParsePosition(fScanIndex); 1061 int i; 1062 1063 startPos = fScanIndex; 1064 try { 1065 uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE); 1066 } catch (Exception e) { // TODO: catch fewer exception types. 1067 // Repackage UnicodeSet errors as RBBI rule builder errors, with location info. 1068 error(RBBIRuleBuilder.U_BRK_MALFORMED_SET); 1069 } 1070 1071 // Verify that the set contains at least one code point. 1072 // 1073 if (uset.isEmpty()) { 1074 // This set is empty. 1075 // Make it an error, because it almost certainly is not what the user wanted. 1076 // Also, avoids having to think about corner cases in the tree manipulation code 1077 // that occurs later on. 1078 // TODO: this shouldn't be an error; it does happen. 1079 error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET); 1080 } 1081 1082 // Advance the RBBI parse postion over the UnicodeSet pattern. 1083 // Don't just set fScanIndex because the line/char positions maintained 1084 // for error reporting would be thrown off. 1085 i = pos.getIndex(); 1086 for (;;) { 1087 if (fNextIndex >= i) { 1088 break; 1089 } 1090 nextCharLL(); 1091 } 1092 1093 RBBINode n; 1094 1095 n = pushNewNode(RBBINode.setRef); 1096 n.fFirstPos = startPos; 1097 n.fLastPos = fNextIndex; 1098 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 1099 // findSetFor() serves several purposes here: 1100 // - Adopts storage for the UnicodeSet, will be responsible for deleting. 1101 // - Mantains collection of all sets in use, needed later for establishing 1102 // character categories for run time engine. 1103 // - Eliminates mulitiple instances of the same set. 1104 // - Creates a new uset node if necessary (if this isn't a duplicate.) 1105 findSetFor(n.fText, n, uset); 1106 } 1107 1108 } 1109 1110