1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 /* 3 ******************************************************************************* 4 * Copyright (C) 2003-2011, International Business Machines Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 package android.icu.text; 9 10 import java.text.ParsePosition; 11 import java.util.HashMap; 12 13 import android.icu.impl.Assert; 14 import android.icu.impl.Utility; 15 import android.icu.lang.UCharacter; 16 17 /** 18 * This class is part of the Rule Based Break Iterator rule compiler. 19 * It scans the rules and builds the parse tree. 20 * There is no public API here. 21 */ 22 class RBBIRuleScanner { 23 24 private final static int kStackSize = 100; // The size of the state stack for 25 // rules parsing. Corresponds roughly 26 // to the depth of parentheses nesting 27 // that is allowed in the rules. 28 29 static class RBBIRuleChar { 30 int fChar; 31 boolean fEscaped; 32 } 33 34 35 36 RBBIRuleBuilder fRB; // The rule builder that we are part of. 37 38 int fScanIndex; // Index of current character being processed 39 // in the rule input string. 40 int fNextIndex; // Index of the next character, which 41 // is the first character not yet scanned. 42 boolean fQuoteMode; // Scan is in a 'quoted region' 43 int fLineNum; // Line number in input file. 44 int fCharNum; // Char position within the line. 45 int fLastChar; // Previous char, needed to count CR-LF 46 // as a single line, not two. 47 48 RBBIRuleChar fC = new RBBIRuleChar(); // Current char for parse state machine 49 // processing. 50 String fVarName; // $variableName, valid when we've just 51 // scanned one. 52 53 54 short fStack[] = new short[kStackSize]; // State stack, holds state pushes 55 int fStackPtr; // and pops as specified in the state 56 // transition rules. 57 58 RBBINode fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created 59 // during the parse of a rule 60 int fNodeStackPtr; 61 62 63 boolean fReverseRule; // True if the rule currently being scanned 64 // is a reverse direction rule (if it 65 // starts with a '!') 66 67 boolean fLookAheadRule; // True if the rule includes a '/' 68 // somewhere within it. 69 70 RBBISymbolTable fSymbolTable; // symbol table, holds definitions of 71 // $variable symbols. 72 73 HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to 74 // the sets created while parsing rules. 75 // The key is the string used for creating 76 // the set. 77 78 UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during 79 // the scanning of RBBI rules. The 80 // indicies for these are assigned by the 81 // perl script that builds the state tables. 82 // See rbbirpt.h. 83 84 int fRuleNum; // Counts each rule as it is scanned. 85 86 int fOptionStart; // Input index of start of a !!option 87 // keyword, while being scanned. 88 89 90 91 static private String gRuleSet_rule_char_pattern = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]"; 92 static private String gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]"; 93 static private String gRuleSet_digit_char_pattern = "[0-9]"; 94 static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]"; 95 static private String gRuleSet_white_space_pattern = "[\\p{Pattern_White_Space}]"; 96 static private String kAny = "any"; 97 98 99 100 101 //---------------------------------------------------------------------------------------- 102 // 103 // Constructor. 104 // 105 //---------------------------------------------------------------------------------------- 106 RBBIRuleScanner(RBBIRuleBuilder rb) { 107 fRB = rb; 108 fLineNum = 1; 109 110 // 111 // Set up the constant Unicode Sets. 112 // Note: These could be made static and shared among 113 // all instances of RBBIRuleScanners. 114 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern); 115 fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern); 116 fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern); 117 fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern); 118 fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern); 119 120 fSymbolTable = new RBBISymbolTable(this, rb.fRules); 121 } 122 123 //---------------------------------------------------------------------------------------- 124 // 125 // doParseAction Do some action during rule parsing. 126 // Called by the parse state machine. 127 // Actions build the parse tree and Unicode Sets, 128 // and maintain the parse stack for nested expressions. 129 // 130 //---------------------------------------------------------------------------------------- 131 boolean doParseActions(int action) { 132 RBBINode n = null; 133 134 boolean returnVal = true; 135 136 switch (action) { 137 138 case RBBIRuleParseTable.doExprStart: 139 pushNewNode(RBBINode.opStart); 140 fRuleNum++; 141 break; 142 143 case RBBIRuleParseTable.doExprOrOperator: { 144 fixOpStack(RBBINode.precOpCat); 145 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 146 RBBINode orNode = pushNewNode(RBBINode.opOr); 147 orNode.fLeftChild = operandNode; 148 operandNode.fParent = orNode; 149 } 150 break; 151 152 case RBBIRuleParseTable.doExprCatOperator: 153 // concatenation operator. 154 // For the implicit concatenation of adjacent terms in an expression 155 // that are 156 // not separated by any other operator. Action is invoked between the 157 // actions for the two terms. 158 { 159 fixOpStack(RBBINode.precOpCat); 160 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 161 RBBINode catNode = pushNewNode(RBBINode.opCat); 162 catNode.fLeftChild = operandNode; 163 operandNode.fParent = catNode; 164 } 165 break; 166 167 case RBBIRuleParseTable.doLParen: 168 // Open Paren. 169 // The openParen node is a dummy operation type with a low 170 // precedence, 171 // which has the affect of ensuring that any real binary op that 172 // follows within the parens binds more tightly to the operands than 173 // stuff outside of the parens. 174 pushNewNode(RBBINode.opLParen); 175 break; 176 177 case RBBIRuleParseTable.doExprRParen: 178 fixOpStack(RBBINode.precLParen); 179 break; 180 181 case RBBIRuleParseTable.doNOP: 182 break; 183 184 case RBBIRuleParseTable.doStartAssign: 185 // We've just scanned "$variable = " 186 // The top of the node stack has the $variable ref node. 187 188 // Save the start position of the RHS text in the StartExpression 189 // node 190 // that precedes the $variableReference node on the stack. 191 // This will eventually be used when saving the full $variable 192 // replacement 193 // text as a string. 194 n = fNodeStack[fNodeStackPtr - 1]; 195 n.fFirstPos = fNextIndex; // move past the '=' 196 197 // Push a new start-of-expression node; needed to keep parse of the 198 // RHS expression happy. 199 pushNewNode(RBBINode.opStart); 200 break; 201 202 case RBBIRuleParseTable.doEndAssign: { 203 // We have reached the end of an assignement statement. 204 // Current scan char is the ';' that terminates the assignment. 205 206 // Terminate expression, leaves expression parse tree rooted in TOS 207 // node. 208 fixOpStack(RBBINode.precStart); 209 210 RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2]; 211 RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1]; 212 RBBINode RHSExprNode = fNodeStack[fNodeStackPtr]; 213 214 // Save original text of right side of assignment, excluding the 215 // terminating ';' 216 // in the root of the node for the right-hand-side expression. 217 RHSExprNode.fFirstPos = startExprNode.fFirstPos; 218 RHSExprNode.fLastPos = fScanIndex; 219 // fRB.fRules.extractBetween(RHSExprNode.fFirstPos, 220 // RHSExprNode.fLastPos, RHSExprNode.fText); 221 RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos, 222 RHSExprNode.fLastPos); 223 224 // Expression parse tree becomes l. child of the $variable reference 225 // node. 226 varRefNode.fLeftChild = RHSExprNode; 227 RHSExprNode.fParent = varRefNode; 228 229 // Make a symbol table entry for the $variableRef node. 230 fSymbolTable.addEntry(varRefNode.fText, varRefNode); 231 232 // Clean up the stack. 233 fNodeStackPtr -= 3; 234 break; 235 } 236 237 case RBBIRuleParseTable.doEndOfRule: { 238 fixOpStack(RBBINode.precStart); // Terminate expression, leaves 239 // expression 240 241 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) { 242 printNodeStack("end of rule"); 243 } 244 Assert.assrt(fNodeStackPtr == 1); 245 246 // If this rule includes a look-ahead '/', add a endMark node to the 247 // expression tree. 248 if (fLookAheadRule) { 249 RBBINode thisRule = fNodeStack[fNodeStackPtr]; 250 RBBINode endNode = pushNewNode(RBBINode.endMark); 251 RBBINode catNode = pushNewNode(RBBINode.opCat); 252 fNodeStackPtr -= 2; 253 catNode.fLeftChild = thisRule; 254 catNode.fRightChild = endNode; 255 fNodeStack[fNodeStackPtr] = catNode; 256 endNode.fVal = fRuleNum; 257 endNode.fLookAheadEnd = true; 258 } 259 260 // All rule expressions are ORed together. 261 // The ';' that terminates an expression really just functions as a 262 // '|' with 263 // a low operator prededence. 264 // 265 // Each of the four sets of rules are collected separately. 266 // (forward, reverse, safe_forward, safe_reverse) 267 // OR this rule into the appropriate group of them. 268 // 269 270 int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree); 271 272 if (fRB.fTreeRoots[destRules] != null) { 273 // This is not the first rule encounted. 274 // OR previous stuff (from *destRules) 275 // with the current rule expression (on the Node Stack) 276 // with the resulting OR expression going to *destRules 277 // 278 RBBINode thisRule = fNodeStack[fNodeStackPtr]; 279 RBBINode prevRules = fRB.fTreeRoots[destRules]; 280 RBBINode orNode = pushNewNode(RBBINode.opOr); 281 orNode.fLeftChild = prevRules; 282 prevRules.fParent = orNode; 283 orNode.fRightChild = thisRule; 284 thisRule.fParent = orNode; 285 fRB.fTreeRoots[destRules] = orNode; 286 } else { 287 // This is the first rule encountered (for this direction). 288 // Just move its parse tree from the stack to *destRules. 289 fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr]; 290 } 291 fReverseRule = false; // in preparation for the next rule. 292 fLookAheadRule = false; 293 fNodeStackPtr = 0; 294 } 295 break; 296 297 case RBBIRuleParseTable.doRuleError: 298 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 299 returnVal = false; 300 break; 301 302 case RBBIRuleParseTable.doVariableNameExpectedErr: 303 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 304 break; 305 306 // 307 // Unary operands + ? * 308 // These all appear after the operand to which they apply. 309 // When we hit one, the operand (may be a whole sub expression) 310 // will be on the top of the stack. 311 // Unary Operator becomes TOS, with the old TOS as its one child. 312 case RBBIRuleParseTable.doUnaryOpPlus: { 313 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 314 RBBINode plusNode = pushNewNode(RBBINode.opPlus); 315 plusNode.fLeftChild = operandNode; 316 operandNode.fParent = plusNode; 317 } 318 break; 319 320 case RBBIRuleParseTable.doUnaryOpQuestion: { 321 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 322 RBBINode qNode = pushNewNode(RBBINode.opQuestion); 323 qNode.fLeftChild = operandNode; 324 operandNode.fParent = qNode; 325 } 326 break; 327 328 case RBBIRuleParseTable.doUnaryOpStar: { 329 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 330 RBBINode starNode = pushNewNode(RBBINode.opStar); 331 starNode.fLeftChild = operandNode; 332 operandNode.fParent = starNode; 333 } 334 break; 335 336 case RBBIRuleParseTable.doRuleChar: 337 // A "Rule Character" is any single character that is a literal part 338 // of the regular expression. Like a, b and c in the expression "(abc*) 339 // | [:L:]" 340 // These are pretty uncommon in break rules; the terms are more commonly 341 // sets. To keep things uniform, treat these characters like as 342 // sets that just happen to contain only one character. 343 { 344 n = pushNewNode(RBBINode.setRef); 345 String s = String.valueOf((char)fC.fChar); 346 findSetFor(s, n, null); 347 n.fFirstPos = fScanIndex; 348 n.fLastPos = fNextIndex; 349 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 350 break; 351 } 352 353 case RBBIRuleParseTable.doDotAny: 354 // scanned a ".", meaning match any single character. 355 { 356 n = pushNewNode(RBBINode.setRef); 357 findSetFor(kAny, n, null); 358 n.fFirstPos = fScanIndex; 359 n.fLastPos = fNextIndex; 360 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 361 break; 362 } 363 364 case RBBIRuleParseTable.doSlash: 365 // Scanned a '/', which identifies a look-ahead break position in a 366 // rule. 367 n = pushNewNode(RBBINode.lookAhead); 368 n.fVal = fRuleNum; 369 n.fFirstPos = fScanIndex; 370 n.fLastPos = fNextIndex; 371 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 372 fLookAheadRule = true; 373 break; 374 375 case RBBIRuleParseTable.doStartTagValue: 376 // Scanned a '{', the opening delimiter for a tag value within a 377 // rule. 378 n = pushNewNode(RBBINode.tag); 379 n.fVal = 0; 380 n.fFirstPos = fScanIndex; 381 n.fLastPos = fNextIndex; 382 break; 383 384 case RBBIRuleParseTable.doTagDigit: 385 // Just scanned a decimal digit that's part of a tag value 386 { 387 n = fNodeStack[fNodeStackPtr]; 388 int v = UCharacter.digit((char) fC.fChar, 10); 389 n.fVal = n.fVal * 10 + v; 390 break; 391 } 392 393 case RBBIRuleParseTable.doTagValue: 394 n = fNodeStack[fNodeStackPtr]; 395 n.fLastPos = fNextIndex; 396 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 397 break; 398 399 case RBBIRuleParseTable.doTagExpectedError: 400 error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG); 401 returnVal = false; 402 break; 403 404 case RBBIRuleParseTable.doOptionStart: 405 // Scanning a !!option. At the start of string. 406 fOptionStart = fScanIndex; 407 break; 408 409 case RBBIRuleParseTable.doOptionEnd: { 410 String opt = fRB.fRules.substring(fOptionStart, fScanIndex); 411 if (opt.equals("chain")) { 412 fRB.fChainRules = true; 413 } else if (opt.equals("LBCMNoChain")) { 414 fRB.fLBCMNoChain = true; 415 } else if (opt.equals("forward")) { 416 fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree; 417 } else if (opt.equals("reverse")) { 418 fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree; 419 } else if (opt.equals("safe_forward")) { 420 fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree; 421 } else if (opt.equals("safe_reverse")) { 422 fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree; 423 } else if (opt.equals("lookAheadHardBreak")) { 424 fRB.fLookAheadHardBreak = true; 425 } else { 426 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION); 427 } 428 break; 429 } 430 431 case RBBIRuleParseTable.doReverseDir: 432 fReverseRule = true; 433 break; 434 435 case RBBIRuleParseTable.doStartVariableName: 436 n = pushNewNode(RBBINode.varRef); 437 n.fFirstPos = fScanIndex; 438 break; 439 440 case RBBIRuleParseTable.doEndVariableName: 441 n = fNodeStack[fNodeStackPtr]; 442 if (n == null || n.fType != RBBINode.varRef) { 443 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 444 break; 445 } 446 n.fLastPos = fScanIndex; 447 n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos); 448 // Look the newly scanned name up in the symbol table 449 // If there's an entry, set the l. child of the var ref to the 450 // replacement expression. 451 // (We also pass through here when scanning assignments, but no harm 452 // is done, other 453 // than a slight wasted effort that seems hard to avoid. Lookup will 454 // be null) 455 n.fLeftChild = fSymbolTable.lookupNode(n.fText); 456 break; 457 458 case RBBIRuleParseTable.doCheckVarDef: 459 n = fNodeStack[fNodeStackPtr]; 460 if (n.fLeftChild == null) { 461 error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE); 462 returnVal = false; 463 } 464 break; 465 466 case RBBIRuleParseTable.doExprFinished: 467 break; 468 469 case RBBIRuleParseTable.doRuleErrorAssignExpr: 470 error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR); 471 returnVal = false; 472 break; 473 474 case RBBIRuleParseTable.doExit: 475 returnVal = false; 476 break; 477 478 case RBBIRuleParseTable.doScanUnicodeSet: 479 scanSet(); 480 break; 481 482 default: 483 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 484 returnVal = false; 485 break; 486 } 487 return returnVal; 488 } 489 490 //---------------------------------------------------------------------------------------- 491 // 492 // Error Throw and IllegalArgumentException in response to a rule parse 493 // error. 494 // 495 //---------------------------------------------------------------------------------------- 496 void error(int e) { 497 String s = "Error " + e + " at line " + fLineNum + " column " 498 + fCharNum; 499 IllegalArgumentException ex = new IllegalArgumentException(s); 500 throw ex; 501 502 } 503 504 //---------------------------------------------------------------------------------------- 505 // 506 // fixOpStack The parse stack holds partially assembled chunks of the parse 507 // tree. 508 // An entry on the stack may be as small as a single setRef node, 509 // or as large as the parse tree 510 // for an entire expression (this will be the one item left on the stack 511 // when the parsing of an RBBI rule completes. 512 // 513 // This function is called when a binary operator is encountered. 514 // It looks back up the stack for operators that are not yet associated 515 // with a right operand, and if the precedence of the stacked operator >= 516 // the precedence of the current operator, binds the operand left, 517 // to the previously encountered operator. 518 // 519 //---------------------------------------------------------------------------------------- 520 void fixOpStack(int p) { 521 RBBINode n; 522 // printNodeStack("entering fixOpStack()"); 523 for (;;) { 524 n = fNodeStack[fNodeStackPtr - 1]; // an operator node 525 if (n.fPrecedence == 0) { 526 System.out.print("RBBIRuleScanner.fixOpStack, bad operator node"); 527 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 528 return; 529 } 530 531 if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) { 532 // The most recent operand goes with the current operator, 533 // not with the previously stacked one. 534 break; 535 } 536 // Stack operator is a binary op ( '|' or concatenation) 537 // TOS operand becomes right child of this operator. 538 // Resulting subexpression becomes the TOS operand. 539 n.fRightChild = fNodeStack[fNodeStackPtr]; 540 fNodeStack[fNodeStackPtr].fParent = n; 541 fNodeStackPtr--; 542 // printNodeStack("looping in fixOpStack() "); 543 } 544 545 if (p <= RBBINode.precLParen) { 546 // Scan is at a right paren or end of expression. 547 // The scanned item must match the stack, or else there was an 548 // error. 549 // Discard the left paren (or start expr) node from the stack, 550 // leaving the completed (sub)expression as TOS. 551 if (n.fPrecedence != p) { 552 // Right paren encountered matched start of expression node, or 553 // end of expression matched with a left paren node. 554 error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN); 555 } 556 fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr]; 557 fNodeStackPtr--; 558 // Delete the now-discarded LParen or Start node. 559 // delete n; 560 } 561 // printNodeStack("leaving fixOpStack()"); 562 } 563 564 //---------------------------------------------------------------------------- 565 // 566 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have 567 // been encountered. The val Node will be of nodetype uset 568 // and contain pointers to the actual UnicodeSets. 569 // The Key is the source string for initializing the set. 570 // 571 // The hash table is used to avoid creating duplicate 572 // unnamed (not $var references) UnicodeSets. 573 // 574 //---------------------------------------------------------------------------- 575 static class RBBISetTableEl { 576 String key; 577 578 RBBINode val; 579 } 580 581 582 //---------------------------------------------------------------------------------------- 583 // 584 // findSetFor given a String, 585 // - find the corresponding Unicode Set (uset node) 586 // (create one if necessary) 587 // - Set fLeftChild of the caller's node (should be a setRef node) 588 // to the uset node 589 // Maintain a hash table of uset nodes, so the same one is always used 590 // for the same string. 591 // If a "to adopt" set is provided and we haven't seen this key before, 592 // add the provided set to the hash table. 593 // If the string is one (32 bit) char in length, the set contains 594 // just one element which is the char in question. 595 // If the string is "any", return a set containing all chars. 596 // 597 //---------------------------------------------------------------------------------------- 598 void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) { 599 600 RBBISetTableEl el; 601 602 // First check whether we've already cached a set for this string. 603 // If so, just use the cached set in the new node. 604 // delete any set provided by the caller, since we own it. 605 el = fSetTable.get(s); 606 if (el != null) { 607 node.fLeftChild = el.val; 608 Assert.assrt(node.fLeftChild.fType == RBBINode.uset); 609 return; 610 } 611 612 // Haven't seen this set before. 613 // If the caller didn't provide us with a prebuilt set, 614 // create a new UnicodeSet now. 615 if (setToAdopt == null) { 616 if (s.equals(kAny)) { 617 setToAdopt = new UnicodeSet(0x000000, 0x10ffff); 618 } else { 619 int c; 620 c = UTF16.charAt(s, 0); 621 setToAdopt = new UnicodeSet(c, c); 622 } 623 } 624 625 // 626 // Make a new uset node to refer to this UnicodeSet 627 // This new uset node becomes the child of the caller's setReference 628 // node. 629 // 630 RBBINode usetNode = new RBBINode(RBBINode.uset); 631 usetNode.fInputSet = setToAdopt; 632 usetNode.fParent = node; 633 node.fLeftChild = usetNode; 634 usetNode.fText = s; 635 636 // 637 // Add the new uset node to the list of all uset nodes. 638 // 639 fRB.fUSetNodes.add(usetNode); 640 641 // 642 // Add the new set to the set hash table. 643 // 644 el = new RBBISetTableEl(); 645 el.key = s; 646 el.val = usetNode; 647 fSetTable.put(el.key, el); 648 649 return; 650 } 651 652 // 653 // Assorted Unicode character constants. 654 // Numeric because there is no portable way to enter them as literals. 655 // (Think EBCDIC). 656 // 657 static final int chNEL = 0x85; // NEL newline variant 658 659 static final int chLS = 0x2028; // Unicode Line Separator 660 661 //---------------------------------------------------------------------------------------- 662 // 663 // stripRules Return a rules string without unnecessary 664 // characters. 665 // 666 //---------------------------------------------------------------------------------------- 667 static String stripRules(String rules) { 668 StringBuilder strippedRules = new StringBuilder(); 669 int rulesLength = rules.length(); 670 for (int idx = 0; idx < rulesLength;) { 671 char ch = rules.charAt(idx++); 672 if (ch == '#') { 673 while (idx < rulesLength 674 && ch != '\r' && ch != '\n' && ch != chNEL) { 675 ch = rules.charAt(idx++); 676 } 677 } 678 if (!UCharacter.isISOControl(ch)) { 679 strippedRules.append(ch); 680 } 681 } 682 return strippedRules.toString(); 683 } 684 685 //---------------------------------------------------------------------------------------- 686 // 687 // nextCharLL Low Level Next Char from rule input source. 688 // Get a char from the input character iterator, 689 // keep track of input position for error reporting. 690 // 691 //---------------------------------------------------------------------------------------- 692 int nextCharLL() { 693 int ch; 694 695 if (fNextIndex >= fRB.fRules.length()) { 696 return -1; 697 } 698 ch = UTF16.charAt(fRB.fRules, fNextIndex); 699 fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1); 700 701 if (ch == '\r' || 702 ch == chNEL || 703 ch == chLS || 704 ch == '\n' && fLastChar != '\r') { 705 // Character is starting a new line. Bump up the line number, and 706 // reset the column to 0. 707 fLineNum++; 708 fCharNum = 0; 709 if (fQuoteMode) { 710 error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING); 711 fQuoteMode = false; 712 } 713 } else { 714 // Character is not starting a new line. Except in the case of a 715 // LF following a CR, increment the column position. 716 if (ch != '\n') { 717 fCharNum++; 718 } 719 } 720 fLastChar = ch; 721 return ch; 722 } 723 724 //--------------------------------------------------------------------------------- 725 // 726 // nextChar for rules scanning. At this level, we handle stripping 727 // out comments and processing backslash character escapes. 728 // The rest of the rules grammar is handled at the next level up. 729 // 730 //--------------------------------------------------------------------------------- 731 void nextChar(RBBIRuleChar c) { 732 733 // Unicode Character constants needed for the processing done by nextChar(), 734 // in hex because literals wont work on EBCDIC machines. 735 736 fScanIndex = fNextIndex; 737 c.fChar = nextCharLL(); 738 c.fEscaped = false; 739 740 // 741 // check for '' sequence. 742 // These are recognized in all contexts, whether in quoted text or not. 743 // 744 if (c.fChar == '\'') { 745 if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') { 746 c.fChar = nextCharLL(); // get nextChar officially so character counts 747 c.fEscaped = true; // stay correct. 748 } else { 749 // Single quote, by itself. 750 // Toggle quoting mode. 751 // Return either '(' or ')', because quotes cause a grouping of the quoted text. 752 fQuoteMode = !fQuoteMode; 753 if (fQuoteMode == true) { 754 c.fChar = '('; 755 } else { 756 c.fChar = ')'; 757 } 758 c.fEscaped = false; // The paren that we return is not escaped. 759 return; 760 } 761 } 762 763 if (fQuoteMode) { 764 c.fEscaped = true; 765 } else { 766 // We are not in a 'quoted region' of the source. 767 // 768 if (c.fChar == '#') { 769 // Start of a comment. Consume the rest of it. 770 // The new-line char that terminates the comment is always returned. 771 // It will be treated as white-space, and serves to break up anything 772 // that might otherwise incorrectly clump together with a comment in 773 // the middle (a variable name, for example.) 774 for (;;) { 775 c.fChar = nextCharLL(); 776 if (c.fChar == -1 || // EOF 777 c.fChar == '\r' || 778 c.fChar == '\n' || 779 c.fChar == chNEL || 780 c.fChar == chLS) 781 { 782 break; 783 } 784 } 785 } 786 if (c.fChar == -1) { 787 return; 788 } 789 790 // 791 // check for backslash escaped characters. 792 // Use String.unescapeAt() to handle them. 793 // 794 if (c.fChar == '\\') { 795 c.fEscaped = true; 796 int[] unescapeIndex = new int[1]; 797 unescapeIndex[0] = fNextIndex; 798 c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex); 799 if (unescapeIndex[0] == fNextIndex) { 800 error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED); 801 } 802 803 fCharNum += unescapeIndex[0] - fNextIndex; 804 fNextIndex = unescapeIndex[0]; 805 } 806 } 807 // putc(c.fChar, stdout); 808 } 809 810 //--------------------------------------------------------------------------------- 811 // 812 // Parse RBBI rules. The state machine for rules parsing is here. 813 // The state tables are hand-written in the file rbbirpt.txt, 814 // and converted to the form used here by a perl 815 // script rbbicst.pl 816 // 817 //--------------------------------------------------------------------------------- 818 void parse() { 819 int state; 820 RBBIRuleParseTable.RBBIRuleTableElement tableEl; 821 822 state = 1; 823 nextChar(fC); 824 // 825 // Main loop for the rule parsing state machine. 826 // Runs once per state transition. 827 // Each time through optionally performs, depending on the state table, 828 // - an advance to the the next input char 829 // - an action to be performed. 830 // - pushing or popping a state to/from the local state return stack. 831 // 832 for (;;) { 833 // Quit if state == 0. This is the normal way to exit the state machine. 834 // 835 if (state == 0) { 836 break; 837 } 838 839 // Find the state table element that matches the input char from the rule, or the 840 // class of the input character. Start with the first table row for this 841 // state, then linearly scan forward until we find a row that matches the 842 // character. The last row for each state always matches all characters, so 843 // the search will stop there, if not before. 844 // 845 tableEl = RBBIRuleParseTable.gRuleParseStateTable[state]; 846 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 847 System.out.println("char, line, col = (\'" + (char) fC.fChar 848 + "\', " + fLineNum + ", " + fCharNum + " state = " 849 + tableEl.fStateName); 850 } 851 852 for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state. 853 tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow]; 854 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 855 System.out.print("."); 856 } 857 if (tableEl.fCharClass < 127 && fC.fEscaped == false 858 && tableEl.fCharClass == fC.fChar) { 859 // Table row specified an individual character, not a set, and 860 // the input character is not escaped, and 861 // the input character matched it. 862 break; 863 } 864 if (tableEl.fCharClass == 255) { 865 // Table row specified default, match anything character class. 866 break; 867 } 868 if (tableEl.fCharClass == 254 && fC.fEscaped) { 869 // Table row specified "escaped" and the char was escaped. 870 break; 871 } 872 if (tableEl.fCharClass == 253 && fC.fEscaped 873 && (fC.fChar == 0x50 || fC.fChar == 0x70)) { 874 // Table row specified "escaped P" and the char is either 'p' or 'P'. 875 break; 876 } 877 if (tableEl.fCharClass == 252 && fC.fChar == -1) { 878 // Table row specified eof and we hit eof on the input. 879 break; 880 } 881 882 if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class && 883 fC.fEscaped == false && // char is not escaped && 884 fC.fChar != -1) { // char is not EOF 885 UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128]; 886 if (uniset.contains(fC.fChar)) { 887 // Table row specified a character class, or set of characters, 888 // and the current char matches it. 889 break; 890 } 891 } 892 } 893 894 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 895 System.out.println(""); 896 } 897 // 898 // We've found the row of the state table that matches the current input 899 // character from the rules string. 900 // Perform any action specified by this row in the state table. 901 if (doParseActions(tableEl.fAction) == false) { 902 // Break out of the state machine loop if the 903 // the action signalled some kind of error, or 904 // the action was to exit, occurs on normal end-of-rules-input. 905 break; 906 } 907 908 if (tableEl.fPushState != 0) { 909 fStackPtr++; 910 if (fStackPtr >= kStackSize) { 911 System.out.println("RBBIRuleScanner.parse() - state stack overflow."); 912 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 913 } 914 fStack[fStackPtr] = tableEl.fPushState; 915 } 916 917 if (tableEl.fNextChar) { 918 nextChar(fC); 919 } 920 921 // Get the next state from the table entry, or from the 922 // state stack if the next state was specified as "pop". 923 if (tableEl.fNextState != 255) { 924 state = tableEl.fNextState; 925 } else { 926 state = fStack[fStackPtr]; 927 fStackPtr--; 928 if (fStackPtr < 0) { 929 System.out.println("RBBIRuleScanner.parse() - state stack underflow."); 930 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 931 } 932 } 933 934 } 935 936 // 937 // If there were NO user specified reverse rules, set up the equivalent of ".*;" 938 // 939 if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) { 940 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar); 941 RBBINode operand = pushNewNode(RBBINode.setRef); 942 findSetFor(kAny, operand, null); 943 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand; 944 operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree]; 945 fNodeStackPtr -= 2; 946 } 947 948 // 949 // Parsing of the input RBBI rules is complete. 950 // We now have a parse tree for the rule expressions 951 // and a list of all UnicodeSets that are referenced. 952 // 953 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) { 954 fSymbolTable.rbbiSymtablePrint(); 955 } 956 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) { 957 System.out.println("Completed Forward Rules Parse Tree..."); 958 fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true); 959 System.out.println("\nCompleted Reverse Rules Parse Tree..."); 960 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true); 961 System.out.println("\nCompleted Safe Point Forward Rules Parse Tree..."); 962 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) { 963 System.out.println(" -- null -- "); 964 } else { 965 fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true); 966 } 967 System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree..."); 968 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) { 969 System.out.println(" -- null -- "); 970 } else { 971 fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true); 972 } 973 } 974 } 975 976 //--------------------------------------------------------------------------------- 977 // 978 // printNodeStack for debugging... 979 // 980 //--------------------------------------------------------------------------------- 981 ///CLOVER:OFF 982 void printNodeStack(String title) { 983 int i; 984 System.out.println(title + ". Dumping node stack...\n"); 985 for (i = fNodeStackPtr; i > 0; i--) { 986 fNodeStack[i].printTree(true); 987 } 988 } 989 ///CLOVER:ON 990 991 //--------------------------------------------------------------------------------- 992 // 993 // pushNewNode create a new RBBINode of the specified type and push it 994 // onto the stack of nodes. 995 // 996 //--------------------------------------------------------------------------------- 997 RBBINode pushNewNode(int nodeType) { 998 fNodeStackPtr++; 999 if (fNodeStackPtr >= kStackSize) { 1000 System.out.println("RBBIRuleScanner.pushNewNode - stack overflow."); 1001 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 1002 } 1003 fNodeStack[fNodeStackPtr] = new RBBINode(nodeType); 1004 return fNodeStack[fNodeStackPtr]; 1005 } 1006 1007 //--------------------------------------------------------------------------------- 1008 // 1009 // scanSet Construct a UnicodeSet from the text at the current scan 1010 // position. Advance the scan position to the first character 1011 // after the set. 1012 // 1013 // A new RBBI setref node referring to the set is pushed onto the node 1014 // stack. 1015 // 1016 // The scan position is normally under the control of the state machine 1017 // that controls rule parsing. UnicodeSets, however, are parsed by 1018 // the UnicodeSet constructor, not by the RBBI rule parser. 1019 // 1020 //--------------------------------------------------------------------------------- 1021 void scanSet() { 1022 UnicodeSet uset = null; 1023 int startPos; 1024 ParsePosition pos = new ParsePosition(fScanIndex); 1025 int i; 1026 1027 startPos = fScanIndex; 1028 try { 1029 uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE); 1030 } catch (Exception e) { // TODO: catch fewer exception types. 1031 // Repackage UnicodeSet errors as RBBI rule builder errors, with location info. 1032 error(RBBIRuleBuilder.U_BRK_MALFORMED_SET); 1033 } 1034 1035 // Verify that the set contains at least one code point. 1036 // 1037 if (uset.isEmpty()) { 1038 // This set is empty. 1039 // Make it an error, because it almost certainly is not what the user wanted. 1040 // Also, avoids having to think about corner cases in the tree manipulation code 1041 // that occurs later on. 1042 // TODO: this shouldn't be an error; it does happen. 1043 error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET); 1044 } 1045 1046 // Advance the RBBI parse postion over the UnicodeSet pattern. 1047 // Don't just set fScanIndex because the line/char positions maintained 1048 // for error reporting would be thrown off. 1049 i = pos.getIndex(); 1050 for (;;) { 1051 if (fNextIndex >= i) { 1052 break; 1053 } 1054 nextCharLL(); 1055 } 1056 1057 RBBINode n; 1058 1059 n = pushNewNode(RBBINode.setRef); 1060 n.fFirstPos = startPos; 1061 n.fLastPos = fNextIndex; 1062 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 1063 // findSetFor() serves several purposes here: 1064 // - Adopts storage for the UnicodeSet, will be responsible for deleting. 1065 // - Mantains collection of all sets in use, needed later for establishing 1066 // character categories for run time engine. 1067 // - Eliminates mulitiple instances of the same set. 1068 // - Creates a new uset node if necessary (if this isn't a duplicate.) 1069 findSetFor(n.fText, n, uset); 1070 } 1071 1072 } 1073 1074