1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // file: rbbiscan.cpp 5 // 6 // Copyright (C) 2002-2016, International Business Machines Corporation and others. 7 // All Rights Reserved. 8 // 9 // This file contains the Rule Based Break Iterator Rule Builder functions for 10 // scanning the rules and assembling a parse tree. This is the first phase 11 // of compiling the rules. 12 // 13 // The overall of the rules is managed by class RBBIRuleBuilder, which will 14 // create and use an instance of this class as part of the process. 15 // 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_BREAK_ITERATION 20 21 #include "unicode/unistr.h" 22 #include "unicode/uniset.h" 23 #include "unicode/uchar.h" 24 #include "unicode/uchriter.h" 25 #include "unicode/parsepos.h" 26 #include "unicode/parseerr.h" 27 #include "cmemory.h" 28 #include "cstring.h" 29 30 #include "rbbirpt.h" // Contains state table for the rbbi rules parser. 31 // generated by a Perl script. 32 #include "rbbirb.h" 33 #include "rbbinode.h" 34 #include "rbbiscan.h" 35 #include "rbbitblb.h" 36 37 #include "uassert.h" 38 39 //------------------------------------------------------------------------------ 40 // 41 // Unicode Set init strings for each of the character classes needed for parsing a rule file. 42 // (Initialized with hex values for portability to EBCDIC based machines. 43 // Really ugly, but there's no good way to avoid it.) 44 // 45 // The sets are referred to by name in the rbbirpt.txt, which is the 46 // source form of the state transition table for the RBBI rule parser. 47 // 48 //------------------------------------------------------------------------------ 49 static const UChar gRuleSet_rule_char_pattern[] = { 50 // [ ^ [ \ p { Z } \ u 0 0 2 0 51 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30, 52 // - \ u 0 0 7 f ] - [ \ p 53 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 54 // { L } ] - [ \ p { N } ] ] 55 0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0}; 56 57 static const UChar gRuleSet_name_char_pattern[] = { 58 // [ _ \ p { L } \ p { N } ] 59 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0}; 60 61 static const UChar gRuleSet_digit_char_pattern[] = { 62 // [ 0 - 9 ] 63 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0}; 64 65 static const UChar gRuleSet_name_start_char_pattern[] = { 66 // [ _ \ p { L } ] 67 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 }; 68 69 static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any" 70 71 72 U_CDECL_BEGIN 73 static void U_CALLCONV RBBISetTable_deleter(void *p) { 74 icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p; 75 delete px->key; 76 // Note: px->val is owned by the linked list "fSetsListHead" in scanner. 77 // Don't delete the value nodes here. 78 uprv_free(px); 79 } 80 U_CDECL_END 81 82 U_NAMESPACE_BEGIN 83 84 //------------------------------------------------------------------------------ 85 // 86 // Constructor. 87 // 88 //------------------------------------------------------------------------------ 89 RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb) 90 { 91 fRB = rb; 92 fScanIndex = 0; 93 fNextIndex = 0; 94 fQuoteMode = FALSE; 95 fLineNum = 1; 96 fCharNum = 0; 97 fLastChar = 0; 98 99 fStateTable = NULL; 100 fStack[0] = 0; 101 fStackPtr = 0; 102 fNodeStack[0] = NULL; 103 fNodeStackPtr = 0; 104 105 fReverseRule = FALSE; 106 fLookAheadRule = FALSE; 107 fNoChainInRule = FALSE; 108 109 fSymbolTable = NULL; 110 fSetTable = NULL; 111 fRuleNum = 0; 112 fOptionStart = 0; 113 114 // Do not check status until after all critical fields are sufficiently initialized 115 // that the destructor can run cleanly. 116 if (U_FAILURE(*rb->fStatus)) { 117 return; 118 } 119 120 // 121 // Set up the constant Unicode Sets. 122 // Note: These could be made static, lazily initialized, and shared among 123 // all instances of RBBIRuleScanners. BUT this is quite a bit simpler, 124 // and the time to build these few sets should be small compared to a 125 // full break iterator build. 126 fRuleSets[kRuleSet_rule_char-128] 127 = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern), *rb->fStatus); 128 // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:] 129 fRuleSets[kRuleSet_white_space-128]. 130 add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029); 131 fRuleSets[kRuleSet_name_char-128] 132 = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern), *rb->fStatus); 133 fRuleSets[kRuleSet_name_start_char-128] 134 = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus); 135 fRuleSets[kRuleSet_digit_char-128] 136 = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern), *rb->fStatus); 137 if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) { 138 // This case happens if ICU's data is missing. UnicodeSet tries to look up property 139 // names from the init string, can't find them, and claims an illegal argument. 140 // Change the error so that the actual problem will be clearer to users. 141 *rb->fStatus = U_BRK_INIT_ERROR; 142 } 143 if (U_FAILURE(*rb->fStatus)) { 144 return; 145 } 146 147 fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus); 148 if (fSymbolTable == NULL) { 149 *rb->fStatus = U_MEMORY_ALLOCATION_ERROR; 150 return; 151 } 152 fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus); 153 if (U_FAILURE(*rb->fStatus)) { 154 return; 155 } 156 uhash_setValueDeleter(fSetTable, RBBISetTable_deleter); 157 } 158 159 160 161 //------------------------------------------------------------------------------ 162 // 163 // Destructor 164 // 165 //------------------------------------------------------------------------------ 166 RBBIRuleScanner::~RBBIRuleScanner() { 167 delete fSymbolTable; 168 if (fSetTable != NULL) { 169 uhash_close(fSetTable); 170 fSetTable = NULL; 171 172 } 173 174 175 // Node Stack. 176 // Normally has one entry, which is the entire parse tree for the rules. 177 // If errors occured, there may be additional subtrees left on the stack. 178 while (fNodeStackPtr > 0) { 179 delete fNodeStack[fNodeStackPtr]; 180 fNodeStackPtr--; 181 } 182 183 } 184 185 //------------------------------------------------------------------------------ 186 // 187 // doParseAction Do some action during rule parsing. 188 // Called by the parse state machine. 189 // Actions build the parse tree and Unicode Sets, 190 // and maintain the parse stack for nested expressions. 191 // 192 // TODO: unify EParseAction and RBBI_RuleParseAction enum types. 193 // They represent exactly the same thing. They're separate 194 // only to work around enum forward declaration restrictions 195 // in some compilers, while at the same time avoiding multiple 196 // definitions problems. I'm sure that there's a better way. 197 // 198 //------------------------------------------------------------------------------ 199 UBool RBBIRuleScanner::doParseActions(int32_t action) 200 { 201 RBBINode *n = NULL; 202 203 UBool returnVal = TRUE; 204 205 switch (action) { 206 207 case doExprStart: 208 pushNewNode(RBBINode::opStart); 209 fRuleNum++; 210 break; 211 212 213 case doNoChain: 214 // Scanned a '^' while on the rule start state. 215 fNoChainInRule = TRUE; 216 break; 217 218 219 case doExprOrOperator: 220 { 221 fixOpStack(RBBINode::precOpCat); 222 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 223 RBBINode *orNode = pushNewNode(RBBINode::opOr); 224 if (U_FAILURE(*fRB->fStatus)) { 225 break; 226 } 227 orNode->fLeftChild = operandNode; 228 operandNode->fParent = orNode; 229 } 230 break; 231 232 case doExprCatOperator: 233 // concatenation operator. 234 // For the implicit concatenation of adjacent terms in an expression that are 235 // not separated by any other operator. Action is invoked between the 236 // actions for the two terms. 237 { 238 fixOpStack(RBBINode::precOpCat); 239 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 240 RBBINode *catNode = pushNewNode(RBBINode::opCat); 241 if (U_FAILURE(*fRB->fStatus)) { 242 break; 243 } 244 catNode->fLeftChild = operandNode; 245 operandNode->fParent = catNode; 246 } 247 break; 248 249 case doLParen: 250 // Open Paren. 251 // The openParen node is a dummy operation type with a low precedence, 252 // which has the affect of ensuring that any real binary op that 253 // follows within the parens binds more tightly to the operands than 254 // stuff outside of the parens. 255 pushNewNode(RBBINode::opLParen); 256 break; 257 258 case doExprRParen: 259 fixOpStack(RBBINode::precLParen); 260 break; 261 262 case doNOP: 263 break; 264 265 case doStartAssign: 266 // We've just scanned "$variable = " 267 // The top of the node stack has the $variable ref node. 268 269 // Save the start position of the RHS text in the StartExpression node 270 // that precedes the $variableReference node on the stack. 271 // This will eventually be used when saving the full $variable replacement 272 // text as a string. 273 n = fNodeStack[fNodeStackPtr-1]; 274 n->fFirstPos = fNextIndex; // move past the '=' 275 276 // Push a new start-of-expression node; needed to keep parse of the 277 // RHS expression happy. 278 pushNewNode(RBBINode::opStart); 279 break; 280 281 282 283 284 case doEndAssign: 285 { 286 // We have reached the end of an assignement statement. 287 // Current scan char is the ';' that terminates the assignment. 288 289 // Terminate expression, leaves expression parse tree rooted in TOS node. 290 fixOpStack(RBBINode::precStart); 291 292 RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2]; 293 RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1]; 294 RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr]; 295 296 // Save original text of right side of assignment, excluding the terminating ';' 297 // in the root of the node for the right-hand-side expression. 298 RHSExprNode->fFirstPos = startExprNode->fFirstPos; 299 RHSExprNode->fLastPos = fScanIndex; 300 fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText); 301 302 // Expression parse tree becomes l. child of the $variable reference node. 303 varRefNode->fLeftChild = RHSExprNode; 304 RHSExprNode->fParent = varRefNode; 305 306 // Make a symbol table entry for the $variableRef node. 307 fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus); 308 if (U_FAILURE(*fRB->fStatus)) { 309 // This is a round-about way to get the parse position set 310 // so that duplicate symbols error messages include a line number. 311 UErrorCode t = *fRB->fStatus; 312 *fRB->fStatus = U_ZERO_ERROR; 313 error(t); 314 } 315 316 // Clean up the stack. 317 delete startExprNode; 318 fNodeStackPtr-=3; 319 break; 320 } 321 322 case doEndOfRule: 323 { 324 fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression 325 if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node. 326 break; 327 } 328 #ifdef RBBI_DEBUG 329 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");} 330 #endif 331 U_ASSERT(fNodeStackPtr == 1); 332 RBBINode *thisRule = fNodeStack[fNodeStackPtr]; 333 334 // If this rule includes a look-ahead '/', add a endMark node to the 335 // expression tree. 336 if (fLookAheadRule) { 337 RBBINode *endNode = pushNewNode(RBBINode::endMark); 338 RBBINode *catNode = pushNewNode(RBBINode::opCat); 339 if (U_FAILURE(*fRB->fStatus)) { 340 break; 341 } 342 fNodeStackPtr -= 2; 343 catNode->fLeftChild = thisRule; 344 catNode->fRightChild = endNode; 345 fNodeStack[fNodeStackPtr] = catNode; 346 endNode->fVal = fRuleNum; 347 endNode->fLookAheadEnd = TRUE; 348 thisRule = catNode; 349 350 // TODO: Disable chaining out of look-ahead (hard break) rules. 351 // The break on rule match is forced, so there is no point in building up 352 // the state table to chain into another rule for a longer match. 353 } 354 355 // Mark this node as being the root of a rule. 356 thisRule->fRuleRoot = TRUE; 357 358 // Flag if chaining into this rule is wanted. 359 // 360 if (fRB->fChainRules && // If rule chaining is enabled globally via !!chain 361 !fNoChainInRule) { // and no '^' chain-in inhibit was on this rule 362 thisRule->fChainIn = TRUE; 363 } 364 365 366 // All rule expressions are ORed together. 367 // The ';' that terminates an expression really just functions as a '|' with 368 // a low operator prededence. 369 // 370 // Each of the four sets of rules are collected separately. 371 // (forward, reverse, safe_forward, safe_reverse) 372 // OR this rule into the appropriate group of them. 373 // 374 RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree); 375 376 if (*destRules != NULL) { 377 // This is not the first rule encounted. 378 // OR previous stuff (from *destRules) 379 // with the current rule expression (on the Node Stack) 380 // with the resulting OR expression going to *destRules 381 // 382 RBBINode *thisRule = fNodeStack[fNodeStackPtr]; 383 RBBINode *prevRules = *destRules; 384 RBBINode *orNode = pushNewNode(RBBINode::opOr); 385 if (U_FAILURE(*fRB->fStatus)) { 386 break; 387 } 388 orNode->fLeftChild = prevRules; 389 prevRules->fParent = orNode; 390 orNode->fRightChild = thisRule; 391 thisRule->fParent = orNode; 392 *destRules = orNode; 393 } 394 else 395 { 396 // This is the first rule encountered (for this direction). 397 // Just move its parse tree from the stack to *destRules. 398 *destRules = fNodeStack[fNodeStackPtr]; 399 } 400 fReverseRule = FALSE; // in preparation for the next rule. 401 fLookAheadRule = FALSE; 402 fNoChainInRule = FALSE; 403 fNodeStackPtr = 0; 404 } 405 break; 406 407 408 case doRuleError: 409 error(U_BRK_RULE_SYNTAX); 410 returnVal = FALSE; 411 break; 412 413 414 case doVariableNameExpectedErr: 415 error(U_BRK_RULE_SYNTAX); 416 break; 417 418 419 // 420 // Unary operands + ? * 421 // These all appear after the operand to which they apply. 422 // When we hit one, the operand (may be a whole sub expression) 423 // will be on the top of the stack. 424 // Unary Operator becomes TOS, with the old TOS as its one child. 425 case doUnaryOpPlus: 426 { 427 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 428 RBBINode *plusNode = pushNewNode(RBBINode::opPlus); 429 if (U_FAILURE(*fRB->fStatus)) { 430 break; 431 } 432 plusNode->fLeftChild = operandNode; 433 operandNode->fParent = plusNode; 434 } 435 break; 436 437 case doUnaryOpQuestion: 438 { 439 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 440 RBBINode *qNode = pushNewNode(RBBINode::opQuestion); 441 if (U_FAILURE(*fRB->fStatus)) { 442 break; 443 } 444 qNode->fLeftChild = operandNode; 445 operandNode->fParent = qNode; 446 } 447 break; 448 449 case doUnaryOpStar: 450 { 451 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 452 RBBINode *starNode = pushNewNode(RBBINode::opStar); 453 if (U_FAILURE(*fRB->fStatus)) { 454 break; 455 } 456 starNode->fLeftChild = operandNode; 457 operandNode->fParent = starNode; 458 } 459 break; 460 461 case doRuleChar: 462 // A "Rule Character" is any single character that is a literal part 463 // of the regular expression. Like a, b and c in the expression "(abc*) | [:L:]" 464 // These are pretty uncommon in break rules; the terms are more commonly 465 // sets. To keep things uniform, treat these characters like as 466 // sets that just happen to contain only one character. 467 { 468 n = pushNewNode(RBBINode::setRef); 469 if (U_FAILURE(*fRB->fStatus)) { 470 break; 471 } 472 findSetFor(UnicodeString(fC.fChar), n); 473 n->fFirstPos = fScanIndex; 474 n->fLastPos = fNextIndex; 475 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 476 break; 477 } 478 479 case doDotAny: 480 // scanned a ".", meaning match any single character. 481 { 482 n = pushNewNode(RBBINode::setRef); 483 if (U_FAILURE(*fRB->fStatus)) { 484 break; 485 } 486 findSetFor(UnicodeString(TRUE, kAny, 3), n); 487 n->fFirstPos = fScanIndex; 488 n->fLastPos = fNextIndex; 489 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 490 break; 491 } 492 493 case doSlash: 494 // Scanned a '/', which identifies a look-ahead break position in a rule. 495 n = pushNewNode(RBBINode::lookAhead); 496 if (U_FAILURE(*fRB->fStatus)) { 497 break; 498 } 499 n->fVal = fRuleNum; 500 n->fFirstPos = fScanIndex; 501 n->fLastPos = fNextIndex; 502 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 503 fLookAheadRule = TRUE; 504 break; 505 506 507 case doStartTagValue: 508 // Scanned a '{', the opening delimiter for a tag value within a rule. 509 n = pushNewNode(RBBINode::tag); 510 if (U_FAILURE(*fRB->fStatus)) { 511 break; 512 } 513 n->fVal = 0; 514 n->fFirstPos = fScanIndex; 515 n->fLastPos = fNextIndex; 516 break; 517 518 case doTagDigit: 519 // Just scanned a decimal digit that's part of a tag value 520 { 521 n = fNodeStack[fNodeStackPtr]; 522 uint32_t v = u_charDigitValue(fC.fChar); 523 U_ASSERT(v < 10); 524 n->fVal = n->fVal*10 + v; 525 break; 526 } 527 528 case doTagValue: 529 n = fNodeStack[fNodeStackPtr]; 530 n->fLastPos = fNextIndex; 531 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 532 break; 533 534 case doTagExpectedError: 535 error(U_BRK_MALFORMED_RULE_TAG); 536 returnVal = FALSE; 537 break; 538 539 case doOptionStart: 540 // Scanning a !!option. At the start of string. 541 fOptionStart = fScanIndex; 542 break; 543 544 case doOptionEnd: 545 { 546 UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart); 547 if (opt == UNICODE_STRING("chain", 5)) { 548 fRB->fChainRules = TRUE; 549 } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) { 550 fRB->fLBCMNoChain = TRUE; 551 } else if (opt == UNICODE_STRING("forward", 7)) { 552 fRB->fDefaultTree = &fRB->fForwardTree; 553 } else if (opt == UNICODE_STRING("reverse", 7)) { 554 fRB->fDefaultTree = &fRB->fReverseTree; 555 } else if (opt == UNICODE_STRING("safe_forward", 12)) { 556 fRB->fDefaultTree = &fRB->fSafeFwdTree; 557 } else if (opt == UNICODE_STRING("safe_reverse", 12)) { 558 fRB->fDefaultTree = &fRB->fSafeRevTree; 559 } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) { 560 fRB->fLookAheadHardBreak = TRUE; 561 } else { 562 error(U_BRK_UNRECOGNIZED_OPTION); 563 } 564 } 565 break; 566 567 case doReverseDir: 568 fReverseRule = TRUE; 569 break; 570 571 case doStartVariableName: 572 n = pushNewNode(RBBINode::varRef); 573 if (U_FAILURE(*fRB->fStatus)) { 574 break; 575 } 576 n->fFirstPos = fScanIndex; 577 break; 578 579 case doEndVariableName: 580 n = fNodeStack[fNodeStackPtr]; 581 if (n==NULL || n->fType != RBBINode::varRef) { 582 error(U_BRK_INTERNAL_ERROR); 583 break; 584 } 585 n->fLastPos = fScanIndex; 586 fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText); 587 // Look the newly scanned name up in the symbol table 588 // If there's an entry, set the l. child of the var ref to the replacement expression. 589 // (We also pass through here when scanning assignments, but no harm is done, other 590 // than a slight wasted effort that seems hard to avoid. Lookup will be null) 591 n->fLeftChild = fSymbolTable->lookupNode(n->fText); 592 break; 593 594 case doCheckVarDef: 595 n = fNodeStack[fNodeStackPtr]; 596 if (n->fLeftChild == NULL) { 597 error(U_BRK_UNDEFINED_VARIABLE); 598 returnVal = FALSE; 599 } 600 break; 601 602 case doExprFinished: 603 break; 604 605 case doRuleErrorAssignExpr: 606 error(U_BRK_ASSIGN_ERROR); 607 returnVal = FALSE; 608 break; 609 610 case doExit: 611 returnVal = FALSE; 612 break; 613 614 case doScanUnicodeSet: 615 scanSet(); 616 break; 617 618 default: 619 error(U_BRK_INTERNAL_ERROR); 620 returnVal = FALSE; 621 break; 622 } 623 return returnVal && U_SUCCESS(*fRB->fStatus); 624 } 625 626 627 628 629 //------------------------------------------------------------------------------ 630 // 631 // Error Report a rule parse error. 632 // Only report it if no previous error has been recorded. 633 // 634 //------------------------------------------------------------------------------ 635 void RBBIRuleScanner::error(UErrorCode e) { 636 if (U_SUCCESS(*fRB->fStatus)) { 637 *fRB->fStatus = e; 638 if (fRB->fParseError) { 639 fRB->fParseError->line = fLineNum; 640 fRB->fParseError->offset = fCharNum; 641 fRB->fParseError->preContext[0] = 0; 642 fRB->fParseError->postContext[0] = 0; 643 } 644 } 645 } 646 647 648 649 650 //------------------------------------------------------------------------------ 651 // 652 // fixOpStack The parse stack holds partially assembled chunks of the parse tree. 653 // An entry on the stack may be as small as a single setRef node, 654 // or as large as the parse tree 655 // for an entire expression (this will be the one item left on the stack 656 // when the parsing of an RBBI rule completes. 657 // 658 // This function is called when a binary operator is encountered. 659 // It looks back up the stack for operators that are not yet associated 660 // with a right operand, and if the precedence of the stacked operator >= 661 // the precedence of the current operator, binds the operand left, 662 // to the previously encountered operator. 663 // 664 //------------------------------------------------------------------------------ 665 void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) { 666 RBBINode *n; 667 // printNodeStack("entering fixOpStack()"); 668 for (;;) { 669 n = fNodeStack[fNodeStackPtr-1]; // an operator node 670 if (n->fPrecedence == 0) { 671 RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node"); 672 error(U_BRK_INTERNAL_ERROR); 673 return; 674 } 675 676 if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) { 677 // The most recent operand goes with the current operator, 678 // not with the previously stacked one. 679 break; 680 } 681 // Stack operator is a binary op ( '|' or concatenation) 682 // TOS operand becomes right child of this operator. 683 // Resulting subexpression becomes the TOS operand. 684 n->fRightChild = fNodeStack[fNodeStackPtr]; 685 fNodeStack[fNodeStackPtr]->fParent = n; 686 fNodeStackPtr--; 687 // printNodeStack("looping in fixOpStack() "); 688 } 689 690 if (p <= RBBINode::precLParen) { 691 // Scan is at a right paren or end of expression. 692 // The scanned item must match the stack, or else there was an error. 693 // Discard the left paren (or start expr) node from the stack, 694 // leaving the completed (sub)expression as TOS. 695 if (n->fPrecedence != p) { 696 // Right paren encountered matched start of expression node, or 697 // end of expression matched with a left paren node. 698 error(U_BRK_MISMATCHED_PAREN); 699 } 700 fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr]; 701 fNodeStackPtr--; 702 // Delete the now-discarded LParen or Start node. 703 delete n; 704 } 705 // printNodeStack("leaving fixOpStack()"); 706 } 707 708 709 710 711 //------------------------------------------------------------------------------ 712 // 713 // findSetFor given a UnicodeString, 714 // - find the corresponding Unicode Set (uset node) 715 // (create one if necessary) 716 // - Set fLeftChild of the caller's node (should be a setRef node) 717 // to the uset node 718 // Maintain a hash table of uset nodes, so the same one is always used 719 // for the same string. 720 // If a "to adopt" set is provided and we haven't seen this key before, 721 // add the provided set to the hash table. 722 // If the string is one (32 bit) char in length, the set contains 723 // just one element which is the char in question. 724 // If the string is "any", return a set containing all chars. 725 // 726 //------------------------------------------------------------------------------ 727 void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) { 728 729 RBBISetTableEl *el; 730 731 // First check whether we've already cached a set for this string. 732 // If so, just use the cached set in the new node. 733 // delete any set provided by the caller, since we own it. 734 el = (RBBISetTableEl *)uhash_get(fSetTable, &s); 735 if (el != NULL) { 736 delete setToAdopt; 737 node->fLeftChild = el->val; 738 U_ASSERT(node->fLeftChild->fType == RBBINode::uset); 739 return; 740 } 741 742 // Haven't seen this set before. 743 // If the caller didn't provide us with a prebuilt set, 744 // create a new UnicodeSet now. 745 if (setToAdopt == NULL) { 746 if (s.compare(kAny, -1) == 0) { 747 setToAdopt = new UnicodeSet(0x000000, 0x10ffff); 748 } else { 749 UChar32 c; 750 c = s.char32At(0); 751 setToAdopt = new UnicodeSet(c, c); 752 } 753 } 754 755 // 756 // Make a new uset node to refer to this UnicodeSet 757 // This new uset node becomes the child of the caller's setReference node. 758 // 759 RBBINode *usetNode = new RBBINode(RBBINode::uset); 760 if (usetNode == NULL) { 761 error(U_MEMORY_ALLOCATION_ERROR); 762 return; 763 } 764 usetNode->fInputSet = setToAdopt; 765 usetNode->fParent = node; 766 node->fLeftChild = usetNode; 767 usetNode->fText = s; 768 769 770 // 771 // Add the new uset node to the list of all uset nodes. 772 // 773 fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus); 774 775 776 // 777 // Add the new set to the set hash table. 778 // 779 el = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl)); 780 UnicodeString *tkey = new UnicodeString(s); 781 if (tkey == NULL || el == NULL || setToAdopt == NULL) { 782 // Delete to avoid memory leak 783 delete tkey; 784 tkey = NULL; 785 uprv_free(el); 786 el = NULL; 787 delete setToAdopt; 788 setToAdopt = NULL; 789 790 error(U_MEMORY_ALLOCATION_ERROR); 791 return; 792 } 793 el->key = tkey; 794 el->val = usetNode; 795 uhash_put(fSetTable, el->key, el, fRB->fStatus); 796 797 return; 798 } 799 800 801 802 // 803 // Assorted Unicode character constants. 804 // Numeric because there is no portable way to enter them as literals. 805 // (Think EBCDIC). 806 // 807 static const UChar chCR = 0x0d; // New lines, for terminating comments. 808 static const UChar chLF = 0x0a; 809 static const UChar chNEL = 0x85; // NEL newline variant 810 static const UChar chLS = 0x2028; // Unicode Line Separator 811 static const UChar chApos = 0x27; // single quote, for quoted chars. 812 static const UChar chPound = 0x23; // '#', introduces a comment. 813 static const UChar chBackSlash = 0x5c; // '\' introduces a char escape 814 static const UChar chLParen = 0x28; 815 static const UChar chRParen = 0x29; 816 817 818 //------------------------------------------------------------------------------ 819 // 820 // stripRules Return a rules string without unnecessary 821 // characters. 822 // 823 //------------------------------------------------------------------------------ 824 UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) { 825 UnicodeString strippedRules; 826 int rulesLength = rules.length(); 827 for (int idx = 0; idx < rulesLength; ) { 828 UChar ch = rules[idx++]; 829 if (ch == chPound) { 830 while (idx < rulesLength 831 && ch != chCR && ch != chLF && ch != chNEL) 832 { 833 ch = rules[idx++]; 834 } 835 } 836 if (!u_isISOControl(ch)) { 837 strippedRules.append(ch); 838 } 839 } 840 // strippedRules = strippedRules.unescape(); 841 return strippedRules; 842 } 843 844 845 //------------------------------------------------------------------------------ 846 // 847 // nextCharLL Low Level Next Char from rule input source. 848 // Get a char from the input character iterator, 849 // keep track of input position for error reporting. 850 // 851 //------------------------------------------------------------------------------ 852 UChar32 RBBIRuleScanner::nextCharLL() { 853 UChar32 ch; 854 855 if (fNextIndex >= fRB->fRules.length()) { 856 return (UChar32)-1; 857 } 858 ch = fRB->fRules.char32At(fNextIndex); 859 fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1); 860 861 if (ch == chCR || 862 ch == chNEL || 863 ch == chLS || 864 (ch == chLF && fLastChar != chCR)) { 865 // Character is starting a new line. Bump up the line number, and 866 // reset the column to 0. 867 fLineNum++; 868 fCharNum=0; 869 if (fQuoteMode) { 870 error(U_BRK_NEW_LINE_IN_QUOTED_STRING); 871 fQuoteMode = FALSE; 872 } 873 } 874 else { 875 // Character is not starting a new line. Except in the case of a 876 // LF following a CR, increment the column position. 877 if (ch != chLF) { 878 fCharNum++; 879 } 880 } 881 fLastChar = ch; 882 return ch; 883 } 884 885 886 //------------------------------------------------------------------------------ 887 // 888 // nextChar for rules scanning. At this level, we handle stripping 889 // out comments and processing backslash character escapes. 890 // The rest of the rules grammar is handled at the next level up. 891 // 892 //------------------------------------------------------------------------------ 893 void RBBIRuleScanner::nextChar(RBBIRuleChar &c) { 894 895 // Unicode Character constants needed for the processing done by nextChar(), 896 // in hex because literals wont work on EBCDIC machines. 897 898 fScanIndex = fNextIndex; 899 c.fChar = nextCharLL(); 900 c.fEscaped = FALSE; 901 902 // 903 // check for '' sequence. 904 // These are recognized in all contexts, whether in quoted text or not. 905 // 906 if (c.fChar == chApos) { 907 if (fRB->fRules.char32At(fNextIndex) == chApos) { 908 c.fChar = nextCharLL(); // get nextChar officially so character counts 909 c.fEscaped = TRUE; // stay correct. 910 } 911 else 912 { 913 // Single quote, by itself. 914 // Toggle quoting mode. 915 // Return either '(' or ')', because quotes cause a grouping of the quoted text. 916 fQuoteMode = !fQuoteMode; 917 if (fQuoteMode == TRUE) { 918 c.fChar = chLParen; 919 } else { 920 c.fChar = chRParen; 921 } 922 c.fEscaped = FALSE; // The paren that we return is not escaped. 923 return; 924 } 925 } 926 927 if (fQuoteMode) { 928 c.fEscaped = TRUE; 929 } 930 else 931 { 932 // We are not in a 'quoted region' of the source. 933 // 934 if (c.fChar == chPound) { 935 // Start of a comment. Consume the rest of it. 936 // The new-line char that terminates the comment is always returned. 937 // It will be treated as white-space, and serves to break up anything 938 // that might otherwise incorrectly clump together with a comment in 939 // the middle (a variable name, for example.) 940 for (;;) { 941 c.fChar = nextCharLL(); 942 if (c.fChar == (UChar32)-1 || // EOF 943 c.fChar == chCR || 944 c.fChar == chLF || 945 c.fChar == chNEL || 946 c.fChar == chLS) {break;} 947 } 948 } 949 if (c.fChar == (UChar32)-1) { 950 return; 951 } 952 953 // 954 // check for backslash escaped characters. 955 // Use UnicodeString::unescapeAt() to handle them. 956 // 957 if (c.fChar == chBackSlash) { 958 c.fEscaped = TRUE; 959 int32_t startX = fNextIndex; 960 c.fChar = fRB->fRules.unescapeAt(fNextIndex); 961 if (fNextIndex == startX) { 962 error(U_BRK_HEX_DIGITS_EXPECTED); 963 } 964 fCharNum += fNextIndex-startX; 965 } 966 } 967 // putc(c.fChar, stdout); 968 } 969 970 //------------------------------------------------------------------------------ 971 // 972 // Parse RBBI rules. The state machine for rules parsing is here. 973 // The state tables are hand-written in the file rbbirpt.txt, 974 // and converted to the form used here by a perl 975 // script rbbicst.pl 976 // 977 //------------------------------------------------------------------------------ 978 void RBBIRuleScanner::parse() { 979 uint16_t state; 980 const RBBIRuleTableEl *tableEl; 981 982 if (U_FAILURE(*fRB->fStatus)) { 983 return; 984 } 985 986 state = 1; 987 nextChar(fC); 988 // 989 // Main loop for the rule parsing state machine. 990 // Runs once per state transition. 991 // Each time through optionally performs, depending on the state table, 992 // - an advance to the the next input char 993 // - an action to be performed. 994 // - pushing or popping a state to/from the local state return stack. 995 // 996 for (;;) { 997 // Bail out if anything has gone wrong. 998 // RBBI rule file parsing stops on the first error encountered. 999 if (U_FAILURE(*fRB->fStatus)) { 1000 break; 1001 } 1002 1003 // Quit if state == 0. This is the normal way to exit the state machine. 1004 // 1005 if (state == 0) { 1006 break; 1007 } 1008 1009 // Find the state table element that matches the input char from the rule, or the 1010 // class of the input character. Start with the first table row for this 1011 // state, then linearly scan forward until we find a row that matches the 1012 // character. The last row for each state always matches all characters, so 1013 // the search will stop there, if not before. 1014 // 1015 tableEl = &gRuleParseStateTable[state]; 1016 #ifdef RBBI_DEBUG 1017 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { 1018 RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ", 1019 fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]); 1020 } 1021 #endif 1022 1023 for (;;) { 1024 #ifdef RBBI_DEBUG 1025 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);} 1026 #endif 1027 if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) { 1028 // Table row specified an individual character, not a set, and 1029 // the input character is not escaped, and 1030 // the input character matched it. 1031 break; 1032 } 1033 if (tableEl->fCharClass == 255) { 1034 // Table row specified default, match anything character class. 1035 break; 1036 } 1037 if (tableEl->fCharClass == 254 && fC.fEscaped) { 1038 // Table row specified "escaped" and the char was escaped. 1039 break; 1040 } 1041 if (tableEl->fCharClass == 253 && fC.fEscaped && 1042 (fC.fChar == 0x50 || fC.fChar == 0x70 )) { 1043 // Table row specified "escaped P" and the char is either 'p' or 'P'. 1044 break; 1045 } 1046 if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) { 1047 // Table row specified eof and we hit eof on the input. 1048 break; 1049 } 1050 1051 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && 1052 fC.fEscaped == FALSE && // char is not escaped && 1053 fC.fChar != (UChar32)-1) { // char is not EOF 1054 U_ASSERT((tableEl->fCharClass-128) < UPRV_LENGTHOF(fRuleSets)); 1055 if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { 1056 // Table row specified a character class, or set of characters, 1057 // and the current char matches it. 1058 break; 1059 } 1060 } 1061 1062 // No match on this row, advance to the next row for this state, 1063 tableEl++; 1064 } 1065 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");} 1066 1067 // 1068 // We've found the row of the state table that matches the current input 1069 // character from the rules string. 1070 // Perform any action specified by this row in the state table. 1071 if (doParseActions((int32_t)tableEl->fAction) == FALSE) { 1072 // Break out of the state machine loop if the 1073 // the action signalled some kind of error, or 1074 // the action was to exit, occurs on normal end-of-rules-input. 1075 break; 1076 } 1077 1078 if (tableEl->fPushState != 0) { 1079 fStackPtr++; 1080 if (fStackPtr >= kStackSize) { 1081 error(U_BRK_INTERNAL_ERROR); 1082 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow."); 1083 fStackPtr--; 1084 } 1085 fStack[fStackPtr] = tableEl->fPushState; 1086 } 1087 1088 if (tableEl->fNextChar) { 1089 nextChar(fC); 1090 } 1091 1092 // Get the next state from the table entry, or from the 1093 // state stack if the next state was specified as "pop". 1094 if (tableEl->fNextState != 255) { 1095 state = tableEl->fNextState; 1096 } else { 1097 state = fStack[fStackPtr]; 1098 fStackPtr--; 1099 if (fStackPtr < 0) { 1100 error(U_BRK_INTERNAL_ERROR); 1101 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow."); 1102 fStackPtr++; 1103 } 1104 } 1105 1106 } 1107 1108 if (U_FAILURE(*fRB->fStatus)) { 1109 return; 1110 } 1111 1112 // If there are no forward rules set an error. 1113 // 1114 if (fRB->fForwardTree == NULL) { 1115 error(U_BRK_RULE_SYNTAX); 1116 return; 1117 } 1118 1119 // 1120 // If there were NO user specified reverse rules, set up the equivalent of ".*;" 1121 // 1122 if (fRB->fReverseTree == NULL) { 1123 fRB->fReverseTree = pushNewNode(RBBINode::opStar); 1124 RBBINode *operand = pushNewNode(RBBINode::setRef); 1125 if (U_FAILURE(*fRB->fStatus)) { 1126 return; 1127 } 1128 findSetFor(UnicodeString(TRUE, kAny, 3), operand); 1129 fRB->fReverseTree->fLeftChild = operand; 1130 operand->fParent = fRB->fReverseTree; 1131 fNodeStackPtr -= 2; 1132 } 1133 1134 1135 // 1136 // Parsing of the input RBBI rules is complete. 1137 // We now have a parse tree for the rule expressions 1138 // and a list of all UnicodeSets that are referenced. 1139 // 1140 #ifdef RBBI_DEBUG 1141 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();} 1142 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) { 1143 RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n"); 1144 RBBINode::printTree(fRB->fForwardTree, TRUE); 1145 RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n"); 1146 RBBINode::printTree(fRB->fReverseTree, TRUE); 1147 RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n"); 1148 RBBINode::printTree(fRB->fSafeFwdTree, TRUE); 1149 RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n"); 1150 RBBINode::printTree(fRB->fSafeRevTree, TRUE); 1151 } 1152 #endif 1153 } 1154 1155 1156 //------------------------------------------------------------------------------ 1157 // 1158 // printNodeStack for debugging... 1159 // 1160 //------------------------------------------------------------------------------ 1161 #ifdef RBBI_DEBUG 1162 void RBBIRuleScanner::printNodeStack(const char *title) { 1163 int i; 1164 RBBIDebugPrintf("%s. Dumping node stack...\n", title); 1165 for (i=fNodeStackPtr; i>0; i--) {RBBINode::printTree(fNodeStack[i], TRUE);} 1166 } 1167 #endif 1168 1169 1170 1171 1172 //------------------------------------------------------------------------------ 1173 // 1174 // pushNewNode create a new RBBINode of the specified type and push it 1175 // onto the stack of nodes. 1176 // 1177 //------------------------------------------------------------------------------ 1178 RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) { 1179 if (U_FAILURE(*fRB->fStatus)) { 1180 return NULL; 1181 } 1182 fNodeStackPtr++; 1183 if (fNodeStackPtr >= kStackSize) { 1184 error(U_BRK_INTERNAL_ERROR); 1185 RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow."); 1186 *fRB->fStatus = U_BRK_INTERNAL_ERROR; 1187 return NULL; 1188 } 1189 fNodeStack[fNodeStackPtr] = new RBBINode(t); 1190 if (fNodeStack[fNodeStackPtr] == NULL) { 1191 *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR; 1192 } 1193 return fNodeStack[fNodeStackPtr]; 1194 } 1195 1196 1197 1198 //------------------------------------------------------------------------------ 1199 // 1200 // scanSet Construct a UnicodeSet from the text at the current scan 1201 // position. Advance the scan position to the first character 1202 // after the set. 1203 // 1204 // A new RBBI setref node referring to the set is pushed onto the node 1205 // stack. 1206 // 1207 // The scan position is normally under the control of the state machine 1208 // that controls rule parsing. UnicodeSets, however, are parsed by 1209 // the UnicodeSet constructor, not by the RBBI rule parser. 1210 // 1211 //------------------------------------------------------------------------------ 1212 void RBBIRuleScanner::scanSet() { 1213 UnicodeSet *uset; 1214 ParsePosition pos; 1215 int startPos; 1216 int i; 1217 1218 if (U_FAILURE(*fRB->fStatus)) { 1219 return; 1220 } 1221 1222 pos.setIndex(fScanIndex); 1223 startPos = fScanIndex; 1224 UErrorCode localStatus = U_ZERO_ERROR; 1225 uset = new UnicodeSet(); 1226 if (uset == NULL) { 1227 localStatus = U_MEMORY_ALLOCATION_ERROR; 1228 } else { 1229 uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus); 1230 } 1231 if (U_FAILURE(localStatus)) { 1232 // TODO: Get more accurate position of the error from UnicodeSet's return info. 1233 // UnicodeSet appears to not be reporting correctly at this time. 1234 #ifdef RBBI_DEBUG 1235 RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex()); 1236 #endif 1237 error(localStatus); 1238 delete uset; 1239 return; 1240 } 1241 1242 // Verify that the set contains at least one code point. 1243 // 1244 U_ASSERT(uset!=NULL); 1245 if (uset->isEmpty()) { 1246 // This set is empty. 1247 // Make it an error, because it almost certainly is not what the user wanted. 1248 // Also, avoids having to think about corner cases in the tree manipulation code 1249 // that occurs later on. 1250 error(U_BRK_RULE_EMPTY_SET); 1251 delete uset; 1252 return; 1253 } 1254 1255 1256 // Advance the RBBI parse postion over the UnicodeSet pattern. 1257 // Don't just set fScanIndex because the line/char positions maintained 1258 // for error reporting would be thrown off. 1259 i = pos.getIndex(); 1260 for (;;) { 1261 if (fNextIndex >= i) { 1262 break; 1263 } 1264 nextCharLL(); 1265 } 1266 1267 if (U_SUCCESS(*fRB->fStatus)) { 1268 RBBINode *n; 1269 1270 n = pushNewNode(RBBINode::setRef); 1271 if (U_FAILURE(*fRB->fStatus)) { 1272 return; 1273 } 1274 n->fFirstPos = startPos; 1275 n->fLastPos = fNextIndex; 1276 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 1277 // findSetFor() serves several purposes here: 1278 // - Adopts storage for the UnicodeSet, will be responsible for deleting. 1279 // - Mantains collection of all sets in use, needed later for establishing 1280 // character categories for run time engine. 1281 // - Eliminates mulitiple instances of the same set. 1282 // - Creates a new uset node if necessary (if this isn't a duplicate.) 1283 findSetFor(n->fText, n, uset); 1284 } 1285 1286 } 1287 1288 U_NAMESPACE_END 1289 1290 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1291