1 """ANTLR3 runtime package""" 2 3 # begin[licence] 4 # 5 # [The "BSD licence"] 6 # Copyright (c) 2005-2008 Terence Parr 7 # All rights reserved. 8 # 9 # Redistribution and use in source and binary forms, with or without 10 # modification, are permitted provided that the following conditions 11 # are met: 12 # 1. Redistributions of source code must retain the above copyright 13 # notice, this list of conditions and the following disclaimer. 14 # 2. Redistributions in binary form must reproduce the above copyright 15 # notice, this list of conditions and the following disclaimer in the 16 # documentation and/or other materials provided with the distribution. 17 # 3. The name of the author may not be used to endorse or promote products 18 # derived from this software without specific prior written permission. 19 # 20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 # 31 # end[licence] 32 33 import sys 34 import inspect 35 36 from antlr3 import compatible_api_versions 37 from antlr3.constants import DEFAULT_CHANNEL, HIDDEN_CHANNEL, EOF, \ 38 EOR_TOKEN_TYPE, INVALID_TOKEN_TYPE 39 from antlr3.exceptions import RecognitionException, MismatchedTokenException, \ 40 MismatchedRangeException, MismatchedTreeNodeException, \ 41 NoViableAltException, EarlyExitException, MismatchedSetException, \ 42 MismatchedNotSetException, FailedPredicateException, \ 43 BacktrackingFailed, UnwantedTokenException, MissingTokenException 44 from antlr3.tokens import CommonToken, SKIP_TOKEN 45 from antlr3.compat import set, frozenset, reversed 46 47 48 class RecognizerSharedState(object): 49 """ 50 The set of fields needed by an abstract recognizer to recognize input 51 and recover from errors etc... As a separate state object, it can be 52 shared among multiple grammars; e.g., when one grammar imports another. 53 54 These fields are publically visible but the actual state pointer per 55 parser is protected. 56 """ 57 58 def __init__(self): 59 # Track the set of token types that can follow any rule invocation. 60 # Stack grows upwards. 61 self.following = [] 62 63 # This is true when we see an error and before having successfully 64 # matched a token. Prevents generation of more than one error message 65 # per error. 66 self.errorRecovery = False 67 68 # The index into the input stream where the last error occurred. 69 # This is used to prevent infinite loops where an error is found 70 # but no token is consumed during recovery...another error is found, 71 # ad naseum. This is a failsafe mechanism to guarantee that at least 72 # one token/tree node is consumed for two errors. 73 self.lastErrorIndex = -1 74 75 # If 0, no backtracking is going on. Safe to exec actions etc... 76 # If >0 then it's the level of backtracking. 77 self.backtracking = 0 78 79 # An array[size num rules] of Map<Integer,Integer> that tracks 80 # the stop token index for each rule. ruleMemo[ruleIndex] is 81 # the memoization table for ruleIndex. For key ruleStartIndex, you 82 # get back the stop token for associated rule or MEMO_RULE_FAILED. 83 # 84 # This is only used if rule memoization is on (which it is by default). 85 self.ruleMemo = None 86 87 ## Did the recognizer encounter a syntax error? Track how many. 88 self.syntaxErrors = 0 89 90 91 # LEXER FIELDS (must be in same state object to avoid casting 92 # constantly in generated code and Lexer object) :( 93 94 95 ## The goal of all lexer rules/methods is to create a token object. 96 # This is an instance variable as multiple rules may collaborate to 97 # create a single token. nextToken will return this object after 98 # matching lexer rule(s). If you subclass to allow multiple token 99 # emissions, then set this to the last token to be matched or 100 # something nonnull so that the auto token emit mechanism will not 101 # emit another token. 102 self.token = None 103 104 ## What character index in the stream did the current token start at? 105 # Needed, for example, to get the text for current token. Set at 106 # the start of nextToken. 107 self.tokenStartCharIndex = -1 108 109 ## The line on which the first character of the token resides 110 self.tokenStartLine = None 111 112 ## The character position of first character within the line 113 self.tokenStartCharPositionInLine = None 114 115 ## The channel number for the current token 116 self.channel = None 117 118 ## The token type for the current token 119 self.type = None 120 121 ## You can set the text for the current token to override what is in 122 # the input char buffer. Use setText() or can set this instance var. 123 self.text = None 124 125 126 class BaseRecognizer(object): 127 """ 128 @brief Common recognizer functionality. 129 130 A generic recognizer that can handle recognizers generated from 131 lexer, parser, and tree grammars. This is all the parsing 132 support code essentially; most of it is error recovery stuff and 133 backtracking. 134 """ 135 136 MEMO_RULE_FAILED = -2 137 MEMO_RULE_UNKNOWN = -1 138 139 # copies from Token object for convenience in actions 140 DEFAULT_TOKEN_CHANNEL = DEFAULT_CHANNEL 141 142 # for convenience in actions 143 HIDDEN = HIDDEN_CHANNEL 144 145 # overridden by generated subclasses 146 tokenNames = None 147 148 # The api_version attribute has been introduced in 3.3. If it is not 149 # overwritten in the generated recognizer, we assume a default of v0. 150 api_version = 0 151 152 def __init__(self, state=None): 153 # Input stream of the recognizer. Must be initialized by a subclass. 154 self.input = None 155 156 ## State of a lexer, parser, or tree parser are collected into a state 157 # object so the state can be shared. This sharing is needed to 158 # have one grammar import others and share same error variables 159 # and other state variables. It's a kind of explicit multiple 160 # inheritance via delegation of methods and shared state. 161 if state is None: 162 state = RecognizerSharedState() 163 self._state = state 164 165 if self.api_version not in compatible_api_versions: 166 raise RuntimeError( 167 ("ANTLR version mismatch: " 168 "The recognizer has been generated with API V%s, " 169 "but this runtime does not support this.") 170 % self.api_version) 171 172 # this one only exists to shut up pylint :( 173 def setInput(self, input): 174 self.input = input 175 176 177 def reset(self): 178 """ 179 reset the parser's state; subclasses must rewinds the input stream 180 """ 181 182 # wack everything related to error recovery 183 if self._state is None: 184 # no shared state work to do 185 return 186 187 self._state.following = [] 188 self._state.errorRecovery = False 189 self._state.lastErrorIndex = -1 190 self._state.syntaxErrors = 0 191 # wack everything related to backtracking and memoization 192 self._state.backtracking = 0 193 if self._state.ruleMemo is not None: 194 self._state.ruleMemo = {} 195 196 197 def match(self, input, ttype, follow): 198 """ 199 Match current input symbol against ttype. Attempt 200 single token insertion or deletion error recovery. If 201 that fails, throw MismatchedTokenException. 202 203 To turn off single token insertion or deletion error 204 recovery, override recoverFromMismatchedToken() and have it 205 throw an exception. See TreeParser.recoverFromMismatchedToken(). 206 This way any error in a rule will cause an exception and 207 immediate exit from rule. Rule would recover by resynchronizing 208 to the set of symbols that can follow rule ref. 209 """ 210 211 matchedSymbol = self.getCurrentInputSymbol(input) 212 if self.input.LA(1) == ttype: 213 self.input.consume() 214 self._state.errorRecovery = False 215 return matchedSymbol 216 217 if self._state.backtracking > 0: 218 # FIXME: need to return matchedSymbol here as well. damn!! 219 raise BacktrackingFailed 220 221 matchedSymbol = self.recoverFromMismatchedToken(input, ttype, follow) 222 return matchedSymbol 223 224 225 def matchAny(self, input): 226 """Match the wildcard: in a symbol""" 227 228 self._state.errorRecovery = False 229 self.input.consume() 230 231 232 def mismatchIsUnwantedToken(self, input, ttype): 233 return input.LA(2) == ttype 234 235 236 def mismatchIsMissingToken(self, input, follow): 237 if follow is None: 238 # we have no information about the follow; we can only consume 239 # a single token and hope for the best 240 return False 241 242 # compute what can follow this grammar element reference 243 if EOR_TOKEN_TYPE in follow: 244 viableTokensFollowingThisRule = self.computeContextSensitiveRuleFOLLOW() 245 follow = follow | viableTokensFollowingThisRule 246 247 if len(self._state.following) > 0: 248 # remove EOR if we're not the start symbol 249 follow = follow - set([EOR_TOKEN_TYPE]) 250 251 # if current token is consistent with what could come after set 252 # then we know we're missing a token; error recovery is free to 253 # "insert" the missing token 254 if input.LA(1) in follow or EOR_TOKEN_TYPE in follow: 255 return True 256 257 return False 258 259 260 def reportError(self, e): 261 """Report a recognition problem. 262 263 This method sets errorRecovery to indicate the parser is recovering 264 not parsing. Once in recovery mode, no errors are generated. 265 To get out of recovery mode, the parser must successfully match 266 a token (after a resync). So it will go: 267 268 1. error occurs 269 2. enter recovery mode, report error 270 3. consume until token found in resynch set 271 4. try to resume parsing 272 5. next match() will reset errorRecovery mode 273 274 If you override, make sure to update syntaxErrors if you care about 275 that. 276 277 """ 278 279 # if we've already reported an error and have not matched a token 280 # yet successfully, don't report any errors. 281 if self._state.errorRecovery: 282 return 283 284 self._state.syntaxErrors += 1 # don't count spurious 285 self._state.errorRecovery = True 286 287 self.displayRecognitionError(self.tokenNames, e) 288 289 290 def displayRecognitionError(self, tokenNames, e): 291 hdr = self.getErrorHeader(e) 292 msg = self.getErrorMessage(e, tokenNames) 293 self.emitErrorMessage(hdr+" "+msg) 294 295 296 def getErrorMessage(self, e, tokenNames): 297 """ 298 What error message should be generated for the various 299 exception types? 300 301 Not very object-oriented code, but I like having all error message 302 generation within one method rather than spread among all of the 303 exception classes. This also makes it much easier for the exception 304 handling because the exception classes do not have to have pointers back 305 to this object to access utility routines and so on. Also, changing 306 the message for an exception type would be difficult because you 307 would have to subclassing exception, but then somehow get ANTLR 308 to make those kinds of exception objects instead of the default. 309 This looks weird, but trust me--it makes the most sense in terms 310 of flexibility. 311 312 For grammar debugging, you will want to override this to add 313 more information such as the stack frame with 314 getRuleInvocationStack(e, this.getClass().getName()) and, 315 for no viable alts, the decision description and state etc... 316 317 Override this to change the message generated for one or more 318 exception types. 319 """ 320 321 if isinstance(e, UnwantedTokenException): 322 tokenName = "<unknown>" 323 if e.expecting == EOF: 324 tokenName = "EOF" 325 326 else: 327 tokenName = self.tokenNames[e.expecting] 328 329 msg = "extraneous input %s expecting %s" % ( 330 self.getTokenErrorDisplay(e.getUnexpectedToken()), 331 tokenName 332 ) 333 334 elif isinstance(e, MissingTokenException): 335 tokenName = "<unknown>" 336 if e.expecting == EOF: 337 tokenName = "EOF" 338 339 else: 340 tokenName = self.tokenNames[e.expecting] 341 342 msg = "missing %s at %s" % ( 343 tokenName, self.getTokenErrorDisplay(e.token) 344 ) 345 346 elif isinstance(e, MismatchedTokenException): 347 tokenName = "<unknown>" 348 if e.expecting == EOF: 349 tokenName = "EOF" 350 else: 351 tokenName = self.tokenNames[e.expecting] 352 353 msg = "mismatched input " \ 354 + self.getTokenErrorDisplay(e.token) \ 355 + " expecting " \ 356 + tokenName 357 358 elif isinstance(e, MismatchedTreeNodeException): 359 tokenName = "<unknown>" 360 if e.expecting == EOF: 361 tokenName = "EOF" 362 else: 363 tokenName = self.tokenNames[e.expecting] 364 365 msg = "mismatched tree node: %s expecting %s" \ 366 % (e.node, tokenName) 367 368 elif isinstance(e, NoViableAltException): 369 msg = "no viable alternative at input " \ 370 + self.getTokenErrorDisplay(e.token) 371 372 elif isinstance(e, EarlyExitException): 373 msg = "required (...)+ loop did not match anything at input " \ 374 + self.getTokenErrorDisplay(e.token) 375 376 elif isinstance(e, MismatchedSetException): 377 msg = "mismatched input " \ 378 + self.getTokenErrorDisplay(e.token) \ 379 + " expecting set " \ 380 + repr(e.expecting) 381 382 elif isinstance(e, MismatchedNotSetException): 383 msg = "mismatched input " \ 384 + self.getTokenErrorDisplay(e.token) \ 385 + " expecting set " \ 386 + repr(e.expecting) 387 388 elif isinstance(e, FailedPredicateException): 389 msg = "rule " \ 390 + e.ruleName \ 391 + " failed predicate: {" \ 392 + e.predicateText \ 393 + "}?" 394 395 else: 396 msg = str(e) 397 398 return msg 399 400 401 def getNumberOfSyntaxErrors(self): 402 """ 403 Get number of recognition errors (lexer, parser, tree parser). Each 404 recognizer tracks its own number. So parser and lexer each have 405 separate count. Does not count the spurious errors found between 406 an error and next valid token match 407 408 See also reportError() 409 """ 410 return self._state.syntaxErrors 411 412 413 def getErrorHeader(self, e): 414 """ 415 What is the error header, normally line/character position information? 416 """ 417 418 source_name = self.getSourceName() 419 if source_name is not None: 420 return "%s line %d:%d" % (source_name, e.line, e.charPositionInLine) 421 return "line %d:%d" % (e.line, e.charPositionInLine) 422 423 424 def getTokenErrorDisplay(self, t): 425 """ 426 How should a token be displayed in an error message? The default 427 is to display just the text, but during development you might 428 want to have a lot of information spit out. Override in that case 429 to use t.toString() (which, for CommonToken, dumps everything about 430 the token). This is better than forcing you to override a method in 431 your token objects because you don't have to go modify your lexer 432 so that it creates a new Java type. 433 """ 434 435 s = t.text 436 if s is None: 437 if t.type == EOF: 438 s = "<EOF>" 439 else: 440 s = "<"+t.type+">" 441 442 return repr(s) 443 444 445 def emitErrorMessage(self, msg): 446 """Override this method to change where error messages go""" 447 sys.stderr.write(msg + '\n') 448 449 450 def recover(self, input, re): 451 """ 452 Recover from an error found on the input stream. This is 453 for NoViableAlt and mismatched symbol exceptions. If you enable 454 single token insertion and deletion, this will usually not 455 handle mismatched symbol exceptions but there could be a mismatched 456 token that the match() routine could not recover from. 457 """ 458 459 # PROBLEM? what if input stream is not the same as last time 460 # perhaps make lastErrorIndex a member of input 461 if self._state.lastErrorIndex == input.index(): 462 # uh oh, another error at same token index; must be a case 463 # where LT(1) is in the recovery token set so nothing is 464 # consumed; consume a single token so at least to prevent 465 # an infinite loop; this is a failsafe. 466 input.consume() 467 468 self._state.lastErrorIndex = input.index() 469 followSet = self.computeErrorRecoverySet() 470 471 self.beginResync() 472 self.consumeUntil(input, followSet) 473 self.endResync() 474 475 476 def beginResync(self): 477 """ 478 A hook to listen in on the token consumption during error recovery. 479 The DebugParser subclasses this to fire events to the listenter. 480 """ 481 482 pass 483 484 485 def endResync(self): 486 """ 487 A hook to listen in on the token consumption during error recovery. 488 The DebugParser subclasses this to fire events to the listenter. 489 """ 490 491 pass 492 493 494 def computeErrorRecoverySet(self): 495 """ 496 Compute the error recovery set for the current rule. During 497 rule invocation, the parser pushes the set of tokens that can 498 follow that rule reference on the stack; this amounts to 499 computing FIRST of what follows the rule reference in the 500 enclosing rule. This local follow set only includes tokens 501 from within the rule; i.e., the FIRST computation done by 502 ANTLR stops at the end of a rule. 503 504 EXAMPLE 505 506 When you find a "no viable alt exception", the input is not 507 consistent with any of the alternatives for rule r. The best 508 thing to do is to consume tokens until you see something that 509 can legally follow a call to r *or* any rule that called r. 510 You don't want the exact set of viable next tokens because the 511 input might just be missing a token--you might consume the 512 rest of the input looking for one of the missing tokens. 513 514 Consider grammar: 515 516 a : '[' b ']' 517 | '(' b ')' 518 ; 519 b : c '^' INT ; 520 c : ID 521 | INT 522 ; 523 524 At each rule invocation, the set of tokens that could follow 525 that rule is pushed on a stack. Here are the various "local" 526 follow sets: 527 528 FOLLOW(b1_in_a) = FIRST(']') = ']' 529 FOLLOW(b2_in_a) = FIRST(')') = ')' 530 FOLLOW(c_in_b) = FIRST('^') = '^' 531 532 Upon erroneous input "[]", the call chain is 533 534 a -> b -> c 535 536 and, hence, the follow context stack is: 537 538 depth local follow set after call to rule 539 0 \<EOF> a (from main()) 540 1 ']' b 541 3 '^' c 542 543 Notice that ')' is not included, because b would have to have 544 been called from a different context in rule a for ')' to be 545 included. 546 547 For error recovery, we cannot consider FOLLOW(c) 548 (context-sensitive or otherwise). We need the combined set of 549 all context-sensitive FOLLOW sets--the set of all tokens that 550 could follow any reference in the call chain. We need to 551 resync to one of those tokens. Note that FOLLOW(c)='^' and if 552 we resync'd to that token, we'd consume until EOF. We need to 553 sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}. 554 In this case, for input "[]", LA(1) is in this set so we would 555 not consume anything and after printing an error rule c would 556 return normally. It would not find the required '^' though. 557 At this point, it gets a mismatched token error and throws an 558 exception (since LA(1) is not in the viable following token 559 set). The rule exception handler tries to recover, but finds 560 the same recovery set and doesn't consume anything. Rule b 561 exits normally returning to rule a. Now it finds the ']' (and 562 with the successful match exits errorRecovery mode). 563 564 So, you cna see that the parser walks up call chain looking 565 for the token that was a member of the recovery set. 566 567 Errors are not generated in errorRecovery mode. 568 569 ANTLR's error recovery mechanism is based upon original ideas: 570 571 "Algorithms + Data Structures = Programs" by Niklaus Wirth 572 573 and 574 575 "A note on error recovery in recursive descent parsers": 576 http://portal.acm.org/citation.cfm?id=947902.947905 577 578 Later, Josef Grosch had some good ideas: 579 580 "Efficient and Comfortable Error Recovery in Recursive Descent 581 Parsers": 582 ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip 583 584 Like Grosch I implemented local FOLLOW sets that are combined 585 at run-time upon error to avoid overhead during parsing. 586 """ 587 588 return self.combineFollows(False) 589 590 591 def computeContextSensitiveRuleFOLLOW(self): 592 """ 593 Compute the context-sensitive FOLLOW set for current rule. 594 This is set of token types that can follow a specific rule 595 reference given a specific call chain. You get the set of 596 viable tokens that can possibly come next (lookahead depth 1) 597 given the current call chain. Contrast this with the 598 definition of plain FOLLOW for rule r: 599 600 FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)} 601 602 where x in T* and alpha, beta in V*; T is set of terminals and 603 V is the set of terminals and nonterminals. In other words, 604 FOLLOW(r) is the set of all tokens that can possibly follow 605 references to r in *any* sentential form (context). At 606 runtime, however, we know precisely which context applies as 607 we have the call chain. We may compute the exact (rather 608 than covering superset) set of following tokens. 609 610 For example, consider grammar: 611 612 stat : ID '=' expr ';' // FOLLOW(stat)=={EOF} 613 | "return" expr '.' 614 ; 615 expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'} 616 atom : INT // FOLLOW(atom)=={'+',')',';','.'} 617 | '(' expr ')' 618 ; 619 620 The FOLLOW sets are all inclusive whereas context-sensitive 621 FOLLOW sets are precisely what could follow a rule reference. 622 For input input "i=(3);", here is the derivation: 623 624 stat => ID '=' expr ';' 625 => ID '=' atom ('+' atom)* ';' 626 => ID '=' '(' expr ')' ('+' atom)* ';' 627 => ID '=' '(' atom ')' ('+' atom)* ';' 628 => ID '=' '(' INT ')' ('+' atom)* ';' 629 => ID '=' '(' INT ')' ';' 630 631 At the "3" token, you'd have a call chain of 632 633 stat -> expr -> atom -> expr -> atom 634 635 What can follow that specific nested ref to atom? Exactly ')' 636 as you can see by looking at the derivation of this specific 637 input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}. 638 639 You want the exact viable token set when recovering from a 640 token mismatch. Upon token mismatch, if LA(1) is member of 641 the viable next token set, then you know there is most likely 642 a missing token in the input stream. "Insert" one by just not 643 throwing an exception. 644 """ 645 646 return self.combineFollows(True) 647 648 649 def combineFollows(self, exact): 650 followSet = set() 651 for idx, localFollowSet in reversed(list(enumerate(self._state.following))): 652 followSet |= localFollowSet 653 if exact: 654 # can we see end of rule? 655 if EOR_TOKEN_TYPE in localFollowSet: 656 # Only leave EOR in set if at top (start rule); this lets 657 # us know if have to include follow(start rule); i.e., EOF 658 if idx > 0: 659 followSet.remove(EOR_TOKEN_TYPE) 660 661 else: 662 # can't see end of rule, quit 663 break 664 665 return followSet 666 667 668 def recoverFromMismatchedToken(self, input, ttype, follow): 669 """Attempt to recover from a single missing or extra token. 670 671 EXTRA TOKEN 672 673 LA(1) is not what we are looking for. If LA(2) has the right token, 674 however, then assume LA(1) is some extra spurious token. Delete it 675 and LA(2) as if we were doing a normal match(), which advances the 676 input. 677 678 MISSING TOKEN 679 680 If current token is consistent with what could come after 681 ttype then it is ok to 'insert' the missing token, else throw 682 exception For example, Input 'i=(3;' is clearly missing the 683 ')'. When the parser returns from the nested call to expr, it 684 will have call chain: 685 686 stat -> expr -> atom 687 688 and it will be trying to match the ')' at this point in the 689 derivation: 690 691 => ID '=' '(' INT ')' ('+' atom)* ';' 692 ^ 693 match() will see that ';' doesn't match ')' and report a 694 mismatched token error. To recover, it sees that LA(1)==';' 695 is in the set of tokens that can follow the ')' token 696 reference in rule atom. It can assume that you forgot the ')'. 697 """ 698 699 e = None 700 701 # if next token is what we are looking for then "delete" this token 702 if self.mismatchIsUnwantedToken(input, ttype): 703 e = UnwantedTokenException(ttype, input) 704 705 self.beginResync() 706 input.consume() # simply delete extra token 707 self.endResync() 708 709 # report after consuming so AW sees the token in the exception 710 self.reportError(e) 711 712 # we want to return the token we're actually matching 713 matchedSymbol = self.getCurrentInputSymbol(input) 714 715 # move past ttype token as if all were ok 716 input.consume() 717 return matchedSymbol 718 719 # can't recover with single token deletion, try insertion 720 if self.mismatchIsMissingToken(input, follow): 721 inserted = self.getMissingSymbol(input, e, ttype, follow) 722 e = MissingTokenException(ttype, input, inserted) 723 724 # report after inserting so AW sees the token in the exception 725 self.reportError(e) 726 return inserted 727 728 # even that didn't work; must throw the exception 729 e = MismatchedTokenException(ttype, input) 730 raise e 731 732 733 def recoverFromMismatchedSet(self, input, e, follow): 734 """Not currently used""" 735 736 if self.mismatchIsMissingToken(input, follow): 737 self.reportError(e) 738 # we don't know how to conjure up a token for sets yet 739 return self.getMissingSymbol(input, e, INVALID_TOKEN_TYPE, follow) 740 741 # TODO do single token deletion like above for Token mismatch 742 raise e 743 744 745 def getCurrentInputSymbol(self, input): 746 """ 747 Match needs to return the current input symbol, which gets put 748 into the label for the associated token ref; e.g., x=ID. Token 749 and tree parsers need to return different objects. Rather than test 750 for input stream type or change the IntStream interface, I use 751 a simple method to ask the recognizer to tell me what the current 752 input symbol is. 753 754 This is ignored for lexers. 755 """ 756 757 return None 758 759 760 def getMissingSymbol(self, input, e, expectedTokenType, follow): 761 """Conjure up a missing token during error recovery. 762 763 The recognizer attempts to recover from single missing 764 symbols. But, actions might refer to that missing symbol. 765 For example, x=ID {f($x);}. The action clearly assumes 766 that there has been an identifier matched previously and that 767 $x points at that token. If that token is missing, but 768 the next token in the stream is what we want we assume that 769 this token is missing and we keep going. Because we 770 have to return some token to replace the missing token, 771 we have to conjure one up. This method gives the user control 772 over the tokens returned for missing tokens. Mostly, 773 you will want to create something special for identifier 774 tokens. For literals such as '{' and ',', the default 775 action in the parser or tree parser works. It simply creates 776 a CommonToken of the appropriate type. The text will be the token. 777 If you change what tokens must be created by the lexer, 778 override this method to create the appropriate tokens. 779 """ 780 781 return None 782 783 784 ## def recoverFromMissingElement(self, input, e, follow): 785 ## """ 786 ## This code is factored out from mismatched token and mismatched set 787 ## recovery. It handles "single token insertion" error recovery for 788 ## both. No tokens are consumed to recover from insertions. Return 789 ## true if recovery was possible else return false. 790 ## """ 791 792 ## if self.mismatchIsMissingToken(input, follow): 793 ## self.reportError(e) 794 ## return True 795 796 ## # nothing to do; throw exception 797 ## return False 798 799 800 def consumeUntil(self, input, tokenTypes): 801 """ 802 Consume tokens until one matches the given token or token set 803 804 tokenTypes can be a single token type or a set of token types 805 806 """ 807 808 if not isinstance(tokenTypes, (set, frozenset)): 809 tokenTypes = frozenset([tokenTypes]) 810 811 ttype = input.LA(1) 812 while ttype != EOF and ttype not in tokenTypes: 813 input.consume() 814 ttype = input.LA(1) 815 816 817 def getRuleInvocationStack(self): 818 """ 819 Return List<String> of the rules in your parser instance 820 leading up to a call to this method. You could override if 821 you want more details such as the file/line info of where 822 in the parser java code a rule is invoked. 823 824 This is very useful for error messages and for context-sensitive 825 error recovery. 826 827 You must be careful, if you subclass a generated recognizers. 828 The default implementation will only search the module of self 829 for rules, but the subclass will not contain any rules. 830 You probably want to override this method to look like 831 832 def getRuleInvocationStack(self): 833 return self._getRuleInvocationStack(<class>.__module__) 834 835 where <class> is the class of the generated recognizer, e.g. 836 the superclass of self. 837 """ 838 839 return self._getRuleInvocationStack(self.__module__) 840 841 842 def _getRuleInvocationStack(cls, module): 843 """ 844 A more general version of getRuleInvocationStack where you can 845 pass in, for example, a RecognitionException to get it's rule 846 stack trace. This routine is shared with all recognizers, hence, 847 static. 848 849 TODO: move to a utility class or something; weird having lexer call 850 this 851 """ 852 853 # mmmhhh,... perhaps look at the first argument 854 # (f_locals[co_varnames[0]]?) and test if it's a (sub)class of 855 # requested recognizer... 856 857 rules = [] 858 for frame in reversed(inspect.stack()): 859 code = frame[0].f_code 860 codeMod = inspect.getmodule(code) 861 if codeMod is None: 862 continue 863 864 # skip frames not in requested module 865 if codeMod.__name__ != module: 866 continue 867 868 # skip some unwanted names 869 if code.co_name in ('nextToken', '<module>'): 870 continue 871 872 rules.append(code.co_name) 873 874 return rules 875 876 _getRuleInvocationStack = classmethod(_getRuleInvocationStack) 877 878 879 def getBacktrackingLevel(self): 880 return self._state.backtracking 881 882 def setBacktrackingLevel(self, n): 883 self._state.backtracking = n 884 885 886 def getGrammarFileName(self): 887 """For debugging and other purposes, might want the grammar name. 888 889 Have ANTLR generate an implementation for this method. 890 """ 891 892 return self.grammarFileName 893 894 895 def getSourceName(self): 896 raise NotImplementedError 897 898 899 def toStrings(self, tokens): 900 """A convenience method for use most often with template rewrites. 901 902 Convert a List<Token> to List<String> 903 """ 904 905 if tokens is None: 906 return None 907 908 return [token.text for token in tokens] 909 910 911 def getRuleMemoization(self, ruleIndex, ruleStartIndex): 912 """ 913 Given a rule number and a start token index number, return 914 MEMO_RULE_UNKNOWN if the rule has not parsed input starting from 915 start index. If this rule has parsed input starting from the 916 start index before, then return where the rule stopped parsing. 917 It returns the index of the last token matched by the rule. 918 """ 919 920 if ruleIndex not in self._state.ruleMemo: 921 self._state.ruleMemo[ruleIndex] = {} 922 923 return self._state.ruleMemo[ruleIndex].get( 924 ruleStartIndex, self.MEMO_RULE_UNKNOWN 925 ) 926 927 928 def alreadyParsedRule(self, input, ruleIndex): 929 """ 930 Has this rule already parsed input at the current index in the 931 input stream? Return the stop token index or MEMO_RULE_UNKNOWN. 932 If we attempted but failed to parse properly before, return 933 MEMO_RULE_FAILED. 934 935 This method has a side-effect: if we have seen this input for 936 this rule and successfully parsed before, then seek ahead to 937 1 past the stop token matched for this rule last time. 938 """ 939 940 stopIndex = self.getRuleMemoization(ruleIndex, input.index()) 941 if stopIndex == self.MEMO_RULE_UNKNOWN: 942 return False 943 944 if stopIndex == self.MEMO_RULE_FAILED: 945 raise BacktrackingFailed 946 947 else: 948 input.seek(stopIndex + 1) 949 950 return True 951 952 953 def memoize(self, input, ruleIndex, ruleStartIndex, success): 954 """ 955 Record whether or not this rule parsed the input at this position 956 successfully. 957 """ 958 959 if success: 960 stopTokenIndex = input.index() - 1 961 else: 962 stopTokenIndex = self.MEMO_RULE_FAILED 963 964 if ruleIndex in self._state.ruleMemo: 965 self._state.ruleMemo[ruleIndex][ruleStartIndex] = stopTokenIndex 966 967 968 def traceIn(self, ruleName, ruleIndex, inputSymbol): 969 sys.stdout.write("enter %s %s" % (ruleName, inputSymbol)) 970 971 if self._state.backtracking > 0: 972 sys.stdout.write(" backtracking=%s" % self._state.backtracking) 973 974 sys.stdout.write('\n') 975 976 977 def traceOut(self, ruleName, ruleIndex, inputSymbol): 978 sys.stdout.write("exit %s %s" % (ruleName, inputSymbol)) 979 980 if self._state.backtracking > 0: 981 sys.stdout.write(" backtracking=%s" % self._state.backtracking) 982 983 # mmmm... we use BacktrackingFailed exceptions now. So how could we 984 # get that information here? 985 #if self._state.failed: 986 # sys.stdout.write(" failed") 987 #else: 988 # sys.stdout.write(" succeeded") 989 990 sys.stdout.write('\n') 991 992 993 class TokenSource(object): 994 """ 995 @brief Abstract baseclass for token producers. 996 997 A source of tokens must provide a sequence of tokens via nextToken() 998 and also must reveal it's source of characters; CommonToken's text is 999 computed from a CharStream; it only store indices into the char stream. 1000 1001 Errors from the lexer are never passed to the parser. Either you want 1002 to keep going or you do not upon token recognition error. If you do not 1003 want to continue lexing then you do not want to continue parsing. Just 1004 throw an exception not under RecognitionException and Java will naturally 1005 toss you all the way out of the recognizers. If you want to continue 1006 lexing then you should not throw an exception to the parser--it has already 1007 requested a token. Keep lexing until you get a valid one. Just report 1008 errors and keep going, looking for a valid token. 1009 """ 1010 1011 def nextToken(self): 1012 """Return a Token object from your input stream (usually a CharStream). 1013 1014 Do not fail/return upon lexing error; keep chewing on the characters 1015 until you get a good one; errors are not passed through to the parser. 1016 """ 1017 1018 raise NotImplementedError 1019 1020 1021 def __iter__(self): 1022 """The TokenSource is an interator. 1023 1024 The iteration will not include the final EOF token, see also the note 1025 for the next() method. 1026 1027 """ 1028 1029 return self 1030 1031 1032 def next(self): 1033 """Return next token or raise StopIteration. 1034 1035 Note that this will raise StopIteration when hitting the EOF token, 1036 so EOF will not be part of the iteration. 1037 1038 """ 1039 1040 token = self.nextToken() 1041 if token is None or token.type == EOF: 1042 raise StopIteration 1043 return token 1044 1045 1046 class Lexer(BaseRecognizer, TokenSource): 1047 """ 1048 @brief Baseclass for generated lexer classes. 1049 1050 A lexer is recognizer that draws input symbols from a character stream. 1051 lexer grammars result in a subclass of this object. A Lexer object 1052 uses simplified match() and error recovery mechanisms in the interest 1053 of speed. 1054 """ 1055 1056 def __init__(self, input, state=None): 1057 BaseRecognizer.__init__(self, state) 1058 TokenSource.__init__(self) 1059 1060 # Where is the lexer drawing characters from? 1061 self.input = input 1062 1063 1064 def reset(self): 1065 BaseRecognizer.reset(self) # reset all recognizer state variables 1066 1067 if self.input is not None: 1068 # rewind the input 1069 self.input.seek(0) 1070 1071 if self._state is None: 1072 # no shared state work to do 1073 return 1074 1075 # wack Lexer state variables 1076 self._state.token = None 1077 self._state.type = INVALID_TOKEN_TYPE 1078 self._state.channel = DEFAULT_CHANNEL 1079 self._state.tokenStartCharIndex = -1 1080 self._state.tokenStartLine = -1 1081 self._state.tokenStartCharPositionInLine = -1 1082 self._state.text = None 1083 1084 1085 def makeEOFToken(self): 1086 eof = CommonToken( 1087 type=EOF, channel=DEFAULT_CHANNEL, 1088 input=self.input, 1089 start=self.input.index(), stop=self.input.index()) 1090 eof.line = self.input.line 1091 eof.charPositionInLine = self.input.charPositionInLine 1092 return eof 1093 1094 def nextToken(self): 1095 """ 1096 Return a token from this source; i.e., match a token on the char 1097 stream. 1098 """ 1099 1100 while 1: 1101 self._state.token = None 1102 self._state.channel = DEFAULT_CHANNEL 1103 self._state.tokenStartCharIndex = self.input.index() 1104 self._state.tokenStartCharPositionInLine = self.input.charPositionInLine 1105 self._state.tokenStartLine = self.input.line 1106 self._state.text = None 1107 if self.input.LA(1) == EOF: 1108 return self.makeEOFToken() 1109 1110 try: 1111 self.mTokens() 1112 1113 if self._state.token is None: 1114 self.emit() 1115 1116 elif self._state.token == SKIP_TOKEN: 1117 continue 1118 1119 return self._state.token 1120 1121 except NoViableAltException, re: 1122 self.reportError(re) 1123 self.recover(re) # throw out current char and try again 1124 1125 except RecognitionException, re: 1126 self.reportError(re) 1127 # match() routine has already called recover() 1128 1129 1130 def skip(self): 1131 """ 1132 Instruct the lexer to skip creating a token for current lexer rule 1133 and look for another token. nextToken() knows to keep looking when 1134 a lexer rule finishes with token set to SKIP_TOKEN. Recall that 1135 if token==null at end of any token rule, it creates one for you 1136 and emits it. 1137 """ 1138 1139 self._state.token = SKIP_TOKEN 1140 1141 1142 def mTokens(self): 1143 """This is the lexer entry point that sets instance var 'token'""" 1144 1145 # abstract method 1146 raise NotImplementedError 1147 1148 1149 def setCharStream(self, input): 1150 """Set the char stream and reset the lexer""" 1151 self.input = None 1152 self.reset() 1153 self.input = input 1154 1155 1156 def getSourceName(self): 1157 return self.input.getSourceName() 1158 1159 1160 def emit(self, token=None): 1161 """ 1162 The standard method called to automatically emit a token at the 1163 outermost lexical rule. The token object should point into the 1164 char buffer start..stop. If there is a text override in 'text', 1165 use that to set the token's text. Override this method to emit 1166 custom Token objects. 1167 1168 If you are building trees, then you should also override 1169 Parser or TreeParser.getMissingSymbol(). 1170 """ 1171 1172 if token is None: 1173 token = CommonToken( 1174 input=self.input, 1175 type=self._state.type, 1176 channel=self._state.channel, 1177 start=self._state.tokenStartCharIndex, 1178 stop=self.getCharIndex()-1 1179 ) 1180 token.line = self._state.tokenStartLine 1181 token.text = self._state.text 1182 token.charPositionInLine = self._state.tokenStartCharPositionInLine 1183 1184 self._state.token = token 1185 1186 return token 1187 1188 1189 def match(self, s): 1190 if isinstance(s, basestring): 1191 for c in s: 1192 if self.input.LA(1) != ord(c): 1193 if self._state.backtracking > 0: 1194 raise BacktrackingFailed 1195 1196 mte = MismatchedTokenException(c, self.input) 1197 self.recover(mte) 1198 raise mte 1199 1200 self.input.consume() 1201 1202 else: 1203 if self.input.LA(1) != s: 1204 if self._state.backtracking > 0: 1205 raise BacktrackingFailed 1206 1207 mte = MismatchedTokenException(unichr(s), self.input) 1208 self.recover(mte) # don't really recover; just consume in lexer 1209 raise mte 1210 1211 self.input.consume() 1212 1213 1214 def matchAny(self): 1215 self.input.consume() 1216 1217 1218 def matchRange(self, a, b): 1219 if self.input.LA(1) < a or self.input.LA(1) > b: 1220 if self._state.backtracking > 0: 1221 raise BacktrackingFailed 1222 1223 mre = MismatchedRangeException(unichr(a), unichr(b), self.input) 1224 self.recover(mre) 1225 raise mre 1226 1227 self.input.consume() 1228 1229 1230 def getLine(self): 1231 return self.input.line 1232 1233 1234 def getCharPositionInLine(self): 1235 return self.input.charPositionInLine 1236 1237 1238 def getCharIndex(self): 1239 """What is the index of the current character of lookahead?""" 1240 1241 return self.input.index() 1242 1243 1244 def getText(self): 1245 """ 1246 Return the text matched so far for the current token or any 1247 text override. 1248 """ 1249 if self._state.text is not None: 1250 return self._state.text 1251 1252 return self.input.substring( 1253 self._state.tokenStartCharIndex, 1254 self.getCharIndex()-1 1255 ) 1256 1257 1258 def setText(self, text): 1259 """ 1260 Set the complete text of this token; it wipes any previous 1261 changes to the text. 1262 """ 1263 self._state.text = text 1264 1265 1266 text = property(getText, setText) 1267 1268 1269 def reportError(self, e): 1270 ## TODO: not thought about recovery in lexer yet. 1271 1272 ## # if we've already reported an error and have not matched a token 1273 ## # yet successfully, don't report any errors. 1274 ## if self.errorRecovery: 1275 ## #System.err.print("[SPURIOUS] "); 1276 ## return; 1277 ## 1278 ## self.errorRecovery = True 1279 1280 self.displayRecognitionError(self.tokenNames, e) 1281 1282 1283 def getErrorMessage(self, e, tokenNames): 1284 msg = None 1285 1286 if isinstance(e, MismatchedTokenException): 1287 msg = "mismatched character " \ 1288 + self.getCharErrorDisplay(e.c) \ 1289 + " expecting " \ 1290 + self.getCharErrorDisplay(e.expecting) 1291 1292 elif isinstance(e, NoViableAltException): 1293 msg = "no viable alternative at character " \ 1294 + self.getCharErrorDisplay(e.c) 1295 1296 elif isinstance(e, EarlyExitException): 1297 msg = "required (...)+ loop did not match anything at character " \ 1298 + self.getCharErrorDisplay(e.c) 1299 1300 elif isinstance(e, MismatchedNotSetException): 1301 msg = "mismatched character " \ 1302 + self.getCharErrorDisplay(e.c) \ 1303 + " expecting set " \ 1304 + repr(e.expecting) 1305 1306 elif isinstance(e, MismatchedSetException): 1307 msg = "mismatched character " \ 1308 + self.getCharErrorDisplay(e.c) \ 1309 + " expecting set " \ 1310 + repr(e.expecting) 1311 1312 elif isinstance(e, MismatchedRangeException): 1313 msg = "mismatched character " \ 1314 + self.getCharErrorDisplay(e.c) \ 1315 + " expecting set " \ 1316 + self.getCharErrorDisplay(e.a) \ 1317 + ".." \ 1318 + self.getCharErrorDisplay(e.b) 1319 1320 else: 1321 msg = BaseRecognizer.getErrorMessage(self, e, tokenNames) 1322 1323 return msg 1324 1325 1326 def getCharErrorDisplay(self, c): 1327 if c == EOF: 1328 c = '<EOF>' 1329 return repr(c) 1330 1331 1332 def recover(self, re): 1333 """ 1334 Lexers can normally match any char in it's vocabulary after matching 1335 a token, so do the easy thing and just kill a character and hope 1336 it all works out. You can instead use the rule invocation stack 1337 to do sophisticated error recovery if you are in a fragment rule. 1338 """ 1339 1340 self.input.consume() 1341 1342 1343 def traceIn(self, ruleName, ruleIndex): 1344 inputSymbol = "%s line=%d:%s" % (self.input.LT(1), 1345 self.getLine(), 1346 self.getCharPositionInLine() 1347 ) 1348 1349 BaseRecognizer.traceIn(self, ruleName, ruleIndex, inputSymbol) 1350 1351 1352 def traceOut(self, ruleName, ruleIndex): 1353 inputSymbol = "%s line=%d:%s" % (self.input.LT(1), 1354 self.getLine(), 1355 self.getCharPositionInLine() 1356 ) 1357 1358 BaseRecognizer.traceOut(self, ruleName, ruleIndex, inputSymbol) 1359 1360 1361 1362 class Parser(BaseRecognizer): 1363 """ 1364 @brief Baseclass for generated parser classes. 1365 """ 1366 1367 def __init__(self, lexer, state=None): 1368 BaseRecognizer.__init__(self, state) 1369 1370 self.input = lexer 1371 1372 1373 def reset(self): 1374 BaseRecognizer.reset(self) # reset all recognizer state variables 1375 if self.input is not None: 1376 self.input.seek(0) # rewind the input 1377 1378 1379 def getCurrentInputSymbol(self, input): 1380 return input.LT(1) 1381 1382 1383 def getMissingSymbol(self, input, e, expectedTokenType, follow): 1384 if expectedTokenType == EOF: 1385 tokenText = "<missing EOF>" 1386 else: 1387 tokenText = "<missing " + self.tokenNames[expectedTokenType] + ">" 1388 t = CommonToken(type=expectedTokenType, text=tokenText) 1389 current = input.LT(1) 1390 if current.type == EOF: 1391 current = input.LT(-1) 1392 1393 if current is not None: 1394 t.line = current.line 1395 t.charPositionInLine = current.charPositionInLine 1396 t.channel = DEFAULT_CHANNEL 1397 return t 1398 1399 1400 def setTokenStream(self, input): 1401 """Set the token stream and reset the parser""" 1402 1403 self.input = None 1404 self.reset() 1405 self.input = input 1406 1407 1408 def getTokenStream(self): 1409 return self.input 1410 1411 1412 def getSourceName(self): 1413 return self.input.getSourceName() 1414 1415 1416 def traceIn(self, ruleName, ruleIndex): 1417 BaseRecognizer.traceIn(self, ruleName, ruleIndex, self.input.LT(1)) 1418 1419 1420 def traceOut(self, ruleName, ruleIndex): 1421 BaseRecognizer.traceOut(self, ruleName, ruleIndex, self.input.LT(1)) 1422 1423 1424 class RuleReturnScope(object): 1425 """ 1426 Rules can return start/stop info as well as possible trees and templates. 1427 """ 1428 1429 def getStart(self): 1430 """Return the start token or tree.""" 1431 return None 1432 1433 1434 def getStop(self): 1435 """Return the stop token or tree.""" 1436 return None 1437 1438 1439 def getTree(self): 1440 """Has a value potentially if output=AST.""" 1441 return None 1442 1443 1444 def getTemplate(self): 1445 """Has a value potentially if output=template.""" 1446 return None 1447 1448 1449 class ParserRuleReturnScope(RuleReturnScope): 1450 """ 1451 Rules that return more than a single value must return an object 1452 containing all the values. Besides the properties defined in 1453 RuleLabelScope.predefinedRulePropertiesScope there may be user-defined 1454 return values. This class simply defines the minimum properties that 1455 are always defined and methods to access the others that might be 1456 available depending on output option such as template and tree. 1457 1458 Note text is not an actual property of the return value, it is computed 1459 from start and stop using the input stream's toString() method. I 1460 could add a ctor to this so that we can pass in and store the input 1461 stream, but I'm not sure we want to do that. It would seem to be undefined 1462 to get the .text property anyway if the rule matches tokens from multiple 1463 input streams. 1464 1465 I do not use getters for fields of objects that are used simply to 1466 group values such as this aggregate. The getters/setters are there to 1467 satisfy the superclass interface. 1468 """ 1469 1470 def __init__(self): 1471 self.start = None 1472 self.stop = None 1473 self.tree = None # only used when output=AST 1474 1475 1476 def getStart(self): 1477 return self.start 1478 1479 1480 def getStop(self): 1481 return self.stop 1482 1483 1484 def getTree(self): 1485 return self.tree 1486