1 #!/usr/bin/ruby 2 # encoding: utf-8 3 4 =begin LICENSE 5 6 [The "BSD licence"] 7 Copyright (c) 2009-2010 Kyle Yetter 8 All rights reserved. 9 10 Redistribution and use in source and binary forms, with or without 11 modification, are permitted provided that the following conditions 12 are met: 13 14 1. Redistributions of source code must retain the above copyright 15 notice, this list of conditions and the following disclaimer. 16 2. Redistributions in binary form must reproduce the above copyright 17 notice, this list of conditions and the following disclaimer in the 18 documentation and/or other materials provided with the distribution. 19 3. The name of the author may not be used to endorse or promote products 20 derived from this software without specific prior written permission. 21 22 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 23 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 24 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 25 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 27 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 31 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 33 =end 34 35 module ANTLR3 36 unless const_defined?( :RecognizerSharedState ) 37 38 RecognizerSharedState = Struct.new( 39 :following, 40 :error_recovery, 41 :last_error_index, 42 :backtracking, 43 :rule_memory, 44 :syntax_errors, 45 :token, 46 :token_start_position, 47 :token_start_line, 48 :token_start_column, 49 :channel, 50 :type, 51 :text 52 ) 53 54 =begin rdoc ANTLR3::RecognizerSharedState 55 56 A big Struct-based class containing most of the data that makes up a 57 recognizer's state. These attributes are externalized from the recognizer itself 58 so that recognizer delegation (which occurs when you import other grammars into 59 your grammar) can function; multiple recognizers can share a common state. 60 61 == Structure Attributes 62 63 following:: 64 a stack that tracks follow sets for error recovery 65 error_recovery:: 66 a flag indicating whether or not the recognizer is in error recovery mode 67 last_error_index:: 68 the index in the input stream of the last error 69 backtracking:: 70 tracks the backtracking depth 71 rule_memory:: 72 if a grammar is compiled with the memoization option, this will be 73 set to a hash mapping previously parsed rules to cached indices 74 syntax_errors:: 75 tracks the number of syntax errors seen so far 76 token:: 77 holds newly constructed tokens for lexer rules 78 token_start_position:: 79 the input stream index at which the token starts 80 token_start_line:: 81 the input stream line number at which the token starts 82 token_start_column:: 83 the input stream column at which the token starts 84 channel:: 85 the channel value of the target token 86 type:: 87 the type value of the target token 88 text:: 89 the text of the target token 90 91 =end 92 93 class RecognizerSharedState 94 def initialize 95 super( [], false, -1, 0, nil, 0, nil, -1 ) 96 # ^-- same as this --v 97 # self.following = [] 98 # self.error_recovery = false 99 # self.last_error_index = -1 100 # self.backtracking = 0 101 # self.syntax_errors = 0 102 # self.token_start_position = -1 103 end 104 105 106 # restores all of the state variables to their respective 107 # initial default values 108 def reset! 109 self.following.clear 110 self.error_recovery = false 111 self.last_error_index = -1 112 self.backtracking = 0 113 self.rule_memory and rule_memory.clear 114 self.syntax_errors = 0 115 self.token = nil 116 self.token_start_position = -1 117 self.token_start_line = nil 118 self.token_start_column = nil 119 self.channel = nil 120 self.type = nil 121 self.text = nil 122 end 123 end 124 125 end # unless const_defined?( :RecognizerSharedState ) 126 127 =begin rdoc ANTLR3::Recognizer 128 129 = Scope 130 131 Scope is used to represent instances of ANTLR's various attribute scopes. 132 It is identical to Ruby's built-in Struct class, but it takes string 133 attribute declarations from the ANTLR grammar as parameters, and overrides 134 the #initialize method to set the default values if any are present in 135 the scope declaration. 136 137 Block = Scope.new( "name", "depth = 0", "variables = {}" ) 138 Block.new # => #<struct Block name=nil, depth=0, variables={}> 139 Block.new( "function" ) # => #<struct Block name="function", depth=0, variables={}> 140 Block.new( 'a', 1, :x => 3 ) # => #<struct Block name="a", depth=1, variables={ :x => 3 }> 141 142 =end 143 144 class Scope < ::Struct 145 def self.new( *declarations, &body ) 146 names = [] 147 defaults = {} 148 for decl in declarations 149 name, default = decl.to_s.split( /\s*=\s*/, 2 ) 150 names << ( name = name.to_sym ) 151 default and defaults[ name ] = default 152 end 153 super( *names ) do 154 155 # If no defaults, leave the initialize method the same as 156 # the struct's default initialize for speed. Otherwise, 157 # overwrite the initialize to populate with default values. 158 unless defaults.empty? 159 parameters = names.map do | name | 160 "#{ name } = " << defaults.fetch( name, 'nil' ) 161 end.join( ', ' ) 162 class_eval( <<-END ) 163 def initialize( #{ parameters } ) 164 super( #{ names.join( ', ' ) } ) 165 end 166 END 167 end 168 169 body and class_eval( &body ) 170 end 171 end 172 end 173 174 =begin rdoc ANTLR3::Recognizer 175 176 = Recognizer 177 178 As the base class of all ANTLR-generated recognizers, Recognizer provides 179 much of the shared functionality and structure used in the recognition process. 180 For all effective purposes, the class and its immediate subclasses Lexer, 181 Parser, and TreeParser are abstract classes. They can be instantiated, but 182 they're pretty useless on their own. Instead, to make useful code, you write an 183 ANTLR grammar and ANTLR will generate classes which inherit from one of the 184 recognizer base classes, providing the implementation of the grammar rules 185 itself. this group of classes to implement necessary tasks. Recognizer 186 defines methods related to: 187 188 * token and character matching 189 * prediction and recognition strategy 190 * recovering from errors 191 * reporting errors 192 * memoization 193 * simple rule tracing and debugging 194 195 =end 196 197 class Recognizer 198 include Constants 199 include Error 200 include TokenFactory 201 extend ClassMacros 202 203 @rules = {} 204 205 # inherited class methods and hooks 206 class << self 207 attr_reader :grammar_file_name, 208 :antlr_version, 209 :antlr_version_string, 210 :library_version_string, 211 :grammar_home 212 213 attr_accessor :token_scheme, :default_rule 214 215 # generated recognizer code uses this method to stamp 216 # the code with the name of the grammar file and 217 # the current version of ANTLR being used to generate 218 # the code 219 def generated_using( grammar_file, antlr_version, library_version = nil ) 220 @grammar_file_name = grammar_file.freeze 221 @antlr_version_string = antlr_version.freeze 222 @library_version = Util.parse_version( library_version ) 223 if @antlr_version_string =~ /^(\d+)\.(\d+)(?:\.(\d+)(?:b(\d+))?)?(.*)$/ 224 @antlr_version = [ $1, $2, $3, $4 ].map! { |str| str.to_i } 225 timestamp = $5.strip 226 #@antlr_release_time = $5.empty? ? nil : Time.parse($5) 227 else 228 raise "bad version string: %p" % version_string 229 end 230 end 231 232 # this method is used to generate return-value structures for 233 # rules with multiple return values. To avoid generating 234 # a special class for ever rule in AST parsers and such 235 # (where most rules have the same default set of return values), 236 # each recognizer gets a default return value structure 237 # assigned to the constant +Return+. Rules which don't 238 # require additional custom members will have a rule-return 239 # name constant that just points to the generic return 240 # value. 241 def define_return_scope( *members ) 242 if members.empty? then generic_return_scope 243 else 244 members += return_scope_members 245 Struct.new( *members ) 246 end 247 end 248 249 # used as a hook to add additional default members 250 # to default return value structures 251 # For example, all AST-building parsers override 252 # this method to add an extra +:tree+ field to 253 # all rule return structures. 254 def return_scope_members 255 [ :start, :stop ] 256 end 257 258 # sets up and returns the generic rule return 259 # scope for a recognizer 260 def generic_return_scope 261 @generic_return_scope ||= begin 262 struct = Struct.new( *return_scope_members ) 263 const_set( :Return, struct ) 264 end 265 end 266 267 def imported_grammars 268 @imported_grammars ||= Set.new 269 end 270 271 def master_grammars 272 @master_grammars ||= [] 273 end 274 275 def master 276 master_grammars.last 277 end 278 279 def masters( *grammar_names ) 280 for grammar in grammar_names 281 unless master_grammars.include?( grammar ) 282 master_grammars << grammar 283 attr_reader( Util.snake_case( grammar ) ) 284 end 285 end 286 end 287 private :masters 288 289 def imports( *grammar_names ) 290 for grammar in grammar_names 291 imported_grammars.add?( grammar.to_sym ) and 292 attr_reader( Util.snake_case( grammar ) ) 293 end 294 return imported_grammars 295 end 296 private :imports 297 298 def rules 299 self::RULE_METHODS.dup rescue [] 300 end 301 302 def default_rule 303 @default_rule ||= rules.first 304 end 305 306 def debug? 307 return false 308 end 309 310 def profile? 311 return false 312 end 313 314 def Scope( *declarations, &body ) 315 Scope.new( *declarations, &body ) 316 end 317 318 def token_class 319 @token_class ||= begin 320 self::Token rescue 321 superclass.token_class rescue 322 ANTLR3::CommonToken 323 end 324 end 325 private :generated_using 326 end 327 328 @grammar_file_name = nil 329 @antlr_version = ANTLR3::ANTLR_VERSION 330 @antlr_version_string = ANTLR3::ANTLR_VERSION_STRING 331 332 def grammar_file_name 333 self.class.grammar_file_name 334 end 335 336 def antlr_version 337 self.class.antlr_version 338 end 339 340 def antlr_version_string 341 self.class.antlr_version_string 342 end 343 344 attr_accessor :input 345 attr_reader :state 346 347 def each_delegate 348 block_given? or return enum_for( __method__ ) 349 for grammar in self.class.imported_grammars 350 del = __send__( Util.snake_case( grammar ) ) and 351 yield( del ) 352 end 353 end 354 355 # Create a new recognizer. The constructor simply ensures that 356 # all recognizers are initialized with a shared state object. 357 # See the main recognizer subclasses for more specific 358 # information about creating recognizer objects like 359 # lexers and parsers. 360 def initialize( options = {} ) 361 @state = options[ :state ] || RecognizerSharedState.new 362 @error_output = options.fetch( :error_output, $stderr ) 363 defined?( @input ) or @input = nil 364 initialize_dfas 365 end 366 367 # Resets the recognizer's state data to initial values. 368 # As a result, all error tracking and error recovery 369 # data accumulated in the current state will be cleared. 370 # It will also attempt to reset the input stream 371 # via input.reset, but it ignores any errors received 372 # from doing so. Thus the input stream is not guarenteed 373 # to be rewound to its initial position 374 def reset 375 @state and @state.reset! 376 @input and @input.reset rescue nil 377 end 378 379 # Attempt to match the current input symbol the token type 380 # specified by +type+. If the symbol matches the type, 381 # consume the current symbol and return its value. If 382 # the symbol doesn't match, attempt to use the follow-set 383 # data provided by +follow+ to recover from the mismatched 384 # token. 385 def match( type, follow ) 386 matched_symbol = current_symbol 387 if @input.peek == type 388 @input.consume 389 @state.error_recovery = false 390 return matched_symbol 391 end 392 raise( BacktrackingFailed ) if @state.backtracking > 0 393 return recover_from_mismatched_token( type, follow ) 394 end 395 396 # match anything -- i.e. wildcard match. Simply consume 397 # the current symbol from the input stream. 398 def match_any 399 @state.error_recovery = false 400 @input.consume 401 end 402 403 ############################################################################################## 404 ###################################### Error Reporting ####################################### 405 ############################################################################################## 406 ############################################################################################## 407 408 # When a recognition error occurs, this method is the main 409 # hook for carrying out the error reporting process. The 410 # default implementation calls +display_recognition_error+ 411 # to display the error info on $stderr. 412 def report_error( e = $! ) 413 @state.error_recovery and return 414 @state.syntax_errors += 1 415 @state.error_recovery = true 416 display_recognition_error( e ) 417 end 418 419 # error reporting hook for presenting the information 420 # The default implementation builds appropriate error 421 # message text using +error_header+ and +error_message+, 422 # and calls +emit_error_message+ to write the error 423 # message out to some source 424 def display_recognition_error( e = $! ) 425 header = error_header( e ) 426 message = error_message( e ) 427 emit_error_message( "#{ header } #{ message }" ) 428 end 429 430 # used to construct an appropriate error message 431 # based on the specific type of error and the 432 # error's attributes 433 def error_message( e = $! ) 434 case e 435 when UnwantedToken 436 token_name = token_name( e.expecting ) 437 "extraneous input #{ token_error_display( e.unexpected_token ) } expecting #{ token_name }" 438 when MissingToken 439 token_name = token_name( e.expecting ) 440 "missing #{ token_name } at #{ token_error_display( e.symbol ) }" 441 when MismatchedToken 442 token_name = token_name( e.expecting ) 443 "mismatched input #{ token_error_display( e.symbol ) } expecting #{ token_name }" 444 when MismatchedTreeNode 445 token_name = token_name( e.expecting ) 446 "mismatched tree node: #{ e.symbol } expecting #{ token_name }" 447 when NoViableAlternative 448 "no viable alternative at input " << token_error_display( e.symbol ) 449 when MismatchedSet 450 "mismatched input %s expecting set %s" % 451 [ token_error_display( e.symbol ), e.expecting.inspect ] 452 when MismatchedNotSet 453 "mismatched input %s expecting set %s" % 454 [ token_error_display( e.symbol ), e.expecting.inspect ] 455 when FailedPredicate 456 "rule %s failed predicate: { %s }?" % [ e.rule_name, e.predicate_text ] 457 else e.message 458 end 459 end 460 461 # 462 # used to add a tag to the error message that indicates 463 # the location of the input stream when the error 464 # occurred 465 # 466 def error_header( e = $! ) 467 e.location 468 end 469 470 # 471 # formats a token object appropriately for inspection 472 # within an error message 473 # 474 def token_error_display( token ) 475 unless text = token.text || ( token.source_text rescue nil ) 476 text = 477 case 478 when token.type == EOF then '<EOF>' 479 when name = token_name( token.type ) rescue nil then "<#{ name }>" 480 when token.respond_to?( :name ) then "<#{ token.name }>" 481 else "<#{ token.type }>" 482 end 483 end 484 return text.inspect 485 end 486 487 # 488 # Write the error report data out to some source. By default, 489 # the error message is written to $stderr 490 # 491 def emit_error_message( message ) 492 @error_output.puts( message ) if @error_output 493 end 494 495 ############################################################################################## 496 ###################################### Error Recovery ######################################## 497 ############################################################################################## 498 499 def recover( error = $! ) 500 @state.last_error_index == @input.index and @input.consume 501 @state.last_error_index = @input.index 502 503 follow_set = compute_error_recovery_set 504 505 resync { consume_until( follow_set ) } 506 end 507 508 def resync 509 begin_resync 510 return( yield ) 511 ensure 512 end_resync 513 end 514 515 # overridable hook method that is executed at the start of the 516 # resyncing procedure in recover 517 # 518 # by default, it does nothing 519 def begin_resync 520 # do nothing 521 end 522 523 # overridable hook method that is after the resyncing procedure has completed 524 # 525 # by default, it does nothing 526 def end_resync 527 # do nothing 528 end 529 530 # (The following explanation has been lifted directly from the 531 # source code documentation of the ANTLR Java runtime library) 532 # 533 # Compute the error recovery set for the current rule. During 534 # rule invocation, the parser pushes the set of tokens that can 535 # follow that rule reference on the stack; this amounts to 536 # computing FIRST of what follows the rule reference in the 537 # enclosing rule. This local follow set only includes tokens 538 # from within the rule; i.e., the FIRST computation done by 539 # ANTLR stops at the end of a rule. 540 # 541 # EXAMPLE 542 # 543 # When you find a "no viable alt exception", the input is not 544 # consistent with any of the alternatives for rule r. The best 545 # thing to do is to consume tokens until you see something that 546 # can legally follow a call to r *or* any rule that called r. 547 # You don't want the exact set of viable next tokens because the 548 # input might just be missing a token--you might consume the 549 # rest of the input looking for one of the missing tokens. 550 # 551 # Consider grammar: 552 # 553 # a : '[' b ']' 554 # | '(' b ')' 555 # ; 556 # b : c '^' INT ; 557 # c : ID 558 # | INT 559 # ; 560 # 561 # At each rule invocation, the set of tokens that could follow 562 # that rule is pushed on a stack. Here are the various "local" 563 # follow sets: 564 # 565 # FOLLOW( b1_in_a ) = FIRST( ']' ) = ']' 566 # FOLLOW( b2_in_a ) = FIRST( ')' ) = ')' 567 # FOLLOW( c_in_b ) = FIRST( '^' ) = '^' 568 # 569 # Upon erroneous input "[]", the call chain is 570 # 571 # a -> b -> c 572 # 573 # and, hence, the follow context stack is: 574 # 575 # depth local follow set after call to rule 576 # 0 \<EOF> a (from main( ) ) 577 # 1 ']' b 578 # 3 '^' c 579 # 580 # Notice that <tt>')'</tt> is not included, because b would have to have 581 # been called from a different context in rule a for ')' to be 582 # included. 583 # 584 # For error recovery, we cannot consider FOLLOW(c) 585 # (context-sensitive or otherwise). We need the combined set of 586 # all context-sensitive FOLLOW sets--the set of all tokens that 587 # could follow any reference in the call chain. We need to 588 # resync to one of those tokens. Note that FOLLOW(c)='^' and if 589 # we resync'd to that token, we'd consume until EOF. We need to 590 # sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}. 591 # In this case, for input "[]", LA(1) is in this set so we would 592 # not consume anything and after printing an error rule c would 593 # return normally. It would not find the required '^' though. 594 # At this point, it gets a mismatched token error and throws an 595 # exception (since LA(1) is not in the viable following token 596 # set). The rule exception handler tries to recover, but finds 597 # the same recovery set and doesn't consume anything. Rule b 598 # exits normally returning to rule a. Now it finds the ']' (and 599 # with the successful match exits errorRecovery mode). 600 # 601 # So, you cna see that the parser walks up call chain looking 602 # for the token that was a member of the recovery set. 603 # 604 # Errors are not generated in errorRecovery mode. 605 # 606 # ANTLR's error recovery mechanism is based upon original ideas: 607 # 608 # "Algorithms + Data Structures = Programs" by Niklaus Wirth 609 # 610 # and 611 # 612 # "A note on error recovery in recursive descent parsers": 613 # http://portal.acm.org/citation.cfm?id=947902.947905 614 # 615 # Later, Josef Grosch had some good ideas: 616 # 617 # "Efficient and Comfortable Error Recovery in Recursive Descent 618 # Parsers": 619 # ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip 620 # 621 # Like Grosch I implemented local FOLLOW sets that are combined 622 # at run-time upon error to avoid overhead during parsing. 623 def compute_error_recovery_set 624 combine_follows( false ) 625 end 626 627 def recover_from_mismatched_token( type, follow ) 628 if mismatch_is_unwanted_token?( type ) 629 err = UnwantedToken( type ) 630 resync { @input.consume } 631 report_error( err ) 632 633 return @input.consume 634 end 635 636 if mismatch_is_missing_token?( follow ) 637 inserted = missing_symbol( nil, type, follow ) 638 report_error( MissingToken( type, inserted ) ) 639 return inserted 640 end 641 642 raise MismatchedToken( type ) 643 end 644 645 def recover_from_mismatched_set( e, follow ) 646 if mismatch_is_missing_token?( follow ) 647 report_error( e ) 648 return missing_symbol( e, INVALID_TOKEN_TYPE, follow ) 649 end 650 raise e 651 end 652 653 def recover_from_mismatched_element( e, follow ) 654 follow.nil? and return false 655 if follow.include?( EOR_TOKEN_TYPE ) 656 viable_tokens = compute_context_sensitive_rule_follow 657 follow = ( follow | viable_tokens ) - Set[ EOR_TOKEN_TYPE ] 658 end 659 if follow.include?( @input.peek ) 660 report_error( e ) 661 return true 662 end 663 return false 664 end 665 666 # Conjure up a missing token during error recovery. 667 # 668 # The recognizer attempts to recover from single missing 669 # symbols. But, actions might refer to that missing symbol. 670 # For example, x=ID {f($x);}. The action clearly assumes 671 # that there has been an identifier matched previously and that 672 # $x points at that token. If that token is missing, but 673 # the next token in the stream is what we want we assume that 674 # this token is missing and we keep going. Because we 675 # have to return some token to replace the missing token, 676 # we have to conjure one up. This method gives the user control 677 # over the tokens returned for missing tokens. Mostly, 678 # you will want to create something special for identifier 679 # tokens. For literals such as '{' and ',', the default 680 # action in the parser or tree parser works. It simply creates 681 # a CommonToken of the appropriate type. The text will be the token. 682 # If you change what tokens must be created by the lexer, 683 # override this method to create the appropriate tokens. 684 def missing_symbol( error, expected_token_type, follow ) 685 return nil 686 end 687 688 def mismatch_is_unwanted_token?( type ) 689 @input.peek( 2 ) == type 690 end 691 692 def mismatch_is_missing_token?( follow ) 693 follow.nil? and return false 694 if follow.include?( EOR_TOKEN_TYPE ) 695 viable_tokens = compute_context_sensitive_rule_follow 696 follow = follow | viable_tokens 697 698 follow.delete( EOR_TOKEN_TYPE ) unless @state.following.empty? 699 end 700 if follow.include?( @input.peek ) or follow.include?( EOR_TOKEN_TYPE ) 701 return true 702 end 703 return false 704 end 705 706 def syntax_errors? 707 ( error_count = @state.syntax_errors ) > 0 and return( error_count ) 708 end 709 710 # factor out what to do upon token mismatch so 711 # tree parsers can behave differently. 712 # 713 # * override this method in your parser to do things 714 # like bailing out after the first error 715 # * just raise the exception instead of 716 # calling the recovery method. 717 # 718 def number_of_syntax_errors 719 @state.syntax_errors 720 end 721 722 # 723 # Compute the context-sensitive +FOLLOW+ set for current rule. 724 # This is set of token types that can follow a specific rule 725 # reference given a specific call chain. You get the set of 726 # viable tokens that can possibly come next (look depth 1) 727 # given the current call chain. Contrast this with the 728 # definition of plain FOLLOW for rule r: 729 # 730 # FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)} 731 # 732 # where x in T* and alpha, beta in V*; T is set of terminals and 733 # V is the set of terminals and nonterminals. In other words, 734 # FOLLOW(r) is the set of all tokens that can possibly follow 735 # references to r in *any* sentential form (context). At 736 # runtime, however, we know precisely which context applies as 737 # we have the call chain. We may compute the exact (rather 738 # than covering superset) set of following tokens. 739 # 740 # For example, consider grammar: 741 # 742 # stat : ID '=' expr ';' // FOLLOW(stat)=={EOF} 743 # | "return" expr '.' 744 # ; 745 # expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'} 746 # atom : INT // FOLLOW(atom)=={'+',')',';','.'} 747 # | '(' expr ')' 748 # ; 749 # 750 # The FOLLOW sets are all inclusive whereas context-sensitive 751 # FOLLOW sets are precisely what could follow a rule reference. 752 # For input input "i=(3);", here is the derivation: 753 # 754 # stat => ID '=' expr ';' 755 # => ID '=' atom ('+' atom)* ';' 756 # => ID '=' '(' expr ')' ('+' atom)* ';' 757 # => ID '=' '(' atom ')' ('+' atom)* ';' 758 # => ID '=' '(' INT ')' ('+' atom)* ';' 759 # => ID '=' '(' INT ')' ';' 760 # 761 # At the "3" token, you'd have a call chain of 762 # 763 # stat -> expr -> atom -> expr -> atom 764 # 765 # What can follow that specific nested ref to atom? Exactly ')' 766 # as you can see by looking at the derivation of this specific 767 # input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}. 768 # 769 # You want the exact viable token set when recovering from a 770 # token mismatch. Upon token mismatch, if LA(1) is member of 771 # the viable next token set, then you know there is most likely 772 # a missing token in the input stream. "Insert" one by just not 773 # throwing an exception. 774 # 775 def compute_context_sensitive_rule_follow 776 combine_follows true 777 end 778 779 def combine_follows( exact ) 780 follow_set = Set.new 781 @state.following.each_with_index.reverse_each do |local_follow_set, index| 782 follow_set |= local_follow_set 783 if exact 784 if local_follow_set.include?( EOR_TOKEN_TYPE ) 785 follow_set.delete( EOR_TOKEN_TYPE ) if index > 0 786 else 787 break 788 end 789 end 790 end 791 return follow_set 792 end 793 794 # 795 # Match needs to return the current input symbol, which gets put 796 # into the label for the associated token ref; e.g., x=ID. Token 797 # and tree parsers need to return different objects. Rather than test 798 # for input stream type or change the IntStream interface, I use 799 # a simple method to ask the recognizer to tell me what the current 800 # input symbol is. 801 # 802 # This is ignored for lexers. 803 # 804 def current_symbol 805 @input.look 806 end 807 808 # 809 # Consume input symbols until one matches a type within types 810 # 811 # types can be a single symbol type or a set of symbol types 812 # 813 def consume_until( types ) 814 types.is_a?( Set ) or types = Set[ *types ] 815 type = @input.peek 816 until type == EOF or types.include?( type ) 817 @input.consume 818 type = @input.peek 819 end 820 return( type ) 821 end 822 823 # 824 # Returns true if the recognizer is currently in a decision for which 825 # backtracking has been enabled 826 # 827 def backtracking? 828 @state.backtracking > 0 829 end 830 831 def backtracking_level 832 @state.backtracking 833 end 834 835 def backtracking_level=( n ) 836 @state.backtracking = n 837 end 838 839 def backtrack 840 @state.backtracking += 1 841 start = @input.mark 842 success = 843 begin yield 844 rescue BacktrackingFailed then false 845 else true 846 end 847 return success 848 ensure 849 @input.rewind( start ) 850 @state.backtracking -= 1 851 end 852 853 def syntactic_predicate?( name ) 854 backtrack { send name } 855 end 856 857 alias backtracking backtracking_level 858 alias backtracking= backtracking_level= 859 860 def rule_memoization( rule, start_index ) 861 @state.rule_memory.fetch( rule ) do 862 @state.rule_memory[ rule ] = Hash.new( MEMO_RULE_UNKNOWN ) 863 end[ start_index ] 864 end 865 866 def already_parsed_rule?( rule ) 867 stop_index = rule_memoization( rule, @input.index ) 868 case stop_index 869 when MEMO_RULE_UNKNOWN then return false 870 when MEMO_RULE_FAILED 871 raise BacktrackingFailed 872 else 873 @input.seek( stop_index + 1 ) 874 end 875 return true 876 end 877 878 def memoize( rule, start_index, success ) 879 stop_index = success ? @input.index - 1 : MEMO_RULE_FAILED 880 memo = @state.rule_memory[ rule ] and memo[ start_index ] = stop_index 881 end 882 883 def trace_in( rule_name, rule_index, input_symbol ) 884 @error_output.printf( "--> enter %s on %s", rule_name, input_symbol ) 885 @state.backtracking > 0 and @error_output.printf( 886 " (in backtracking mode: depth = %s)", @state.backtracking 887 ) 888 @error_output.print( "\n" ) 889 end 890 891 def trace_out( rule_name, rule_index, input_symbol ) 892 @error_output.printf( "<-- exit %s on %s", rule_name, input_symbol ) 893 @state.backtracking > 0 and @error_output.printf( 894 " (in backtracking mode: depth = %s)", @state.backtracking 895 ) 896 @error_output.print( "\n" ) 897 end 898 899 private 900 901 def initialize_dfas 902 # do nothing 903 end 904 end 905 906 907 # constant alias for compatibility with older versions of the 908 # runtime library 909 BaseRecognizer = Recognizer 910 911 =begin rdoc ANTLR3::Lexer 912 913 = Lexer 914 915 Lexer is the default superclass of all lexers generated by ANTLR. The class 916 tailors the core functionality provided by Recognizer to the task of 917 matching patterns in the text input and breaking the input into tokens. 918 919 == About Lexers 920 921 A lexer's job is to take input text and break it up into _tokens_ -- objects 922 that encapsulate a piece of text, a type label (such as ID or INTEGER), and the 923 position of the text with respect to the input. Thus, a lexer is essentially a 924 complicated iterator that steps through an input stream and produces a sequence 925 of tokens. Sometimes lexers are enough to carry out a goal on their own, such as 926 tasks like source code highlighting and simple code analysis. Usually, however, 927 the lexer converts text into tokens for use by a parser, which recognizes larger 928 structures within the text. 929 930 ANTLR parsers have a variety of entry points specified by parser rules, each of 931 which defines the structure of a specific type of sentence in a grammar. Lexers, 932 however, are primarily intended to have a single entry point. It looks at the 933 characters starting at the current input position, decides if the chunk of text 934 matches one of a number of possible token type definitions, wraps the chunk into 935 a token with information on its type and location, and advances the input stream 936 to the next place. 937 938 == ANTLR Lexers and the Lexer API 939 940 ANTLR-generated lexers will subclass this class, unless specified otherwise 941 within a grammar file. The generated class will provide an implementation of 942 each lexer rule as a method of the same name. The subclass will also provide an 943 implementation for the abstract method #m_tokens, the purpose of which is to 944 multiplex the token type definitions and predict what rule definition to execute 945 to fetch a token. The primary method in the lexer API, #next_token, uses 946 #m_tokens to fetch the next token and drive the iteration. 947 948 If the lexer is preparing tokens for use by an ANTLR generated parser, the lexer 949 will generally be used to build a TokenStream object. The following code example 950 demonstrates the typical setup for using ANTLR parsers and lexers in Ruby. 951 952 # in HypotheticalLexer.rb 953 module Hypothetical 954 class Lexer < ANTLR3::Lexer 955 # ... 956 # ANTLR generated code 957 # ... 958 end 959 end 960 961 # in HypotheticalParser.rb 962 module Hypothetical 963 class Parser < ANTLR3::Parser 964 # ... 965 # more ANTLR generated code 966 # ... 967 end 968 end 969 970 # to take hypothetical source code and prepare it for parsing, 971 # there is generally a four-step construction process 972 973 source = "some hypothetical source code" 974 input = ANTLR3::StringStream.new(source, :file => 'blah-de-blah.hyp') 975 lexer = Hypothetical::Lexer.new( input ) 976 tokens = ANTLR3::CommonTokenStream.new( lexer ) 977 parser = Hypothetical::Parser.new( tokens ) 978 979 # if you're using the standard streams, ANTLR3::StringStream and 980 # ANTLR3::CommonTokenStream, you can write the same process 981 # shown above more succinctly: 982 983 lexer = Hypothetical::Lexer.new("some hypothetical source code", :file => 'blah-de-blah.hyp') 984 parser = Hypothetical::Parser.new( lexer ) 985 986 =end 987 class Lexer < Recognizer 988 include TokenSource 989 @token_class = CommonToken 990 991 def self.default_rule 992 @default_rule ||= :token! 993 end 994 995 def self.main( argv = ARGV, options = {} ) 996 if argv.is_a?( ::Hash ) then argv, options = ARGV, argv end 997 main = ANTLR3::Main::LexerMain.new( self, options ) 998 block_given? ? yield( main ) : main.execute( argv ) 999 end 1000 1001 def self.associated_parser 1002 @associated_parser ||= begin 1003 @grammar_home and @grammar_home::Parser 1004 rescue NameError 1005 grammar_name = @grammar_home.name.split( "::" ).last 1006 begin 1007 require "#{ grammar_name }Parser" 1008 @grammar_home::Parser 1009 rescue LoadError, NameError 1010 end 1011 end 1012 end 1013 1014 def initialize( input, options = {} ) 1015 super( options ) 1016 @input = cast_input( input, options ) 1017 end 1018 1019 def current_symbol 1020 nil 1021 end 1022 1023 def next_token 1024 loop do 1025 @state.token = nil 1026 @state.channel = DEFAULT_CHANNEL 1027 @state.token_start_position = @input.index 1028 @state.token_start_column = @input.column 1029 @state.token_start_line = @input.line 1030 @state.text = nil 1031 @input.peek == EOF and return EOF_TOKEN 1032 begin 1033 token! 1034 1035 case token = @state.token 1036 when nil then return( emit ) 1037 when SKIP_TOKEN then next 1038 else 1039 return token 1040 end 1041 rescue NoViableAlternative => re 1042 report_error( re ) 1043 recover( re ) 1044 rescue Error::RecognitionError => re 1045 report_error( re ) 1046 end 1047 end 1048 end 1049 1050 def skip 1051 @state.token = SKIP_TOKEN 1052 end 1053 1054 abstract :token! 1055 1056 def exhaust 1057 self.to_a 1058 end 1059 1060 def char_stream=( input ) 1061 @input = nil 1062 reset() 1063 @input = input 1064 end 1065 1066 def source_name 1067 @input.source_name 1068 end 1069 1070 def emit( token = @state.token ) 1071 token ||= create_token 1072 @state.token = token 1073 return token 1074 end 1075 1076 def match( expected ) 1077 case expected 1078 when String 1079 expected.each_byte do |char| 1080 unless @input.peek == char 1081 @state.backtracking > 0 and raise BacktrackingFailed 1082 error = MismatchedToken( char ) 1083 recover( error ) 1084 raise error 1085 end 1086 @input.consume() 1087 end 1088 else # single integer character 1089 unless @input.peek == expected 1090 @state.backtracking > 0 and raise BacktrackingFailed 1091 error = MismatchedToken( expected ) 1092 recover( error ) 1093 raise error 1094 end 1095 @input.consume 1096 end 1097 return true 1098 end 1099 1100 def match_any 1101 @input.consume 1102 end 1103 1104 def match_range( min, max ) 1105 char = @input.peek 1106 if char.between?( min, max ) then @input.consume 1107 else 1108 @state.backtracking > 0 and raise BacktrackingFailed 1109 error = MismatchedRange( min.chr, max.chr ) 1110 recover( error ) 1111 raise( error ) 1112 end 1113 return true 1114 end 1115 1116 def line 1117 @input.line 1118 end 1119 1120 def column 1121 @input.column 1122 end 1123 1124 def character_index 1125 @input.index 1126 end 1127 1128 def text 1129 @state.text and return @state.text 1130 @input.substring( @state.token_start_position, character_index - 1 ) 1131 end 1132 1133 def text=( text ) 1134 @state.text = text 1135 end 1136 1137 def report_error( e ) 1138 display_recognition_error( e ) 1139 end 1140 1141 def error_message( e ) 1142 char = character_error_display( e.symbol ) rescue nil 1143 case e 1144 when Error::MismatchedToken 1145 expecting = character_error_display( e.expecting ) 1146 "mismatched character #{ char }; expecting #{ expecting }" 1147 when Error::NoViableAlternative 1148 "no viable alternative at character #{ char }" 1149 when Error::EarlyExit 1150 "required ( ... )+ loop did not match anything at character #{ char }" 1151 when Error::MismatchedNotSet 1152 "mismatched character %s; expecting set %p" % [ char, e.expecting ] 1153 when Error::MismatchedSet 1154 "mismatched character %s; expecting set %p" % [ char, e.expecting ] 1155 when Error::MismatchedRange 1156 a = character_error_display( e.min ) 1157 b = character_error_display( e.max ) 1158 "mismatched character %s; expecting set %s..%s" % [ char, a, b ] 1159 else super 1160 end 1161 end 1162 1163 def character_error_display( char ) 1164 case char 1165 when EOF then '<EOF>' 1166 when Integer then char.chr.inspect 1167 else char.inspect 1168 end 1169 end 1170 1171 def recover( re ) 1172 @input.consume 1173 end 1174 1175 alias input= char_stream= 1176 1177 private 1178 1179 def cast_input( input, options ) 1180 case input 1181 when CharacterStream then input 1182 when ::String then StringStream.new( input, options ) 1183 when ::IO, ARGF then FileStream.new( input, options ) 1184 else input 1185 end 1186 end 1187 1188 def trace_in( rule_name, rule_index ) 1189 if symbol = @input.look and symbol != EOF then symbol = symbol.inspect 1190 else symbol = '<EOF>' end 1191 input_symbol = "#{ symbol } @ line #{ line } / col #{ column }" 1192 super( rule_name, rule_index, input_symbol ) 1193 end 1194 1195 def trace_out( rule_name, rule_index ) 1196 if symbol = @input.look and symbol != EOF then symbol = symbol.inspect 1197 else symbol = '<EOF>' end 1198 input_symbol = "#{ symbol } @ line #{ line } / col #{ column }" 1199 super( rule_name, rule_index, input_symbol ) 1200 end 1201 1202 def create_token( &b ) 1203 if block_given? then super( &b ) 1204 else 1205 super do |t| 1206 t.input = @input 1207 t.type = @state.type 1208 t.channel = @state.channel 1209 t.start = @state.token_start_position 1210 t.stop = @input.index - 1 1211 t.line = @state.token_start_line 1212 t.text = self.text 1213 t.column = @state.token_start_column 1214 end 1215 end 1216 end 1217 end 1218 1219 1220 =begin rdoc ANTLR3::Parser 1221 1222 = Parser 1223 1224 Parser is the default base class of ANTLR-generated parser classes. The class 1225 tailors the functionality provided by Recognizer to the task of parsing. 1226 1227 == About Parsing 1228 1229 This is just a lose overview of parsing. For considerably more in-depth coverage 1230 of the topic, read the ANTLR documentation or check out the ANTLR website 1231 (http://www.antlr.org). 1232 1233 A grammar defines the vocabulary and the sentence structure of a language. While 1234 a lexer concerns the basic vocabulary symbols of the language, a parser's 1235 primary task is to implement the sentence structure. 1236 1237 Parsers are set up by providing a stream of tokens, which is usually created by 1238 a corresponding lexer. Then, the user requests a specific sentence-structure 1239 within the grammar, such as "class_definition" or "xml_node", from the parser. 1240 It iterates through the tokens, verifying the syntax of the sentence and 1241 performing actions specified by the grammar. It stops when it encounters an 1242 error or when it has matched the full sentence according to its defined 1243 structure. 1244 1245 == ANTLR Parsers and the Parser API 1246 1247 Plain ANTLR-generated parsers directly subclass this class, unless specified 1248 otherwise within the grammar options. The generated code will provide a method 1249 for each parser rule defined in the ANTLR grammar, as well as any other 1250 customized member attributes and methods specified in the source grammar. 1251 1252 This class does not override much of the functionality in Recognizer, and 1253 thus the API closely mirrors Recognizer. 1254 1255 =end 1256 class Parser < Recognizer 1257 def self.main( argv = ARGV, options = {} ) 1258 if argv.is_a?( ::Hash ) then argv, options = ARGV, argv end 1259 main = ANTLR3::Main::ParserMain.new( self, options ) 1260 block_given? ? yield( main ) : main.execute( argv ) 1261 end 1262 1263 def self.associated_lexer 1264 @associated_lexer ||= begin 1265 @grammar_home and @grammar_home::Lexer 1266 rescue NameError 1267 grammar_name = @grammar_home.name.split( "::" ).last 1268 begin 1269 require "#{ grammar_name }Lexer" 1270 @grammar_home::Lexer 1271 rescue LoadError, NameError 1272 end 1273 end 1274 end 1275 1276 1277 def initialize( input, options = {} ) 1278 super( options ) 1279 @input = nil 1280 reset 1281 @input = cast_input( input, options ) 1282 end 1283 1284 def missing_symbol( error, expected_type, follow ) 1285 current = @input.look 1286 current = @input.look( -1 ) if current == ANTLR3::EOF_TOKEN 1287 t = 1288 case 1289 when current && current != ANTLR3::EOF_TOKEN then current.clone 1290 when @input.token_class then @input.token_class.new 1291 else ( create_token rescue CommonToken.new ) 1292 end 1293 1294 t.type = expected_type 1295 name = t.name.gsub( /(^<)|(>$)/,'' ) 1296 t.text = "<missing #{ name }>" 1297 t.channel = DEFAULT_CHANNEL 1298 return( t ) 1299 end 1300 1301 def token_stream=( input ) 1302 @input = nil 1303 reset 1304 @input = input 1305 end 1306 alias token_stream input 1307 1308 def source_name 1309 @input.source_name 1310 end 1311 1312 1313 private 1314 1315 def trace_in( rule_name, rule_index ) 1316 super( rule_name, rule_index, @input.look.inspect ) 1317 end 1318 1319 def trace_out( rule_name, rule_index ) 1320 super( rule_name, rule_index, @input.look.inspect ) 1321 end 1322 1323 def cast_input( input, options ) 1324 case input 1325 when TokenStream then input 1326 when TokenSource then CommonTokenStream.new( input, options ) 1327 when IO, String, CharacterStream 1328 if lexer_class = self.class.associated_lexer 1329 CommonTokenStream.new( lexer_class.new( input, options ), options ) 1330 else 1331 raise ArgumentError, Util.tidy( <<-END, true ) 1332 | unable to automatically convert input #{ input.inspect } 1333 | to a ANTLR3::TokenStream object as #{ self.class } 1334 | does not appear to have an associated lexer class 1335 END 1336 end 1337 else 1338 # assume it's a stream if it at least implements peek and consume 1339 unless input.respond_to?( :peek ) and input.respond_to?( :consume ) 1340 raise ArgumentError, Util.tidy( <<-END, true ) 1341 | #{ self.class } requires a token stream as input, but 1342 | #{ input.inspect } was provided 1343 END 1344 end 1345 input 1346 end 1347 end 1348 1349 end 1350 1351 end 1352