Home | History | Annotate | Download | only in antlr3
      1 #!/usr/bin/ruby
      2 # encoding: utf-8
      3 
      4 =begin LICENSE
      5 
      6 [The "BSD licence"]
      7 Copyright (c) 2009-2010 Kyle Yetter
      8 All rights reserved.
      9 
     10 Redistribution and use in source and binary forms, with or without
     11 modification, are permitted provided that the following conditions
     12 are met:
     13 
     14  1. Redistributions of source code must retain the above copyright
     15     notice, this list of conditions and the following disclaimer.
     16  2. Redistributions in binary form must reproduce the above copyright
     17     notice, this list of conditions and the following disclaimer in the
     18     documentation and/or other materials provided with the distribution.
     19  3. The name of the author may not be used to endorse or promote products
     20     derived from this software without specific prior written permission.
     21 
     22 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     23 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     24 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     25 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     27 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     31 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     32 
     33 =end
     34 
     35 module ANTLR3
     36 unless const_defined?( :RecognizerSharedState )
     37 
     38 RecognizerSharedState = Struct.new( 
     39   :following,
     40   :error_recovery,
     41   :last_error_index,
     42   :backtracking,
     43   :rule_memory,
     44   :syntax_errors,
     45   :token,
     46   :token_start_position,
     47   :token_start_line,
     48   :token_start_column,
     49   :channel,
     50   :type,
     51   :text
     52 )
     53 
     54 =begin rdoc ANTLR3::RecognizerSharedState
     55 
     56 A big Struct-based class containing most of the data that makes up a
     57 recognizer's state. These attributes are externalized from the recognizer itself
     58 so that recognizer delegation (which occurs when you import other grammars into
     59 your grammar) can function; multiple recognizers can share a common state.
     60 
     61 == Structure Attributes
     62 
     63 following::
     64   a stack that tracks follow sets for error recovery
     65 error_recovery::
     66   a flag indicating whether or not the recognizer is in error recovery mode
     67 last_error_index::
     68   the index in the input stream of the last error
     69 backtracking::
     70   tracks the backtracking depth
     71 rule_memory::
     72   if a grammar is compiled with the memoization option, this will be 
     73   set to a hash mapping previously parsed rules to cached indices
     74 syntax_errors::
     75   tracks the number of syntax errors seen so far
     76 token::
     77   holds newly constructed tokens for lexer rules
     78 token_start_position::
     79   the input stream index at which the token starts
     80 token_start_line::
     81   the input stream line number at which the token starts
     82 token_start_column::
     83   the input stream column at which the token starts
     84 channel::
     85   the channel value of the target token
     86 type::
     87   the type value of the target token
     88 text::
     89   the text of the target token
     90 
     91 =end
     92 
     93 class RecognizerSharedState
     94   def initialize
     95     super( [], false, -1, 0, nil, 0, nil, -1 )
     96     # ^-- same as this --v 
     97     # self.following = []
     98     # self.error_recovery = false
     99     # self.last_error_index = -1
    100     # self.backtracking = 0
    101     # self.syntax_errors = 0
    102     # self.token_start_position = -1
    103   end
    104   
    105   
    106   # restores all of the state variables to their respective
    107   # initial default values
    108   def reset!
    109     self.following.clear
    110     self.error_recovery = false
    111     self.last_error_index = -1
    112     self.backtracking = 0
    113     self.rule_memory and rule_memory.clear
    114     self.syntax_errors = 0
    115     self.token = nil
    116     self.token_start_position = -1
    117     self.token_start_line = nil
    118     self.token_start_column = nil
    119     self.channel = nil
    120     self.type = nil
    121     self.text = nil
    122   end
    123 end
    124 
    125 end # unless const_defined?( :RecognizerSharedState )
    126 
    127 =begin rdoc ANTLR3::Recognizer
    128 
    129 = Scope
    130 
    131 Scope is used to represent instances of ANTLR's various attribute scopes.
    132 It is identical to Ruby's built-in Struct class, but it takes string
    133 attribute declarations from the ANTLR grammar as parameters, and overrides
    134 the #initialize method to set the default values if any are present in
    135 the scope declaration.
    136 
    137   Block = Scope.new( "name", "depth = 0", "variables = {}" )
    138   Block.new                    # => #<struct Block name=nil, depth=0, variables={}>
    139   Block.new( "function" )      # => #<struct Block name="function", depth=0, variables={}>
    140   Block.new( 'a', 1, :x => 3 ) # => #<struct Block name="a", depth=1, variables={ :x => 3 }>
    141 
    142 =end
    143 
    144 class Scope < ::Struct
    145   def self.new( *declarations, &body )
    146     names = []
    147     defaults = {}
    148     for decl in declarations
    149       name, default = decl.to_s.split( /\s*=\s*/, 2 )
    150       names << ( name = name.to_sym )
    151       default and defaults[ name ] = default
    152     end
    153     super( *names ) do
    154       
    155       # If no defaults, leave the initialize method the same as
    156       # the struct's default initialize for speed. Otherwise,
    157       # overwrite the initialize to populate with default values.
    158       unless defaults.empty?
    159         parameters = names.map do | name |
    160           "#{ name } = " << defaults.fetch( name, 'nil' )
    161         end.join( ', ' )
    162         class_eval( <<-END )
    163           def initialize( #{ parameters } )
    164             super( #{ names.join( ', ' ) } )
    165           end
    166         END
    167       end
    168       
    169       body and class_eval( &body )
    170     end
    171   end
    172 end
    173 
    174 =begin rdoc ANTLR3::Recognizer
    175 
    176 = Recognizer
    177 
    178 As the base class of all ANTLR-generated recognizers, Recognizer provides
    179 much of the shared functionality and structure used in the recognition process.
    180 For all effective purposes, the class and its immediate subclasses Lexer,
    181 Parser, and TreeParser are abstract classes. They can be instantiated, but
    182 they're pretty useless on their own. Instead, to make useful code, you write an
    183 ANTLR grammar and ANTLR will generate classes which inherit from one of the
    184 recognizer base classes, providing the implementation of the grammar rules
    185 itself. this group of classes to implement necessary tasks. Recognizer
    186 defines methods related to:
    187 
    188 * token and character matching
    189 * prediction and recognition strategy
    190 * recovering from errors
    191 * reporting errors
    192 * memoization
    193 * simple rule tracing and debugging
    194 
    195 =end
    196 
    197 class Recognizer
    198   include Constants
    199   include Error
    200   include TokenFactory
    201   extend ClassMacros
    202   
    203   @rules = {}
    204   
    205   # inherited class methods and hooks
    206   class << self
    207     attr_reader :grammar_file_name,
    208                 :antlr_version,
    209                 :antlr_version_string,
    210                 :library_version_string,
    211                 :grammar_home
    212     
    213     attr_accessor :token_scheme, :default_rule
    214     
    215     # generated recognizer code uses this method to stamp
    216     # the code with the name of the grammar file and
    217     # the current version of ANTLR being used to generate
    218     # the code
    219     def generated_using( grammar_file, antlr_version, library_version = nil )
    220       @grammar_file_name = grammar_file.freeze
    221       @antlr_version_string = antlr_version.freeze
    222       @library_version = Util.parse_version( library_version )
    223       if @antlr_version_string =~ /^(\d+)\.(\d+)(?:\.(\d+)(?:b(\d+))?)?(.*)$/
    224         @antlr_version = [ $1, $2, $3, $4 ].map! { |str| str.to_i }
    225         timestamp = $5.strip
    226         #@antlr_release_time = $5.empty? ? nil : Time.parse($5)
    227       else
    228         raise "bad version string: %p" % version_string
    229       end
    230     end
    231     
    232     # this method is used to generate return-value structures for
    233     # rules with multiple return values. To avoid generating
    234     # a special class for ever rule in AST parsers and such
    235     # (where most rules have the same default set of return values),
    236     # each recognizer gets a default return value structure
    237     # assigned to the constant +Return+. Rules which don't
    238     # require additional custom members will have a rule-return
    239     # name constant that just points to the generic return
    240     # value. 
    241     def define_return_scope( *members )
    242       if members.empty? then generic_return_scope
    243       else
    244         members += return_scope_members
    245         Struct.new( *members )
    246       end
    247     end
    248     
    249     # used as a hook to add additional default members
    250     # to default return value structures
    251     # For example, all AST-building parsers override
    252     # this method to add an extra +:tree+ field to
    253     # all rule return structures.
    254     def return_scope_members
    255       [ :start, :stop ]
    256     end
    257     
    258     # sets up and returns the generic rule return
    259     # scope for a recognizer
    260     def generic_return_scope
    261       @generic_return_scope ||= begin
    262         struct = Struct.new( *return_scope_members )
    263         const_set( :Return, struct )
    264       end
    265     end
    266     
    267     def imported_grammars
    268       @imported_grammars ||= Set.new
    269     end
    270     
    271     def master_grammars
    272       @master_grammars ||= []
    273     end
    274     
    275     def master
    276       master_grammars.last
    277     end
    278     
    279     def masters( *grammar_names )
    280       for grammar in grammar_names
    281         unless master_grammars.include?( grammar )
    282           master_grammars << grammar
    283           attr_reader( Util.snake_case( grammar ) )
    284         end
    285       end
    286     end
    287     private :masters
    288     
    289     def imports( *grammar_names )
    290       for grammar in grammar_names
    291         imported_grammars.add?( grammar.to_sym ) and
    292           attr_reader( Util.snake_case( grammar ) )
    293       end
    294       return imported_grammars
    295     end
    296     private :imports
    297     
    298     def rules
    299       self::RULE_METHODS.dup rescue []
    300     end
    301     
    302     def default_rule
    303       @default_rule ||= rules.first
    304     end
    305     
    306     def debug?
    307       return false
    308     end
    309     
    310     def profile?
    311       return false
    312     end
    313     
    314     def Scope( *declarations, &body )
    315       Scope.new( *declarations, &body )
    316     end
    317     
    318     def token_class
    319       @token_class ||= begin
    320         self::Token            rescue
    321         superclass.token_class rescue
    322         ANTLR3::CommonToken
    323       end
    324     end
    325     private :generated_using
    326   end
    327   
    328   @grammar_file_name = nil
    329   @antlr_version = ANTLR3::ANTLR_VERSION
    330   @antlr_version_string = ANTLR3::ANTLR_VERSION_STRING
    331   
    332   def grammar_file_name
    333     self.class.grammar_file_name
    334   end
    335   
    336   def antlr_version
    337     self.class.antlr_version
    338   end
    339   
    340   def antlr_version_string
    341     self.class.antlr_version_string
    342   end
    343   
    344   attr_accessor :input
    345   attr_reader :state
    346   
    347   def each_delegate
    348     block_given? or return enum_for( __method__ )
    349     for grammar in self.class.imported_grammars
    350       del = __send__( Util.snake_case( grammar ) ) and
    351         yield( del )
    352     end
    353   end
    354   
    355   # Create a new recognizer. The constructor simply ensures that
    356   # all recognizers are initialized with a shared state object.
    357   # See the main recognizer subclasses for more specific
    358   # information about creating recognizer objects like
    359   # lexers and parsers.
    360   def initialize( options = {} )
    361     @state  = options[ :state ] || RecognizerSharedState.new
    362     @error_output = options.fetch( :error_output, $stderr )
    363     defined?( @input ) or @input = nil
    364     initialize_dfas
    365   end
    366   
    367   # Resets the recognizer's state data to initial values.
    368   # As a result, all error tracking and error recovery
    369   # data accumulated in the current state will be cleared.
    370   # It will also attempt to reset the input stream
    371   # via input.reset, but it ignores any errors received
    372   # from doing so. Thus the input stream is not guarenteed
    373   # to be rewound to its initial position
    374   def reset
    375     @state and @state.reset!
    376     @input and @input.reset rescue nil
    377   end
    378   
    379   # Attempt to match the current input symbol the token type
    380   # specified by +type+. If the symbol matches the type,
    381   # consume the current symbol and return its value. If
    382   # the symbol doesn't match, attempt to use the follow-set
    383   # data provided by +follow+ to recover from the mismatched
    384   # token. 
    385   def match( type, follow )
    386     matched_symbol = current_symbol
    387     if @input.peek == type
    388       @input.consume
    389       @state.error_recovery = false
    390       return matched_symbol
    391     end
    392     raise( BacktrackingFailed ) if @state.backtracking > 0
    393     return recover_from_mismatched_token( type, follow )
    394   end
    395   
    396   # match anything -- i.e. wildcard match. Simply consume
    397   # the current symbol from the input stream. 
    398   def match_any
    399     @state.error_recovery = false
    400     @input.consume
    401   end
    402   
    403   ##############################################################################################
    404   ###################################### Error Reporting #######################################
    405   ##############################################################################################
    406   ##############################################################################################
    407   
    408   # When a recognition error occurs, this method is the main
    409   # hook for carrying out the error reporting process. The
    410   # default implementation calls +display_recognition_error+
    411   # to display the error info on $stderr. 
    412   def report_error( e = $! )
    413     @state.error_recovery and return
    414     @state.syntax_errors += 1
    415     @state.error_recovery = true
    416     display_recognition_error( e )
    417   end
    418   
    419   # error reporting hook for presenting the information
    420   # The default implementation builds appropriate error
    421   # message text using +error_header+ and +error_message+,
    422   # and calls +emit_error_message+ to write the error
    423   # message out to some source
    424   def display_recognition_error( e = $! )
    425     header = error_header( e )
    426     message = error_message( e )
    427     emit_error_message( "#{ header } #{ message }" )
    428   end
    429   
    430   # used to construct an appropriate error message
    431   # based on the specific type of error and the
    432   # error's attributes
    433   def error_message( e = $! )
    434     case e
    435     when UnwantedToken
    436       token_name = token_name( e.expecting )
    437       "extraneous input #{ token_error_display( e.unexpected_token ) } expecting #{ token_name }"
    438     when MissingToken
    439       token_name = token_name( e.expecting )
    440       "missing #{ token_name } at #{ token_error_display( e.symbol ) }"
    441     when MismatchedToken
    442       token_name = token_name( e.expecting )
    443       "mismatched input #{ token_error_display( e.symbol ) } expecting #{ token_name }"
    444     when MismatchedTreeNode
    445       token_name = token_name( e.expecting )
    446       "mismatched tree node: #{ e.symbol } expecting #{ token_name }"
    447     when NoViableAlternative
    448       "no viable alternative at input " << token_error_display( e.symbol )
    449     when MismatchedSet
    450       "mismatched input %s expecting set %s" %
    451         [ token_error_display( e.symbol ), e.expecting.inspect ]
    452     when MismatchedNotSet
    453       "mismatched input %s expecting set %s" %
    454         [ token_error_display( e.symbol ), e.expecting.inspect ]
    455     when FailedPredicate
    456       "rule %s failed predicate: { %s }?" % [ e.rule_name, e.predicate_text ]
    457     else e.message
    458     end
    459   end
    460   
    461   # 
    462   # used to add a tag to the error message that indicates
    463   # the location of the input stream when the error
    464   # occurred
    465   # 
    466   def error_header( e = $! )
    467     e.location
    468   end
    469   
    470   # 
    471   # formats a token object appropriately for inspection
    472   # within an error message
    473   # 
    474   def token_error_display( token )
    475     unless text = token.text || ( token.source_text rescue nil )
    476       text =
    477         case
    478         when token.type == EOF then '<EOF>'
    479         when name = token_name( token.type ) rescue nil then "<#{ name }>"
    480         when token.respond_to?( :name ) then "<#{ token.name }>"
    481         else "<#{ token.type }>"
    482         end
    483     end
    484     return text.inspect
    485   end
    486   
    487   # 
    488   # Write the error report data out to some source. By default,
    489   # the error message is written to $stderr
    490   # 
    491   def emit_error_message( message )
    492     @error_output.puts( message ) if @error_output
    493   end
    494   
    495   ##############################################################################################
    496   ###################################### Error Recovery ########################################
    497   ##############################################################################################
    498   
    499   def recover( error = $! )
    500     @state.last_error_index == @input.index and @input.consume
    501     @state.last_error_index = @input.index
    502     
    503     follow_set = compute_error_recovery_set
    504     
    505     resync { consume_until( follow_set ) }
    506   end
    507   
    508   def resync
    509     begin_resync
    510     return( yield )
    511   ensure
    512     end_resync
    513   end
    514   
    515   # overridable hook method that is executed at the start of the
    516   # resyncing procedure in recover
    517   #
    518   # by default, it does nothing
    519   def begin_resync
    520     # do nothing
    521   end
    522   
    523   # overridable hook method that is after the resyncing procedure has completed
    524   #
    525   # by default, it does nothing
    526   def end_resync
    527     # do nothing
    528   end
    529   
    530   # (The following explanation has been lifted directly from the
    531   #  source code documentation of the ANTLR Java runtime library)
    532   # 
    533   # Compute the error recovery set for the current rule.  During
    534   # rule invocation, the parser pushes the set of tokens that can
    535   # follow that rule reference on the stack; this amounts to
    536   # computing FIRST of what follows the rule reference in the
    537   # enclosing rule. This local follow set only includes tokens
    538   # from within the rule; i.e., the FIRST computation done by
    539   # ANTLR stops at the end of a rule.
    540   # 
    541   # EXAMPLE
    542   # 
    543   # When you find a "no viable alt exception", the input is not
    544   # consistent with any of the alternatives for rule r.  The best
    545   # thing to do is to consume tokens until you see something that
    546   # can legally follow a call to r *or* any rule that called r.
    547   # You don't want the exact set of viable next tokens because the
    548   # input might just be missing a token--you might consume the
    549   # rest of the input looking for one of the missing tokens.
    550   # 
    551   # Consider grammar:
    552   # 
    553   #   a : '[' b ']'
    554   #     | '(' b ')'
    555   #     ;
    556   #   b : c '^' INT ;
    557   #   c : ID
    558   #     | INT
    559   #     ;
    560   # 
    561   # At each rule invocation, the set of tokens that could follow
    562   # that rule is pushed on a stack.  Here are the various "local"
    563   # follow sets:
    564   # 
    565   #   FOLLOW( b1_in_a ) = FIRST( ']' ) = ']'
    566   #   FOLLOW( b2_in_a ) = FIRST( ')' ) = ')'
    567   #   FOLLOW( c_in_b ) = FIRST( '^' ) = '^'
    568   # 
    569   # Upon erroneous input "[]", the call chain is
    570   # 
    571   #   a -> b -> c
    572   # 
    573   # and, hence, the follow context stack is:
    574   # 
    575   #   depth  local follow set     after call to rule
    576   #     0         \<EOF>                   a (from main( ) )
    577   #     1          ']'                     b
    578   #     3          '^'                     c
    579   # 
    580   # Notice that <tt>')'</tt> is not included, because b would have to have
    581   # been called from a different context in rule a for ')' to be
    582   # included.
    583   # 
    584   # For error recovery, we cannot consider FOLLOW(c)
    585   # (context-sensitive or otherwise).  We need the combined set of
    586   # all context-sensitive FOLLOW sets--the set of all tokens that
    587   # could follow any reference in the call chain.  We need to
    588   # resync to one of those tokens.  Note that FOLLOW(c)='^' and if
    589   # we resync'd to that token, we'd consume until EOF.  We need to
    590   # sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
    591   # In this case, for input "[]", LA(1) is in this set so we would
    592   # not consume anything and after printing an error rule c would
    593   # return normally.  It would not find the required '^' though.
    594   # At this point, it gets a mismatched token error and throws an
    595   # exception (since LA(1) is not in the viable following token
    596   # set).  The rule exception handler tries to recover, but finds
    597   # the same recovery set and doesn't consume anything.  Rule b
    598   # exits normally returning to rule a.  Now it finds the ']' (and
    599   # with the successful match exits errorRecovery mode).
    600   # 
    601   # So, you cna see that the parser walks up call chain looking
    602   # for the token that was a member of the recovery set.
    603   # 
    604   # Errors are not generated in errorRecovery mode.
    605   # 
    606   # ANTLR's error recovery mechanism is based upon original ideas:
    607   # 
    608   # "Algorithms + Data Structures = Programs" by Niklaus Wirth
    609   # 
    610   # and
    611   # 
    612   # "A note on error recovery in recursive descent parsers":
    613   # http://portal.acm.org/citation.cfm?id=947902.947905
    614   # 
    615   # Later, Josef Grosch had some good ideas:
    616   # 
    617   # "Efficient and Comfortable Error Recovery in Recursive Descent
    618   # Parsers":
    619   # ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
    620   # 
    621   # Like Grosch I implemented local FOLLOW sets that are combined
    622   # at run-time upon error to avoid overhead during parsing.
    623   def compute_error_recovery_set
    624     combine_follows( false )
    625   end
    626   
    627   def recover_from_mismatched_token( type, follow )
    628     if mismatch_is_unwanted_token?( type )
    629       err = UnwantedToken( type )
    630       resync { @input.consume }
    631       report_error( err )
    632       
    633       return @input.consume
    634     end
    635     
    636     if mismatch_is_missing_token?( follow )
    637       inserted = missing_symbol( nil, type, follow )
    638       report_error( MissingToken( type, inserted ) )
    639       return inserted
    640     end
    641     
    642     raise MismatchedToken( type )
    643   end
    644   
    645   def recover_from_mismatched_set( e, follow )
    646     if mismatch_is_missing_token?( follow )
    647       report_error( e )
    648       return missing_symbol( e, INVALID_TOKEN_TYPE, follow )
    649     end
    650     raise e
    651   end
    652   
    653   def recover_from_mismatched_element( e, follow )
    654     follow.nil? and return false
    655     if follow.include?( EOR_TOKEN_TYPE )
    656       viable_tokens = compute_context_sensitive_rule_follow
    657       follow = ( follow | viable_tokens ) - Set[ EOR_TOKEN_TYPE ]
    658     end
    659     if follow.include?( @input.peek )
    660       report_error( e )
    661       return true
    662     end
    663     return false
    664   end
    665   
    666   # Conjure up a missing token during error recovery.
    667   # 
    668   # The recognizer attempts to recover from single missing
    669   # symbols. But, actions might refer to that missing symbol.
    670   # For example, x=ID {f($x);}. The action clearly assumes
    671   # that there has been an identifier matched previously and that
    672   # $x points at that token. If that token is missing, but
    673   # the next token in the stream is what we want we assume that
    674   # this token is missing and we keep going. Because we
    675   # have to return some token to replace the missing token,
    676   # we have to conjure one up. This method gives the user control
    677   # over the tokens returned for missing tokens. Mostly,
    678   # you will want to create something special for identifier
    679   # tokens. For literals such as '{' and ',', the default
    680   # action in the parser or tree parser works. It simply creates
    681   # a CommonToken of the appropriate type. The text will be the token.
    682   # If you change what tokens must be created by the lexer,
    683   # override this method to create the appropriate tokens.
    684   def missing_symbol( error, expected_token_type, follow )
    685     return nil
    686   end
    687   
    688   def mismatch_is_unwanted_token?( type )
    689     @input.peek( 2 ) == type
    690   end
    691   
    692   def mismatch_is_missing_token?( follow )
    693     follow.nil? and return false
    694     if follow.include?( EOR_TOKEN_TYPE )
    695       viable_tokens = compute_context_sensitive_rule_follow
    696       follow = follow | viable_tokens
    697       
    698       follow.delete( EOR_TOKEN_TYPE ) unless @state.following.empty?
    699     end
    700     if follow.include?( @input.peek ) or follow.include?( EOR_TOKEN_TYPE )
    701       return true
    702     end
    703     return false
    704   end
    705   
    706   def syntax_errors?
    707     ( error_count = @state.syntax_errors ) > 0 and return( error_count )
    708   end
    709   
    710   # factor out what to do upon token mismatch so
    711   # tree parsers can behave differently.
    712   #
    713   # * override this method in your parser to do things
    714   #	  like bailing out after the first error
    715   #	* just raise the exception instead of
    716   #	  calling the recovery method.
    717   #
    718   def number_of_syntax_errors
    719     @state.syntax_errors
    720   end
    721   
    722   # 
    723   # Compute the context-sensitive +FOLLOW+ set for current rule.
    724   # This is set of token types that can follow a specific rule
    725   # reference given a specific call chain.  You get the set of
    726   # viable tokens that can possibly come next (look depth 1)
    727   # given the current call chain.  Contrast this with the
    728   # definition of plain FOLLOW for rule r:
    729   # 
    730   #    FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
    731   # 
    732   # where x in T* and alpha, beta in V*; T is set of terminals and
    733   # V is the set of terminals and nonterminals.  In other words,
    734   # FOLLOW(r) is the set of all tokens that can possibly follow
    735   # references to r in *any* sentential form (context).  At
    736   # runtime, however, we know precisely which context applies as
    737   # we have the call chain.  We may compute the exact (rather
    738   # than covering superset) set of following tokens.
    739   # 
    740   # For example, consider grammar:
    741   # 
    742   #   stat : ID '=' expr ';'      // FOLLOW(stat)=={EOF}
    743   #        | "return" expr '.'
    744   #        ;
    745   #   expr : atom ('+' atom)* ;   // FOLLOW(expr)=={';','.',')'}
    746   #   atom : INT                  // FOLLOW(atom)=={'+',')',';','.'}
    747   #        | '(' expr ')'
    748   #        ;
    749   # 
    750   # The FOLLOW sets are all inclusive whereas context-sensitive
    751   # FOLLOW sets are precisely what could follow a rule reference.
    752   # For input input "i=(3);", here is the derivation:
    753   # 
    754   #   stat => ID '=' expr ';'
    755   #        => ID '=' atom ('+' atom)* ';'
    756   #        => ID '=' '(' expr ')' ('+' atom)* ';'
    757   #        => ID '=' '(' atom ')' ('+' atom)* ';'
    758   #        => ID '=' '(' INT ')' ('+' atom)* ';'
    759   #        => ID '=' '(' INT ')' ';'
    760   # 
    761   # At the "3" token, you'd have a call chain of
    762   # 
    763   #   stat -> expr -> atom -> expr -> atom
    764   # 
    765   # What can follow that specific nested ref to atom?  Exactly ')'
    766   # as you can see by looking at the derivation of this specific
    767   # input.  Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
    768   # 
    769   # You want the exact viable token set when recovering from a
    770   # token mismatch.  Upon token mismatch, if LA(1) is member of
    771   # the viable next token set, then you know there is most likely
    772   # a missing token in the input stream.  "Insert" one by just not
    773   # throwing an exception.
    774   # 
    775   def compute_context_sensitive_rule_follow
    776     combine_follows true
    777   end
    778   
    779   def combine_follows( exact )
    780     follow_set = Set.new
    781     @state.following.each_with_index.reverse_each do |local_follow_set, index|
    782       follow_set |= local_follow_set
    783       if exact
    784         if local_follow_set.include?( EOR_TOKEN_TYPE )
    785           follow_set.delete( EOR_TOKEN_TYPE ) if index > 0
    786         else
    787           break
    788         end
    789       end
    790     end
    791     return follow_set
    792   end
    793   
    794   # 
    795   # Match needs to return the current input symbol, which gets put
    796   # into the label for the associated token ref; e.g., x=ID.  Token
    797   # and tree parsers need to return different objects. Rather than test
    798   # for input stream type or change the IntStream interface, I use
    799   # a simple method to ask the recognizer to tell me what the current
    800   # input symbol is.
    801   # 
    802   # This is ignored for lexers.
    803   # 
    804   def current_symbol
    805     @input.look
    806   end
    807   
    808   # 
    809   # Consume input symbols until one matches a type within types
    810   # 
    811   # types can be a single symbol type or a set of symbol types
    812   # 
    813   def consume_until( types )
    814     types.is_a?( Set ) or types = Set[ *types ]
    815     type = @input.peek
    816     until type == EOF or types.include?( type )
    817       @input.consume
    818       type = @input.peek
    819     end
    820     return( type )
    821   end
    822   
    823   # 
    824   # Returns true if the recognizer is currently in a decision for which
    825   # backtracking has been enabled
    826   # 
    827   def backtracking?
    828     @state.backtracking > 0
    829   end
    830   
    831   def backtracking_level
    832     @state.backtracking
    833   end
    834   
    835   def backtracking_level=( n )
    836     @state.backtracking = n
    837   end
    838   
    839   def backtrack
    840     @state.backtracking += 1
    841     start = @input.mark
    842     success =
    843       begin yield
    844       rescue BacktrackingFailed then false
    845       else true
    846       end
    847     return success
    848   ensure
    849     @input.rewind( start )
    850     @state.backtracking -= 1
    851   end
    852   
    853   def syntactic_predicate?( name )
    854     backtrack { send name }
    855   end
    856   
    857   alias backtracking backtracking_level
    858   alias backtracking= backtracking_level=
    859   
    860   def rule_memoization( rule, start_index )
    861     @state.rule_memory.fetch( rule ) do
    862       @state.rule_memory[ rule ] = Hash.new( MEMO_RULE_UNKNOWN )
    863     end[ start_index ]
    864   end
    865   
    866   def already_parsed_rule?( rule )
    867     stop_index = rule_memoization( rule, @input.index )
    868     case stop_index
    869     when MEMO_RULE_UNKNOWN then return false
    870     when MEMO_RULE_FAILED
    871       raise BacktrackingFailed
    872     else
    873       @input.seek( stop_index + 1 )
    874     end
    875     return true
    876   end
    877   
    878   def memoize( rule, start_index, success )
    879     stop_index = success ? @input.index - 1 : MEMO_RULE_FAILED
    880     memo = @state.rule_memory[ rule ] and memo[ start_index ] = stop_index
    881   end
    882   
    883   def trace_in( rule_name, rule_index, input_symbol )
    884     @error_output.printf( "--> enter %s on %s", rule_name, input_symbol )
    885     @state.backtracking > 0 and @error_output.printf( 
    886       " (in backtracking mode: depth = %s)", @state.backtracking
    887     )
    888     @error_output.print( "\n" )
    889   end
    890   
    891   def trace_out( rule_name, rule_index, input_symbol )
    892     @error_output.printf( "<-- exit %s on %s", rule_name, input_symbol )
    893     @state.backtracking > 0 and @error_output.printf( 
    894       " (in backtracking mode: depth = %s)", @state.backtracking
    895     )
    896     @error_output.print( "\n" )
    897   end
    898   
    899 private
    900   
    901   def initialize_dfas
    902     # do nothing
    903   end
    904 end
    905 
    906 
    907 # constant alias for compatibility with older versions of the
    908 # runtime library
    909 BaseRecognizer = Recognizer
    910 
    911 =begin rdoc ANTLR3::Lexer
    912 
    913 = Lexer
    914 
    915 Lexer is the default superclass of all lexers generated by ANTLR. The class
    916 tailors the core functionality provided by Recognizer to the task of
    917 matching patterns in the text input and breaking the input into tokens.
    918 
    919 == About Lexers
    920 
    921 A lexer's job is to take input text and break it up into _tokens_ -- objects
    922 that encapsulate a piece of text, a type label (such as ID or INTEGER), and the
    923 position of the text with respect to the input. Thus, a lexer is essentially a
    924 complicated iterator that steps through an input stream and produces a sequence
    925 of tokens. Sometimes lexers are enough to carry out a goal on their own, such as
    926 tasks like source code highlighting and simple code analysis. Usually, however,
    927 the lexer converts text into tokens for use by a parser, which recognizes larger
    928 structures within the text.
    929 
    930 ANTLR parsers have a variety of entry points specified by parser rules, each of
    931 which defines the structure of a specific type of sentence in a grammar. Lexers,
    932 however, are primarily intended to have a single entry point. It looks at the
    933 characters starting at the current input position, decides if the chunk of text
    934 matches one of a number of possible token type definitions, wraps the chunk into
    935 a token with information on its type and location, and advances the input stream
    936 to the next place.
    937 
    938 == ANTLR Lexers and the Lexer API
    939 
    940 ANTLR-generated lexers will subclass this class, unless specified otherwise
    941 within a grammar file. The generated class will provide an implementation of
    942 each lexer rule as a method of the same name. The subclass will also provide an
    943 implementation for the abstract method #m_tokens, the purpose of which is to
    944 multiplex the token type definitions and predict what rule definition to execute
    945 to fetch a token. The primary method in the lexer API, #next_token, uses
    946 #m_tokens to fetch the next token and drive the iteration.
    947 
    948 If the lexer is preparing tokens for use by an ANTLR generated parser, the lexer
    949 will generally be used to build a TokenStream object. The following code example
    950 demonstrates the typical setup for using ANTLR parsers and lexers in Ruby.
    951   
    952   # in HypotheticalLexer.rb
    953   module Hypothetical
    954   class Lexer < ANTLR3::Lexer
    955     # ...
    956     # ANTLR generated code
    957     # ...
    958   end
    959   end
    960   
    961   # in HypotheticalParser.rb
    962   module Hypothetical
    963   class Parser < ANTLR3::Parser
    964     # ...
    965     # more ANTLR generated code
    966     # ...
    967   end
    968   end
    969   
    970   # to take hypothetical source code and prepare it for parsing,
    971   # there is generally a four-step construction process
    972   
    973   source = "some hypothetical source code"
    974   input = ANTLR3::StringStream.new(source, :file => 'blah-de-blah.hyp')
    975   lexer = Hypothetical::Lexer.new( input )
    976   tokens = ANTLR3::CommonTokenStream.new( lexer )
    977   parser = Hypothetical::Parser.new( tokens )
    978   
    979   # if you're using the standard streams, ANTLR3::StringStream and
    980   # ANTLR3::CommonTokenStream, you can write the same process 
    981   # shown above more succinctly:
    982   
    983   lexer  = Hypothetical::Lexer.new("some hypothetical source code", :file => 'blah-de-blah.hyp')
    984   parser = Hypothetical::Parser.new( lexer )
    985 
    986 =end
    987 class Lexer < Recognizer
    988   include TokenSource
    989   @token_class = CommonToken
    990   
    991   def self.default_rule
    992     @default_rule ||= :token!
    993   end
    994   
    995   def self.main( argv = ARGV, options = {} )
    996     if argv.is_a?( ::Hash ) then argv, options = ARGV, argv end
    997     main = ANTLR3::Main::LexerMain.new( self, options )
    998     block_given? ? yield( main ) : main.execute( argv )
    999   end
   1000   
   1001   def self.associated_parser
   1002     @associated_parser ||= begin
   1003       @grammar_home and @grammar_home::Parser
   1004     rescue NameError
   1005       grammar_name = @grammar_home.name.split( "::" ).last
   1006       begin
   1007         require "#{ grammar_name }Parser"
   1008         @grammar_home::Parser
   1009       rescue LoadError, NameError
   1010       end
   1011     end
   1012   end
   1013 
   1014   def initialize( input, options = {} )
   1015     super( options )
   1016     @input = cast_input( input, options )
   1017   end
   1018   
   1019   def current_symbol
   1020     nil
   1021   end
   1022   
   1023   def next_token
   1024     loop do
   1025       @state.token = nil
   1026       @state.channel = DEFAULT_CHANNEL
   1027       @state.token_start_position = @input.index
   1028       @state.token_start_column = @input.column
   1029       @state.token_start_line = @input.line
   1030       @state.text = nil
   1031       @input.peek == EOF and return EOF_TOKEN
   1032       begin
   1033         token!
   1034         
   1035         case token = @state.token
   1036         when nil then return( emit )
   1037         when SKIP_TOKEN then next
   1038         else
   1039           return token
   1040         end
   1041       rescue NoViableAlternative => re
   1042         report_error( re )
   1043         recover( re )
   1044       rescue Error::RecognitionError => re
   1045         report_error( re )
   1046       end
   1047     end
   1048   end
   1049   
   1050   def skip
   1051     @state.token = SKIP_TOKEN
   1052   end
   1053   
   1054   abstract :token!
   1055   
   1056   def exhaust
   1057     self.to_a
   1058   end
   1059   
   1060   def char_stream=( input )
   1061     @input = nil
   1062     reset()
   1063     @input = input
   1064   end
   1065   
   1066   def source_name
   1067     @input.source_name
   1068   end
   1069   
   1070   def emit( token = @state.token )
   1071     token ||= create_token
   1072     @state.token = token
   1073     return token
   1074   end
   1075   
   1076   def match( expected )
   1077     case expected
   1078     when String
   1079       expected.each_byte do |char|
   1080         unless @input.peek == char
   1081           @state.backtracking > 0 and raise BacktrackingFailed
   1082           error = MismatchedToken( char )
   1083           recover( error )
   1084           raise error
   1085         end
   1086         @input.consume()
   1087       end
   1088     else # single integer character
   1089       unless @input.peek == expected
   1090         @state.backtracking > 0 and raise BacktrackingFailed
   1091         error = MismatchedToken( expected )
   1092         recover( error )
   1093         raise error
   1094       end
   1095       @input.consume
   1096     end
   1097     return true
   1098   end
   1099     
   1100   def match_any
   1101     @input.consume
   1102   end
   1103   
   1104   def match_range( min, max )
   1105     char = @input.peek
   1106     if char.between?( min, max ) then @input.consume
   1107     else
   1108       @state.backtracking > 0 and raise BacktrackingFailed
   1109       error = MismatchedRange( min.chr, max.chr )
   1110       recover( error )
   1111       raise( error )
   1112     end
   1113     return true
   1114   end
   1115   
   1116   def line
   1117     @input.line
   1118   end
   1119   
   1120   def column
   1121     @input.column
   1122   end
   1123   
   1124   def character_index
   1125     @input.index
   1126   end
   1127   
   1128   def text
   1129     @state.text and return @state.text
   1130     @input.substring( @state.token_start_position, character_index - 1 )
   1131   end
   1132   
   1133   def text=( text )
   1134     @state.text = text
   1135   end
   1136   
   1137   def report_error( e )
   1138     display_recognition_error( e )
   1139   end
   1140   
   1141   def error_message( e )
   1142     char = character_error_display( e.symbol ) rescue nil
   1143     case e
   1144     when Error::MismatchedToken
   1145       expecting = character_error_display( e.expecting )
   1146       "mismatched character #{ char }; expecting #{ expecting }"
   1147     when Error::NoViableAlternative
   1148       "no viable alternative at character #{ char }"
   1149     when Error::EarlyExit
   1150       "required ( ... )+ loop did not match anything at character #{ char }"
   1151     when Error::MismatchedNotSet
   1152       "mismatched character %s; expecting set %p" % [ char, e.expecting ]
   1153     when Error::MismatchedSet
   1154       "mismatched character %s; expecting set %p" % [ char, e.expecting ]
   1155     when Error::MismatchedRange
   1156       a = character_error_display( e.min )
   1157       b = character_error_display( e.max )
   1158       "mismatched character %s; expecting set %s..%s" % [ char, a, b ]
   1159     else super
   1160     end
   1161   end
   1162   
   1163   def character_error_display( char )
   1164     case char
   1165     when EOF then '<EOF>'
   1166     when Integer then char.chr.inspect
   1167     else char.inspect
   1168     end
   1169   end
   1170   
   1171   def recover( re )
   1172     @input.consume
   1173   end
   1174   
   1175   alias input= char_stream=
   1176   
   1177 private
   1178   
   1179   def cast_input( input, options )
   1180     case input
   1181     when CharacterStream then input
   1182     when ::String then StringStream.new( input, options )
   1183     when ::IO, ARGF then FileStream.new( input, options )
   1184     else input
   1185     end
   1186   end
   1187   
   1188   def trace_in( rule_name, rule_index )
   1189     if symbol = @input.look and symbol != EOF then symbol = symbol.inspect
   1190     else symbol = '<EOF>' end
   1191     input_symbol = "#{ symbol } @ line #{ line } / col #{ column }"
   1192     super( rule_name, rule_index, input_symbol )
   1193   end
   1194   
   1195   def trace_out( rule_name, rule_index )
   1196     if symbol = @input.look and symbol != EOF then symbol = symbol.inspect
   1197     else symbol = '<EOF>' end
   1198     input_symbol = "#{ symbol } @ line #{ line } / col #{ column }"
   1199     super( rule_name, rule_index, input_symbol )
   1200   end
   1201   
   1202   def create_token( &b )
   1203     if block_given? then super( &b )
   1204     else
   1205       super do |t|
   1206         t.input = @input
   1207         t.type = @state.type
   1208         t.channel = @state.channel
   1209         t.start = @state.token_start_position
   1210         t.stop = @input.index - 1
   1211         t.line = @state.token_start_line
   1212         t.text = self.text
   1213         t.column = @state.token_start_column
   1214       end
   1215     end
   1216   end
   1217 end
   1218 
   1219 
   1220 =begin rdoc ANTLR3::Parser
   1221 
   1222 = Parser
   1223 
   1224 Parser is the default base class of ANTLR-generated parser classes. The class
   1225 tailors the functionality provided by Recognizer to the task of parsing.
   1226 
   1227 == About Parsing
   1228 
   1229 This is just a lose overview of parsing. For considerably more in-depth coverage
   1230 of the topic, read the ANTLR documentation or check out the ANTLR website
   1231 (http://www.antlr.org).
   1232 
   1233 A grammar defines the vocabulary and the sentence structure of a language. While
   1234 a lexer concerns the basic vocabulary symbols of the language, a parser's
   1235 primary task is to implement the sentence structure.
   1236 
   1237 Parsers are set up by providing a stream of tokens, which is usually created by
   1238 a corresponding lexer. Then, the user requests a specific sentence-structure
   1239 within the grammar, such as "class_definition" or "xml_node", from the parser.
   1240 It iterates through the tokens, verifying the syntax of the sentence and
   1241 performing actions specified by the grammar. It stops when it encounters an
   1242 error or when it has matched the full sentence according to its defined
   1243 structure.
   1244 
   1245 == ANTLR Parsers and the Parser API
   1246 
   1247 Plain ANTLR-generated parsers directly subclass this class, unless specified
   1248 otherwise within the grammar options. The generated code will provide a method
   1249 for each parser rule defined in the ANTLR grammar, as well as any other
   1250 customized member attributes and methods specified in the source grammar.
   1251 
   1252 This class does not override much of the functionality in Recognizer, and
   1253 thus the API closely mirrors Recognizer.
   1254 
   1255 =end
   1256 class Parser < Recognizer
   1257   def self.main( argv = ARGV, options = {} )
   1258     if argv.is_a?( ::Hash ) then argv, options = ARGV, argv end
   1259     main = ANTLR3::Main::ParserMain.new( self, options )
   1260     block_given? ? yield( main ) : main.execute( argv )
   1261   end
   1262   
   1263   def self.associated_lexer
   1264     @associated_lexer ||= begin
   1265       @grammar_home and @grammar_home::Lexer
   1266     rescue NameError
   1267       grammar_name = @grammar_home.name.split( "::" ).last
   1268       begin
   1269         require "#{ grammar_name }Lexer"
   1270         @grammar_home::Lexer
   1271       rescue LoadError, NameError
   1272       end
   1273     end
   1274   end
   1275   
   1276   
   1277   def initialize( input, options = {} )
   1278     super( options )
   1279     @input = nil
   1280     reset
   1281     @input = cast_input( input, options )
   1282   end
   1283   
   1284   def missing_symbol( error, expected_type, follow )
   1285     current = @input.look
   1286     current = @input.look( -1 ) if current == ANTLR3::EOF_TOKEN
   1287     t =
   1288       case
   1289       when current && current != ANTLR3::EOF_TOKEN then current.clone
   1290       when @input.token_class then @input.token_class.new
   1291       else ( create_token rescue CommonToken.new )
   1292       end
   1293     
   1294     t.type = expected_type
   1295     name = t.name.gsub( /(^<)|(>$)/,'' )
   1296     t.text = "<missing #{ name }>"
   1297     t.channel = DEFAULT_CHANNEL
   1298     return( t )
   1299   end
   1300   
   1301   def token_stream=( input )
   1302     @input = nil
   1303     reset
   1304     @input = input
   1305   end
   1306   alias token_stream input
   1307   
   1308   def source_name
   1309     @input.source_name
   1310   end
   1311   
   1312   
   1313 private
   1314   
   1315   def trace_in( rule_name, rule_index )
   1316     super( rule_name, rule_index, @input.look.inspect )
   1317   end
   1318   
   1319   def trace_out( rule_name, rule_index )
   1320     super( rule_name, rule_index, @input.look.inspect )
   1321   end
   1322   
   1323   def cast_input( input, options )
   1324     case input
   1325     when TokenStream then input
   1326     when TokenSource then CommonTokenStream.new( input, options )
   1327     when IO, String, CharacterStream
   1328       if lexer_class = self.class.associated_lexer
   1329         CommonTokenStream.new( lexer_class.new( input, options ), options )
   1330       else
   1331         raise ArgumentError, Util.tidy( <<-END, true )
   1332         | unable to automatically convert input #{ input.inspect }
   1333         | to a ANTLR3::TokenStream object as #{ self.class }
   1334         | does not appear to have an associated lexer class
   1335         END
   1336       end
   1337     else
   1338       # assume it's a stream if it at least implements peek and consume
   1339       unless input.respond_to?( :peek ) and input.respond_to?( :consume )
   1340         raise ArgumentError, Util.tidy( <<-END, true )
   1341         | #{ self.class } requires a token stream as input, but
   1342         | #{ input.inspect } was provided
   1343         END
   1344       end
   1345       input
   1346     end
   1347   end
   1348   
   1349 end
   1350 
   1351 end
   1352