1 #!/usr/bin/ruby 2 # encoding: utf-8 3 4 =begin LICENSE 5 6 [The "BSD licence"] 7 Copyright (c) 2009-2010 Kyle Yetter 8 All rights reserved. 9 10 Redistribution and use in source and binary forms, with or without 11 modification, are permitted provided that the following conditions 12 are met: 13 14 1. Redistributions of source code must retain the above copyright 15 notice, this list of conditions and the following disclaimer. 16 2. Redistributions in binary form must reproduce the above copyright 17 notice, this list of conditions and the following disclaimer in the 18 documentation and/or other materials provided with the distribution. 19 3. The name of the author may not be used to endorse or promote products 20 derived from this software without specific prior written permission. 21 22 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 23 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 24 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 25 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 27 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 31 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 33 =end 34 35 module ANTLR3 36 37 =begin rdoc ANTLR3::Token 38 39 At a minimum, tokens are data structures that bind together a chunk of text and 40 a corresponding type symbol, which categorizes/characterizes the content of the 41 text. Tokens also usually carry information about their location in the input, 42 such as absolute character index, line number, and position within the line (or 43 column). 44 45 Furthermore, ANTLR tokens are assigned a "channel" number, an extra degree of 46 categorization that groups things on a larger scale. Parsers will usually ignore 47 tokens that have channel value 99 (the HIDDEN_CHANNEL), so you can keep things 48 like comment and white space huddled together with neighboring tokens, 49 effectively ignoring them without discarding them. 50 51 ANTLR tokens also keep a reference to the source stream from which they 52 originated. Token streams will also provide an index value for the token, which 53 indicates the position of the token relative to other tokens in the stream, 54 starting at zero. For example, the 22nd token pulled from a lexer by 55 CommonTokenStream will have index value 21. 56 57 == Token as an Interface 58 59 This library provides a token implementation (see CommonToken). Additionally, 60 you may write your own token class as long as you provide methods that give 61 access to the attributes expected by a token. Even though most of the ANTLR 62 library tries to use duck-typing techniques instead of pure object-oriented type 63 checking, it's a good idea to include this ANTLR3::Token into your customized 64 token class. 65 66 =end 67 68 module Token 69 include ANTLR3::Constants 70 include Comparable 71 72 # the token's associated chunk of text 73 attr_accessor :text 74 75 # the integer value associated with the token's type 76 attr_accessor :type 77 78 # the text's starting line number within the source (indexed starting at 1) 79 attr_accessor :line 80 81 # the text's starting position in the line within the source (indexed starting at 0) 82 attr_accessor :column 83 84 # the integer value of the channel to which the token is assigned 85 attr_accessor :channel 86 87 # the index of the token with respect to other the other tokens produced during lexing 88 attr_accessor :index 89 90 # a reference to the input stream from which the token was extracted 91 attr_accessor :input 92 93 # the absolute character index in the input at which the text starts 94 attr_accessor :start 95 96 # the absolute character index in the input at which the text ends 97 attr_accessor :stop 98 99 alias :input_stream :input 100 alias :input_stream= :input= 101 alias :token_index :index 102 alias :token_index= :index= 103 104 # 105 # The match operator has been implemented to match against several different 106 # attributes of a token for convenience in quick scripts 107 # 108 # @example Match against an integer token type constant 109 # token =~ VARIABLE_NAME => true/false 110 # @example Match against a token type name as a Symbol 111 # token =~ :FLOAT => true/false 112 # @example Match the token text against a Regular Expression 113 # token =~ /^@[a-z_]\w*$/i 114 # @example Compare the token's text to a string 115 # token =~ "class" 116 # 117 def =~ obj 118 case obj 119 when Integer then type == obj 120 when Symbol then name == obj.to_s 121 when Regexp then obj =~ text 122 when String then text == obj 123 else super 124 end 125 end 126 127 # 128 # Tokens are comparable by their stream index values 129 # 130 def <=> tk2 131 index <=> tk2.index 132 end 133 134 def initialize_copy( orig ) 135 self.index = -1 136 self.type = orig.type 137 self.channel = orig.channel 138 self.text = orig.text.clone if orig.text 139 self.start = orig.start 140 self.stop = orig.stop 141 self.line = orig.line 142 self.column = orig.column 143 self.input = orig.input 144 end 145 146 def concrete? 147 input && start && stop ? true : false 148 end 149 150 def imaginary? 151 input && start && stop ? false : true 152 end 153 154 def name 155 token_name( type ) 156 end 157 158 def source_name 159 i = input and i.source_name 160 end 161 162 def hidden? 163 channel == HIDDEN_CHANNEL 164 end 165 166 def source_text 167 concrete? ? input.substring( start, stop ) : text 168 end 169 170 # 171 # Sets the token's channel value to HIDDEN_CHANNEL 172 # 173 def hide! 174 self.channel = HIDDEN_CHANNEL 175 end 176 177 def inspect 178 text_inspect = text ? "[#{ text.inspect }] " : ' ' 179 text_position = line > 0 ? "@ line #{ line } col #{ column } " : '' 180 stream_position = start ? "(#{ range.inspect })" : '' 181 182 front = index >= 0 ? "#{ index } " : '' 183 rep = front << name << text_inspect << 184 text_position << stream_position 185 rep.strip! 186 channel == DEFAULT_CHANNEL or rep << " (#{ channel.to_s })" 187 return( rep ) 188 end 189 190 def pretty_print( printer ) 191 printer.text( inspect ) 192 end 193 194 def range 195 start..stop rescue nil 196 end 197 198 def to_i 199 index.to_i 200 end 201 202 def to_s 203 text.to_s 204 end 205 206 private 207 208 def token_name( type ) 209 BUILT_IN_TOKEN_NAMES[ type ] 210 end 211 end 212 213 CommonToken = Struct.new( :type, :channel, :text, :input, :start, 214 :stop, :index, :line, :column ) 215 216 =begin rdoc ANTLR3::CommonToken 217 218 The base class for the standard implementation of Token. It is implemented as a 219 simple Struct as tokens are basically simple data structures binding together a 220 bunch of different information and Structs are slightly faster than a standard 221 Object with accessor methods implementation. 222 223 By default, ANTLR generated ruby code will provide a customized subclass of 224 CommonToken to track token-type names efficiently for debugging, inspection, and 225 general utility. Thus code generated for a standard combo lexer-parser grammar 226 named XYZ will have a base module named XYZ and a customized CommonToken 227 subclass named XYZ::Token. 228 229 Here is the token structure attribute list in order: 230 231 * <tt>type</tt> 232 * <tt>channel</tt> 233 * <tt>text</tt> 234 * <tt>input</tt> 235 * <tt>start</tt> 236 * <tt>stop</tt> 237 * <tt>index</tt> 238 * <tt>line</tt> 239 * <tt>column</tt> 240 241 =end 242 243 class CommonToken 244 include Token 245 DEFAULT_VALUES = { 246 :channel => DEFAULT_CHANNEL, 247 :index => -1, 248 :line => 0, 249 :column => -1 250 }.freeze 251 252 def self.token_name( type ) 253 BUILT_IN_TOKEN_NAMES[ type ] 254 end 255 256 def self.create( fields = {} ) 257 fields = DEFAULT_VALUES.merge( fields ) 258 args = members.map { |name| fields[ name.to_sym ] } 259 new( *args ) 260 end 261 262 # allows you to make a copy of a token with a different class 263 def self.from_token( token ) 264 new( 265 token.type, token.channel, token.text ? token.text.clone : nil, 266 token.input, token.start, token.stop, -1, token.line, token.column 267 ) 268 end 269 270 def initialize( type = nil, channel = DEFAULT_CHANNEL, text = nil, 271 input = nil, start = nil, stop = nil, index = -1, 272 line = 0, column = -1 ) 273 super 274 block_given? and yield( self ) 275 self.text.nil? && self.start && self.stop and 276 self.text = self.input.substring( self.start, self.stop ) 277 end 278 279 alias :input_stream :input 280 alias :input_stream= :input= 281 alias :token_index :index 282 alias :token_index= :index= 283 end 284 285 module Constants 286 287 # End of File / End of Input character and token type 288 EOF_TOKEN = CommonToken.new( EOF ).freeze 289 INVALID_TOKEN = CommonToken.new( INVALID_TOKEN_TYPE ).freeze 290 SKIP_TOKEN = CommonToken.new( INVALID_TOKEN_TYPE ).freeze 291 end 292 293 294 295 =begin rdoc ANTLR3::TokenSource 296 297 TokenSource is a simple mixin module that demands an 298 implementation of the method #next_token. In return, it 299 defines methods #next and #each, which provide basic 300 iterator methods for token generators. Furthermore, it 301 includes Enumerable to provide the standard Ruby iteration 302 methods to token generators, like lexers. 303 304 =end 305 306 module TokenSource 307 include Constants 308 include Enumerable 309 extend ClassMacros 310 311 abstract :next_token 312 313 def next 314 token = next_token() 315 raise StopIteration if token.nil? || token.type == EOF 316 return token 317 end 318 319 def each 320 block_given? or return enum_for( :each ) 321 while token = next_token and token.type != EOF 322 yield( token ) 323 end 324 return self 325 end 326 327 def to_stream( options = {} ) 328 if block_given? 329 CommonTokenStream.new( self, options ) { | t, stream | yield( t, stream ) } 330 else 331 CommonTokenStream.new( self, options ) 332 end 333 end 334 end 335 336 337 =begin rdoc ANTLR3::TokenFactory 338 339 There are a variety of different entities throughout the ANTLR runtime library 340 that need to create token objects This module serves as a mixin that provides 341 methods for constructing tokens. 342 343 Including this module provides a +token_class+ attribute. Instance of the 344 including class can create tokens using the token class (which defaults to 345 ANTLR3::CommonToken). Token classes are presumed to have an #initialize method 346 that can be called without any parameters and the token objects are expected to 347 have the standard token attributes (see ANTLR3::Token). 348 349 =end 350 351 module TokenFactory 352 attr_writer :token_class 353 def token_class 354 @token_class ||= begin 355 self.class.token_class rescue 356 self::Token rescue 357 ANTLR3::CommonToken 358 end 359 end 360 361 def create_token( *args ) 362 if block_given? 363 token_class.new( *args ) do |*targs| 364 yield( *targs ) 365 end 366 else 367 token_class.new( *args ) 368 end 369 end 370 end 371 372 373 =begin rdoc ANTLR3::TokenScheme 374 375 TokenSchemes exist to handle the problem of defining token types as integer 376 values while maintaining meaningful text names for the types. They are 377 dynamically defined modules that map integer values to constants with token-type 378 names. 379 380 --- 381 382 Fundamentally, tokens exist to take a chunk of text and identify it as belonging 383 to some category, like "VARIABLE" or "INTEGER". In code, the category is 384 represented by an integer -- some arbitrary value that ANTLR will decide to use 385 as it is creating the recognizer. The purpose of using an integer (instead of 386 say, a ruby symbol) is that ANTLR's decision logic often needs to test whether a 387 token's type falls within a range, which is not possible with symbols. 388 389 The downside of token types being represented as integers is that a developer 390 needs to be able to reference the unknown type value by name in action code. 391 Furthermore, code that references the type by name and tokens that can be 392 inspected with names in place of type values are more meaningful to a developer. 393 394 Since ANTLR requires token type names to follow capital-letter naming 395 conventions, defining types as named constants of the recognizer class resolves 396 the problem of referencing type values by name. Thus, a token type like 397 ``VARIABLE'' can be represented by a number like 5 and referenced within code by 398 +VARIABLE+. However, when a recognizer creates tokens, the name of the token's 399 type cannot be seen without using the data defined in the recognizer. 400 401 Of course, tokens could be defined with a name attribute that could be specified 402 when tokens are created. However, doing so would make tokens take up more space 403 than necessary, as well as making it difficult to change the type of a token 404 while maintaining a correct name value. 405 406 TokenSchemes exist as a technique to manage token type referencing and name 407 extraction. They: 408 409 1. keep token type references clear and understandable in recognizer code 410 2. permit access to a token's type-name independently of recognizer objects 411 3. allow multiple classes to share the same token information 412 413 == Building Token Schemes 414 415 TokenScheme is a subclass of Module. Thus, it has the method 416 <tt>TokenScheme.new(tk_class = nil) { ... module-level code ...}</tt>, which 417 will evaluate the block in the context of the scheme (module), similarly to 418 Module#module_eval. Before evaluating the block, <tt>.new</tt> will setup the 419 module with the following actions: 420 421 1. define a customized token class (more on that below) 422 2. add a new constant, TOKEN_NAMES, which is a hash that maps types to names 423 3. dynamically populate the new scheme module with a couple instance methods 424 4. include ANTLR3::Constants in the new scheme module 425 426 As TokenScheme the class functions as a metaclass, figuring out some of the 427 scoping behavior can be mildly confusing if you're trying to get a handle of the 428 entity for your own purposes. Remember that all of the instance methods of 429 TokenScheme function as module-level methods of TokenScheme instances, ala 430 +attr_accessor+ and friends. 431 432 <tt>TokenScheme#define_token(name_symbol, int_value)</tt> adds a constant 433 definition <tt>name_symbol</tt> with the value <tt>int_value</tt>. It is 434 essentially like <tt>Module#const_set</tt>, except it forbids constant 435 overwriting (which would mess up recognizer code fairly badly) and adds an 436 inverse type-to-name map to its own <tt>TOKEN_NAMES</tt> table. 437 <tt>TokenScheme#define_tokens</tt> is a convenience method for defining many 438 types with a hash pairing names to values. 439 440 <tt>TokenScheme#register_name(value, name_string)</tt> specifies a custom 441 type-to-name definition. This is particularly useful for the anonymous tokens 442 that ANTLR generates for literal strings in the grammar specification. For 443 example, if you refer to the literal <tt>'='</tt> in some parser rule in your 444 grammar, ANTLR will add a lexer rule for the literal and give the token a name 445 like <tt>T__<i>x</i></tt>, where <tt><i>x</i></tt> is the type's integer value. 446 Since this is pretty meaningless to a developer, generated code should add a 447 special name definition for type value <tt><i>x</i></tt> with the string 448 <tt>"'='"</tt>. 449 450 === Sample TokenScheme Construction 451 452 TokenData = ANTLR3::TokenScheme.new do 453 define_tokens( 454 :INT => 4, 455 :ID => 6, 456 :T__5 => 5, 457 :WS => 7 458 ) 459 460 # note the self:: scoping below is due to the fact that 461 # ruby lexically-scopes constant names instead of 462 # looking up in the current scope 463 register_name(self::T__5, "'='") 464 end 465 466 TokenData::ID # => 6 467 TokenData::T__5 # => 5 468 TokenData.token_name(4) # => 'INT' 469 TokenData.token_name(5) # => "'='" 470 471 class ARecognizerOrSuch < ANTLR3::Parser 472 include TokenData 473 ID # => 6 474 end 475 476 == Custom Token Classes and Relationship with Tokens 477 478 When a TokenScheme is created, it will define a subclass of ANTLR3::CommonToken 479 and assigned it to the constant name +Token+. This token class will both include 480 and extend the scheme module. Since token schemes define the private instance 481 method <tt>token_name(type)</tt>, instances of the token class are now able to 482 provide their type names. The Token method <tt>name</tt> uses the 483 <tt>token_name</tt> method to provide the type name as if it were a simple 484 attribute without storing the name itself. 485 486 When a TokenScheme is included in a recognizer class, the class will now have 487 the token types as named constants, a type-to-name map constant +TOKEN_NAMES+, 488 and a grammar-specific subclass of ANTLR3::CommonToken assigned to the constant 489 Token. Thus, when recognizers need to manufacture tokens, instead of using the 490 generic CommonToken class, they can create tokens using the customized Token 491 class provided by the token scheme. 492 493 If you need to use a token class other than CommonToken, you can pass the class 494 as a parameter to TokenScheme.new, which will be used in place of the 495 dynamically-created CommonToken subclass. 496 497 =end 498 499 class TokenScheme < ::Module 500 include TokenFactory 501 502 def self.new( tk_class = nil, &body ) 503 super() do 504 tk_class ||= Class.new( ::ANTLR3::CommonToken ) 505 self.token_class = tk_class 506 507 const_set( :TOKEN_NAMES, ::ANTLR3::Constants::BUILT_IN_TOKEN_NAMES.clone ) 508 509 @types = ::ANTLR3::Constants::BUILT_IN_TOKEN_NAMES.invert 510 @unused = ::ANTLR3::Constants::MIN_TOKEN_TYPE 511 512 scheme = self 513 define_method( :token_scheme ) { scheme } 514 define_method( :token_names ) { scheme::TOKEN_NAMES } 515 define_method( :token_name ) do |type| 516 begin 517 token_names[ type ] or super 518 rescue NoMethodError 519 ::ANTLR3::CommonToken.token_name( type ) 520 end 521 end 522 module_function :token_name, :token_names 523 524 include ANTLR3::Constants 525 526 body and module_eval( &body ) 527 end 528 end 529 530 def self.build( *token_names ) 531 token_names = [ token_names ].flatten! 532 token_names.compact! 533 token_names.uniq! 534 tk_class = Class === token_names.first ? token_names.shift : nil 535 value_maps, names = token_names.partition { |i| Hash === i } 536 new( tk_class ) do 537 for value_map in value_maps 538 define_tokens( value_map ) 539 end 540 541 for name in names 542 define_token( name ) 543 end 544 end 545 end 546 547 548 def included( mod ) 549 super 550 mod.extend( self ) 551 end 552 private :included 553 554 attr_reader :unused, :types 555 556 def define_tokens( token_map = {} ) 557 for token_name, token_value in token_map 558 define_token( token_name, token_value ) 559 end 560 return self 561 end 562 563 def define_token( name, value = nil ) 564 name = name.to_s 565 566 if current_value = @types[ name ] 567 # token type has already been defined 568 # raise an error unless value is the same as the current value 569 value ||= current_value 570 unless current_value == value 571 raise NameError.new( 572 "new token type definition ``#{ name } = #{ value }'' conflicts " << 573 "with existing type definition ``#{ name } = #{ current_value }''", name 574 ) 575 end 576 else 577 value ||= @unused 578 if name =~ /^[A-Z]\w*$/ 579 const_set( name, @types[ name ] = value ) 580 else 581 constant = "T__#{ value }" 582 const_set( constant, @types[ constant ] = value ) 583 @types[ name ] = value 584 end 585 register_name( value, name ) unless built_in_type?( value ) 586 end 587 588 value >= @unused and @unused = value + 1 589 return self 590 end 591 592 def register_names( *names ) 593 if names.length == 1 and Hash === names.first 594 names.first.each do |value, name| 595 register_name( value, name ) 596 end 597 else 598 names.each_with_index do |name, i| 599 type_value = Constants::MIN_TOKEN_TYPE + i 600 register_name( type_value, name ) 601 end 602 end 603 end 604 605 def register_name( type_value, name ) 606 name = name.to_s.freeze 607 if token_names.has_key?( type_value ) 608 current_name = token_names[ type_value ] 609 current_name == name and return name 610 611 if current_name == "T__#{ type_value }" 612 # only an anonymous name is registered -- upgrade the name to the full literal name 613 token_names[ type_value ] = name 614 elsif name == "T__#{ type_value }" 615 # ignore name downgrade from literal to anonymous constant 616 return current_name 617 else 618 error = NameError.new( 619 "attempted assignment of token type #{ type_value }" << 620 " to name #{ name } conflicts with existing name #{ current_name }", name 621 ) 622 raise error 623 end 624 else 625 token_names[ type_value ] = name.to_s.freeze 626 end 627 end 628 629 def built_in_type?( type_value ) 630 Constants::BUILT_IN_TOKEN_NAMES.fetch( type_value, false ) and true 631 end 632 633 def token_defined?( name_or_value ) 634 case value 635 when Integer then token_names.has_key?( name_or_value ) 636 else const_defined?( name_or_value.to_s ) 637 end 638 end 639 640 def []( name_or_value ) 641 case name_or_value 642 when Integer then token_names.fetch( name_or_value, nil ) 643 else const_get( name_or_value.to_s ) rescue token_names.index( name_or_value ) 644 end 645 end 646 647 def token_class 648 self::Token 649 end 650 651 def token_class=( klass ) 652 Class === klass or raise( TypeError, "token_class must be a Class" ) 653 Util.silence_warnings do 654 klass < self or klass.send( :include, self ) 655 const_set( :Token, klass ) 656 end 657 end 658 659 end 660 661 end 662