Home | History | Annotate | Download | only in functional
      1 /*
      2  [The 'BSD licence']
      3  Copyright (c) 2004 Terence Parr and Loring Craymer
      4  All rights reserved.
      5 
      6  Redistribution and use in source and binary forms, with or without
      7  modification, are permitted provided that the following conditions
      8  are met:
      9  1. Redistributions of source code must retain the above copyright
     10     notice, this list of conditions and the following disclaimer.
     11  2. Redistributions in binary form must reproduce the above copyright
     12     notice, this list of conditions and the following disclaimer in the
     13     documentation and/or other materials provided with the distribution.
     14  3. The name of the author may not be used to endorse or promote products
     15     derived from this software without specific prior written permission.
     16 
     17  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     18  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     19  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     20  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     21  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     22  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     26  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 */
     28 
     29 /** Python 2.3.3 Grammar
     30  *
     31  *  Terence Parr and Loring Craymer
     32  *  February 2004
     33  *
     34  *  Converted to ANTLR v3 November 2005 by Terence Parr.
     35  *
     36  *  This grammar was derived automatically from the Python 2.3.3
     37  *  parser grammar to get a syntactically correct ANTLR grammar
     38  *  for Python.  Then Terence hand tweaked it to be semantically
     39  *  correct; i.e., removed lookahead issues etc...  It is LL(1)
     40  *  except for the (sometimes optional) trailing commas and semi-colons.
     41  *  It needs two symbols of lookahead in this case.
     42  *
     43  *  Starting with Loring's preliminary lexer for Python, I modified it
     44  *  to do my version of the whole nasty INDENT/DEDENT issue just so I
     45  *  could understand the problem better.  This grammar requires
     46  *  PythonTokenStream.java to work.  Also I used some rules from the
     47  *  semi-formal grammar on the web for Python (automatically
     48  *  translated to ANTLR format by an ANTLR grammar, naturally <grin>).
     49  *  The lexical rules for python are particularly nasty and it took me
     50  *  a long time to get it 'right'; i.e., think about it in the proper
     51  *  way.  Resist changing the lexer unless you've used ANTLR a lot. ;)
     52  *
     53  *  I (Terence) tested this by running it on the jython-2.1/Lib
     54  *  directory of 40k lines of Python.
     55  *
     56  *  REQUIRES ANTLR v3
     57  */
     58 grammar Python;
     59 options {language=JavaScript;}
     60 
     61 tokens {
     62     INDENT;
     63     DEDENT;
     64 }
     65 
     66 @lexer::members {
     67 /** Handles context-sensitive lexing of implicit line joining such as
     68  *  the case where newline is ignored in cases like this:
     69  *  a = [3,
     70  *       4]
     71  */
     72 	this.implicitLineJoiningLevel= 0;
     73 	this.startPos = -1;
     74 }
     75 
     76 single_input
     77     : NEWLINE
     78 	| simple_stmt
     79 	| compound_stmt NEWLINE
     80 	;
     81 
     82 file_input
     83     :   (NEWLINE | stmt)*
     84 	;
     85 
     86 eval_input
     87     :   (NEWLINE)* testlist (NEWLINE)*
     88 	;
     89 
     90 funcdef
     91     :   'def' NAME parameters COLON suite
     92 	{xlog("found method def "+$NAME.text);}
     93 	;
     94 
     95 parameters
     96     :   LPAREN (varargslist)? RPAREN
     97 	;
     98 
     99 varargslist
    100     :   defparameter (options {greedy=true;}:COMMA defparameter)*
    101         (COMMA
    102             ( STAR NAME (COMMA DOUBLESTAR NAME)?
    103             | DOUBLESTAR NAME
    104             )?
    105         )?
    106     |   STAR NAME (COMMA DOUBLESTAR NAME)?
    107     |   DOUBLESTAR NAME
    108     ;
    109 
    110 defparameter
    111     :   fpdef (ASSIGN test)?
    112     ;
    113 
    114 fpdef
    115     :   NAME
    116 	|   LPAREN fplist RPAREN
    117 	;
    118 
    119 fplist
    120     :   fpdef (options {greedy=true;}:COMMA fpdef)* (COMMA)?
    121 	;
    122 
    123 
    124 stmt: simple_stmt
    125 	| compound_stmt
    126 	;
    127 
    128 simple_stmt
    129     :   small_stmt (options {greedy=true;}:SEMI small_stmt)* (SEMI)? NEWLINE
    130 	;
    131 
    132 small_stmt: expr_stmt
    133 	| print_stmt
    134 	| del_stmt
    135 	| pass_stmt
    136 	| flow_stmt
    137 	| import_stmt
    138 	| global_stmt
    139 	| exec_stmt
    140 	| assert_stmt
    141 	;
    142 
    143 expr_stmt
    144 	:	testlist
    145 		(	augassign testlist
    146 		|	(ASSIGN testlist)+
    147 		)?
    148 	;
    149 
    150 augassign
    151     : PLUSEQUAL
    152 	| MINUSEQUAL
    153 	| STAREQUAL
    154 	| SLASHEQUAL
    155 	| PERCENTEQUAL
    156 	| AMPEREQUAL
    157 	| VBAREQUAL
    158 	| CIRCUMFLEXEQUAL
    159 	| LEFTSHIFTEQUAL
    160 	| RIGHTSHIFTEQUAL
    161 	| DOUBLESTAREQUAL
    162 	| DOUBLESLASHEQUAL
    163 	;
    164 
    165 print_stmt:
    166         'print'
    167         (   testlist
    168         |   RIGHTSHIFT testlist
    169         )?
    170 	;
    171 
    172 del_stmt: 'del' exprlist
    173 	;
    174 
    175 pass_stmt: 'pass'
    176 	;
    177 
    178 flow_stmt: break_stmt
    179 	| continue_stmt
    180 	| return_stmt
    181 	| raise_stmt
    182 	| yield_stmt
    183 	;
    184 
    185 break_stmt: 'break'
    186 	;
    187 
    188 continue_stmt: 'continue'
    189 	;
    190 
    191 return_stmt: 'return' (testlist)?
    192 	;
    193 
    194 yield_stmt: 'yield' testlist
    195 	;
    196 
    197 raise_stmt: 'raise' (test (COMMA test (COMMA test)?)?)?
    198 	;
    199 
    200 import_stmt
    201     :   'import' dotted_as_name (COMMA dotted_as_name)*
    202 	|   'from' dotted_name 'import'
    203         (STAR | import_as_name (COMMA import_as_name)*)
    204 	;
    205 
    206 import_as_name
    207     :   NAME (NAME NAME)?
    208 	;
    209 
    210 dotted_as_name: dotted_name (NAME NAME)?
    211 	;
    212 
    213 dotted_name: NAME (DOT NAME)*
    214 	;
    215 
    216 global_stmt: 'global' NAME (COMMA NAME)*
    217 	;
    218 
    219 exec_stmt: 'exec' expr ('in' test (COMMA test)?)?
    220 	;
    221 
    222 assert_stmt: 'assert' test (COMMA test)?
    223 	;
    224 
    225 
    226 compound_stmt: if_stmt
    227 	| while_stmt
    228 	| for_stmt
    229 	| try_stmt
    230 	| funcdef
    231 	| classdef
    232 	;
    233 
    234 if_stmt: 'if' test COLON suite ('elif' test COLON suite)* ('else' COLON suite)?
    235 	;
    236 
    237 while_stmt: 'while' test COLON suite ('else' COLON suite)?
    238 	;
    239 
    240 for_stmt: 'for' exprlist 'in' testlist COLON suite ('else' COLON suite)?
    241 	;
    242 
    243 try_stmt
    244     :   'try' COLON suite
    245         (   (except_clause COLON suite)+ ('else' COLON suite)?
    246         |   'finally' COLON suite
    247         )
    248 	;
    249 
    250 except_clause: 'except' (test (COMMA test)?)?
    251 	;
    252 
    253 suite: simple_stmt
    254 	| NEWLINE INDENT (stmt)+ DEDENT
    255 	;
    256 
    257 
    258 test: and_test ('or' and_test)*
    259 	| lambdef
    260 	;
    261 
    262 and_test
    263 	: not_test ('and' not_test)*
    264 	;
    265 
    266 not_test
    267 	: 'not' not_test
    268 	| comparison
    269 	;
    270 
    271 comparison: expr (comp_op expr)*
    272 	;
    273 
    274 comp_op: LESS
    275 	|GREATER
    276 	|EQUAL
    277 	|GREATEREQUAL
    278 	|LESSEQUAL
    279 	|ALT_NOTEQUAL
    280 	|NOTEQUAL
    281 	|'in'
    282 	|'not' 'in'
    283 	|'is'
    284 	|'is' 'not'
    285 	;
    286 
    287 expr: xor_expr (VBAR xor_expr)*
    288 	;
    289 
    290 xor_expr: and_expr (CIRCUMFLEX and_expr)*
    291 	;
    292 
    293 and_expr: shift_expr (AMPER shift_expr)*
    294 	;
    295 
    296 shift_expr: arith_expr ((LEFTSHIFT|RIGHTSHIFT) arith_expr)*
    297 	;
    298 
    299 arith_expr: term ((PLUS|MINUS) term)*
    300 	;
    301 
    302 term: factor ((STAR | SLASH | PERCENT | DOUBLESLASH ) factor)*
    303 	;
    304 
    305 factor
    306 	: (PLUS|MINUS|TILDE) factor
    307 	| power
    308 	;
    309 
    310 power
    311 	:   atom (trailer)* (options {greedy=true;}:DOUBLESTAR factor)?
    312 	;
    313 
    314 atom: LPAREN (testlist)? RPAREN
    315 	| LBRACK (listmaker)? RBRACK
    316 	| LCURLY (dictmaker)? RCURLY
    317 	| BACKQUOTE testlist BACKQUOTE
    318 	| NAME
    319 	| INT
    320     | LONGINT
    321     | FLOAT
    322     | COMPLEX
    323 	| (STRING)+
    324 	;
    325 
    326 listmaker: test ( list_for | (options {greedy=true;}:COMMA test)* ) (COMMA)?
    327 	;
    328 
    329 lambdef: 'lambda' (varargslist)? COLON test
    330 	;
    331 
    332 trailer: LPAREN (arglist)? RPAREN
    333 	| LBRACK subscriptlist RBRACK
    334 	| DOT NAME
    335 	;
    336 
    337 subscriptlist
    338     :   subscript (options {greedy=true;}:COMMA subscript)* (COMMA)?
    339 	;
    340 
    341 subscript
    342 	: DOT DOT DOT
    343     | test (COLON (test)? (sliceop)?)?
    344     | COLON (test)? (sliceop)?
    345     ;
    346 
    347 sliceop: COLON (test)?
    348 	;
    349 
    350 exprlist
    351     :   expr (options {k=2;}:COMMA expr)* (COMMA)?
    352 	;
    353 
    354 testlist
    355     :   test (options {k=2;}: COMMA test)* (COMMA)?
    356     ;
    357 
    358 dictmaker
    359     :   test COLON test
    360         (options {k=2;}:COMMA test COLON test)* (COMMA)?
    361     ;
    362 
    363 classdef: 'class' NAME (LPAREN testlist RPAREN)? COLON suite
    364 	{xlog("found class def "+$NAME.text);}
    365 	;
    366 
    367 arglist: argument (COMMA argument)*
    368         ( COMMA
    369           ( STAR test (COMMA DOUBLESTAR test)?
    370           | DOUBLESTAR test
    371           )?
    372         )?
    373     |   STAR test (COMMA DOUBLESTAR test)?
    374     |   DOUBLESTAR test
    375     ;
    376 
    377 argument : test (ASSIGN test)?
    378          ;
    379 
    380 list_iter: list_for
    381 	| list_if
    382 	;
    383 
    384 list_for: 'for' exprlist 'in' testlist (list_iter)?
    385 	;
    386 
    387 list_if: 'if' test (list_iter)?
    388 	;
    389 
    390 LPAREN	: '(' {this.implicitLineJoiningLevel++;} ;
    391 
    392 RPAREN	: ')' {this.implicitLineJoiningLevel--;} ;
    393 
    394 LBRACK	: '[' {this.implicitLineJoiningLevel++;} ;
    395 
    396 RBRACK	: ']' {this.implicitLineJoiningLevel--;} ;
    397 
    398 COLON 	: ':' ;
    399 
    400 COMMA	: ',' ;
    401 
    402 SEMI	: ';' ;
    403 
    404 PLUS	: '+' ;
    405 
    406 MINUS	: '-' ;
    407 
    408 STAR	: '*' ;
    409 
    410 SLASH	: '/' ;
    411 
    412 VBAR	: '|' ;
    413 
    414 AMPER	: '&' ;
    415 
    416 LESS	: '<' ;
    417 
    418 GREATER	: '>' ;
    419 
    420 ASSIGN	: '=' ;
    421 
    422 PERCENT	: '%' ;
    423 
    424 BACKQUOTE	: '`' ;
    425 
    426 LCURLY	: '{' {this.implicitLineJoiningLevel++;} ;
    427 
    428 RCURLY	: '}' {this.implicitLineJoiningLevel--;} ;
    429 
    430 CIRCUMFLEX	: '^' ;
    431 
    432 TILDE	: '~' ;
    433 
    434 EQUAL	: '==' ;
    435 
    436 NOTEQUAL	: '!=' ;
    437 
    438 ALT_NOTEQUAL: '<>' ;
    439 
    440 LESSEQUAL	: '<=' ;
    441 
    442 LEFTSHIFT	: '<<' ;
    443 
    444 GREATEREQUAL	: '>=' ;
    445 
    446 RIGHTSHIFT	: '>>' ;
    447 
    448 PLUSEQUAL	: '+=' ;
    449 
    450 MINUSEQUAL	: '-=' ;
    451 
    452 DOUBLESTAR	: '**' ;
    453 
    454 STAREQUAL	: '*=' ;
    455 
    456 DOUBLESLASH	: '//' ;
    457 
    458 SLASHEQUAL	: '/=' ;
    459 
    460 VBAREQUAL	: '|=' ;
    461 
    462 PERCENTEQUAL	: '%=' ;
    463 
    464 AMPEREQUAL	: '&=' ;
    465 
    466 CIRCUMFLEXEQUAL	: '^=' ;
    467 
    468 LEFTSHIFTEQUAL	: '<<=' ;
    469 
    470 RIGHTSHIFTEQUAL	: '>>=' ;
    471 
    472 DOUBLESTAREQUAL	: '**=' ;
    473 
    474 DOUBLESLASHEQUAL	: '//=' ;
    475 
    476 DOT : '.' ;
    477 
    478 FLOAT
    479 	:	'.' DIGITS (Exponent)?
    480     |   DIGITS ('.' (DIGITS (Exponent)?)? | Exponent)
    481     ;
    482 
    483 LONGINT
    484     :   INT ('l'|'L')
    485     ;
    486 
    487 fragment
    488 Exponent
    489 	:	('e' | 'E') ( '+' | '-' )? DIGITS
    490 	;
    491 
    492 INT :   // Hex
    493         '0' ('x' | 'X') ( '0' .. '9' | 'a' .. 'f' | 'A' .. 'F' )+
    494         ('l' | 'L')?
    495     |   // Octal
    496         '0' DIGITS*
    497     |   '1'..'9' DIGITS*
    498     ;
    499 
    500 COMPLEX
    501     :   INT ('j'|'J')
    502     |   FLOAT ('j'|'J')
    503     ;
    504 
    505 fragment
    506 DIGITS : ( '0' .. '9' )+ ;
    507 
    508 NAME:	( 'a' .. 'z' | 'A' .. 'Z' | '_')
    509         ( 'a' .. 'z' | 'A' .. 'Z' | '_' | '0' .. '9' )*
    510     ;
    511 
    512 /** Match various string types.  Note that greedy=false implies '''
    513  *  should make us exit loop not continue.
    514  */
    515 STRING
    516     :   ('r'|'u'|'ur')?
    517         (   '\'\'\'' (options {greedy=false;}:.)* '\'\'\''
    518         |   '"""' (options {greedy=false;}:.)* '"""'
    519         |   '"' (ESC|~('\\'|'\n'|'"'))* '"'
    520         |   '\'' (ESC|~('\\'|'\n'|'\''))* '\''
    521         )
    522 	;
    523 
    524 fragment
    525 ESC
    526 	:	'\\' .
    527 	;
    528 
    529 /** Consume a newline and any whitespace at start of next line */
    530 CONTINUED_LINE
    531 	:	'\\' ('\r')? '\n' (' '|'\t')* { $channel=HIDDEN; }
    532 	;
    533 
    534 /** Treat a sequence of blank lines as a single blank line.  If
    535  *  nested within a (..), {..}, or [..], then ignore newlines.
    536  *  If the first newline starts in column one, they are to be ignored.
    537  */
    538 NEWLINE
    539     :   (('\r')? '\n' )+
    540         {if ( this.startPos==0 || this.implicitLineJoiningLevel>0 )
    541             $channel=HIDDEN;
    542         }
    543     ;
    544 
    545 WS	:	{this.startPos>0}?=> (' '|'\t')+ {$channel=HIDDEN;}
    546 	;
    547 
    548 /** Grab everything before a real symbol.  Then if newline, kill it
    549  *  as this is a blank line.  If whitespace followed by comment, kill it
    550  *  as it's a comment on a line by itself.
    551  *
    552  *  Ignore leading whitespace when nested in [..], (..), {..}.
    553  */
    554 LEADING_WS
    555 @init {
    556     var spaces = 0;
    557 }
    558     :   {this.startPos==0}?=>
    559     	(   {this.implicitLineJoiningLevel>0}? ( ' ' | '\t' )+ {$channel=HIDDEN;}
    560        	|	( 	' '  { spaces++; }
    561         	|	'\t' { spaces += 8; spaces -= (spaces \% 8); }
    562        		)+
    563         	{
    564             // make a string of n spaces where n is column number - 1
    565             var indentation = new Array(spaces);
    566             for (var i=0; i<spaces; i++) {
    567                 indentation[i] = ' ';
    568             }
    569             var s = indentation.join("");
    570             this.emit(new org.antlr.runtime.CommonToken(this.LEADING_WS,s));
    571         	}
    572         	// kill trailing newline if present and then ignore
    573         	( ('\r')? '\n' {if (this.state.token!=null) this.state.token.setChannel(HIDDEN); else $channel=HIDDEN;})*
    574            // {this.token.setChannel(99); }
    575         )
    576     ;
    577 
    578 /** Comments not on line by themselves are turned into newlines.
    579 
    580     b = a # end of line comment
    581 
    582     or
    583 
    584     a = [1, # weird
    585          2]
    586 
    587     This rule is invoked directly by nextToken when the comment is in
    588     first column or when comment is on end of nonwhitespace line.
    589 
    590 	Only match \n here if we didn't start on left edge; let NEWLINE return that.
    591 	Kill if newlines if we live on a line by ourselves
    592 
    593 	Consume any leading whitespace if it starts on left edge.
    594  */
    595 COMMENT
    596 @init {
    597     $channel=HIDDEN;
    598 }
    599     :	{this.startPos==0}?=> (' '|'\t')* '#' (~'\n')* '\n'+
    600     |	{this.startPos>0}?=> '#' (~'\n')* // let NEWLINE handle \n unless char pos==0 for '#'
    601     ;
    602