tests/functional/Python.g

/*
 [The 'BSD licence']
 Copyright (c) 2004 Terence Parr and Loring Craymer
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
 are met:
 1. Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
    notice, this list of conditions and the following disclaimer in the
    documentation and/or other materials provided with the distribution.
 3. The name of the author may not be used to endorse or promote products
    derived from this software without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/** Python 2.3.3 Grammar
 *
 *  Terence Parr and Loring Craymer
 *  February 2004
 *
 *  Converted to ANTLR v3 November 2005 by Terence Parr.
 *
 *  This grammar was derived automatically from the Python 2.3.3
 *  parser grammar to get a syntactically correct ANTLR grammar
 *  for Python.  Then Terence hand tweaked it to be semantically
 *  correct; i.e., removed lookahead issues etc...  It is LL(1)
 *  except for the (sometimes optional) trailing commas and semi-colons.
 *  It needs two symbols of lookahead in this case.
 *
 *  Starting with Loring's preliminary lexer for Python, I modified it
 *  to do my version of the whole nasty INDENT/DEDENT issue just so I
 *  could understand the problem better.  This grammar requires
 *  PythonTokenStream.java to work.  Also I used some rules from the
 *  semi-formal grammar on the web for Python (automatically
 *  translated to ANTLR format by an ANTLR grammar, naturally <grin>).
 *  The lexical rules for python are particularly nasty and it took me
 *  a long time to get it 'right'; i.e., think about it in the proper
 *  way.  Resist changing the lexer unless you've used ANTLR a lot. ;)
 *
 *  I (Terence) tested this by running it on the jython-2.1/Lib
 *  directory of 40k lines of Python.
 *
 *  REQUIRES ANTLR v3
 */
grammar Python;
options {language=JavaScript;}

tokens {
    INDENT;
    DEDENT;
}

@lexer::members {
/** Handles context-sensitive lexing of implicit line joining such as
 *  the case where newline is ignored in cases like this:
 *  a = [3,
 *       4]
 */
	this.implicitLineJoiningLevel= 0;
	this.startPos = -1;
}

single_input
    : NEWLINE
	| simple_stmt
	| compound_stmt NEWLINE
	;

file_input
    :   (NEWLINE | stmt)*
	;

eval_input
    :   (NEWLINE)* testlist (NEWLINE)*
	;

funcdef
    :   'def' NAME parameters COLON suite
	{xlog("found method def "+$NAME.text);}
	;

parameters
    :   LPAREN (varargslist)? RPAREN
	;

varargslist
    :   defparameter (options {greedy=true;}:COMMA defparameter)*
        (COMMA
            ( STAR NAME (COMMA DOUBLESTAR NAME)?
            | DOUBLESTAR NAME
            )?
        )?
    |   STAR NAME (COMMA DOUBLESTAR NAME)?
    |   DOUBLESTAR NAME
    ;

defparameter
    :   fpdef (ASSIGN test)?
    ;

fpdef
    :   NAME
	|   LPAREN fplist RPAREN
	;

fplist
    :   fpdef (options {greedy=true;}:COMMA fpdef)* (COMMA)?
	;


stmt: simple_stmt
	| compound_stmt
	;

simple_stmt
    :   small_stmt (options {greedy=true;}:SEMI small_stmt)* (SEMI)? NEWLINE
	;

small_stmt: expr_stmt
	| print_stmt
	| del_stmt
	| pass_stmt
	| flow_stmt
	| import_stmt
	| global_stmt
	| exec_stmt
	| assert_stmt
	;

expr_stmt
	:	testlist
		(	augassign testlist
		|	(ASSIGN testlist)+
		)?
	;

augassign
    : PLUSEQUAL
	| MINUSEQUAL
	| STAREQUAL
	| SLASHEQUAL
	| PERCENTEQUAL
	| AMPEREQUAL
	| VBAREQUAL
	| CIRCUMFLEXEQUAL
	| LEFTSHIFTEQUAL
	| RIGHTSHIFTEQUAL
	| DOUBLESTAREQUAL
	| DOUBLESLASHEQUAL
	;

print_stmt:
        'print'
        (   testlist
        |   RIGHTSHIFT testlist
        )?
	;

del_stmt: 'del' exprlist
	;

pass_stmt: 'pass'
	;

flow_stmt: break_stmt
	| continue_stmt
	| return_stmt
	| raise_stmt
	| yield_stmt
	;

break_stmt: 'break'
	;

continue_stmt: 'continue'
	;

return_stmt: 'return' (testlist)?
	;

yield_stmt: 'yield' testlist
	;

raise_stmt: 'raise' (test (COMMA test (COMMA test)?)?)?
	;

import_stmt
    :   'import' dotted_as_name (COMMA dotted_as_name)*
	|   'from' dotted_name 'import'
        (STAR | import_as_name (COMMA import_as_name)*)
	;

import_as_name
    :   NAME (NAME NAME)?
	;

dotted_as_name: dotted_name (NAME NAME)?
	;

dotted_name: NAME (DOT NAME)*
	;

global_stmt: 'global' NAME (COMMA NAME)*
	;

exec_stmt: 'exec' expr ('in' test (COMMA test)?)?
	;

assert_stmt: 'assert' test (COMMA test)?
	;


compound_stmt: if_stmt
	| while_stmt
	| for_stmt
	| try_stmt
	| funcdef
	| classdef
	;

if_stmt: 'if' test COLON suite ('elif' test COLON suite)* ('else' COLON suite)?
	;

while_stmt: 'while' test COLON suite ('else' COLON suite)?
	;

for_stmt: 'for' exprlist 'in' testlist COLON suite ('else' COLON suite)?
	;

try_stmt
    :   'try' COLON suite
        (   (except_clause COLON suite)+ ('else' COLON suite)?
        |   'finally' COLON suite
        )
	;

except_clause: 'except' (test (COMMA test)?)?
	;

suite: simple_stmt
	| NEWLINE INDENT (stmt)+ DEDENT
	;


test: and_test ('or' and_test)*
	| lambdef
	;

and_test
	: not_test ('and' not_test)*
	;

not_test
	: 'not' not_test
	| comparison
	;

comparison: expr (comp_op expr)*
	;

comp_op: LESS
	|GREATER
	|EQUAL
	|GREATEREQUAL
	|LESSEQUAL
	|ALT_NOTEQUAL
	|NOTEQUAL
	|'in'
	|'not' 'in'
	|'is'
	|'is' 'not'
	;

expr: xor_expr (VBAR xor_expr)*
	;

xor_expr: and_expr (CIRCUMFLEX and_expr)*
	;

and_expr: shift_expr (AMPER shift_expr)*
	;

shift_expr: arith_expr ((LEFTSHIFT|RIGHTSHIFT) arith_expr)*
	;

arith_expr: term ((PLUS|MINUS) term)*
	;

term: factor ((STAR | SLASH | PERCENT | DOUBLESLASH ) factor)*
	;

factor
	: (PLUS|MINUS|TILDE) factor
	| power
	;

power
	:   atom (trailer)* (options {greedy=true;}:DOUBLESTAR factor)?
	;

atom: LPAREN (testlist)? RPAREN
	| LBRACK (listmaker)? RBRACK
	| LCURLY (dictmaker)? RCURLY
	| BACKQUOTE testlist BACKQUOTE
	| NAME
	| INT
    | LONGINT
    | FLOAT
    | COMPLEX
	| (STRING)+
	;

listmaker: test ( list_for | (options {greedy=true;}:COMMA test)* ) (COMMA)?
	;

lambdef: 'lambda' (varargslist)? COLON test
	;

trailer: LPAREN (arglist)? RPAREN
	| LBRACK subscriptlist RBRACK
	| DOT NAME
	;

subscriptlist
    :   subscript (options {greedy=true;}:COMMA subscript)* (COMMA)?
	;

subscript
	: DOT DOT DOT
    | test (COLON (test)? (sliceop)?)?
    | COLON (test)? (sliceop)?
    ;

sliceop: COLON (test)?
	;

exprlist
    :   expr (options {k=2;}:COMMA expr)* (COMMA)?
	;

testlist
    :   test (options {k=2;}: COMMA test)* (COMMA)?
    ;

dictmaker
    :   test COLON test
        (options {k=2;}:COMMA test COLON test)* (COMMA)?
    ;

classdef: 'class' NAME (LPAREN testlist RPAREN)? COLON suite
	{xlog("found class def "+$NAME.text);}
	;

arglist: argument (COMMA argument)*
        ( COMMA
          ( STAR test (COMMA DOUBLESTAR test)?
          | DOUBLESTAR test
          )?
        )?
    |   STAR test (COMMA DOUBLESTAR test)?
    |   DOUBLESTAR test
    ;

argument : test (ASSIGN test)?
         ;

list_iter: list_for
	| list_if
	;

list_for: 'for' exprlist 'in' testlist (list_iter)?
	;

list_if: 'if' test (list_iter)?
	;

LPAREN	: '(' {this.implicitLineJoiningLevel++;} ;

RPAREN	: ')' {this.implicitLineJoiningLevel--;} ;

LBRACK	: '[' {this.implicitLineJoiningLevel++;} ;

RBRACK	: ']' {this.implicitLineJoiningLevel--;} ;

COLON 	: ':' ;

COMMA	: ',' ;

SEMI	: ';' ;

PLUS	: '+' ;

MINUS	: '-' ;

STAR	: '*' ;

SLASH	: '/' ;

VBAR	: '|' ;

AMPER	: '&' ;

LESS	: '<' ;

GREATER	: '>' ;

ASSIGN	: '=' ;

PERCENT	: '%' ;

BACKQUOTE	: '`' ;

LCURLY	: '{' {this.implicitLineJoiningLevel++;} ;

RCURLY	: '}' {this.implicitLineJoiningLevel--;} ;

CIRCUMFLEX	: '^' ;

TILDE	: '~' ;

EQUAL	: '==' ;

NOTEQUAL	: '!=' ;

ALT_NOTEQUAL: '<>' ;

LESSEQUAL	: '<=' ;

LEFTSHIFT	: '<<' ;

GREATEREQUAL	: '>=' ;

RIGHTSHIFT	: '>>' ;

PLUSEQUAL	: '+=' ;

MINUSEQUAL	: '-=' ;

DOUBLESTAR	: '**' ;

STAREQUAL	: '*=' ;

DOUBLESLASH	: '//' ;

SLASHEQUAL	: '/=' ;

VBAREQUAL	: '|=' ;

PERCENTEQUAL	: '%=' ;

AMPEREQUAL	: '&=' ;

CIRCUMFLEXEQUAL	: '^=' ;

LEFTSHIFTEQUAL	: '<<=' ;

RIGHTSHIFTEQUAL	: '>>=' ;

DOUBLESTAREQUAL	: '**=' ;

DOUBLESLASHEQUAL	: '//=' ;

DOT : '.' ;

FLOAT
	:	'.' DIGITS (Exponent)?
    |   DIGITS ('.' (DIGITS (Exponent)?)? | Exponent)
    ;

LONGINT
    :   INT ('l'|'L')
    ;

fragment
Exponent
	:	('e' | 'E') ( '+' | '-' )? DIGITS
	;

INT :   // Hex
        '0' ('x' | 'X') ( '0' .. '9' | 'a' .. 'f' | 'A' .. 'F' )+
        ('l' | 'L')?
    |   // Octal
        '0' DIGITS*
    |   '1'..'9' DIGITS*
    ;

COMPLEX
    :   INT ('j'|'J')
    |   FLOAT ('j'|'J')
    ;

fragment
DIGITS : ( '0' .. '9' )+ ;

NAME:	( 'a' .. 'z' | 'A' .. 'Z' | '_')
        ( 'a' .. 'z' | 'A' .. 'Z' | '_' | '0' .. '9' )*
    ;

/** Match various string types.  Note that greedy=false implies '''
 *  should make us exit loop not continue.
 */
STRING
    :   ('r'|'u'|'ur')?
        (   '\'\'\'' (options {greedy=false;}:.)* '\'\'\''
        |   '"""' (options {greedy=false;}:.)* '"""'
        |   '"' (ESC|~('\\'|'\n'|'"'))* '"'
        |   '\'' (ESC|~('\\'|'\n'|'\''))* '\''
        )
	;

fragment
ESC
	:	'\\' .
	;

/** Consume a newline and any whitespace at start of next line */
CONTINUED_LINE
	:	'\\' ('\r')? '\n' (' '|'\t')* { $channel=HIDDEN; }
	;

/** Treat a sequence of blank lines as a single blank line.  If
 *  nested within a (..), {..}, or [..], then ignore newlines.
 *  If the first newline starts in column one, they are to be ignored.
 */
NEWLINE
    :   (('\r')? '\n' )+
        {if ( this.startPos==0 || this.implicitLineJoiningLevel>0 )
            $channel=HIDDEN;
        }
    ;

WS	:	{this.startPos>0}?=> (' '|'\t')+ {$channel=HIDDEN;}
	;

/** Grab everything before a real symbol.  Then if newline, kill it
 *  as this is a blank line.  If whitespace followed by comment, kill it
 *  as it's a comment on a line by itself.
 *
 *  Ignore leading whitespace when nested in [..], (..), {..}.
 */
LEADING_WS
@init {
    var spaces = 0;
}
    :   {this.startPos==0}?=>
    	(   {this.implicitLineJoiningLevel>0}? ( ' ' | '\t' )+ {$channel=HIDDEN;}
       	|	( 	' '  { spaces++; }
        	|	'\t' { spaces += 8; spaces -= (spaces \% 8); }
       		)+
        	{
            // make a string of n spaces where n is column number - 1
            var indentation = new Array(spaces);
            for (var i=0; i<spaces; i++) {
                indentation[i] = ' ';
            }
            var s = indentation.join("");
            this.emit(new org.antlr.runtime.CommonToken(this.LEADING_WS,s));
        	}
        	// kill trailing newline if present and then ignore
        	( ('\r')? '\n' {if (this.state.token!=null) this.state.token.setChannel(HIDDEN); else $channel=HIDDEN;})*
           // {this.token.setChannel(99); }
        )
    ;

/** Comments not on line by themselves are turned into newlines.

    b = a # end of line comment

    or

    a = [1, # weird
         2]

    This rule is invoked directly by nextToken when the comment is in
    first column or when comment is on end of nonwhitespace line.

	Only match \n here if we didn't start on left edge; let NEWLINE return that.
	Kill if newlines if we live on a line by ourselves

	Consume any leading whitespace if it starts on left edge.
 */
COMMENT
@init {
    $channel=HIDDEN;
}
    :	{this.startPos==0}?=> (' '|'\t')* '#' (~'\n')* '\n'+
    |	{this.startPos>0}?=> '#' (~'\n')* // let NEWLINE handle \n unless char pos==0 for '#'
    ;