1 /* 2 [The 'BSD licence'] 3 Copyright (c) 2004 Terence Parr and Loring Craymer 4 All rights reserved. 5 6 Redistribution and use in source and binary forms, with or without 7 modification, are permitted provided that the following conditions 8 are met: 9 1. Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 2. Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 3. The name of the author may not be used to endorse or promote products 15 derived from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /** Python 2.3.3 Grammar 30 * 31 * Terence Parr and Loring Craymer 32 * February 2004 33 * 34 * Converted to ANTLR v3 November 2005 by Terence Parr. 35 * 36 * This grammar was derived automatically from the Python 2.3.3 37 * parser grammar to get a syntactically correct ANTLR grammar 38 * for Python. Then Terence hand tweaked it to be semantically 39 * correct; i.e., removed lookahead issues etc... It is LL(1) 40 * except for the (sometimes optional) trailing commas and semi-colons. 41 * It needs two symbols of lookahead in this case. 42 * 43 * Starting with Loring's preliminary lexer for Python, I modified it 44 * to do my version of the whole nasty INDENT/DEDENT issue just so I 45 * could understand the problem better. This grammar requires 46 * PythonTokenStream.java to work. Also I used some rules from the 47 * semi-formal grammar on the web for Python (automatically 48 * translated to ANTLR format by an ANTLR grammar, naturally <grin>). 49 * The lexical rules for python are particularly nasty and it took me 50 * a long time to get it 'right'; i.e., think about it in the proper 51 * way. Resist changing the lexer unless you've used ANTLR a lot. ;) 52 * 53 * I (Terence) tested this by running it on the jython-2.1/Lib 54 * directory of 40k lines of Python. 55 * 56 * REQUIRES ANTLR v3 57 */ 58 grammar Python; 59 options {language=JavaScript;} 60 61 tokens { 62 INDENT; 63 DEDENT; 64 } 65 66 @lexer::members { 67 /** Handles context-sensitive lexing of implicit line joining such as 68 * the case where newline is ignored in cases like this: 69 * a = [3, 70 * 4] 71 */ 72 this.implicitLineJoiningLevel= 0; 73 this.startPos = -1; 74 } 75 76 single_input 77 : NEWLINE 78 | simple_stmt 79 | compound_stmt NEWLINE 80 ; 81 82 file_input 83 : (NEWLINE | stmt)* 84 ; 85 86 eval_input 87 : (NEWLINE)* testlist (NEWLINE)* 88 ; 89 90 funcdef 91 : 'def' NAME parameters COLON suite 92 {xlog("found method def "+$NAME.text);} 93 ; 94 95 parameters 96 : LPAREN (varargslist)? RPAREN 97 ; 98 99 varargslist 100 : defparameter (options {greedy=true;}:COMMA defparameter)* 101 (COMMA 102 ( STAR NAME (COMMA DOUBLESTAR NAME)? 103 | DOUBLESTAR NAME 104 )? 105 )? 106 | STAR NAME (COMMA DOUBLESTAR NAME)? 107 | DOUBLESTAR NAME 108 ; 109 110 defparameter 111 : fpdef (ASSIGN test)? 112 ; 113 114 fpdef 115 : NAME 116 | LPAREN fplist RPAREN 117 ; 118 119 fplist 120 : fpdef (options {greedy=true;}:COMMA fpdef)* (COMMA)? 121 ; 122 123 124 stmt: simple_stmt 125 | compound_stmt 126 ; 127 128 simple_stmt 129 : small_stmt (options {greedy=true;}:SEMI small_stmt)* (SEMI)? NEWLINE 130 ; 131 132 small_stmt: expr_stmt 133 | print_stmt 134 | del_stmt 135 | pass_stmt 136 | flow_stmt 137 | import_stmt 138 | global_stmt 139 | exec_stmt 140 | assert_stmt 141 ; 142 143 expr_stmt 144 : testlist 145 ( augassign testlist 146 | (ASSIGN testlist)+ 147 )? 148 ; 149 150 augassign 151 : PLUSEQUAL 152 | MINUSEQUAL 153 | STAREQUAL 154 | SLASHEQUAL 155 | PERCENTEQUAL 156 | AMPEREQUAL 157 | VBAREQUAL 158 | CIRCUMFLEXEQUAL 159 | LEFTSHIFTEQUAL 160 | RIGHTSHIFTEQUAL 161 | DOUBLESTAREQUAL 162 | DOUBLESLASHEQUAL 163 ; 164 165 print_stmt: 166 'print' 167 ( testlist 168 | RIGHTSHIFT testlist 169 )? 170 ; 171 172 del_stmt: 'del' exprlist 173 ; 174 175 pass_stmt: 'pass' 176 ; 177 178 flow_stmt: break_stmt 179 | continue_stmt 180 | return_stmt 181 | raise_stmt 182 | yield_stmt 183 ; 184 185 break_stmt: 'break' 186 ; 187 188 continue_stmt: 'continue' 189 ; 190 191 return_stmt: 'return' (testlist)? 192 ; 193 194 yield_stmt: 'yield' testlist 195 ; 196 197 raise_stmt: 'raise' (test (COMMA test (COMMA test)?)?)? 198 ; 199 200 import_stmt 201 : 'import' dotted_as_name (COMMA dotted_as_name)* 202 | 'from' dotted_name 'import' 203 (STAR | import_as_name (COMMA import_as_name)*) 204 ; 205 206 import_as_name 207 : NAME (NAME NAME)? 208 ; 209 210 dotted_as_name: dotted_name (NAME NAME)? 211 ; 212 213 dotted_name: NAME (DOT NAME)* 214 ; 215 216 global_stmt: 'global' NAME (COMMA NAME)* 217 ; 218 219 exec_stmt: 'exec' expr ('in' test (COMMA test)?)? 220 ; 221 222 assert_stmt: 'assert' test (COMMA test)? 223 ; 224 225 226 compound_stmt: if_stmt 227 | while_stmt 228 | for_stmt 229 | try_stmt 230 | funcdef 231 | classdef 232 ; 233 234 if_stmt: 'if' test COLON suite ('elif' test COLON suite)* ('else' COLON suite)? 235 ; 236 237 while_stmt: 'while' test COLON suite ('else' COLON suite)? 238 ; 239 240 for_stmt: 'for' exprlist 'in' testlist COLON suite ('else' COLON suite)? 241 ; 242 243 try_stmt 244 : 'try' COLON suite 245 ( (except_clause COLON suite)+ ('else' COLON suite)? 246 | 'finally' COLON suite 247 ) 248 ; 249 250 except_clause: 'except' (test (COMMA test)?)? 251 ; 252 253 suite: simple_stmt 254 | NEWLINE INDENT (stmt)+ DEDENT 255 ; 256 257 258 test: and_test ('or' and_test)* 259 | lambdef 260 ; 261 262 and_test 263 : not_test ('and' not_test)* 264 ; 265 266 not_test 267 : 'not' not_test 268 | comparison 269 ; 270 271 comparison: expr (comp_op expr)* 272 ; 273 274 comp_op: LESS 275 |GREATER 276 |EQUAL 277 |GREATEREQUAL 278 |LESSEQUAL 279 |ALT_NOTEQUAL 280 |NOTEQUAL 281 |'in' 282 |'not' 'in' 283 |'is' 284 |'is' 'not' 285 ; 286 287 expr: xor_expr (VBAR xor_expr)* 288 ; 289 290 xor_expr: and_expr (CIRCUMFLEX and_expr)* 291 ; 292 293 and_expr: shift_expr (AMPER shift_expr)* 294 ; 295 296 shift_expr: arith_expr ((LEFTSHIFT|RIGHTSHIFT) arith_expr)* 297 ; 298 299 arith_expr: term ((PLUS|MINUS) term)* 300 ; 301 302 term: factor ((STAR | SLASH | PERCENT | DOUBLESLASH ) factor)* 303 ; 304 305 factor 306 : (PLUS|MINUS|TILDE) factor 307 | power 308 ; 309 310 power 311 : atom (trailer)* (options {greedy=true;}:DOUBLESTAR factor)? 312 ; 313 314 atom: LPAREN (testlist)? RPAREN 315 | LBRACK (listmaker)? RBRACK 316 | LCURLY (dictmaker)? RCURLY 317 | BACKQUOTE testlist BACKQUOTE 318 | NAME 319 | INT 320 | LONGINT 321 | FLOAT 322 | COMPLEX 323 | (STRING)+ 324 ; 325 326 listmaker: test ( list_for | (options {greedy=true;}:COMMA test)* ) (COMMA)? 327 ; 328 329 lambdef: 'lambda' (varargslist)? COLON test 330 ; 331 332 trailer: LPAREN (arglist)? RPAREN 333 | LBRACK subscriptlist RBRACK 334 | DOT NAME 335 ; 336 337 subscriptlist 338 : subscript (options {greedy=true;}:COMMA subscript)* (COMMA)? 339 ; 340 341 subscript 342 : DOT DOT DOT 343 | test (COLON (test)? (sliceop)?)? 344 | COLON (test)? (sliceop)? 345 ; 346 347 sliceop: COLON (test)? 348 ; 349 350 exprlist 351 : expr (options {k=2;}:COMMA expr)* (COMMA)? 352 ; 353 354 testlist 355 : test (options {k=2;}: COMMA test)* (COMMA)? 356 ; 357 358 dictmaker 359 : test COLON test 360 (options {k=2;}:COMMA test COLON test)* (COMMA)? 361 ; 362 363 classdef: 'class' NAME (LPAREN testlist RPAREN)? COLON suite 364 {xlog("found class def "+$NAME.text);} 365 ; 366 367 arglist: argument (COMMA argument)* 368 ( COMMA 369 ( STAR test (COMMA DOUBLESTAR test)? 370 | DOUBLESTAR test 371 )? 372 )? 373 | STAR test (COMMA DOUBLESTAR test)? 374 | DOUBLESTAR test 375 ; 376 377 argument : test (ASSIGN test)? 378 ; 379 380 list_iter: list_for 381 | list_if 382 ; 383 384 list_for: 'for' exprlist 'in' testlist (list_iter)? 385 ; 386 387 list_if: 'if' test (list_iter)? 388 ; 389 390 LPAREN : '(' {this.implicitLineJoiningLevel++;} ; 391 392 RPAREN : ')' {this.implicitLineJoiningLevel--;} ; 393 394 LBRACK : '[' {this.implicitLineJoiningLevel++;} ; 395 396 RBRACK : ']' {this.implicitLineJoiningLevel--;} ; 397 398 COLON : ':' ; 399 400 COMMA : ',' ; 401 402 SEMI : ';' ; 403 404 PLUS : '+' ; 405 406 MINUS : '-' ; 407 408 STAR : '*' ; 409 410 SLASH : '/' ; 411 412 VBAR : '|' ; 413 414 AMPER : '&' ; 415 416 LESS : '<' ; 417 418 GREATER : '>' ; 419 420 ASSIGN : '=' ; 421 422 PERCENT : '%' ; 423 424 BACKQUOTE : '`' ; 425 426 LCURLY : '{' {this.implicitLineJoiningLevel++;} ; 427 428 RCURLY : '}' {this.implicitLineJoiningLevel--;} ; 429 430 CIRCUMFLEX : '^' ; 431 432 TILDE : '~' ; 433 434 EQUAL : '==' ; 435 436 NOTEQUAL : '!=' ; 437 438 ALT_NOTEQUAL: '<>' ; 439 440 LESSEQUAL : '<=' ; 441 442 LEFTSHIFT : '<<' ; 443 444 GREATEREQUAL : '>=' ; 445 446 RIGHTSHIFT : '>>' ; 447 448 PLUSEQUAL : '+=' ; 449 450 MINUSEQUAL : '-=' ; 451 452 DOUBLESTAR : '**' ; 453 454 STAREQUAL : '*=' ; 455 456 DOUBLESLASH : '//' ; 457 458 SLASHEQUAL : '/=' ; 459 460 VBAREQUAL : '|=' ; 461 462 PERCENTEQUAL : '%=' ; 463 464 AMPEREQUAL : '&=' ; 465 466 CIRCUMFLEXEQUAL : '^=' ; 467 468 LEFTSHIFTEQUAL : '<<=' ; 469 470 RIGHTSHIFTEQUAL : '>>=' ; 471 472 DOUBLESTAREQUAL : '**=' ; 473 474 DOUBLESLASHEQUAL : '//=' ; 475 476 DOT : '.' ; 477 478 FLOAT 479 : '.' DIGITS (Exponent)? 480 | DIGITS ('.' (DIGITS (Exponent)?)? | Exponent) 481 ; 482 483 LONGINT 484 : INT ('l'|'L') 485 ; 486 487 fragment 488 Exponent 489 : ('e' | 'E') ( '+' | '-' )? DIGITS 490 ; 491 492 INT : // Hex 493 '0' ('x' | 'X') ( '0' .. '9' | 'a' .. 'f' | 'A' .. 'F' )+ 494 ('l' | 'L')? 495 | // Octal 496 '0' DIGITS* 497 | '1'..'9' DIGITS* 498 ; 499 500 COMPLEX 501 : INT ('j'|'J') 502 | FLOAT ('j'|'J') 503 ; 504 505 fragment 506 DIGITS : ( '0' .. '9' )+ ; 507 508 NAME: ( 'a' .. 'z' | 'A' .. 'Z' | '_') 509 ( 'a' .. 'z' | 'A' .. 'Z' | '_' | '0' .. '9' )* 510 ; 511 512 /** Match various string types. Note that greedy=false implies ''' 513 * should make us exit loop not continue. 514 */ 515 STRING 516 : ('r'|'u'|'ur')? 517 ( '\'\'\'' (options {greedy=false;}:.)* '\'\'\'' 518 | '"""' (options {greedy=false;}:.)* '"""' 519 | '"' (ESC|~('\\'|'\n'|'"'))* '"' 520 | '\'' (ESC|~('\\'|'\n'|'\''))* '\'' 521 ) 522 ; 523 524 fragment 525 ESC 526 : '\\' . 527 ; 528 529 /** Consume a newline and any whitespace at start of next line */ 530 CONTINUED_LINE 531 : '\\' ('\r')? '\n' (' '|'\t')* { $channel=HIDDEN; } 532 ; 533 534 /** Treat a sequence of blank lines as a single blank line. If 535 * nested within a (..), {..}, or [..], then ignore newlines. 536 * If the first newline starts in column one, they are to be ignored. 537 */ 538 NEWLINE 539 : (('\r')? '\n' )+ 540 {if ( this.startPos==0 || this.implicitLineJoiningLevel>0 ) 541 $channel=HIDDEN; 542 } 543 ; 544 545 WS : {this.startPos>0}?=> (' '|'\t')+ {$channel=HIDDEN;} 546 ; 547 548 /** Grab everything before a real symbol. Then if newline, kill it 549 * as this is a blank line. If whitespace followed by comment, kill it 550 * as it's a comment on a line by itself. 551 * 552 * Ignore leading whitespace when nested in [..], (..), {..}. 553 */ 554 LEADING_WS 555 @init { 556 var spaces = 0; 557 } 558 : {this.startPos==0}?=> 559 ( {this.implicitLineJoiningLevel>0}? ( ' ' | '\t' )+ {$channel=HIDDEN;} 560 | ( ' ' { spaces++; } 561 | '\t' { spaces += 8; spaces -= (spaces \% 8); } 562 )+ 563 { 564 // make a string of n spaces where n is column number - 1 565 var indentation = new Array(spaces); 566 for (var i=0; i<spaces; i++) { 567 indentation[i] = ' '; 568 } 569 var s = indentation.join(""); 570 this.emit(new org.antlr.runtime.CommonToken(this.LEADING_WS,s)); 571 } 572 // kill trailing newline if present and then ignore 573 ( ('\r')? '\n' {if (this.state.token!=null) this.state.token.setChannel(HIDDEN); else $channel=HIDDEN;})* 574 // {this.token.setChannel(99); } 575 ) 576 ; 577 578 /** Comments not on line by themselves are turned into newlines. 579 580 b = a # end of line comment 581 582 or 583 584 a = [1, # weird 585 2] 586 587 This rule is invoked directly by nextToken when the comment is in 588 first column or when comment is on end of nonwhitespace line. 589 590 Only match \n here if we didn't start on left edge; let NEWLINE return that. 591 Kill if newlines if we live on a line by ourselves 592 593 Consume any leading whitespace if it starts on left edge. 594 */ 595 COMMENT 596 @init { 597 $channel=HIDDEN; 598 } 599 : {this.startPos==0}?=> (' '|'\t')* '#' (~'\n')* '\n'+ 600 | {this.startPos>0}?=> '#' (~'\n')* // let NEWLINE handle \n unless char pos==0 for '#' 601 ; 602