Home | History | Annotate | Download | only in functional
      1 /*
      2  [The "BSD licence"]
      3  Copyright (c) 2004 Terence Parr and Loring Craymer
      4  All rights reserved.
      5 
      6  Redistribution and use in source and binary forms, with or without
      7  modification, are permitted provided that the following conditions
      8  are met:
      9  1. Redistributions of source code must retain the above copyright
     10     notice, this list of conditions and the following disclaimer.
     11  2. Redistributions in binary form must reproduce the above copyright
     12     notice, this list of conditions and the following disclaimer in the
     13     documentation and/or other materials provided with the distribution.
     14  3. The name of the author may not be used to endorse or promote products
     15     derived from this software without specific prior written permission.
     16 
     17  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     18  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     19  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     20  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     21  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     22  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     26  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 */
     28 
     29 
     30 /** Python does not explicitly provide begin and end nesting signals.
     31  Rather, the indentation level indicates when you begin and end.
     32  This is an interesting lexical problem because multiple DEDENT
     33  tokens should be sent to the parser sometimes without a corresponding
     34  input symbol!  Consider the following example:
     35 
     36  a=1
     37  if a>1:
     38      print a
     39  b=3
     40 
     41  Here the "b" token on the left edge signals that a DEDENT is needed
     42  after the "print a \n" and before the "b".  The sequence should be
     43 
     44  ... 1 COLON NEWLINE INDENT PRINT a NEWLINE DEDENT b ASSIGN 3 ...
     45 
     46  For more examples, see the big comment at the bottom of this file.
     47 
     48  This TokenStream normally just passes tokens through to the parser.
     49  Upon NEWLINE token from the lexer, however, an INDENT or DEDENT token
     50  may need to be sent to the parser.  The NEWLINE is the trigger for
     51  this class to do it's job.  NEWLINE is saved and then the first token
     52  of the next line is examined.  If non-leading-whitespace token,
     53  then check against stack for indent vs dedent.  If LEADING_WS, then
     54  the column of the next non-whitespace token will dictate indent vs
     55  dedent.  The column of the next real token is number of spaces
     56  in the LEADING_WS token + 1 (to move past the whitespace).  The
     57  lexer grammar must set the text of the LEADING_WS token to be
     58  the proper number of spaces (and do tab conversion etc...).
     59 
     60  A stack of column numbers is tracked and used to detect changes
     61  in indent level from one token to the next.
     62 
     63  A queue of tokens is built up to hold multiple DEDENT tokens that
     64  are generated.  Before asking the lexer for another token via
     65  nextToken(), the queue is flushed first one token at a time.
     66 
     67  Terence Parr and Loring Craymer
     68  February 2004
     69  */
     70 PythonTokenSource = function(stream) {
     71     this.stream = stream;
     72 	/** The stack of indent levels (column numbers) */
     73 	this.indentStack = new Array(PythonTokenSource.MAX_INDENTS);
     74 	/** stack pointer */
     75 	this.sp=-1; // grow upwards
     76 
     77 	/** The queue of tokens */
     78 	this.tokens = [];
     79 	this.lastTokenAddedIndex = -1;
     80 	this.push(PythonTokenSource.FIRST_CHAR_POSITION);
     81 };
     82 
     83 ANTLR.lang.augmentObject(PythonTokenSource, {
     84 	MAX_INDENTS: 100,
     85 	FIRST_CHAR_POSITION: 0,
     86 });
     87 
     88 PythonTokenSource.prototype = {
     89 	getSourceName: function() {
     90 		return this.stream.getSourceName();
     91 	},
     92 
     93 	/** From http://www.python.org/doc/2.2.3/ref/indentation.html
     94 
     95 	 "Before the first line of the file is read, a single zero is
     96 	 pushed on the stack; this will never be popped off again. The
     97 	 numbers pushed on the stack will always be strictly increasing
     98 	 from bottom to top. At the beginning of each logical line, the
     99 	 line's indentation level is compared to the top of the
    100 	 stack. If it is equal, nothing happens. If it is larger, it is
    101 	 pushed on the stack, and one INDENT token is generated. If it
    102 	 is smaller, it must be one of the numbers occurring on the
    103 	 stack; all numbers on the stack that are larger are popped
    104 	 off, and for each number popped off a DEDENT token is
    105 	 generated. At the end of the file, a DEDENT token is generated
    106 	 for each number remaining on the stack that is larger than
    107 	 zero."
    108 
    109 	 I use char position in line 0..n-1 instead.
    110 
    111 	 The DEDENTS possibly needed at EOF are gracefully handled by forcing
    112 	 EOF to have char pos 0 even though with UNIX it's hard to get EOF
    113 	 at a non left edge.
    114 	 */
    115 	nextToken: function() {
    116 		// if something in queue, just remove and return it
    117 		if (this.tokens.length>0 ) {
    118 			var t = this.tokens[0];
    119 			this.tokens.splice(0,1);
    120 			return t;
    121 		}
    122 
    123 		this.insertImaginaryIndentDedentTokens();
    124 
    125 		return this.nextToken();
    126 	},
    127 
    128 	insertImaginaryIndentDedentTokens: function()
    129 	{
    130 		var t = this.stream.LT(1);
    131 		this.stream.consume();
    132 
    133 		// if not a NEWLINE, doesn't signal indent/dedent work; just enqueue
    134 		if ( t.getType()!=PythonLexer.NEWLINE ) {
    135 			var hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
    136 			if ( hiddenTokens!=null ) {
    137 				this.tokens = this.tokens.concat(hiddenTokens);
    138 			}
    139 			this.lastTokenAddedIndex = t.getTokenIndex();
    140 			this.tokens.push(t);
    141 			return;
    142 		}
    143 
    144 		// save NEWLINE in the queue
    145 		var hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
    146 		if ( hiddenTokens!=null ) {
    147 			this.tokens = this.tokens.concat(hiddenTokens);
    148 		}
    149 		this.lastTokenAddedIndex = t.getTokenIndex();
    150 		this.tokens.push(t);
    151 
    152 		// grab first token of next line
    153 		t = this.stream.LT(1);
    154 		this.stream.consume();
    155 
    156 		hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
    157 		if ( hiddenTokens!=null ) {
    158 			this.tokens = this.tokens.concat(hiddenTokens);
    159 		}
    160 		this.lastTokenAddedIndex = t.getTokenIndex();
    161 
    162 		// compute cpos as the char pos of next non-WS token in line
    163 		var cpos = t.getCharPositionInLine(); // column dictates indent/dedent
    164 		if ( t.getType()==ANTLR.runtime.Token.EOF ) {
    165 			cpos = -1; // pretend EOF always happens at left edge
    166 		}
    167 		else if ( t.getType()==PythonLexer.LEADING_WS ) {
    168 			cpos = t.getText().length;
    169 		}
    170 
    171 		// compare to last indent level
    172 		var lastIndent = this.peek();
    173 		if ( cpos > lastIndent ) { // they indented; track and gen INDENT
    174 			this.push(cpos);
    175 			var indent = new ANTLR.runtime.CommonToken(PythonParser.INDENT, "");
    176 			indent.setCharPositionInLine(t.getCharPositionInLine());
    177 			indent.setLine(t.getLine());
    178 			this.tokens.push(indent);
    179 		}
    180 		else if ( cpos < lastIndent ) { // they dedented
    181 			// how far back did we dedent?
    182 			var prevIndex = this.findPreviousIndent(cpos);
    183 			// generate DEDENTs for each indent level we backed up over
    184 			for (var d=this.sp-1; d>=prevIndex; d--) {
    185 				var dedent = new ANTLR.runtime.CommonToken(PythonParser.DEDENT, "");
    186 				dedent.setCharPositionInLine(t.getCharPositionInLine());
    187 				dedent.setLine(t.getLine());
    188 				this.tokens.push(dedent);
    189 			}
    190 			this.sp = prevIndex; // pop those off indent level
    191 		}
    192 		if ( t.getType()!=PythonLexer.LEADING_WS ) { // discard WS
    193 			this.tokens.push(t);
    194 		}
    195 	},
    196 
    197 	//  T O K E N  S T A C K  M E T H O D S
    198 
    199 	push: function(i) {
    200 		if (this.sp>=PythonTokenSource.MAX_INDENTS) {
    201 			throw new Error("stack overflow");
    202 		}
    203 		this.sp++;
    204 		this.indentStack[this.sp] = i;
    205 	},
    206 
    207 	pop: function() {
    208 		if (this.sp<0) {
    209 			throw new Error("stack underflow");
    210 		}
    211 		var top = this.indentStack[this.sp];
    212 		this.sp--;
    213 		return top;
    214 	},
    215 
    216 	peek: function() {
    217 		return this.indentStack[this.sp];
    218 	},
    219 
    220 	/** Return the index on stack of previous indent level == i else -1 */
    221 	findPreviousIndent: function(i) {
    222 		for (var j=this.sp-1; j>=0; j--) {
    223 			if (this.indentStack[j]==i ) {
    224 				return j;
    225 			}
    226 		}
    227 		return PythonTokenSource.FIRST_CHAR_POSITION;
    228 	},
    229 
    230 	stackString: function() {
    231 		var buf = [];
    232 		for (var j=this.sp; j>=0; j--) {
    233 			buf.push(this.indentStack[j]);
    234 		}
    235 		return buf.join(" ");
    236 	}
    237 
    238 }
    239 
    240 /* More example input / output pairs with code simplified to single chars
    241 ------- t1 -------
    242 a a
    243         b b
    244         c
    245 d
    246 a a \n INDENT b b \n c \n DEDENT d \n EOF
    247 ------- t2 -------
    248 a  c
    249  b
    250 c
    251 a c \n INDENT b \n DEDENT c \n EOF
    252 ------- t3 -------
    253 a
    254         b
    255                 c
    256 d
    257 a \n INDENT b \n INDENT c \n DEDENT DEDENT d \n EOF
    258 ------- t4 -------
    259 a
    260     c
    261                   d
    262     e
    263     f
    264              g
    265              h
    266              i
    267               j
    268     k
    269 a \n INDENT c \n INDENT d \n DEDENT e \n f \n INDENT g \n h \n i \n INDENT j \n DEDENT DEDENT k \n DEDENT EOF
    270 ------- t5 -------
    271 a
    272         b
    273         c
    274                 d
    275                 e
    276 a \n INDENT b \n c \n INDENT d \n e \n DEDENT DEDENT EOF
    277 */
    278