1 /* 2 [The "BSD licence"] 3 Copyright (c) 2004 Terence Parr and Loring Craymer 4 All rights reserved. 5 6 Redistribution and use in source and binary forms, with or without 7 modification, are permitted provided that the following conditions 8 are met: 9 1. Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 2. Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 3. The name of the author may not be used to endorse or promote products 15 derived from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 30 /** Python does not explicitly provide begin and end nesting signals. 31 Rather, the indentation level indicates when you begin and end. 32 This is an interesting lexical problem because multiple DEDENT 33 tokens should be sent to the parser sometimes without a corresponding 34 input symbol! Consider the following example: 35 36 a=1 37 if a>1: 38 print a 39 b=3 40 41 Here the "b" token on the left edge signals that a DEDENT is needed 42 after the "print a \n" and before the "b". The sequence should be 43 44 ... 1 COLON NEWLINE INDENT PRINT a NEWLINE DEDENT b ASSIGN 3 ... 45 46 For more examples, see the big comment at the bottom of this file. 47 48 This TokenStream normally just passes tokens through to the parser. 49 Upon NEWLINE token from the lexer, however, an INDENT or DEDENT token 50 may need to be sent to the parser. The NEWLINE is the trigger for 51 this class to do it's job. NEWLINE is saved and then the first token 52 of the next line is examined. If non-leading-whitespace token, 53 then check against stack for indent vs dedent. If LEADING_WS, then 54 the column of the next non-whitespace token will dictate indent vs 55 dedent. The column of the next real token is number of spaces 56 in the LEADING_WS token + 1 (to move past the whitespace). The 57 lexer grammar must set the text of the LEADING_WS token to be 58 the proper number of spaces (and do tab conversion etc...). 59 60 A stack of column numbers is tracked and used to detect changes 61 in indent level from one token to the next. 62 63 A queue of tokens is built up to hold multiple DEDENT tokens that 64 are generated. Before asking the lexer for another token via 65 nextToken(), the queue is flushed first one token at a time. 66 67 Terence Parr and Loring Craymer 68 February 2004 69 */ 70 PythonTokenSource = function(stream) { 71 this.stream = stream; 72 /** The stack of indent levels (column numbers) */ 73 this.indentStack = new Array(PythonTokenSource.MAX_INDENTS); 74 /** stack pointer */ 75 this.sp=-1; // grow upwards 76 77 /** The queue of tokens */ 78 this.tokens = []; 79 this.lastTokenAddedIndex = -1; 80 this.push(PythonTokenSource.FIRST_CHAR_POSITION); 81 }; 82 83 ANTLR.lang.augmentObject(PythonTokenSource, { 84 MAX_INDENTS: 100, 85 FIRST_CHAR_POSITION: 0, 86 }); 87 88 PythonTokenSource.prototype = { 89 getSourceName: function() { 90 return this.stream.getSourceName(); 91 }, 92 93 /** From http://www.python.org/doc/2.2.3/ref/indentation.html 94 95 "Before the first line of the file is read, a single zero is 96 pushed on the stack; this will never be popped off again. The 97 numbers pushed on the stack will always be strictly increasing 98 from bottom to top. At the beginning of each logical line, the 99 line's indentation level is compared to the top of the 100 stack. If it is equal, nothing happens. If it is larger, it is 101 pushed on the stack, and one INDENT token is generated. If it 102 is smaller, it must be one of the numbers occurring on the 103 stack; all numbers on the stack that are larger are popped 104 off, and for each number popped off a DEDENT token is 105 generated. At the end of the file, a DEDENT token is generated 106 for each number remaining on the stack that is larger than 107 zero." 108 109 I use char position in line 0..n-1 instead. 110 111 The DEDENTS possibly needed at EOF are gracefully handled by forcing 112 EOF to have char pos 0 even though with UNIX it's hard to get EOF 113 at a non left edge. 114 */ 115 nextToken: function() { 116 // if something in queue, just remove and return it 117 if (this.tokens.length>0 ) { 118 var t = this.tokens[0]; 119 this.tokens.splice(0,1); 120 return t; 121 } 122 123 this.insertImaginaryIndentDedentTokens(); 124 125 return this.nextToken(); 126 }, 127 128 insertImaginaryIndentDedentTokens: function() 129 { 130 var t = this.stream.LT(1); 131 this.stream.consume(); 132 133 // if not a NEWLINE, doesn't signal indent/dedent work; just enqueue 134 if ( t.getType()!=PythonLexer.NEWLINE ) { 135 var hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1); 136 if ( hiddenTokens!=null ) { 137 this.tokens = this.tokens.concat(hiddenTokens); 138 } 139 this.lastTokenAddedIndex = t.getTokenIndex(); 140 this.tokens.push(t); 141 return; 142 } 143 144 // save NEWLINE in the queue 145 var hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1); 146 if ( hiddenTokens!=null ) { 147 this.tokens = this.tokens.concat(hiddenTokens); 148 } 149 this.lastTokenAddedIndex = t.getTokenIndex(); 150 this.tokens.push(t); 151 152 // grab first token of next line 153 t = this.stream.LT(1); 154 this.stream.consume(); 155 156 hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1); 157 if ( hiddenTokens!=null ) { 158 this.tokens = this.tokens.concat(hiddenTokens); 159 } 160 this.lastTokenAddedIndex = t.getTokenIndex(); 161 162 // compute cpos as the char pos of next non-WS token in line 163 var cpos = t.getCharPositionInLine(); // column dictates indent/dedent 164 if ( t.getType()==ANTLR.runtime.Token.EOF ) { 165 cpos = -1; // pretend EOF always happens at left edge 166 } 167 else if ( t.getType()==PythonLexer.LEADING_WS ) { 168 cpos = t.getText().length; 169 } 170 171 // compare to last indent level 172 var lastIndent = this.peek(); 173 if ( cpos > lastIndent ) { // they indented; track and gen INDENT 174 this.push(cpos); 175 var indent = new ANTLR.runtime.CommonToken(PythonParser.INDENT, ""); 176 indent.setCharPositionInLine(t.getCharPositionInLine()); 177 indent.setLine(t.getLine()); 178 this.tokens.push(indent); 179 } 180 else if ( cpos < lastIndent ) { // they dedented 181 // how far back did we dedent? 182 var prevIndex = this.findPreviousIndent(cpos); 183 // generate DEDENTs for each indent level we backed up over 184 for (var d=this.sp-1; d>=prevIndex; d--) { 185 var dedent = new ANTLR.runtime.CommonToken(PythonParser.DEDENT, ""); 186 dedent.setCharPositionInLine(t.getCharPositionInLine()); 187 dedent.setLine(t.getLine()); 188 this.tokens.push(dedent); 189 } 190 this.sp = prevIndex; // pop those off indent level 191 } 192 if ( t.getType()!=PythonLexer.LEADING_WS ) { // discard WS 193 this.tokens.push(t); 194 } 195 }, 196 197 // T O K E N S T A C K M E T H O D S 198 199 push: function(i) { 200 if (this.sp>=PythonTokenSource.MAX_INDENTS) { 201 throw new Error("stack overflow"); 202 } 203 this.sp++; 204 this.indentStack[this.sp] = i; 205 }, 206 207 pop: function() { 208 if (this.sp<0) { 209 throw new Error("stack underflow"); 210 } 211 var top = this.indentStack[this.sp]; 212 this.sp--; 213 return top; 214 }, 215 216 peek: function() { 217 return this.indentStack[this.sp]; 218 }, 219 220 /** Return the index on stack of previous indent level == i else -1 */ 221 findPreviousIndent: function(i) { 222 for (var j=this.sp-1; j>=0; j--) { 223 if (this.indentStack[j]==i ) { 224 return j; 225 } 226 } 227 return PythonTokenSource.FIRST_CHAR_POSITION; 228 }, 229 230 stackString: function() { 231 var buf = []; 232 for (var j=this.sp; j>=0; j--) { 233 buf.push(this.indentStack[j]); 234 } 235 return buf.join(" "); 236 } 237 238 } 239 240 /* More example input / output pairs with code simplified to single chars 241 ------- t1 ------- 242 a a 243 b b 244 c 245 d 246 a a \n INDENT b b \n c \n DEDENT d \n EOF 247 ------- t2 ------- 248 a c 249 b 250 c 251 a c \n INDENT b \n DEDENT c \n EOF 252 ------- t3 ------- 253 a 254 b 255 c 256 d 257 a \n INDENT b \n INDENT c \n DEDENT DEDENT d \n EOF 258 ------- t4 ------- 259 a 260 c 261 d 262 e 263 f 264 g 265 h 266 i 267 j 268 k 269 a \n INDENT c \n INDENT d \n DEDENT e \n f \n INDENT g \n h \n i \n INDENT j \n DEDENT DEDENT k \n DEDENT EOF 270 ------- t5 ------- 271 a 272 b 273 c 274 d 275 e 276 a \n INDENT b \n c \n INDENT d \n e \n DEDENT DEDENT EOF 277 */ 278