1 /* 2 [The "BSD license"] 3 Copyright (c) 2005-2009 Terence Parr 4 All rights reserved. 5 6 Redistribution and use in source and binary forms, with or without 7 modification, are permitted provided that the following conditions 8 are met: 9 1. Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 2. Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 3. The name of the author may not be used to endorse or promote products 15 derived from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 package org.antlr.runtime; 29 30 /** A lexer is recognizer that draws input symbols from a character stream. 31 * lexer grammars result in a subclass of this object. A Lexer object 32 * uses simplified match() and error recovery mechanisms in the interest 33 * of speed. 34 */ 35 public abstract class Lexer extends BaseRecognizer implements TokenSource { 36 /** Where is the lexer drawing characters from? */ 37 protected CharStream input; 38 39 public Lexer() { 40 } 41 42 public Lexer(CharStream input) { 43 this.input = input; 44 } 45 46 public Lexer(CharStream input, RecognizerSharedState state) { 47 super(state); 48 this.input = input; 49 } 50 51 public void reset() { 52 super.reset(); // reset all recognizer state variables 53 // wack Lexer state variables 54 if ( input!=null ) { 55 input.seek(0); // rewind the input 56 } 57 if ( state==null ) { 58 return; // no shared state work to do 59 } 60 state.token = null; 61 state.type = Token.INVALID_TOKEN_TYPE; 62 state.channel = Token.DEFAULT_CHANNEL; 63 state.tokenStartCharIndex = -1; 64 state.tokenStartCharPositionInLine = -1; 65 state.tokenStartLine = -1; 66 state.text = null; 67 } 68 69 /** Return a token from this source; i.e., match a token on the char 70 * stream. 71 */ 72 public Token nextToken() { 73 while (true) { 74 state.token = null; 75 state.channel = Token.DEFAULT_CHANNEL; 76 state.tokenStartCharIndex = input.index(); 77 state.tokenStartCharPositionInLine = input.getCharPositionInLine(); 78 state.tokenStartLine = input.getLine(); 79 state.text = null; 80 if ( input.LA(1)==CharStream.EOF ) { 81 Token eof = new CommonToken((CharStream)input,Token.EOF, 82 Token.DEFAULT_CHANNEL, 83 input.index(),input.index()); 84 eof.setLine(getLine()); 85 eof.setCharPositionInLine(getCharPositionInLine()); 86 return eof; 87 } 88 try { 89 mTokens(); 90 if ( state.token==null ) { 91 emit(); 92 } 93 else if ( state.token==Token.SKIP_TOKEN ) { 94 continue; 95 } 96 return state.token; 97 } 98 catch (MismatchedRangeException re) { 99 reportError(re); 100 // matchRange() routine has already called recover() 101 } 102 catch (MismatchedTokenException re) { 103 reportError(re); 104 // match() routine has already called recover() 105 } 106 catch (RecognitionException re) { 107 reportError(re); 108 recover(re); // throw out current char and try again 109 } 110 } 111 } 112 113 /** Instruct the lexer to skip creating a token for current lexer rule 114 * and look for another token. nextToken() knows to keep looking when 115 * a lexer rule finishes with token set to SKIP_TOKEN. Recall that 116 * if token==null at end of any token rule, it creates one for you 117 * and emits it. 118 */ 119 public void skip() { 120 state.token = Token.SKIP_TOKEN; 121 } 122 123 /** This is the lexer entry point that sets instance var 'token' */ 124 public abstract void mTokens() throws RecognitionException; 125 126 /** Set the char stream and reset the lexer */ 127 public void setCharStream(CharStream input) { 128 this.input = null; 129 reset(); 130 this.input = input; 131 } 132 133 public CharStream getCharStream() { 134 return this.input; 135 } 136 137 public String getSourceName() { 138 return input.getSourceName(); 139 } 140 141 /** Currently does not support multiple emits per nextToken invocation 142 * for efficiency reasons. Subclass and override this method and 143 * nextToken (to push tokens into a list and pull from that list rather 144 * than a single variable as this implementation does). 145 */ 146 public void emit(Token token) { 147 state.token = token; 148 } 149 150 /** The standard method called to automatically emit a token at the 151 * outermost lexical rule. The token object should point into the 152 * char buffer start..stop. If there is a text override in 'text', 153 * use that to set the token's text. Override this method to emit 154 * custom Token objects. 155 * 156 * If you are building trees, then you should also override 157 * Parser or TreeParser.getMissingSymbol(). 158 */ 159 public Token emit() { 160 Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1); 161 t.setLine(state.tokenStartLine); 162 t.setText(state.text); 163 t.setCharPositionInLine(state.tokenStartCharPositionInLine); 164 emit(t); 165 return t; 166 } 167 168 public void match(String s) throws MismatchedTokenException { 169 int i = 0; 170 while ( i<s.length() ) { 171 if ( input.LA(1)!=s.charAt(i) ) { 172 if ( state.backtracking>0 ) { 173 state.failed = true; 174 return; 175 } 176 MismatchedTokenException mte = 177 new MismatchedTokenException(s.charAt(i), input); 178 recover(mte); 179 throw mte; 180 } 181 i++; 182 input.consume(); 183 state.failed = false; 184 } 185 } 186 187 public void matchAny() { 188 input.consume(); 189 } 190 191 public void match(int c) throws MismatchedTokenException { 192 if ( input.LA(1)!=c ) { 193 if ( state.backtracking>0 ) { 194 state.failed = true; 195 return; 196 } 197 MismatchedTokenException mte = 198 new MismatchedTokenException(c, input); 199 recover(mte); // don't really recover; just consume in lexer 200 throw mte; 201 } 202 input.consume(); 203 state.failed = false; 204 } 205 206 public void matchRange(int a, int b) 207 throws MismatchedRangeException 208 { 209 if ( input.LA(1)<a || input.LA(1)>b ) { 210 if ( state.backtracking>0 ) { 211 state.failed = true; 212 return; 213 } 214 MismatchedRangeException mre = 215 new MismatchedRangeException(a,b,input); 216 recover(mre); 217 throw mre; 218 } 219 input.consume(); 220 state.failed = false; 221 } 222 223 public int getLine() { 224 return input.getLine(); 225 } 226 227 public int getCharPositionInLine() { 228 return input.getCharPositionInLine(); 229 } 230 231 /** What is the index of the current character of lookahead? */ 232 public int getCharIndex() { 233 return input.index(); 234 } 235 236 /** Return the text matched so far for the current token or any 237 * text override. 238 */ 239 public String getText() { 240 if ( state.text!=null ) { 241 return state.text; 242 } 243 return input.substring(state.tokenStartCharIndex,getCharIndex()-1); 244 } 245 246 /** Set the complete text of this token; it wipes any previous 247 * changes to the text. 248 */ 249 public void setText(String text) { 250 state.text = text; 251 } 252 253 public void reportError(RecognitionException e) { 254 /** TODO: not thought about recovery in lexer yet. 255 * 256 // if we've already reported an error and have not matched a token 257 // yet successfully, don't report any errors. 258 if ( errorRecovery ) { 259 //System.err.print("[SPURIOUS] "); 260 return; 261 } 262 errorRecovery = true; 263 */ 264 265 displayRecognitionError(this.getTokenNames(), e); 266 } 267 268 public String getErrorMessage(RecognitionException e, String[] tokenNames) { 269 String msg = null; 270 if ( e instanceof MismatchedTokenException ) { 271 MismatchedTokenException mte = (MismatchedTokenException)e; 272 msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting "+getCharErrorDisplay(mte.expecting); 273 } 274 else if ( e instanceof NoViableAltException ) { 275 NoViableAltException nvae = (NoViableAltException)e; 276 // for development, can add "decision=<<"+nvae.grammarDecisionDescription+">>" 277 // and "(decision="+nvae.decisionNumber+") and 278 // "state "+nvae.stateNumber 279 msg = "no viable alternative at character "+getCharErrorDisplay(e.c); 280 } 281 else if ( e instanceof EarlyExitException ) { 282 EarlyExitException eee = (EarlyExitException)e; 283 // for development, can add "(decision="+eee.decisionNumber+")" 284 msg = "required (...)+ loop did not match anything at character "+getCharErrorDisplay(e.c); 285 } 286 else if ( e instanceof MismatchedNotSetException ) { 287 MismatchedNotSetException mse = (MismatchedNotSetException)e; 288 msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting set "+mse.expecting; 289 } 290 else if ( e instanceof MismatchedSetException ) { 291 MismatchedSetException mse = (MismatchedSetException)e; 292 msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting set "+mse.expecting; 293 } 294 else if ( e instanceof MismatchedRangeException ) { 295 MismatchedRangeException mre = (MismatchedRangeException)e; 296 msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting set "+ 297 getCharErrorDisplay(mre.a)+".."+getCharErrorDisplay(mre.b); 298 } 299 else { 300 msg = super.getErrorMessage(e, tokenNames); 301 } 302 return msg; 303 } 304 305 public String getCharErrorDisplay(int c) { 306 String s = String.valueOf((char)c); 307 switch ( c ) { 308 case Token.EOF : 309 s = "<EOF>"; 310 break; 311 case '\n' : 312 s = "\\n"; 313 break; 314 case '\t' : 315 s = "\\t"; 316 break; 317 case '\r' : 318 s = "\\r"; 319 break; 320 } 321 return "'"+s+"'"; 322 } 323 324 /** Lexers can normally match any char in it's vocabulary after matching 325 * a token, so do the easy thing and just kill a character and hope 326 * it all works out. You can instead use the rule invocation stack 327 * to do sophisticated error recovery if you are in a fragment rule. 328 */ 329 public void recover(RecognitionException re) { 330 //System.out.println("consuming char "+(char)input.LA(1)+" during recovery"); 331 //re.printStackTrace(); 332 input.consume(); 333 } 334 335 public void traceIn(String ruleName, int ruleIndex) { 336 String inputSymbol = ((char)input.LT(1))+" line="+getLine()+":"+getCharPositionInLine(); 337 super.traceIn(ruleName, ruleIndex, inputSymbol); 338 } 339 340 public void traceOut(String ruleName, int ruleIndex) { 341 String inputSymbol = ((char)input.LT(1))+" line="+getLine()+":"+getCharPositionInLine(); 342 super.traceOut(ruleName, ruleIndex, inputSymbol); 343 } 344 } 345