1 /* 2 [The "BSD license"] 3 Copyright (c) 2005-2009 Terence Parr 4 All rights reserved. 5 6 Redistribution and use in source and binary forms, with or without 7 modification, are permitted provided that the following conditions 8 are met: 9 1. Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 2. Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 3. The name of the author may not be used to endorse or promote products 15 derived from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 package org.antlr.runtime; 29 30 /** A lexer is recognizer that draws input symbols from a character stream. 31 * lexer grammars result in a subclass of this object. A Lexer object 32 * uses simplified match() and error recovery mechanisms in the interest 33 * of speed. 34 */ 35 public abstract class Lexer extends BaseRecognizer implements TokenSource { 36 /** Where is the lexer drawing characters from? */ 37 protected CharStream input; 38 39 public Lexer() { 40 } 41 42 public Lexer(CharStream input) { 43 this.input = input; 44 } 45 46 public Lexer(CharStream input, RecognizerSharedState state) { 47 super(state); 48 this.input = input; 49 } 50 51 public void reset() { 52 super.reset(); // reset all recognizer state variables 53 // wack Lexer state variables 54 if ( input!=null ) { 55 input.seek(0); // rewind the input 56 } 57 if ( state==null ) { 58 return; // no shared state work to do 59 } 60 state.token = null; 61 state.type = Token.INVALID_TOKEN_TYPE; 62 state.channel = Token.DEFAULT_CHANNEL; 63 state.tokenStartCharIndex = -1; 64 state.tokenStartCharPositionInLine = -1; 65 state.tokenStartLine = -1; 66 state.text = null; 67 } 68 69 /** Return a token from this source; i.e., match a token on the char 70 * stream. 71 */ 72 public Token nextToken() { 73 while (true) { 74 state.token = null; 75 state.channel = Token.DEFAULT_CHANNEL; 76 state.tokenStartCharIndex = input.index(); 77 state.tokenStartCharPositionInLine = input.getCharPositionInLine(); 78 state.tokenStartLine = input.getLine(); 79 state.text = null; 80 if ( input.LA(1)==CharStream.EOF ) { 81 Token eof = new CommonToken((CharStream)input,Token.EOF, 82 Token.DEFAULT_CHANNEL, 83 input.index(),input.index()); 84 eof.setLine(getLine()); 85 eof.setCharPositionInLine(getCharPositionInLine()); 86 return eof; 87 } 88 try { 89 mTokens(); 90 if ( state.token==null ) { 91 emit(); 92 } 93 else if ( state.token==Token.SKIP_TOKEN ) { 94 continue; 95 } 96 return state.token; 97 } 98 catch (NoViableAltException nva) { 99 reportError(nva); 100 recover(nva); // throw out current char and try again 101 } 102 catch (RecognitionException re) { 103 reportError(re); 104 // match() routine has already called recover() 105 } 106 } 107 } 108 109 /** Instruct the lexer to skip creating a token for current lexer rule 110 * and look for another token. nextToken() knows to keep looking when 111 * a lexer rule finishes with token set to SKIP_TOKEN. Recall that 112 * if token==null at end of any token rule, it creates one for you 113 * and emits it. 114 */ 115 public void skip() { 116 state.token = Token.SKIP_TOKEN; 117 } 118 119 /** This is the lexer entry point that sets instance var 'token' */ 120 public abstract void mTokens() throws RecognitionException; 121 122 /** Set the char stream and reset the lexer */ 123 public void setCharStream(CharStream input) { 124 this.input = null; 125 reset(); 126 this.input = input; 127 } 128 129 public CharStream getCharStream() { 130 return this.input; 131 } 132 133 public String getSourceName() { 134 return input.getSourceName(); 135 } 136 137 /** Currently does not support multiple emits per nextToken invocation 138 * for efficiency reasons. Subclass and override this method and 139 * nextToken (to push tokens into a list and pull from that list rather 140 * than a single variable as this implementation does). 141 */ 142 public void emit(Token token) { 143 state.token = token; 144 } 145 146 /** The standard method called to automatically emit a token at the 147 * outermost lexical rule. The token object should point into the 148 * char buffer start..stop. If there is a text override in 'text', 149 * use that to set the token's text. Override this method to emit 150 * custom Token objects. 151 * 152 * If you are building trees, then you should also override 153 * Parser or TreeParser.getMissingSymbol(). 154 */ 155 public Token emit() { 156 Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1); 157 t.setLine(state.tokenStartLine); 158 t.setText(state.text); 159 t.setCharPositionInLine(state.tokenStartCharPositionInLine); 160 emit(t); 161 return t; 162 } 163 164 public void match(String s) throws MismatchedTokenException { 165 int i = 0; 166 while ( i<s.length() ) { 167 if ( input.LA(1)!=s.charAt(i) ) { 168 if ( state.backtracking>0 ) { 169 state.failed = true; 170 return; 171 } 172 MismatchedTokenException mte = 173 new MismatchedTokenException(s.charAt(i), input); 174 recover(mte); 175 throw mte; 176 } 177 i++; 178 input.consume(); 179 state.failed = false; 180 } 181 } 182 183 public void matchAny() { 184 input.consume(); 185 } 186 187 public void match(int c) throws MismatchedTokenException { 188 if ( input.LA(1)!=c ) { 189 if ( state.backtracking>0 ) { 190 state.failed = true; 191 return; 192 } 193 MismatchedTokenException mte = 194 new MismatchedTokenException(c, input); 195 recover(mte); // don't really recover; just consume in lexer 196 throw mte; 197 } 198 input.consume(); 199 state.failed = false; 200 } 201 202 public void matchRange(int a, int b) 203 throws MismatchedRangeException 204 { 205 if ( input.LA(1)<a || input.LA(1)>b ) { 206 if ( state.backtracking>0 ) { 207 state.failed = true; 208 return; 209 } 210 MismatchedRangeException mre = 211 new MismatchedRangeException(a,b,input); 212 recover(mre); 213 throw mre; 214 } 215 input.consume(); 216 state.failed = false; 217 } 218 219 public int getLine() { 220 return input.getLine(); 221 } 222 223 public int getCharPositionInLine() { 224 return input.getCharPositionInLine(); 225 } 226 227 /** What is the index of the current character of lookahead? */ 228 public int getCharIndex() { 229 return input.index(); 230 } 231 232 /** Return the text matched so far for the current token or any 233 * text override. 234 */ 235 public String getText() { 236 if ( state.text!=null ) { 237 return state.text; 238 } 239 return input.substring(state.tokenStartCharIndex,getCharIndex()-1); 240 } 241 242 /** Set the complete text of this token; it wipes any previous 243 * changes to the text. 244 */ 245 public void setText(String text) { 246 state.text = text; 247 } 248 249 public void reportError(RecognitionException e) { 250 /** TODO: not thought about recovery in lexer yet. 251 * 252 // if we've already reported an error and have not matched a token 253 // yet successfully, don't report any errors. 254 if ( errorRecovery ) { 255 //System.err.print("[SPURIOUS] "); 256 return; 257 } 258 errorRecovery = true; 259 */ 260 261 displayRecognitionError(this.getTokenNames(), e); 262 } 263 264 public String getErrorMessage(RecognitionException e, String[] tokenNames) { 265 String msg = null; 266 if ( e instanceof MismatchedTokenException ) { 267 MismatchedTokenException mte = (MismatchedTokenException)e; 268 msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting "+getCharErrorDisplay(mte.expecting); 269 } 270 else if ( e instanceof NoViableAltException ) { 271 NoViableAltException nvae = (NoViableAltException)e; 272 // for development, can add "decision=<<"+nvae.grammarDecisionDescription+">>" 273 // and "(decision="+nvae.decisionNumber+") and 274 // "state "+nvae.stateNumber 275 msg = "no viable alternative at character "+getCharErrorDisplay(e.c); 276 } 277 else if ( e instanceof EarlyExitException ) { 278 EarlyExitException eee = (EarlyExitException)e; 279 // for development, can add "(decision="+eee.decisionNumber+")" 280 msg = "required (...)+ loop did not match anything at character "+getCharErrorDisplay(e.c); 281 } 282 else if ( e instanceof MismatchedNotSetException ) { 283 MismatchedNotSetException mse = (MismatchedNotSetException)e; 284 msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting set "+mse.expecting; 285 } 286 else if ( e instanceof MismatchedSetException ) { 287 MismatchedSetException mse = (MismatchedSetException)e; 288 msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting set "+mse.expecting; 289 } 290 else if ( e instanceof MismatchedRangeException ) { 291 MismatchedRangeException mre = (MismatchedRangeException)e; 292 msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting set "+ 293 getCharErrorDisplay(mre.a)+".."+getCharErrorDisplay(mre.b); 294 } 295 else { 296 msg = super.getErrorMessage(e, tokenNames); 297 } 298 return msg; 299 } 300 301 public String getCharErrorDisplay(int c) { 302 String s = String.valueOf((char)c); 303 switch ( c ) { 304 case Token.EOF : 305 s = "<EOF>"; 306 break; 307 case '\n' : 308 s = "\\n"; 309 break; 310 case '\t' : 311 s = "\\t"; 312 break; 313 case '\r' : 314 s = "\\r"; 315 break; 316 } 317 return "'"+s+"'"; 318 } 319 320 /** Lexers can normally match any char in it's vocabulary after matching 321 * a token, so do the easy thing and just kill a character and hope 322 * it all works out. You can instead use the rule invocation stack 323 * to do sophisticated error recovery if you are in a fragment rule. 324 */ 325 public void recover(RecognitionException re) { 326 //System.out.println("consuming char "+(char)input.LA(1)+" during recovery"); 327 //re.printStackTrace(); 328 input.consume(); 329 } 330 331 public void traceIn(String ruleName, int ruleIndex) { 332 String inputSymbol = ((char)input.LT(1))+" line="+getLine()+":"+getCharPositionInLine(); 333 super.traceIn(ruleName, ruleIndex, inputSymbol); 334 } 335 336 public void traceOut(String ruleName, int ruleIndex) { 337 String inputSymbol = ((char)input.LT(1))+" line="+getLine()+":"+getCharPositionInLine(); 338 super.traceOut(ruleName, ruleIndex, inputSymbol); 339 } 340 } 341