Home | History | Annotate | Download | only in runtime
      1 /*
      2  [The "BSD license"]
      3  Copyright (c) 2005-2009 Terence Parr
      4  All rights reserved.
      5 
      6  Redistribution and use in source and binary forms, with or without
      7  modification, are permitted provided that the following conditions
      8  are met:
      9  1. Redistributions of source code must retain the above copyright
     10      notice, this list of conditions and the following disclaimer.
     11  2. Redistributions in binary form must reproduce the above copyright
     12      notice, this list of conditions and the following disclaimer in the
     13      documentation and/or other materials provided with the distribution.
     14  3. The name of the author may not be used to endorse or promote products
     15      derived from this software without specific prior written permission.
     16 
     17  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     18  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     19  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     20  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     21  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     22  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     26  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 package org.antlr.runtime;
     29 
     30 /** A lexer is recognizer that draws input symbols from a character stream.
     31  *  lexer grammars result in a subclass of this object. A Lexer object
     32  *  uses simplified match() and error recovery mechanisms in the interest
     33  *  of speed.
     34  */
     35 public abstract class Lexer extends BaseRecognizer implements TokenSource {
     36 	/** Where is the lexer drawing characters from? */
     37 	protected CharStream input;
     38 
     39 	public Lexer() {
     40 	}
     41 
     42 	public Lexer(CharStream input) {
     43 		this.input = input;
     44 	}
     45 
     46 	public Lexer(CharStream input, RecognizerSharedState state) {
     47 		super(state);
     48 		this.input = input;
     49 	}
     50 
     51 	public void reset() {
     52 		super.reset(); // reset all recognizer state variables
     53 		// wack Lexer state variables
     54 		if ( input!=null ) {
     55 			input.seek(0); // rewind the input
     56 		}
     57 		if ( state==null ) {
     58 			return; // no shared state work to do
     59 		}
     60 		state.token = null;
     61 		state.type = Token.INVALID_TOKEN_TYPE;
     62 		state.channel = Token.DEFAULT_CHANNEL;
     63 		state.tokenStartCharIndex = -1;
     64 		state.tokenStartCharPositionInLine = -1;
     65 		state.tokenStartLine = -1;
     66 		state.text = null;
     67 	}
     68 
     69 	/** Return a token from this source; i.e., match a token on the char
     70 	 *  stream.
     71 	 */
     72 	public Token nextToken() {
     73 		while (true) {
     74 			state.token = null;
     75 			state.channel = Token.DEFAULT_CHANNEL;
     76 			state.tokenStartCharIndex = input.index();
     77 			state.tokenStartCharPositionInLine = input.getCharPositionInLine();
     78 			state.tokenStartLine = input.getLine();
     79 			state.text = null;
     80 			if ( input.LA(1)==CharStream.EOF ) {
     81                 Token eof = new CommonToken((CharStream)input,Token.EOF,
     82                                             Token.DEFAULT_CHANNEL,
     83                                             input.index(),input.index());
     84                 eof.setLine(getLine());
     85                 eof.setCharPositionInLine(getCharPositionInLine());
     86                 return eof;
     87 			}
     88 			try {
     89 				mTokens();
     90 				if ( state.token==null ) {
     91 					emit();
     92 				}
     93 				else if ( state.token==Token.SKIP_TOKEN ) {
     94 					continue;
     95 				}
     96 				return state.token;
     97 			}
     98 			catch (MismatchedRangeException re) {
     99 				reportError(re);
    100 				// matchRange() routine has already called recover()
    101 			}
    102 			catch (MismatchedTokenException re) {
    103 				reportError(re);
    104 				// match() routine has already called recover()
    105 			}
    106 			catch (RecognitionException re) {
    107 				reportError(re);
    108 				recover(re); // throw out current char and try again
    109 			}
    110 		}
    111 	}
    112 
    113 	/** Instruct the lexer to skip creating a token for current lexer rule
    114 	 *  and look for another token.  nextToken() knows to keep looking when
    115 	 *  a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
    116 	 *  if token==null at end of any token rule, it creates one for you
    117 	 *  and emits it.
    118 	 */
    119 	public void skip() {
    120 		state.token = Token.SKIP_TOKEN;
    121 	}
    122 
    123 	/** This is the lexer entry point that sets instance var 'token' */
    124 	public abstract void mTokens() throws RecognitionException;
    125 
    126 	/** Set the char stream and reset the lexer */
    127 	public void setCharStream(CharStream input) {
    128 		this.input = null;
    129 		reset();
    130 		this.input = input;
    131 	}
    132 
    133 	public CharStream getCharStream() {
    134 		return this.input;
    135 	}
    136 
    137 	public String getSourceName() {
    138 		return input.getSourceName();
    139 	}
    140 
    141 	/** Currently does not support multiple emits per nextToken invocation
    142 	 *  for efficiency reasons.  Subclass and override this method and
    143 	 *  nextToken (to push tokens into a list and pull from that list rather
    144 	 *  than a single variable as this implementation does).
    145 	 */
    146 	public void emit(Token token) {
    147 		state.token = token;
    148 	}
    149 
    150 	/** The standard method called to automatically emit a token at the
    151 	 *  outermost lexical rule.  The token object should point into the
    152 	 *  char buffer start..stop.  If there is a text override in 'text',
    153 	 *  use that to set the token's text.  Override this method to emit
    154 	 *  custom Token objects.
    155 	 *
    156 	 *  If you are building trees, then you should also override
    157 	 *  Parser or TreeParser.getMissingSymbol().
    158 	 */
    159 	public Token emit() {
    160 		Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1);
    161 		t.setLine(state.tokenStartLine);
    162 		t.setText(state.text);
    163 		t.setCharPositionInLine(state.tokenStartCharPositionInLine);
    164 		emit(t);
    165 		return t;
    166 	}
    167 
    168 	public void match(String s) throws MismatchedTokenException {
    169 		int i = 0;
    170 		while ( i<s.length() ) {
    171 			if ( input.LA(1)!=s.charAt(i) ) {
    172 				if ( state.backtracking>0 ) {
    173 					state.failed = true;
    174 					return;
    175 				}
    176 				MismatchedTokenException mte =
    177 					new MismatchedTokenException(s.charAt(i), input);
    178 				recover(mte);
    179 				throw mte;
    180 			}
    181 			i++;
    182 			input.consume();
    183 			state.failed = false;
    184 		}
    185 	}
    186 
    187 	public void matchAny() {
    188 		input.consume();
    189 	}
    190 
    191 	public void match(int c) throws MismatchedTokenException {
    192 		if ( input.LA(1)!=c ) {
    193 			if ( state.backtracking>0 ) {
    194 				state.failed = true;
    195 				return;
    196 			}
    197 			MismatchedTokenException mte =
    198 				new MismatchedTokenException(c, input);
    199 			recover(mte);  // don't really recover; just consume in lexer
    200 			throw mte;
    201 		}
    202 		input.consume();
    203 		state.failed = false;
    204 	}
    205 
    206 	public void matchRange(int a, int b)
    207 		throws MismatchedRangeException
    208 	{
    209 		if ( input.LA(1)<a || input.LA(1)>b ) {
    210 			if ( state.backtracking>0 ) {
    211 				state.failed = true;
    212 				return;
    213 			}
    214 			MismatchedRangeException mre =
    215 				new MismatchedRangeException(a,b,input);
    216 			recover(mre);
    217 			throw mre;
    218 		}
    219 		input.consume();
    220 		state.failed = false;
    221 	}
    222 
    223 	public int getLine() {
    224 		return input.getLine();
    225 	}
    226 
    227 	public int getCharPositionInLine() {
    228 		return input.getCharPositionInLine();
    229 	}
    230 
    231 	/** What is the index of the current character of lookahead? */
    232 	public int getCharIndex() {
    233 		return input.index();
    234 	}
    235 
    236 	/** Return the text matched so far for the current token or any
    237 	 *  text override.
    238 	 */
    239 	public String getText() {
    240 		if ( state.text!=null ) {
    241 			return state.text;
    242 		}
    243 		return input.substring(state.tokenStartCharIndex,getCharIndex()-1);
    244 	}
    245 
    246 	/** Set the complete text of this token; it wipes any previous
    247 	 *  changes to the text.
    248 	 */
    249 	public void setText(String text) {
    250 		state.text = text;
    251 	}
    252 
    253 	public void reportError(RecognitionException e) {
    254 		/** TODO: not thought about recovery in lexer yet.
    255 		 *
    256 		// if we've already reported an error and have not matched a token
    257 		// yet successfully, don't report any errors.
    258 		if ( errorRecovery ) {
    259 			//System.err.print("[SPURIOUS] ");
    260 			return;
    261 		}
    262 		errorRecovery = true;
    263 		 */
    264 
    265 		displayRecognitionError(this.getTokenNames(), e);
    266 	}
    267 
    268 	public String getErrorMessage(RecognitionException e, String[] tokenNames) {
    269 		String msg = null;
    270 		if ( e instanceof MismatchedTokenException ) {
    271 			MismatchedTokenException mte = (MismatchedTokenException)e;
    272 			msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting "+getCharErrorDisplay(mte.expecting);
    273 		}
    274 		else if ( e instanceof NoViableAltException ) {
    275 			NoViableAltException nvae = (NoViableAltException)e;
    276 			// for development, can add "decision=<<"+nvae.grammarDecisionDescription+">>"
    277 			// and "(decision="+nvae.decisionNumber+") and
    278 			// "state "+nvae.stateNumber
    279 			msg = "no viable alternative at character "+getCharErrorDisplay(e.c);
    280 		}
    281 		else if ( e instanceof EarlyExitException ) {
    282 			EarlyExitException eee = (EarlyExitException)e;
    283 			// for development, can add "(decision="+eee.decisionNumber+")"
    284 			msg = "required (...)+ loop did not match anything at character "+getCharErrorDisplay(e.c);
    285 		}
    286 		else if ( e instanceof MismatchedNotSetException ) {
    287 			MismatchedNotSetException mse = (MismatchedNotSetException)e;
    288 			msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting set "+mse.expecting;
    289 		}
    290 		else if ( e instanceof MismatchedSetException ) {
    291 			MismatchedSetException mse = (MismatchedSetException)e;
    292 			msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting set "+mse.expecting;
    293 		}
    294 		else if ( e instanceof MismatchedRangeException ) {
    295 			MismatchedRangeException mre = (MismatchedRangeException)e;
    296 			msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting set "+
    297 				  getCharErrorDisplay(mre.a)+".."+getCharErrorDisplay(mre.b);
    298 		}
    299 		else {
    300 			msg = super.getErrorMessage(e, tokenNames);
    301 		}
    302 		return msg;
    303 	}
    304 
    305 	public String getCharErrorDisplay(int c) {
    306 		String s = String.valueOf((char)c);
    307 		switch ( c ) {
    308 			case Token.EOF :
    309 				s = "<EOF>";
    310 				break;
    311 			case '\n' :
    312 				s = "\\n";
    313 				break;
    314 			case '\t' :
    315 				s = "\\t";
    316 				break;
    317 			case '\r' :
    318 				s = "\\r";
    319 				break;
    320 		}
    321 		return "'"+s+"'";
    322 	}
    323 
    324 	/** Lexers can normally match any char in it's vocabulary after matching
    325 	 *  a token, so do the easy thing and just kill a character and hope
    326 	 *  it all works out.  You can instead use the rule invocation stack
    327 	 *  to do sophisticated error recovery if you are in a fragment rule.
    328 	 */
    329 	public void recover(RecognitionException re) {
    330 		//System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
    331 		//re.printStackTrace();
    332 		input.consume();
    333 	}
    334 
    335 	public void traceIn(String ruleName, int ruleIndex)  {
    336 		String inputSymbol = ((char)input.LT(1))+" line="+getLine()+":"+getCharPositionInLine();
    337 		super.traceIn(ruleName, ruleIndex, inputSymbol);
    338 	}
    339 
    340 	public void traceOut(String ruleName, int ruleIndex)  {
    341 		String inputSymbol = ((char)input.LT(1))+" line="+getLine()+":"+getCharPositionInLine();
    342 		super.traceOut(ruleName, ruleIndex, inputSymbol);
    343 	}
    344 }
    345