Home | History | Annotate | Download | only in runtime
      1 /*
      2  [The "BSD license"]
      3  Copyright (c) 2005-2009 Terence Parr
      4  All rights reserved.
      5 
      6  Redistribution and use in source and binary forms, with or without
      7  modification, are permitted provided that the following conditions
      8  are met:
      9  1. Redistributions of source code must retain the above copyright
     10      notice, this list of conditions and the following disclaimer.
     11  2. Redistributions in binary form must reproduce the above copyright
     12      notice, this list of conditions and the following disclaimer in the
     13      documentation and/or other materials provided with the distribution.
     14  3. The name of the author may not be used to endorse or promote products
     15      derived from this software without specific prior written permission.
     16 
     17  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     18  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     19  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     20  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     21  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     22  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     26  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 package org.antlr.runtime;
     29 
     30 /** A lexer is recognizer that draws input symbols from a character stream.
     31  *  lexer grammars result in a subclass of this object. A Lexer object
     32  *  uses simplified match() and error recovery mechanisms in the interest
     33  *  of speed.
     34  */
     35 public abstract class Lexer extends BaseRecognizer implements TokenSource {
     36 	/** Where is the lexer drawing characters from? */
     37 	protected CharStream input;
     38 
     39 	public Lexer() {
     40 	}
     41 
     42 	public Lexer(CharStream input) {
     43 		this.input = input;
     44 	}
     45 
     46 	public Lexer(CharStream input, RecognizerSharedState state) {
     47 		super(state);
     48 		this.input = input;
     49 	}
     50 
     51 	public void reset() {
     52 		super.reset(); // reset all recognizer state variables
     53 		// wack Lexer state variables
     54 		if ( input!=null ) {
     55 			input.seek(0); // rewind the input
     56 		}
     57 		if ( state==null ) {
     58 			return; // no shared state work to do
     59 		}
     60 		state.token = null;
     61 		state.type = Token.INVALID_TOKEN_TYPE;
     62 		state.channel = Token.DEFAULT_CHANNEL;
     63 		state.tokenStartCharIndex = -1;
     64 		state.tokenStartCharPositionInLine = -1;
     65 		state.tokenStartLine = -1;
     66 		state.text = null;
     67 	}
     68 
     69 	/** Return a token from this source; i.e., match a token on the char
     70 	 *  stream.
     71 	 */
     72 	public Token nextToken() {
     73 		while (true) {
     74 			state.token = null;
     75 			state.channel = Token.DEFAULT_CHANNEL;
     76 			state.tokenStartCharIndex = input.index();
     77 			state.tokenStartCharPositionInLine = input.getCharPositionInLine();
     78 			state.tokenStartLine = input.getLine();
     79 			state.text = null;
     80 			if ( input.LA(1)==CharStream.EOF ) {
     81                 Token eof = new CommonToken((CharStream)input,Token.EOF,
     82                                             Token.DEFAULT_CHANNEL,
     83                                             input.index(),input.index());
     84                 eof.setLine(getLine());
     85                 eof.setCharPositionInLine(getCharPositionInLine());
     86                 return eof;
     87 			}
     88 			try {
     89 				mTokens();
     90 				if ( state.token==null ) {
     91 					emit();
     92 				}
     93 				else if ( state.token==Token.SKIP_TOKEN ) {
     94 					continue;
     95 				}
     96 				return state.token;
     97 			}
     98 			catch (NoViableAltException nva) {
     99 				reportError(nva);
    100 				recover(nva); // throw out current char and try again
    101 			}
    102 			catch (RecognitionException re) {
    103 				reportError(re);
    104 				// match() routine has already called recover()
    105 			}
    106 		}
    107 	}
    108 
    109 	/** Instruct the lexer to skip creating a token for current lexer rule
    110 	 *  and look for another token.  nextToken() knows to keep looking when
    111 	 *  a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
    112 	 *  if token==null at end of any token rule, it creates one for you
    113 	 *  and emits it.
    114 	 */
    115 	public void skip() {
    116 		state.token = Token.SKIP_TOKEN;
    117 	}
    118 
    119 	/** This is the lexer entry point that sets instance var 'token' */
    120 	public abstract void mTokens() throws RecognitionException;
    121 
    122 	/** Set the char stream and reset the lexer */
    123 	public void setCharStream(CharStream input) {
    124 		this.input = null;
    125 		reset();
    126 		this.input = input;
    127 	}
    128 
    129 	public CharStream getCharStream() {
    130 		return this.input;
    131 	}
    132 
    133 	public String getSourceName() {
    134 		return input.getSourceName();
    135 	}
    136 
    137 	/** Currently does not support multiple emits per nextToken invocation
    138 	 *  for efficiency reasons.  Subclass and override this method and
    139 	 *  nextToken (to push tokens into a list and pull from that list rather
    140 	 *  than a single variable as this implementation does).
    141 	 */
    142 	public void emit(Token token) {
    143 		state.token = token;
    144 	}
    145 
    146 	/** The standard method called to automatically emit a token at the
    147 	 *  outermost lexical rule.  The token object should point into the
    148 	 *  char buffer start..stop.  If there is a text override in 'text',
    149 	 *  use that to set the token's text.  Override this method to emit
    150 	 *  custom Token objects.
    151 	 *
    152 	 *  If you are building trees, then you should also override
    153 	 *  Parser or TreeParser.getMissingSymbol().
    154 	 */
    155 	public Token emit() {
    156 		Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1);
    157 		t.setLine(state.tokenStartLine);
    158 		t.setText(state.text);
    159 		t.setCharPositionInLine(state.tokenStartCharPositionInLine);
    160 		emit(t);
    161 		return t;
    162 	}
    163 
    164 	public void match(String s) throws MismatchedTokenException {
    165 		int i = 0;
    166 		while ( i<s.length() ) {
    167 			if ( input.LA(1)!=s.charAt(i) ) {
    168 				if ( state.backtracking>0 ) {
    169 					state.failed = true;
    170 					return;
    171 				}
    172 				MismatchedTokenException mte =
    173 					new MismatchedTokenException(s.charAt(i), input);
    174 				recover(mte);
    175 				throw mte;
    176 			}
    177 			i++;
    178 			input.consume();
    179 			state.failed = false;
    180 		}
    181 	}
    182 
    183 	public void matchAny() {
    184 		input.consume();
    185 	}
    186 
    187 	public void match(int c) throws MismatchedTokenException {
    188 		if ( input.LA(1)!=c ) {
    189 			if ( state.backtracking>0 ) {
    190 				state.failed = true;
    191 				return;
    192 			}
    193 			MismatchedTokenException mte =
    194 				new MismatchedTokenException(c, input);
    195 			recover(mte);  // don't really recover; just consume in lexer
    196 			throw mte;
    197 		}
    198 		input.consume();
    199 		state.failed = false;
    200 	}
    201 
    202 	public void matchRange(int a, int b)
    203 		throws MismatchedRangeException
    204 	{
    205 		if ( input.LA(1)<a || input.LA(1)>b ) {
    206 			if ( state.backtracking>0 ) {
    207 				state.failed = true;
    208 				return;
    209 			}
    210 			MismatchedRangeException mre =
    211 				new MismatchedRangeException(a,b,input);
    212 			recover(mre);
    213 			throw mre;
    214 		}
    215 		input.consume();
    216 		state.failed = false;
    217 	}
    218 
    219 	public int getLine() {
    220 		return input.getLine();
    221 	}
    222 
    223 	public int getCharPositionInLine() {
    224 		return input.getCharPositionInLine();
    225 	}
    226 
    227 	/** What is the index of the current character of lookahead? */
    228 	public int getCharIndex() {
    229 		return input.index();
    230 	}
    231 
    232 	/** Return the text matched so far for the current token or any
    233 	 *  text override.
    234 	 */
    235 	public String getText() {
    236 		if ( state.text!=null ) {
    237 			return state.text;
    238 		}
    239 		return input.substring(state.tokenStartCharIndex,getCharIndex()-1);
    240 	}
    241 
    242 	/** Set the complete text of this token; it wipes any previous
    243 	 *  changes to the text.
    244 	 */
    245 	public void setText(String text) {
    246 		state.text = text;
    247 	}
    248 
    249 	public void reportError(RecognitionException e) {
    250 		/** TODO: not thought about recovery in lexer yet.
    251 		 *
    252 		// if we've already reported an error and have not matched a token
    253 		// yet successfully, don't report any errors.
    254 		if ( errorRecovery ) {
    255 			//System.err.print("[SPURIOUS] ");
    256 			return;
    257 		}
    258 		errorRecovery = true;
    259 		 */
    260 
    261 		displayRecognitionError(this.getTokenNames(), e);
    262 	}
    263 
    264 	public String getErrorMessage(RecognitionException e, String[] tokenNames) {
    265 		String msg = null;
    266 		if ( e instanceof MismatchedTokenException ) {
    267 			MismatchedTokenException mte = (MismatchedTokenException)e;
    268 			msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting "+getCharErrorDisplay(mte.expecting);
    269 		}
    270 		else if ( e instanceof NoViableAltException ) {
    271 			NoViableAltException nvae = (NoViableAltException)e;
    272 			// for development, can add "decision=<<"+nvae.grammarDecisionDescription+">>"
    273 			// and "(decision="+nvae.decisionNumber+") and
    274 			// "state "+nvae.stateNumber
    275 			msg = "no viable alternative at character "+getCharErrorDisplay(e.c);
    276 		}
    277 		else if ( e instanceof EarlyExitException ) {
    278 			EarlyExitException eee = (EarlyExitException)e;
    279 			// for development, can add "(decision="+eee.decisionNumber+")"
    280 			msg = "required (...)+ loop did not match anything at character "+getCharErrorDisplay(e.c);
    281 		}
    282 		else if ( e instanceof MismatchedNotSetException ) {
    283 			MismatchedNotSetException mse = (MismatchedNotSetException)e;
    284 			msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting set "+mse.expecting;
    285 		}
    286 		else if ( e instanceof MismatchedSetException ) {
    287 			MismatchedSetException mse = (MismatchedSetException)e;
    288 			msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting set "+mse.expecting;
    289 		}
    290 		else if ( e instanceof MismatchedRangeException ) {
    291 			MismatchedRangeException mre = (MismatchedRangeException)e;
    292 			msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting set "+
    293 				  getCharErrorDisplay(mre.a)+".."+getCharErrorDisplay(mre.b);
    294 		}
    295 		else {
    296 			msg = super.getErrorMessage(e, tokenNames);
    297 		}
    298 		return msg;
    299 	}
    300 
    301 	public String getCharErrorDisplay(int c) {
    302 		String s = String.valueOf((char)c);
    303 		switch ( c ) {
    304 			case Token.EOF :
    305 				s = "<EOF>";
    306 				break;
    307 			case '\n' :
    308 				s = "\\n";
    309 				break;
    310 			case '\t' :
    311 				s = "\\t";
    312 				break;
    313 			case '\r' :
    314 				s = "\\r";
    315 				break;
    316 		}
    317 		return "'"+s+"'";
    318 	}
    319 
    320 	/** Lexers can normally match any char in it's vocabulary after matching
    321 	 *  a token, so do the easy thing and just kill a character and hope
    322 	 *  it all works out.  You can instead use the rule invocation stack
    323 	 *  to do sophisticated error recovery if you are in a fragment rule.
    324 	 */
    325 	public void recover(RecognitionException re) {
    326 		//System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
    327 		//re.printStackTrace();
    328 		input.consume();
    329 	}
    330 
    331 	public void traceIn(String ruleName, int ruleIndex)  {
    332 		String inputSymbol = ((char)input.LT(1))+" line="+getLine()+":"+getCharPositionInLine();
    333 		super.traceIn(ruleName, ruleIndex, inputSymbol);
    334 	}
    335 
    336 	public void traceOut(String ruleName, int ruleIndex)  {
    337 		String inputSymbol = ((char)input.LT(1))+" line="+getLine()+":"+getCharPositionInLine();
    338 		super.traceOut(ruleName, ruleIndex, inputSymbol);
    339 	}
    340 }
    341