Home | History | Annotate | Download | only in java_cup
      1 package java_cup;
      2 
      3 import java.util.Hashtable;
      4 
      5 import java_cup.runtime.str_token;
      6 import java_cup.runtime.token;
      7 
      8 /** This class implements a small scanner (aka lexical analyzer or lexer) for
      9  *  the JavaCup specification.  This scanner reads characters from standard
     10  *  input (System.in) and returns integers corresponding to the terminal
     11  *  number of the next token.  Once end of input is reached the EOF token is
     12  *  returned on every subsequent call.<p>
     13  *  Tokens currently returned include: <pre>
     14  *    Symbol        Constant Returned     Symbol        Constant Returned
     15  *    ------        -----------------     ------        -----------------
     16  *    "package"     PACKAGE               "import"      IMPORT
     17  *    "code"        CODE                  "action"      ACTION
     18  *    "parser"      PARSER                "terminal"    TERMINAL
     19  *    "non"         NON                   "init"        INIT
     20  *    "scan"        SCAN                  "with"        WITH
     21  *    "start"       START                   ;           SEMI
     22  *      ,           COMMA                   *           STAR
     23  *      .           DOT                     :           COLON
     24  *      ::=         COLON_COLON_EQUALS      |           BAR
     25  *    identifier    ID                    {:...:}       CODE_STRING
     26  *    "debug"       DEBUG
     27  *  </pre>
     28  *  All symbol constants are defined in sym.java which is generated by
     29  *  JavaCup from parser.cup.<p>
     30  *
     31  *  In addition to the scanner proper (called first via init() then with
     32  *  next_token() to get each token) this class provides simple error and
     33  *  warning routines and keeps a count of errors and warnings that is
     34  *  publicly accessible.<p>
     35  *
     36  *  This class is "static" (i.e., it has only static members and methods).
     37  *
     38  * @version last updated: 11/25/95
     39  * @author  Scott Hudson
     40  */
     41 public class lexer {
     42 
     43   /*-----------------------------------------------------------*/
     44   /*--- Constructor(s) ----------------------------------------*/
     45   /*-----------------------------------------------------------*/
     46 
     47   /** The only constructor is private, so no instances can be created. */
     48   private lexer() { }
     49 
     50   /*-----------------------------------------------------------*/
     51   /*--- Static (Class) Variables ------------------------------*/
     52   /*-----------------------------------------------------------*/
     53 
     54   /** First character of lookahead. */
     55   protected static int next_char;
     56 
     57   /** Second character of lookahead. */
     58   protected static int next_char2;
     59 
     60   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
     61 
     62   /** EOF constant. */
     63   protected static final int EOF_CHAR = -1;
     64 
     65   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
     66 
     67   /** Table of keywords.  Keywords are initially treated as identifiers.
     68    *  Just before they are returned we look them up in this table to see if
     69    *  they match one of the keywords.  The string of the name is the key here,
     70    *  which indexes Integer objects holding the symbol number.
     71    */
     72   protected static Hashtable keywords = new Hashtable(23);
     73 
     74   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
     75 
     76   /** Table of single character symbols.  For ease of implementation, we
     77    *  store all unambiguous single character tokens in this table of Integer
     78    *  objects keyed by Integer objects with the numerical value of the
     79    *  appropriate char (currently Character objects have a bug which precludes
     80    *  their use in tables).
     81    */
     82   protected static Hashtable char_symbols = new Hashtable(11);
     83 
     84   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
     85 
     86   /** Current line number for use in error messages. */
     87   protected static int current_line = 1;
     88 
     89   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
     90 
     91   /** Character position in current line. */
     92   protected static int current_position = 1;
     93 
     94   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
     95 
     96   /** Count of total errors detected so far. */
     97   public static int error_count = 0;
     98 
     99   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
    100 
    101   /** Count of warnings issued so far */
    102   public static int warning_count = 0;
    103 
    104   /*-----------------------------------------------------------*/
    105   /*--- Static Methods ----------------------------------------*/
    106   /*-----------------------------------------------------------*/
    107 
    108   /** Initialize the scanner.  This sets up the keywords and char_symbols
    109     * tables and reads the first two characters of lookahead.
    110     */
    111   public static void init() throws java.io.IOException
    112     {
    113       /* set up the keyword table */
    114       keywords.put("package",  new Integer(sym.PACKAGE));
    115       keywords.put("import",   new Integer(sym.IMPORT));
    116       keywords.put("code",     new Integer(sym.CODE));
    117       keywords.put("action",   new Integer(sym.ACTION));
    118       keywords.put("parser",   new Integer(sym.PARSER));
    119       keywords.put("terminal", new Integer(sym.TERMINAL));
    120       keywords.put("non",      new Integer(sym.NON));
    121       keywords.put("init",     new Integer(sym.INIT));
    122       keywords.put("scan",     new Integer(sym.SCAN));
    123       keywords.put("with",     new Integer(sym.WITH));
    124       keywords.put("start",    new Integer(sym.START));
    125       keywords.put("debug",    new Integer(sym.DEBUG));
    126 
    127       /* set up the table of single character symbols */
    128       char_symbols.put(new Integer(';'), new Integer(sym.SEMI));
    129       char_symbols.put(new Integer(','), new Integer(sym.COMMA));
    130       char_symbols.put(new Integer('*'), new Integer(sym.STAR));
    131       char_symbols.put(new Integer('.'), new Integer(sym.DOT));
    132       char_symbols.put(new Integer('|'), new Integer(sym.BAR));
    133 
    134       /* read two characters of lookahead */
    135       next_char = System.in.read();
    136       if (next_char == EOF_CHAR)
    137     next_char2 = EOF_CHAR;
    138       else
    139     next_char2 = System.in.read();
    140     }
    141 
    142   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
    143 
    144   /** Advance the scanner one character in the input stream.  This moves
    145    * next_char2 to next_char and then reads a new next_char2.
    146    */
    147   protected static void advance() throws java.io.IOException
    148     {
    149       int old_char;
    150 
    151       old_char = next_char;
    152       next_char = next_char2;
    153       if (next_char == EOF_CHAR)
    154     next_char2 = EOF_CHAR;
    155       else
    156     next_char2 = System.in.read();
    157 
    158       /* count this */
    159       current_position++;
    160       if (old_char == '\n')
    161     {
    162       current_line++;
    163       current_position = 1;
    164     }
    165     }
    166 
    167   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
    168 
    169   /** Emit an error message.  The message will be marked with both the
    170    *  current line number and the position in the line.  Error messages
    171    *  are printed on standard error (System.err).
    172    * @param message the message to print.
    173    */
    174   public static void emit_error(String message)
    175     {
    176       System.err.println("Error at " + current_line + "(" + current_position +
    177              "): " + message);
    178       error_count++;
    179     }
    180 
    181   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
    182 
    183   /** Emit a warning message.  The message will be marked with both the
    184    *  current line number and the position in the line.  Messages are
    185    *  printed on standard error (System.err).
    186    * @param message the message to print.
    187    */
    188   public static void emit_warn(String message)
    189     {
    190       System.err.println("Warning at " + current_line + "(" + current_position +
    191              "): " + message);
    192       warning_count++;
    193     }
    194 
    195   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
    196 
    197   /** Determine if a character is ok to start an id.
    198    * @param ch the character in question.
    199    */
    200   protected static boolean id_start_char(int ch)
    201     {
    202       return (ch >= 'a' &&  ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
    203          (ch == '_');
    204 
    205       // later need to deal with non-8-bit chars here
    206     }
    207 
    208   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
    209 
    210   /** Determine if a character is ok for the middle of an id.
    211    * @param ch the character in question.
    212    */
    213   protected static boolean id_char(int ch)
    214     {
    215       return id_start_char(ch) || (ch >= '0' && ch <= '9');
    216     }
    217 
    218   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
    219 
    220   /** Try to look up a single character symbol, returns -1 for not found.
    221    * @param ch the character in question.
    222    */
    223   protected static int find_single_char(int ch)
    224     {
    225       Integer result;
    226 
    227       result = (Integer)char_symbols.get(new Integer((char)ch));
    228       if (result == null)
    229     return -1;
    230       else
    231     return result.intValue();
    232     }
    233 
    234   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
    235 
    236   /** Handle swallowing up a comment.  Both old style C and new style C++
    237    *  comments are handled.
    238    */
    239   protected static void swallow_comment() throws java.io.IOException
    240     {
    241       /* next_char == '/' at this point */
    242 
    243       /* is it a traditional comment */
    244       if (next_char2 == '*')
    245     {
    246       /* swallow the opener */
    247       advance(); advance();
    248 
    249       /* swallow the comment until end of comment or EOF */
    250       for (;;)
    251         {
    252           /* if its EOF we have an error */
    253           if (next_char == EOF_CHAR)
    254         {
    255           emit_error("Specification file ends inside a comment");
    256           return;
    257         }
    258 
    259           /* if we can see the closer we are done */
    260           if (next_char == '*' && next_char2 == '/')
    261         {
    262           advance();
    263           advance();
    264           return;
    265         }
    266 
    267           /* otherwise swallow char and move on */
    268           advance();
    269         }
    270     }
    271 
    272       /* is its a new style comment */
    273       if (next_char2 == '/')
    274     {
    275       /* swallow the opener */
    276       advance(); advance();
    277 
    278       /* swallow to '\n', '\f', or EOF */
    279       while (next_char != '\n' && next_char != '\f' && next_char!=EOF_CHAR)
    280         advance();
    281 
    282       return;
    283 
    284     }
    285 
    286       /* shouldn't get here, but... if we get here we have an error */
    287       emit_error("Malformed comment in specification -- ignored");
    288       advance();
    289     }
    290 
    291   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
    292 
    293   /** Swallow up a code string.  Code strings begin with "{:" and include
    294       all characters up to the first occurrence of ":}" (there is no way to
    295       include ":}" inside a code string).  The routine returns an str_token
    296       object suitable for return by the scanner.
    297    */
    298   protected static token do_code_string() throws java.io.IOException
    299     {
    300       StringBuffer result = new StringBuffer();
    301 
    302       /* at this point we have lookahead of "{:" -- swallow that */
    303       advance(); advance();
    304 
    305       /* save chars until we see ":}" */
    306       while (!(next_char == ':' && next_char2 == '}'))
    307     {
    308       /* if we have run off the end issue a message and break out of loop */
    309       if (next_char == EOF_CHAR)
    310         {
    311           emit_error("Specification file ends inside a code string");
    312           break;
    313         }
    314 
    315       /* otherwise record the char and move on */
    316       result.append(new Character((char)next_char));
    317       advance();
    318     }
    319 
    320       /* advance past the closer and build a return token */
    321       advance(); advance();
    322       return new str_token(sym.CODE_STRING, result.toString());
    323     }
    324 
    325   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
    326 
    327   /** Process an identifier.  Identifiers begin with a letter, underscore,
    328    *  or dollar sign, which is followed by zero or more letters, numbers,
    329    *  underscores or dollar signs.  This routine returns an str_token suitable
    330    *  for return by the scanner.
    331    */
    332   protected static token do_id() throws java.io.IOException
    333     {
    334       StringBuffer result = new StringBuffer();
    335       String       result_str;
    336       Integer      keyword_num;
    337       char         buffer[] = new char[1];
    338 
    339       /* next_char holds first character of id */
    340       buffer[0] = (char)next_char;
    341       result.append(buffer,0,1);
    342       advance();
    343 
    344       /* collect up characters while they fit in id */
    345       while(id_char(next_char))
    346     {
    347           buffer[0] = (char)next_char;
    348       result.append(buffer,0,1);
    349       advance();
    350     }
    351 
    352       /* extract a string and try to look it up as a keyword */
    353       result_str = result.toString();
    354       keyword_num = (Integer)keywords.get(result_str);
    355 
    356       /* if we found something, return that keyword */
    357       if (keyword_num != null)
    358     return new token(keyword_num.intValue());
    359 
    360       /* otherwise build and return an id token with an attached string */
    361       return new str_token(sym.ID, result_str);
    362     }
    363 
    364   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
    365 
    366   /** Return one token.  This is the main external interface to the scanner.
    367    *  It consumes sufficient characters to determine the next input token
    368    *  and returns it.  To help with debugging, this routine actually calls
    369    *  real_next_token() which does the work.  If you need to debug the
    370    *  parser, this can be changed to call debug_next_token() which prints
    371    *  a debugging message before returning the token.
    372    */
    373   public static token next_token() throws java.io.IOException
    374     {
    375       return real_next_token();
    376     }
    377 
    378   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
    379 
    380   /** Debugging version of next_token().  This routine calls the real scanning
    381    *  routine, prints a message on System.out indicating what the token is,
    382    *  then returns it.
    383    */
    384   public static token debug_next_token() throws java.io.IOException
    385     {
    386       token result = real_next_token();
    387       System.out.println("# next_token() => " + result.sym);
    388       return result;
    389     }
    390 
    391   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
    392 
    393   /** The actual routine to return one token.  This is normally called from
    394    *  next_token(), but for debugging purposes can be called indirectly from
    395    *  debug_next_token().
    396    */
    397   protected static token real_next_token() throws java.io.IOException
    398     {
    399       int sym_num;
    400 
    401       for (;;)
    402     {
    403       /* look for white space */
    404       if (next_char == ' ' || next_char == '\t' || next_char == '\n' ||
    405           next_char == '\f' ||  next_char == '\r')
    406         {
    407           /* advance past it and try the next character */
    408           advance();
    409           continue;
    410         }
    411 
    412       /* look for a single character symbol */
    413       sym_num = find_single_char(next_char);
    414       if (sym_num != -1)
    415         {
    416           /* found one -- advance past it and return a token for it */
    417           advance();
    418           return new token(sym_num);
    419         }
    420 
    421       /* look for : or ::= */
    422       if (next_char == ':')
    423         {
    424           /* if we don't have a second ':' return COLON */
    425           if (next_char2 != ':')
    426         {
    427           advance();
    428           return new token(sym.COLON);
    429         }
    430 
    431           /* move forward and look for the '=' */
    432           advance();
    433           if (next_char2 == '=')
    434         {
    435           advance(); advance();
    436           return new token(sym.COLON_COLON_EQUALS);
    437         }
    438           else
    439         {
    440           /* return just the colon (already consumed) */
    441           return new token(sym.COLON);
    442         }
    443         }
    444 
    445       /* look for a comment */
    446       if (next_char == '/' && (next_char2 == '*' || next_char2 == '/'))
    447         {
    448           /* swallow then continue the scan */
    449           swallow_comment();
    450           continue;
    451         }
    452 
    453       /* look for start of code string */
    454       if (next_char == '{' && next_char2 == ':')
    455         return do_code_string();
    456 
    457       /* look for an id or keyword */
    458       if (id_start_char(next_char)) return do_id();
    459 
    460       /* look for EOF */
    461       if (next_char == EOF_CHAR) return new token(sym.EOF);
    462 
    463       /* if we get here, we have an unrecognized character */
    464       emit_warn("Unrecognized character '" +
    465         new Character((char)next_char) + "'(" + next_char +
    466         ") -- ignored");
    467 
    468       /* advance past it */
    469       advance();
    470     }
    471     }
    472 
    473   /*-----------------------------------------------------------*/
    474 };
    475 
    476