Home | History | Annotate | Download | only in core
      1 /*
      2 * Conditions Of Use
      3 *
      4 * This software was developed by employees of the National Institute of
      5 * Standards and Technology (NIST), an agency of the Federal Government.
      6 * Pursuant to title 15 Untied States Code Section 105, works of NIST
      7 * employees are not subject to copyright protection in the United States
      8 * and are considered to be in the public domain.  As a result, a formal
      9 * license is not needed to use the software.
     10 *
     11 * This software is provided by NIST as a service and is expressly
     12 * provided "AS IS."  NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED
     13 * OR STATUTORY, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF
     14 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT
     15 * AND DATA ACCURACY.  NIST does not warrant or make any representations
     16 * regarding the use of the software or the results thereof, including but
     17 * not limited to the correctness, accuracy, reliability or usefulness of
     18 * the software.
     19 *
     20 * Permission to use this software is contingent upon your acceptance
     21 * of the terms of this agreement
     22 *
     23 * .
     24 *
     25 */
     26 package gov.nist.core;
     27 
     28 import java.text.ParseException;
     29 import java.util.Hashtable;
     30 
     31 /** A lexical analyzer that is used by all parsers in our implementation.
     32  *
     33  *@version 1.2
     34  *@since 1.1
     35  *
     36  *@author M. Ranganathan
     37  */
     38 public class LexerCore extends StringTokenizer {
     39 
     40     // IMPORTANT - All keyword matches should be between START and END
     41     public static final int START = 2048;
     42     public static final int END = START + 2048;
     43     // IMPORTANT -- This should be < END
     44     public static final int ID = END - 1;
     45     public static final int SAFE = END - 2;
     46     // Individial token classes.
     47     public static final int WHITESPACE = END + 1;
     48     public static final int DIGIT = END + 2;
     49     public static final int ALPHA = END + 3;
     50     public static final int BACKSLASH = (int) '\\';
     51     public static final int QUOTE = (int) '\'';
     52     public static final int AT = (int) '@';
     53     public static final int SP = (int) ' ';
     54     public static final int HT = (int) '\t';
     55     public static final int COLON = (int) ':';
     56     public static final int STAR = (int) '*';
     57     public static final int DOLLAR = (int) '$';
     58     public static final int PLUS = (int) '+';
     59     public static final int POUND = (int) '#';
     60     public static final int MINUS = (int) '-';
     61     public static final int DOUBLEQUOTE = (int) '\"';
     62     public static final int TILDE = (int) '~';
     63     public static final int BACK_QUOTE = (int) '`';
     64     public static final int NULL = (int) '\0';
     65     public static final int EQUALS = (int) '=';
     66     public static final int SEMICOLON = (int) ';';
     67     public static final int SLASH = (int) '/';
     68     public static final int L_SQUARE_BRACKET = (int) '[';
     69     public static final int R_SQUARE_BRACKET = (int) ']';
     70     public static final int R_CURLY = (int) '}';
     71     public static final int L_CURLY = (int) '{';
     72     public static final int HAT = (int) '^';
     73     public static final int BAR = (int) '|';
     74     public static final int DOT = (int) '.';
     75     public static final int EXCLAMATION = (int) '!';
     76     public static final int LPAREN = (int) '(';
     77     public static final int RPAREN = (int) ')';
     78     public static final int GREATER_THAN = (int) '>';
     79     public static final int LESS_THAN = (int) '<';
     80     public static final int PERCENT = (int) '%';
     81     public static final int QUESTION = (int) '?';
     82     public static final int AND = (int) '&';
     83     public static final int UNDERSCORE = (int) '_';
     84 
     85     protected static final Hashtable globalSymbolTable;
     86     protected static final Hashtable lexerTables;
     87     protected Hashtable currentLexer;
     88     protected String currentLexerName;
     89     protected Token currentMatch;
     90 
     91     static {
     92         globalSymbolTable = new Hashtable();
     93         lexerTables = new Hashtable();
     94     }
     95 
     96     protected void addKeyword(String name, int value) {
     97         // System.out.println("addKeyword " + name + " value = " + value);
     98         // new Exception().printStackTrace();
     99         Integer val = Integer.valueOf(value);
    100         currentLexer.put(name, val);
    101         if (!globalSymbolTable.containsKey(val))
    102             globalSymbolTable.put(val, name);
    103     }
    104 
    105     public String lookupToken(int value) {
    106         if (value > START) {
    107             return (String) globalSymbolTable.get(Integer.valueOf(value));
    108         } else {
    109             Character ch = Character.valueOf((char) value);
    110             return ch.toString();
    111         }
    112     }
    113 
    114     protected Hashtable addLexer(String lexerName) {
    115         currentLexer = (Hashtable) lexerTables.get(lexerName);
    116         if (currentLexer == null) {
    117             currentLexer = new Hashtable();
    118             lexerTables.put(lexerName, currentLexer);
    119         }
    120         return currentLexer;
    121     }
    122 
    123     //public abstract void selectLexer(String lexerName);
    124 
    125     public void selectLexer(String lexerName) {
    126         this.currentLexerName = lexerName;
    127     }
    128 
    129     protected LexerCore() {
    130         this.currentLexer = new Hashtable();
    131         this.currentLexerName = "charLexer";
    132     }
    133 
    134     /** Initialize the lexer with a buffer.
    135      */
    136     public LexerCore(String lexerName, String buffer) {
    137         super(buffer);
    138         this.currentLexerName = lexerName;
    139     }
    140 
    141     /** Peek the next id but dont move the buffer pointer forward.
    142      */
    143 
    144     public String peekNextId() {
    145         int oldPtr = ptr;
    146         String retval = ttoken();
    147         savedPtr = ptr;
    148         ptr = oldPtr;
    149         return retval;
    150     }
    151 
    152     /** Get the next id.
    153      */
    154     public String getNextId() {
    155         return ttoken();
    156     }
    157 
    158     // call this after you call match
    159     public Token getNextToken() {
    160         return this.currentMatch;
    161 
    162     }
    163 
    164     /** Look ahead for one token.
    165      */
    166     public Token peekNextToken() throws ParseException {
    167         return (Token) peekNextToken(1)[0];
    168     }
    169 
    170     public Token[] peekNextToken(int ntokens) throws ParseException {
    171         int old = ptr;
    172         Token[] retval = new Token[ntokens];
    173         for (int i = 0; i < ntokens; i++) {
    174             Token tok = new Token();
    175             if (startsId()) {
    176                 String id = ttoken();
    177                 tok.tokenValue = id;
    178                 String idUppercase = id.toUpperCase();
    179                 if (currentLexer.containsKey(idUppercase)) {
    180                     Integer type = (Integer) currentLexer.get(idUppercase);
    181                     tok.tokenType = type.intValue();
    182                 } else
    183                     tok.tokenType = ID;
    184             } else {
    185                 char nextChar = getNextChar();
    186                 tok.tokenValue = String.valueOf(nextChar);
    187                 if (isAlpha(nextChar)) {
    188                     tok.tokenType = ALPHA;
    189                 } else if (isDigit(nextChar)) {
    190                     tok.tokenType = DIGIT;
    191                 } else
    192                     tok.tokenType = (int) nextChar;
    193             }
    194             retval[i] = tok;
    195         }
    196         savedPtr = ptr;
    197         ptr = old;
    198         return retval;
    199     }
    200 
    201     /** Match the given token or throw an exception if no such token
    202      * can be matched.
    203      */
    204     public Token match(int tok) throws ParseException {
    205         if (Debug.parserDebug) {
    206             Debug.println("match " + tok);
    207         }
    208         if (tok > START && tok < END) {
    209             if (tok == ID) {
    210                 // Generic ID sought.
    211                 if (!startsId())
    212                     throw new ParseException(buffer + "\nID expected", ptr);
    213                 String id = getNextId();
    214                 this.currentMatch = new Token();
    215                 this.currentMatch.tokenValue = id;
    216                 this.currentMatch.tokenType = ID;
    217             } else if (tok == SAFE) {
    218                 if (!startsSafeToken())
    219                     throw new ParseException(buffer + "\nID expected", ptr);
    220                 String id = ttokenSafe();
    221                 this.currentMatch = new Token();
    222                 this.currentMatch.tokenValue = id;
    223                 this.currentMatch.tokenType = SAFE;
    224             } else {
    225                 String nexttok = getNextId();
    226                 Integer cur = (Integer) currentLexer.get(nexttok.toUpperCase());
    227 
    228                 if (cur == null || cur.intValue() != tok)
    229                     throw new ParseException(
    230                         buffer + "\nUnexpected Token : " + nexttok,
    231                         ptr);
    232                 this.currentMatch = new Token();
    233                 this.currentMatch.tokenValue = nexttok;
    234                 this.currentMatch.tokenType = tok;
    235             }
    236         } else if (tok > END) {
    237             // Character classes.
    238             char next = lookAhead(0);
    239             if (tok == DIGIT) {
    240                 if (!isDigit(next))
    241                     throw new ParseException(buffer + "\nExpecting DIGIT", ptr);
    242                 this.currentMatch = new Token();
    243                 this.currentMatch.tokenValue =
    244                     String.valueOf(next);
    245                 this.currentMatch.tokenType = tok;
    246                 consume(1);
    247 
    248             } else if (tok == ALPHA) {
    249                 if (!isAlpha(next))
    250                     throw new ParseException(buffer + "\nExpecting ALPHA", ptr);
    251                 this.currentMatch = new Token();
    252                 this.currentMatch.tokenValue =
    253                     String.valueOf(next);
    254                 this.currentMatch.tokenType = tok;
    255                 consume(1);
    256 
    257             }
    258 
    259         } else {
    260             // This is a direct character spec.
    261             char ch = (char) tok;
    262             char next = lookAhead(0);
    263             if (next == ch) {
    264                 /*this.currentMatch = new Token();
    265                 this.currentMatch.tokenValue =
    266                     String.valueOf(ch);
    267                 this.currentMatch.tokenType = tok;*/
    268                 consume(1);
    269             } else
    270                 throw new ParseException(
    271                     buffer + "\nExpecting  >>>" + ch + "<<< got >>>"
    272                     + next + "<<<", ptr);
    273         }
    274         return this.currentMatch;
    275     }
    276 
    277     public void SPorHT() {
    278         try {
    279             char c = lookAhead(0);
    280             while (c == ' ' || c == '\t') {
    281                 consume(1);
    282                 c = lookAhead(0);
    283             }
    284         } catch (ParseException ex) {
    285             // Ignore
    286         }
    287     }
    288 
    289     /**
    290      * JvB: utility function added to validate tokens
    291      *
    292      * @see RFC3261 section 25.1:
    293      * token       =  1*(alphanum / "-" / "." / "!" / "%" / "*"
    294                      / "_" / "+" / "`" / "'" / "~" )
    295 
    296      * @param c - character to check
    297      * @return true iff character c is a valid token character as per RFC3261
    298      */
    299     public static final boolean isTokenChar( char c ) {
    300         if ( isAlphaDigit(c) ) return true;
    301         else switch (c)
    302         {
    303             case '-':
    304             case '.':
    305             case '!':
    306             case '%':
    307             case '*':
    308             case '_':
    309             case '+':
    310             case '`':
    311             case '\'':
    312             case '~':
    313                 return true;
    314             default:
    315                 return false;
    316         }
    317     }
    318 
    319 
    320     public boolean startsId() {
    321         try {
    322             char nextChar = lookAhead(0);
    323             return isTokenChar(nextChar);
    324         } catch (ParseException ex) {
    325             return false;
    326         }
    327     }
    328 
    329     public boolean startsSafeToken() {
    330         try {
    331             char nextChar = lookAhead(0);
    332             if (isAlphaDigit(nextChar)) {
    333                 return true;
    334             }
    335             else {
    336                 switch (nextChar) {
    337                     case '_':
    338                     case '+':
    339                     case '-':
    340                     case '!':
    341                     case '`':
    342                     case '\'':
    343                     case '.':
    344                     case '/':
    345                     case '}':
    346                     case '{':
    347                     case ']':
    348                     case '[':
    349                     case '^':
    350                     case '|':
    351                     case '~':
    352                     case '%': // bug fix by Bruno Konik, JvB copied here
    353                     case '#':
    354                     case '@':
    355                     case '$':
    356                     case ':':
    357                     case ';':
    358                     case '?':
    359                     case '\"':
    360                     case '*':
    361                     case '=': // Issue 155 on java.net
    362                         return true;
    363                     default:
    364                         return false;
    365                 }
    366             }
    367         } catch (ParseException ex) {
    368             return false;
    369         }
    370     }
    371 
    372     public String ttoken() {
    373         int startIdx = ptr;
    374         try {
    375             while (hasMoreChars()) {
    376                 char nextChar = lookAhead(0);
    377                 if ( isTokenChar(nextChar) ) {
    378                     consume(1);
    379                 } else {
    380                     break;
    381                 }
    382             }
    383             return buffer.substring(startIdx, ptr);
    384         } catch (ParseException ex) {
    385             return null;
    386         }
    387     }
    388 
    389     /* JvB: unreferenced
    390     public String ttokenAllowSpace() {
    391         int startIdx = ptr;
    392         try {
    393             while (hasMoreChars()) {
    394                 char nextChar = lookAhead(0);
    395                 if (isAlphaDigit(nextChar)) {
    396                     consume(1);
    397                 }
    398                 else {
    399                     boolean isValidChar = false;
    400                     switch (nextChar) {
    401                         case '_':
    402                         case '+':
    403                         case '-':
    404                         case '!':
    405                         case '`':
    406                         case '\'':
    407                         case '~':
    408                         case '%': // bug fix by Bruno Konik, JvB copied here
    409                         case '.':
    410                         case ' ':
    411                         case '\t':
    412                         case '*':
    413                             isValidChar = true;
    414                     }
    415                     if (isValidChar) {
    416                         consume(1);
    417                     }
    418                     else {
    419                         break;
    420                     }
    421                 }
    422 
    423             }
    424             return buffer.substring(startIdx, ptr);
    425         } catch (ParseException ex) {
    426             return null;
    427         }
    428     }*/
    429 
    430     public String ttokenSafe() {
    431         int startIdx = ptr;
    432         try {
    433             while (hasMoreChars()) {
    434                 char nextChar = lookAhead(0);
    435                 if (isAlphaDigit(nextChar)) {
    436                     consume(1);
    437                 }
    438                 else {
    439                     boolean isValidChar = false;
    440                     switch (nextChar) {
    441                         case '_':
    442                         case '+':
    443                         case '-':
    444                         case '!':
    445                         case '`':
    446                         case '\'':
    447                         case '.':
    448                         case '/':
    449                         case '}':
    450                         case '{':
    451                         case ']':
    452                         case '[':
    453                         case '^':
    454                         case '|':
    455                         case '~':
    456                         case '%': // bug fix by Bruno Konik, JvB copied here
    457                         case '#':
    458                         case '@':
    459                         case '$':
    460                         case ':':
    461                         case ';':
    462                         case '?':
    463                         case '\"':
    464                         case '*':
    465                             isValidChar = true;
    466                     }
    467                     if (isValidChar) {
    468                         consume(1);
    469                     }
    470                     else {
    471                         break;
    472                     }
    473                 }
    474             }
    475             return buffer.substring(startIdx, ptr);
    476         } catch (ParseException ex) {
    477             return null;
    478         }
    479     }
    480 
    481     static final char ALPHA_VALID_CHARS = Character.MAX_VALUE;
    482     static final char DIGIT_VALID_CHARS = Character.MAX_VALUE - 1;
    483     static final char ALPHADIGIT_VALID_CHARS = Character.MAX_VALUE - 2;
    484     public void consumeValidChars(char[] validChars) {
    485         int validCharsLength = validChars.length;
    486         try {
    487             while (hasMoreChars()) {
    488                 char nextChar = lookAhead(0);
    489                 boolean isValid = false;
    490                 for (int i = 0; i < validCharsLength; i++) {
    491                     char validChar = validChars[i];
    492                     switch(validChar) {
    493                         case ALPHA_VALID_CHARS:
    494                             isValid = isAlpha(nextChar);
    495                             break;
    496                         case DIGIT_VALID_CHARS:
    497                             isValid = isDigit(nextChar);
    498                             break;
    499                         case ALPHADIGIT_VALID_CHARS:
    500                             isValid = isAlphaDigit(nextChar);
    501                             break;
    502                         default:
    503                             isValid = nextChar == validChar;
    504                     }
    505                     if (isValid) {
    506                         break;
    507                     }
    508                 }
    509                 if (isValid) {
    510                     consume(1);
    511                 }
    512                 else {
    513                     break;
    514                 }
    515             }
    516         } catch (ParseException ex) {
    517 
    518         }
    519     }
    520 
    521     /** Parse a comment string cursor is at a ". Leave cursor at closing "
    522     *@return the substring containing the quoted string excluding the
    523     * closing quote.
    524     */
    525     public String quotedString() throws ParseException {
    526         int startIdx = ptr + 1;
    527         if (lookAhead(0) != '\"')
    528             return null;
    529         consume(1);
    530         while (true) {
    531             char next = getNextChar();
    532             if (next == '\"') {
    533                 // Got to the terminating quote.
    534                 break;
    535             } else if (next == '\0') {
    536                 throw new ParseException(
    537                     this.buffer + " :unexpected EOL",
    538                     this.ptr);
    539             } else if (next == '\\') {
    540                 consume(1);
    541             }
    542         }
    543         return buffer.substring(startIdx, ptr - 1);
    544     }
    545 
    546     /** Parse a comment string cursor is at a "(". Leave cursor at )
    547     *@return the substring containing the comment excluding the
    548     * closing brace.
    549     */
    550     public String comment() throws ParseException {
    551         StringBuffer retval = new StringBuffer();
    552         if (lookAhead(0) != '(')
    553             return null;
    554         consume(1);
    555         while (true) {
    556             char next = getNextChar();
    557             if (next == ')') {
    558                 break;
    559             } else if (next == '\0') {
    560                 throw new ParseException(
    561                     this.buffer + " :unexpected EOL",
    562                     this.ptr);
    563             } else if (next == '\\') {
    564                 retval.append(next);
    565                 next = getNextChar();
    566                 if (next == '\0')
    567                     throw new ParseException(
    568                         this.buffer + " : unexpected EOL",
    569                         this.ptr);
    570                 retval.append(next);
    571             } else {
    572                 retval.append(next);
    573             }
    574         }
    575         return retval.toString();
    576     }
    577 
    578     /** Return a substring containing no semicolons.
    579     *@return a substring containing no semicolons.
    580     */
    581     public String byteStringNoSemicolon() {
    582         StringBuffer retval = new StringBuffer();
    583         try {
    584             while (true) {
    585                 char next = lookAhead(0);
    586                 // bug fix from Ben Evans.
    587                 if (next == '\0' || next == '\n' || next == ';' || next == ',' ) {
    588                     break;
    589                 } else {
    590                     consume(1);
    591                     retval.append(next);
    592                 }
    593             }
    594         } catch (ParseException ex) {
    595             return retval.toString();
    596         }
    597         return retval.toString();
    598     }
    599 
    600     /**
    601      * Scan until you see a slash or an EOL.
    602      *
    603      * @return substring containing no slash.
    604      */
    605     public String byteStringNoSlash() {
    606         StringBuffer retval = new StringBuffer();
    607         try {
    608             while (true) {
    609                 char next = lookAhead(0);
    610                 // bug fix from Ben Evans.
    611                 if (next == '\0' || next == '\n' || next == '/'  ) {
    612                     break;
    613                 } else {
    614                     consume(1);
    615                     retval.append(next);
    616                 }
    617             }
    618         } catch (ParseException ex) {
    619             return retval.toString();
    620         }
    621         return retval.toString();
    622     }
    623 
    624     /** Return a substring containing no commas
    625     *@return a substring containing no commas.
    626     */
    627 
    628     public String byteStringNoComma() {
    629         StringBuffer retval = new StringBuffer();
    630         try {
    631             while (true) {
    632                 char next = lookAhead(0);
    633                 if (next == '\n' || next == ',') {
    634                     break;
    635                 } else {
    636                     consume(1);
    637                     retval.append(next);
    638                 }
    639             }
    640         } catch (ParseException ex) {
    641         }
    642         return retval.toString();
    643     }
    644 
    645     public static String charAsString(char ch) {
    646         return String.valueOf(ch);
    647     }
    648 
    649     /** Lookahead in the inputBuffer for n chars and return as a string.
    650      * Do not consume the input.
    651      */
    652     public String charAsString(int nchars) {
    653         return buffer.substring(ptr, ptr + nchars);
    654     }
    655 
    656     /** Get and consume the next number.
    657      *@return a substring corresponding to a number
    658      *(i.e. sequence of digits).
    659      */
    660     public String number() throws ParseException {
    661 
    662         int startIdx = ptr;
    663         try {
    664             if (!isDigit(lookAhead(0))) {
    665                 throw new ParseException(
    666                     buffer + ": Unexpected token at " + lookAhead(0),
    667                     ptr);
    668             }
    669             consume(1);
    670             while (true) {
    671                 char next = lookAhead(0);
    672                 if (isDigit(next)) {
    673                     consume(1);
    674                 } else
    675                     break;
    676             }
    677             return buffer.substring(startIdx, ptr);
    678         } catch (ParseException ex) {
    679             return buffer.substring(startIdx, ptr);
    680         }
    681     }
    682 
    683     /** Mark the position for backtracking.
    684      *@return the current location of the pointer.
    685      */
    686     public int markInputPosition() {
    687         return ptr;
    688     }
    689 
    690     /** Rewind the input ptr to the marked position.
    691      *@param position - the position to rewind the parser to.
    692      */
    693     public void rewindInputPosition(int position) {
    694         this.ptr = position;
    695     }
    696 
    697     /** Get the rest of the String
    698      * @return rest of the buffer.
    699      */
    700     public String getRest() {
    701         if (ptr >= buffer.length())
    702             return null;
    703         else
    704             return buffer.substring(ptr);
    705     }
    706 
    707     /** Get the sub-String until the character is encountered
    708      * @param c the character to match
    709      * @return the substring that matches.
    710      */
    711     public String getString(char c) throws ParseException {
    712         StringBuffer retval = new StringBuffer();
    713         while (true) {
    714             char next = lookAhead(0);
    715             //System.out.println(" next = [" + next + ']' + "ptr = " + ptr);
    716             //System.out.println(next == '\0');
    717 
    718             if (next == '\0') {
    719                 throw new ParseException(
    720                     this.buffer + "unexpected EOL",
    721                     this.ptr);
    722             } else if (next == c) {
    723                 consume(1);
    724                 break;
    725             } else if (next == '\\') {
    726                 consume(1);
    727                 char nextchar = lookAhead(0);
    728                 if (nextchar == '\0') {
    729                     throw new ParseException(
    730                         this.buffer + "unexpected EOL",
    731                         this.ptr);
    732                 } else {
    733                     consume(1);
    734                     retval.append(nextchar);
    735                 }
    736             } else {
    737                 consume(1);
    738                 retval.append(next);
    739             }
    740         }
    741         return retval.toString();
    742     }
    743 
    744     /** Get the read pointer.
    745      */
    746     public int getPtr() {
    747         return this.ptr;
    748     }
    749 
    750     /** Get the buffer.
    751      */
    752     public String getBuffer() {
    753         return this.buffer;
    754     }
    755 
    756     /** Create a parse exception.
    757      */
    758     public ParseException createParseException() {
    759         return new ParseException(this.buffer, this.ptr);
    760     }
    761 }
    762