Home | History | Annotate | Download | only in io
      1 /*
      2  *  Licensed to the Apache Software Foundation (ASF) under one or more
      3  *  contributor license agreements.  See the NOTICE file distributed with
      4  *  this work for additional information regarding copyright ownership.
      5  *  The ASF licenses this file to You under the Apache License, Version 2.0
      6  *  (the "License"); you may not use this file except in compliance with
      7  *  the License.  You may obtain a copy of the License at
      8  *
      9  *     http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  *  Unless required by applicable law or agreed to in writing, software
     12  *  distributed under the License is distributed on an "AS IS" BASIS,
     13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  *  See the License for the specific language governing permissions and
     15  *  limitations under the License.
     16  */
     17 
     18 package java.io;
     19 
     20 import java.util.Locale;
     21 
     22 /**
     23  * Parses a stream into a set of defined tokens, one at a time. The different
     24  * types of tokens that can be found are numbers, identifiers, quoted strings,
     25  * and different comment styles. The class can be used for limited processing
     26  * of source code of programming languages like Java, although it is nowhere
     27  * near a full parser.
     28  */
     29 public class StreamTokenizer {
     30     /**
     31      * Contains a number if the current token is a number ({@code ttype} ==
     32      * {@code TT_NUMBER}).
     33      */
     34     public double nval;
     35 
     36     /**
     37      * Contains a string if the current token is a word ({@code ttype} ==
     38      * {@code TT_WORD}).
     39      */
     40     public String sval;
     41 
     42     /**
     43      * The constant representing the end of the stream.
     44      */
     45     public static final int TT_EOF = -1;
     46 
     47     /**
     48      * The constant representing the end of the line.
     49      */
     50     public static final int TT_EOL = '\n';
     51 
     52     /**
     53      * The constant representing a number token.
     54      */
     55     public static final int TT_NUMBER = -2;
     56 
     57     /**
     58      * The constant representing a word token.
     59      */
     60     public static final int TT_WORD = -3;
     61 
     62     /**
     63      * Internal representation of unknown state.
     64      */
     65     private static final int TT_UNKNOWN = -4;
     66 
     67     /**
     68      * After calling {@code nextToken()}, {@code ttype} contains the type of
     69      * token that has been read. When a single character is read, its value
     70      * converted to an integer is stored in {@code ttype}. For a quoted string,
     71      * the value is the quoted character. Otherwise, its value is one of the
     72      * following:
     73      * <ul>
     74      * <li> {@code TT_WORD} - the token is a word.</li>
     75      * <li> {@code TT_NUMBER} - the token is a number.</li>
     76      * <li> {@code TT_EOL} - the end of line has been reached. Depends on
     77      * whether {@code eolIsSignificant} is {@code true}.</li>
     78      * <li> {@code TT_EOF} - the end of the stream has been reached.</li>
     79      * </ul>
     80      */
     81     public int ttype = TT_UNKNOWN;
     82 
     83     /**
     84      * Internal character meanings, 0 implies TOKEN_ORDINARY
     85      */
     86     private byte[] tokenTypes = new byte[256];
     87 
     88     private static final byte TOKEN_COMMENT = 1;
     89 
     90     private static final byte TOKEN_QUOTE = 2;
     91 
     92     private static final byte TOKEN_WHITE = 4;
     93 
     94     private static final byte TOKEN_WORD = 8;
     95 
     96     private static final byte TOKEN_DIGIT = 16;
     97 
     98     private int lineNumber = 1;
     99 
    100     private boolean forceLowercase;
    101 
    102     private boolean isEOLSignificant;
    103 
    104     private boolean slashStarComments;
    105 
    106     private boolean slashSlashComments;
    107 
    108     private boolean pushBackToken;
    109 
    110     private boolean lastCr;
    111 
    112     /* One of these will have the stream */
    113     private InputStream inStream;
    114 
    115     private Reader inReader;
    116 
    117     private int peekChar = -2;
    118 
    119     /**
    120      * Private constructor to initialize the default values according to the
    121      * specification.
    122      */
    123     private StreamTokenizer() {
    124         /*
    125          * Initialize the default state per specification. All byte values 'A'
    126          * through 'Z', 'a' through 'z', and '\u00A0' through '\u00FF' are
    127          * considered to be alphabetic.
    128          */
    129         wordChars('A', 'Z');
    130         wordChars('a', 'z');
    131         wordChars(160, 255);
    132         /**
    133          * All byte values '\u0000' through '\u0020' are considered to be white
    134          * space.
    135          */
    136         whitespaceChars(0, 32);
    137         /**
    138          * '/' is a comment character. Single quote '\'' and double quote '"'
    139          * are string quote characters.
    140          */
    141         commentChar('/');
    142         quoteChar('"');
    143         quoteChar('\'');
    144         /**
    145          * Numbers are parsed.
    146          */
    147         parseNumbers();
    148         /**
    149          * Ends of lines are treated as white space, not as separate tokens.
    150          * C-style and C++-style comments are not recognized. These are the
    151          * defaults and are not needed in constructor.
    152          */
    153     }
    154 
    155     /**
    156      * Constructs a new {@code StreamTokenizer} with {@code is} as source input
    157      * stream. This constructor is deprecated; instead, the constructor that
    158      * takes a {@code Reader} as an argument should be used.
    159      *
    160      * @param is
    161      *            the source stream from which to parse tokens.
    162      * @throws NullPointerException
    163      *             if {@code is} is {@code null}.
    164      * @deprecated Use {@link #StreamTokenizer(Reader)} instead.
    165      */
    166     @Deprecated
    167     public StreamTokenizer(InputStream is) {
    168         this();
    169         if (is == null) {
    170             throw new NullPointerException("is == null");
    171         }
    172         inStream = is;
    173     }
    174 
    175     /**
    176      * Constructs a new {@code StreamTokenizer} with {@code r} as source reader.
    177      * The tokenizer's initial state is as follows:
    178      * <ul>
    179      * <li>All byte values 'A' through 'Z', 'a' through 'z', and '&#92;u00A0'
    180      * through '&#92;u00FF' are considered to be alphabetic.</li>
    181      * <li>All byte values '&#92;u0000' through '&#92;u0020' are considered to
    182      * be white space. '/' is a comment character.</li>
    183      * <li>Single quote '\'' and double quote '"' are string quote characters.
    184      * </li>
    185      * <li>Numbers are parsed.</li>
    186      * <li>End of lines are considered to be white space rather than separate
    187      * tokens.</li>
    188      * <li>C-style and C++-style comments are not recognized.</LI>
    189      * </ul>
    190      *
    191      * @param r
    192      *            the source reader from which to parse tokens.
    193      */
    194     public StreamTokenizer(Reader r) {
    195         this();
    196         if (r == null) {
    197             throw new NullPointerException("r == null");
    198         }
    199         inReader = r;
    200     }
    201 
    202     /**
    203      * Specifies that the character {@code ch} shall be treated as a comment
    204      * character.
    205      *
    206      * @param ch
    207      *            the character to be considered a comment character.
    208      */
    209     public void commentChar(int ch) {
    210         if (ch >= 0 && ch < tokenTypes.length) {
    211             tokenTypes[ch] = TOKEN_COMMENT;
    212         }
    213     }
    214 
    215     /**
    216      * Specifies whether the end of a line is significant and should be returned
    217      * as {@code TT_EOF} in {@code ttype} by this tokenizer.
    218      *
    219      * @param flag
    220      *            {@code true} if EOL is significant, {@code false} otherwise.
    221      */
    222     public void eolIsSignificant(boolean flag) {
    223         isEOLSignificant = flag;
    224     }
    225 
    226     /**
    227      * Returns the current line number.
    228      *
    229      * @return this tokenizer's current line number.
    230      */
    231     public int lineno() {
    232         return lineNumber;
    233     }
    234 
    235     /**
    236      * Specifies whether word tokens should be converted to lower case when they
    237      * are stored in {@code sval}.
    238      *
    239      * @param flag
    240      *            {@code true} if {@code sval} should be converted to lower
    241      *            case, {@code false} otherwise.
    242      */
    243     public void lowerCaseMode(boolean flag) {
    244         forceLowercase = flag;
    245     }
    246 
    247     /**
    248      * Parses the next token from this tokenizer's source stream or reader. The
    249      * type of the token is stored in the {@code ttype} field, additional
    250      * information may be stored in the {@code nval} or {@code sval} fields.
    251      *
    252      * @return the value of {@code ttype}.
    253      * @throws IOException
    254      *             if an I/O error occurs while parsing the next token.
    255      */
    256     public int nextToken() throws IOException {
    257         if (pushBackToken) {
    258             pushBackToken = false;
    259             if (ttype != TT_UNKNOWN) {
    260                 return ttype;
    261             }
    262         }
    263         sval = null; // Always reset sval to null
    264         int currentChar = peekChar == -2 ? read() : peekChar;
    265 
    266         if (lastCr && currentChar == '\n') {
    267             lastCr = false;
    268             currentChar = read();
    269         }
    270         if (currentChar == -1) {
    271             return (ttype = TT_EOF);
    272         }
    273 
    274         byte currentType = currentChar > 255 ? TOKEN_WORD
    275                 : tokenTypes[currentChar];
    276         while ((currentType & TOKEN_WHITE) != 0) {
    277             /**
    278              * Skip over white space until we hit a new line or a real token
    279              */
    280             if (currentChar == '\r') {
    281                 lineNumber++;
    282                 if (isEOLSignificant) {
    283                     lastCr = true;
    284                     peekChar = -2;
    285                     return (ttype = TT_EOL);
    286                 }
    287                 if ((currentChar = read()) == '\n') {
    288                     currentChar = read();
    289                 }
    290             } else if (currentChar == '\n') {
    291                 lineNumber++;
    292                 if (isEOLSignificant) {
    293                     peekChar = -2;
    294                     return (ttype = TT_EOL);
    295                 }
    296                 currentChar = read();
    297             } else {
    298                 // Advance over this white space character and try again.
    299                 currentChar = read();
    300             }
    301             if (currentChar == -1) {
    302                 return (ttype = TT_EOF);
    303             }
    304             currentType = currentChar > 255 ? TOKEN_WORD
    305                     : tokenTypes[currentChar];
    306         }
    307 
    308         /**
    309          * Check for digits before checking for words since digits can be
    310          * contained within words.
    311          */
    312         if ((currentType & TOKEN_DIGIT) != 0) {
    313             StringBuilder digits = new StringBuilder(20);
    314             boolean haveDecimal = false, checkJustNegative = currentChar == '-';
    315             while (true) {
    316                 if (currentChar == '.') {
    317                     haveDecimal = true;
    318                 }
    319                 digits.append((char) currentChar);
    320                 currentChar = read();
    321                 if ((currentChar < '0' || currentChar > '9')
    322                         && (haveDecimal || currentChar != '.')) {
    323                     break;
    324                 }
    325             }
    326             peekChar = currentChar;
    327             if (checkJustNegative && digits.length() == 1) {
    328                 // Didn't get any other digits other than '-'
    329                 return (ttype = '-');
    330             }
    331             try {
    332                 nval = Double.valueOf(digits.toString()).doubleValue();
    333             } catch (NumberFormatException e) {
    334                 // Unsure what to do, will write test.
    335                 nval = 0;
    336             }
    337             return (ttype = TT_NUMBER);
    338         }
    339         // Check for words
    340         if ((currentType & TOKEN_WORD) != 0) {
    341             StringBuilder word = new StringBuilder(20);
    342             while (true) {
    343                 word.append((char) currentChar);
    344                 currentChar = read();
    345                 if (currentChar == -1
    346                         || (currentChar < 256 && (tokenTypes[currentChar] & (TOKEN_WORD | TOKEN_DIGIT)) == 0)) {
    347                     break;
    348                 }
    349             }
    350             peekChar = currentChar;
    351             sval = word.toString();
    352             if (forceLowercase) {
    353                 sval = sval.toLowerCase(Locale.getDefault());
    354             }
    355             return (ttype = TT_WORD);
    356         }
    357         // Check for quoted character
    358         if (currentType == TOKEN_QUOTE) {
    359             int matchQuote = currentChar;
    360             StringBuilder quoteString = new StringBuilder();
    361             int peekOne = read();
    362             while (peekOne >= 0 && peekOne != matchQuote && peekOne != '\r'
    363                     && peekOne != '\n') {
    364                 boolean readPeek = true;
    365                 if (peekOne == '\\') {
    366                     int c1 = read();
    367                     // Check for quoted octal IE: \377
    368                     if (c1 <= '7' && c1 >= '0') {
    369                         int digitValue = c1 - '0';
    370                         c1 = read();
    371                         if (c1 > '7' || c1 < '0') {
    372                             readPeek = false;
    373                         } else {
    374                             digitValue = digitValue * 8 + (c1 - '0');
    375                             c1 = read();
    376                             // limit the digit value to a byte
    377                             if (digitValue > 037 || c1 > '7' || c1 < '0') {
    378                                 readPeek = false;
    379                             } else {
    380                                 digitValue = digitValue * 8 + (c1 - '0');
    381                             }
    382                         }
    383                         if (!readPeek) {
    384                             // We've consumed one to many
    385                             quoteString.append((char) digitValue);
    386                             peekOne = c1;
    387                         } else {
    388                             peekOne = digitValue;
    389                         }
    390                     } else {
    391                         switch (c1) {
    392                             case 'a':
    393                                 peekOne = 0x7;
    394                                 break;
    395                             case 'b':
    396                                 peekOne = 0x8;
    397                                 break;
    398                             case 'f':
    399                                 peekOne = 0xc;
    400                                 break;
    401                             case 'n':
    402                                 peekOne = 0xA;
    403                                 break;
    404                             case 'r':
    405                                 peekOne = 0xD;
    406                                 break;
    407                             case 't':
    408                                 peekOne = 0x9;
    409                                 break;
    410                             case 'v':
    411                                 peekOne = 0xB;
    412                                 break;
    413                             default:
    414                                 peekOne = c1;
    415                         }
    416                     }
    417                 }
    418                 if (readPeek) {
    419                     quoteString.append((char) peekOne);
    420                     peekOne = read();
    421                 }
    422             }
    423             if (peekOne == matchQuote) {
    424                 peekOne = read();
    425             }
    426             peekChar = peekOne;
    427             ttype = matchQuote;
    428             sval = quoteString.toString();
    429             return ttype;
    430         }
    431         // Do comments, both "//" and "/*stuff*/"
    432         if (currentChar == '/' && (slashSlashComments || slashStarComments)) {
    433             if ((currentChar = read()) == '*' && slashStarComments) {
    434                 int peekOne = read();
    435                 while (true) {
    436                     currentChar = peekOne;
    437                     peekOne = read();
    438                     if (currentChar == -1) {
    439                         peekChar = -1;
    440                         return (ttype = TT_EOF);
    441                     }
    442                     if (currentChar == '\r') {
    443                         if (peekOne == '\n') {
    444                             peekOne = read();
    445                         }
    446                         lineNumber++;
    447                     } else if (currentChar == '\n') {
    448                         lineNumber++;
    449                     } else if (currentChar == '*' && peekOne == '/') {
    450                         peekChar = read();
    451                         return nextToken();
    452                     }
    453                 }
    454             } else if (currentChar == '/' && slashSlashComments) {
    455                 // Skip to EOF or new line then return the next token
    456                 while ((currentChar = read()) >= 0 && currentChar != '\r'
    457                         && currentChar != '\n') {
    458                     // Intentionally empty
    459                 }
    460                 peekChar = currentChar;
    461                 return nextToken();
    462             } else if (currentType != TOKEN_COMMENT) {
    463                 // Was just a slash by itself
    464                 peekChar = currentChar;
    465                 return (ttype = '/');
    466             }
    467         }
    468         // Check for comment character
    469         if (currentType == TOKEN_COMMENT) {
    470             // Skip to EOF or new line then return the next token
    471             while ((currentChar = read()) >= 0 && currentChar != '\r'
    472                     && currentChar != '\n') {
    473                 // Intentionally empty
    474             }
    475             peekChar = currentChar;
    476             return nextToken();
    477         }
    478 
    479         peekChar = read();
    480         return (ttype = currentChar);
    481     }
    482 
    483     /**
    484      * Specifies that the character {@code ch} shall be treated as an ordinary
    485      * character by this tokenizer. That is, it has no special meaning as a
    486      * comment character, word component, white space, string delimiter or
    487      * number.
    488      *
    489      * @param ch
    490      *            the character to be considered an ordinary character.
    491      */
    492     public void ordinaryChar(int ch) {
    493         if (ch >= 0 && ch < tokenTypes.length) {
    494             tokenTypes[ch] = 0;
    495         }
    496     }
    497 
    498     /**
    499      * Specifies that the characters in the range from {@code low} to {@code hi}
    500      * shall be treated as an ordinary character by this tokenizer. That is,
    501      * they have no special meaning as a comment character, word component,
    502      * white space, string delimiter or number.
    503      *
    504      * @param low
    505      *            the first character in the range of ordinary characters.
    506      * @param hi
    507      *            the last character in the range of ordinary characters.
    508      */
    509     public void ordinaryChars(int low, int hi) {
    510         if (low < 0) {
    511             low = 0;
    512         }
    513         if (hi > tokenTypes.length) {
    514             hi = tokenTypes.length - 1;
    515         }
    516         for (int i = low; i <= hi; i++) {
    517             tokenTypes[i] = 0;
    518         }
    519     }
    520 
    521     /**
    522      * Specifies that this tokenizer shall parse numbers.
    523      */
    524     public void parseNumbers() {
    525         for (int i = '0'; i <= '9'; i++) {
    526             tokenTypes[i] |= TOKEN_DIGIT;
    527         }
    528         tokenTypes['.'] |= TOKEN_DIGIT;
    529         tokenTypes['-'] |= TOKEN_DIGIT;
    530     }
    531 
    532     /**
    533      * Indicates that the current token should be pushed back and returned again
    534      * the next time {@code nextToken()} is called.
    535      */
    536     public void pushBack() {
    537         pushBackToken = true;
    538     }
    539 
    540     /**
    541      * Specifies that the character {@code ch} shall be treated as a quote
    542      * character.
    543      *
    544      * @param ch
    545      *            the character to be considered a quote character.
    546      */
    547     public void quoteChar(int ch) {
    548         if (ch >= 0 && ch < tokenTypes.length) {
    549             tokenTypes[ch] = TOKEN_QUOTE;
    550         }
    551     }
    552 
    553     private int read() throws IOException {
    554         // Call the read for the appropriate stream
    555         if (inStream == null) {
    556             return inReader.read();
    557         }
    558         return inStream.read();
    559     }
    560 
    561     /**
    562      * Specifies that all characters shall be treated as ordinary characters.
    563      */
    564     public void resetSyntax() {
    565         for (int i = 0; i < 256; i++) {
    566             tokenTypes[i] = 0;
    567         }
    568     }
    569 
    570     /**
    571      * Specifies whether "slash-slash" (C++-style) comments shall be recognized.
    572      * This kind of comment ends at the end of the line.
    573      *
    574      * @param flag
    575      *            {@code true} if {@code //} should be recognized as the start
    576      *            of a comment, {@code false} otherwise.
    577      */
    578     public void slashSlashComments(boolean flag) {
    579         slashSlashComments = flag;
    580     }
    581 
    582     /**
    583      * Specifies whether "slash-star" (C-style) comments shall be recognized.
    584      * Slash-star comments cannot be nested and end when a star-slash
    585      * combination is found.
    586      *
    587      * @param flag
    588      *            {@code true} if {@code /*} should be recognized as the start
    589      *            of a comment, {@code false} otherwise.
    590      */
    591     public void slashStarComments(boolean flag) {
    592         slashStarComments = flag;
    593     }
    594 
    595     /**
    596      * Returns the state of this tokenizer in a readable format.
    597      *
    598      * @return the current state of this tokenizer.
    599      */
    600     @Override
    601     public String toString() {
    602         // Values determined through experimentation
    603         StringBuilder result = new StringBuilder();
    604         result.append("Token[");
    605         switch (ttype) {
    606             case TT_EOF:
    607                 result.append("EOF");
    608                 break;
    609             case TT_EOL:
    610                 result.append("EOL");
    611                 break;
    612             case TT_NUMBER:
    613                 result.append("n=");
    614                 result.append(nval);
    615                 break;
    616             case TT_WORD:
    617                 result.append(sval);
    618                 break;
    619             default:
    620                 if (ttype == TT_UNKNOWN || tokenTypes[ttype] == TOKEN_QUOTE) {
    621                     result.append(sval);
    622                 } else {
    623                     result.append('\'');
    624                     result.append((char) ttype);
    625                     result.append('\'');
    626                 }
    627         }
    628         result.append("], line ");
    629         result.append(lineNumber);
    630         return result.toString();
    631     }
    632 
    633     /**
    634      * Specifies that the characters in the range from {@code low} to {@code hi}
    635      * shall be treated as whitespace characters by this tokenizer.
    636      *
    637      * @param low
    638      *            the first character in the range of whitespace characters.
    639      * @param hi
    640      *            the last character in the range of whitespace characters.
    641      */
    642     public void whitespaceChars(int low, int hi) {
    643         if (low < 0) {
    644             low = 0;
    645         }
    646         if (hi > tokenTypes.length) {
    647             hi = tokenTypes.length - 1;
    648         }
    649         for (int i = low; i <= hi; i++) {
    650             tokenTypes[i] = TOKEN_WHITE;
    651         }
    652     }
    653 
    654     /**
    655      * Specifies that the characters in the range from {@code low} to {@code hi}
    656      * shall be treated as word characters by this tokenizer. A word consists of
    657      * a word character followed by zero or more word or number characters.
    658      *
    659      * @param low
    660      *            the first character in the range of word characters.
    661      * @param hi
    662      *            the last character in the range of word characters.
    663      */
    664     public void wordChars(int low, int hi) {
    665         if (low < 0) {
    666             low = 0;
    667         }
    668         if (hi > tokenTypes.length) {
    669             hi = tokenTypes.length - 1;
    670         }
    671         for (int i = low; i <= hi; i++) {
    672             tokenTypes[i] |= TOKEN_WORD;
    673         }
    674     }
    675 }
    676