Home | History | Annotate | Download | only in io
      1 /*
      2  *  Licensed to the Apache Software Foundation (ASF) under one or more
      3  *  contributor license agreements.  See the NOTICE file distributed with
      4  *  this work for additional information regarding copyright ownership.
      5  *  The ASF licenses this file to You under the Apache License, Version 2.0
      6  *  (the "License"); you may not use this file except in compliance with
      7  *  the License.  You may obtain a copy of the License at
      8  *
      9  *     http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  *  Unless required by applicable law or agreed to in writing, software
     12  *  distributed under the License is distributed on an "AS IS" BASIS,
     13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  *  See the License for the specific language governing permissions and
     15  *  limitations under the License.
     16  */
     17 
     18 package java.io;
     19 
     20 /**
     21  * Parses a stream into a set of defined tokens, one at a time. The different
     22  * types of tokens that can be found are numbers, identifiers, quoted strings,
     23  * and different comment styles. The class can be used for limited processing
     24  * of source code of programming languages like Java, although it is nowhere
     25  * near a full parser.
     26  */
     27 public class StreamTokenizer {
     28     /**
     29      * Contains a number if the current token is a number ({@code ttype} ==
     30      * {@code TT_NUMBER}).
     31      */
     32     public double nval;
     33 
     34     /**
     35      * Contains a string if the current token is a word ({@code ttype} ==
     36      * {@code TT_WORD}).
     37      */
     38     public String sval;
     39 
     40     /**
     41      * The constant representing the end of the stream.
     42      */
     43     public static final int TT_EOF = -1;
     44 
     45     /**
     46      * The constant representing the end of the line.
     47      */
     48     public static final int TT_EOL = '\n';
     49 
     50     /**
     51      * The constant representing a number token.
     52      */
     53     public static final int TT_NUMBER = -2;
     54 
     55     /**
     56      * The constant representing a word token.
     57      */
     58     public static final int TT_WORD = -3;
     59 
     60     /**
     61      * Internal representation of unknown state.
     62      */
     63     private static final int TT_UNKNOWN = -4;
     64 
     65     /**
     66      * After calling {@code nextToken()}, {@code ttype} contains the type of
     67      * token that has been read. When a single character is read, its value
     68      * converted to an integer is stored in {@code ttype}. For a quoted string,
     69      * the value is the quoted character. Otherwise, its value is one of the
     70      * following:
     71      * <ul>
     72      * <li> {@code TT_WORD} - the token is a word.</li>
     73      * <li> {@code TT_NUMBER} - the token is a number.</li>
     74      * <li> {@code TT_EOL} - the end of line has been reached. Depends on
     75      * whether {@code eolIsSignificant} is {@code true}.</li>
     76      * <li> {@code TT_EOF} - the end of the stream has been reached.</li>
     77      * </ul>
     78      */
     79     public int ttype = TT_UNKNOWN;
     80 
     81     /**
     82      * Internal character meanings, 0 implies TOKEN_ORDINARY
     83      */
     84     private byte[] tokenTypes = new byte[256];
     85 
     86     private static final byte TOKEN_COMMENT = 1;
     87 
     88     private static final byte TOKEN_QUOTE = 2;
     89 
     90     private static final byte TOKEN_WHITE = 4;
     91 
     92     private static final byte TOKEN_WORD = 8;
     93 
     94     private static final byte TOKEN_DIGIT = 16;
     95 
     96     private int lineNumber = 1;
     97 
     98     private boolean forceLowercase;
     99 
    100     private boolean isEOLSignificant;
    101 
    102     private boolean slashStarComments;
    103 
    104     private boolean slashSlashComments;
    105 
    106     private boolean pushBackToken;
    107 
    108     private boolean lastCr;
    109 
    110     /* One of these will have the stream */
    111     private InputStream inStream;
    112 
    113     private Reader inReader;
    114 
    115     private int peekChar = -2;
    116 
    117     /**
    118      * Private constructor to initialize the default values according to the
    119      * specification.
    120      */
    121     private StreamTokenizer() {
    122         /*
    123          * Initialize the default state per specification. All byte values 'A'
    124          * through 'Z', 'a' through 'z', and '\u00A0' through '\u00FF' are
    125          * considered to be alphabetic.
    126          */
    127         wordChars('A', 'Z');
    128         wordChars('a', 'z');
    129         wordChars(160, 255);
    130         /**
    131          * All byte values '\u0000' through '\u0020' are considered to be white
    132          * space.
    133          */
    134         whitespaceChars(0, 32);
    135         /**
    136          * '/' is a comment character. Single quote '\'' and double quote '"'
    137          * are string quote characters.
    138          */
    139         commentChar('/');
    140         quoteChar('"');
    141         quoteChar('\'');
    142         /**
    143          * Numbers are parsed.
    144          */
    145         parseNumbers();
    146         /**
    147          * Ends of lines are treated as white space, not as separate tokens.
    148          * C-style and C++-style comments are not recognized. These are the
    149          * defaults and are not needed in constructor.
    150          */
    151     }
    152 
    153     /**
    154      * Constructs a new {@code StreamTokenizer} with {@code is} as source input
    155      * stream. This constructor is deprecated; instead, the constructor that
    156      * takes a {@code Reader} as an arugment should be used.
    157      *
    158      * @param is
    159      *            the source stream from which to parse tokens.
    160      * @throws NullPointerException
    161      *             if {@code is} is {@code null}.
    162      * @deprecated Use {@link #StreamTokenizer(Reader)}
    163      */
    164     @Deprecated
    165     public StreamTokenizer(InputStream is) {
    166         this();
    167         if (is == null) {
    168             throw new NullPointerException();
    169         }
    170         inStream = is;
    171     }
    172 
    173     /**
    174      * Constructs a new {@code StreamTokenizer} with {@code r} as source reader.
    175      * The tokenizer's initial state is as follows:
    176      * <ul>
    177      * <li>All byte values 'A' through 'Z', 'a' through 'z', and '&#92;u00A0'
    178      * through '&#92;u00FF' are considered to be alphabetic.</li>
    179      * <li>All byte values '&#92;u0000' through '&#92;u0020' are considered to
    180      * be white space. '/' is a comment character.</li>
    181      * <li>Single quote '\'' and double quote '"' are string quote characters.
    182      * </li>
    183      * <li>Numbers are parsed.</li>
    184      * <li>End of lines are considered to be white space rather than separate
    185      * tokens.</li>
    186      * <li>C-style and C++-style comments are not recognized.</LI>
    187      * </ul>
    188      *
    189      * @param r
    190      *            the source reader from which to parse tokens.
    191      */
    192     public StreamTokenizer(Reader r) {
    193         this();
    194         if (r == null) {
    195             throw new NullPointerException();
    196         }
    197         inReader = r;
    198     }
    199 
    200     /**
    201      * Specifies that the character {@code ch} shall be treated as a comment
    202      * character.
    203      *
    204      * @param ch
    205      *            the character to be considered a comment character.
    206      */
    207     public void commentChar(int ch) {
    208         if (0 <= ch && ch < tokenTypes.length) {
    209             tokenTypes[ch] = TOKEN_COMMENT;
    210         }
    211     }
    212 
    213     /**
    214      * Specifies whether the end of a line is significant and should be returned
    215      * as {@code TT_EOF} in {@code ttype} by this tokenizer.
    216      *
    217      * @param flag
    218      *            {@code true} if EOL is significant, {@code false} otherwise.
    219      */
    220     public void eolIsSignificant(boolean flag) {
    221         isEOLSignificant = flag;
    222     }
    223 
    224     /**
    225      * Returns the current line number.
    226      *
    227      * @return this tokenizer's current line number.
    228      */
    229     public int lineno() {
    230         return lineNumber;
    231     }
    232 
    233     /**
    234      * Specifies whether word tokens should be converted to lower case when they
    235      * are stored in {@code sval}.
    236      *
    237      * @param flag
    238      *            {@code true} if {@code sval} should be converted to lower
    239      *            case, {@code false} otherwise.
    240      */
    241     public void lowerCaseMode(boolean flag) {
    242         forceLowercase = flag;
    243     }
    244 
    245     /**
    246      * Parses the next token from this tokenizer's source stream or reader. The
    247      * type of the token is stored in the {@code ttype} field, additional
    248      * information may be stored in the {@code nval} or {@code sval} fields.
    249      *
    250      * @return the value of {@code ttype}.
    251      * @throws IOException
    252      *             if an I/O error occurs while parsing the next token.
    253      */
    254     public int nextToken() throws IOException {
    255         if (pushBackToken) {
    256             pushBackToken = false;
    257             if (ttype != TT_UNKNOWN) {
    258                 return ttype;
    259             }
    260         }
    261         sval = null; // Always reset sval to null
    262         int currentChar = peekChar == -2 ? read() : peekChar;
    263 
    264         if (lastCr && currentChar == '\n') {
    265             lastCr = false;
    266             currentChar = read();
    267         }
    268         if (currentChar == -1) {
    269             return (ttype = TT_EOF);
    270         }
    271 
    272         byte currentType = currentChar > 255 ? TOKEN_WORD
    273                 : tokenTypes[currentChar];
    274         while ((currentType & TOKEN_WHITE) != 0) {
    275             /**
    276              * Skip over white space until we hit a new line or a real token
    277              */
    278             if (currentChar == '\r') {
    279                 lineNumber++;
    280                 if (isEOLSignificant) {
    281                     lastCr = true;
    282                     peekChar = -2;
    283                     return (ttype = TT_EOL);
    284                 }
    285                 if ((currentChar = read()) == '\n') {
    286                     currentChar = read();
    287                 }
    288             } else if (currentChar == '\n') {
    289                 lineNumber++;
    290                 if (isEOLSignificant) {
    291                     peekChar = -2;
    292                     return (ttype = TT_EOL);
    293                 }
    294                 currentChar = read();
    295             } else {
    296                 // Advance over this white space character and try again.
    297                 currentChar = read();
    298             }
    299             if (currentChar == -1) {
    300                 return (ttype = TT_EOF);
    301             }
    302             currentType = currentChar > 255 ? TOKEN_WORD
    303                     : tokenTypes[currentChar];
    304         }
    305 
    306         /**
    307          * Check for digits before checking for words since digits can be
    308          * contained within words.
    309          */
    310         if ((currentType & TOKEN_DIGIT) != 0) {
    311             StringBuilder digits = new StringBuilder(20);
    312             boolean haveDecimal = false, checkJustNegative = currentChar == '-';
    313             while (true) {
    314                 if (currentChar == '.') {
    315                     haveDecimal = true;
    316                 }
    317                 digits.append((char) currentChar);
    318                 currentChar = read();
    319                 if ((currentChar < '0' || currentChar > '9')
    320                         && (haveDecimal || currentChar != '.')) {
    321                     break;
    322                 }
    323             }
    324             peekChar = currentChar;
    325             if (checkJustNegative && digits.length() == 1) {
    326                 // Didn't get any other digits other than '-'
    327                 return (ttype = '-');
    328             }
    329             try {
    330                 nval = Double.valueOf(digits.toString()).doubleValue();
    331             } catch (NumberFormatException e) {
    332                 // Unsure what to do, will write test.
    333                 nval = 0;
    334             }
    335             return (ttype = TT_NUMBER);
    336         }
    337         // Check for words
    338         if ((currentType & TOKEN_WORD) != 0) {
    339             StringBuilder word = new StringBuilder(20);
    340             while (true) {
    341                 word.append((char) currentChar);
    342                 currentChar = read();
    343                 if (currentChar == -1
    344                         || (currentChar < 256 && (tokenTypes[currentChar] & (TOKEN_WORD | TOKEN_DIGIT)) == 0)) {
    345                     break;
    346                 }
    347             }
    348             peekChar = currentChar;
    349             sval = forceLowercase ? word.toString().toLowerCase() : word
    350                     .toString();
    351             return (ttype = TT_WORD);
    352         }
    353         // Check for quoted character
    354         if (currentType == TOKEN_QUOTE) {
    355             int matchQuote = currentChar;
    356             StringBuilder quoteString = new StringBuilder();
    357             int peekOne = read();
    358             while (peekOne >= 0 && peekOne != matchQuote && peekOne != '\r'
    359                     && peekOne != '\n') {
    360                 boolean readPeek = true;
    361                 if (peekOne == '\\') {
    362                     int c1 = read();
    363                     // Check for quoted octal IE: \377
    364                     if (c1 <= '7' && c1 >= '0') {
    365                         int digitValue = c1 - '0';
    366                         c1 = read();
    367                         if (c1 > '7' || c1 < '0') {
    368                             readPeek = false;
    369                         } else {
    370                             digitValue = digitValue * 8 + (c1 - '0');
    371                             c1 = read();
    372                             // limit the digit value to a byte
    373                             if (digitValue > 037 || c1 > '7' || c1 < '0') {
    374                                 readPeek = false;
    375                             } else {
    376                                 digitValue = digitValue * 8 + (c1 - '0');
    377                             }
    378                         }
    379                         if (!readPeek) {
    380                             // We've consumed one to many
    381                             quoteString.append((char) digitValue);
    382                             peekOne = c1;
    383                         } else {
    384                             peekOne = digitValue;
    385                         }
    386                     } else {
    387                         switch (c1) {
    388                             case 'a':
    389                                 peekOne = 0x7;
    390                                 break;
    391                             case 'b':
    392                                 peekOne = 0x8;
    393                                 break;
    394                             case 'f':
    395                                 peekOne = 0xc;
    396                                 break;
    397                             case 'n':
    398                                 peekOne = 0xA;
    399                                 break;
    400                             case 'r':
    401                                 peekOne = 0xD;
    402                                 break;
    403                             case 't':
    404                                 peekOne = 0x9;
    405                                 break;
    406                             case 'v':
    407                                 peekOne = 0xB;
    408                                 break;
    409                             default:
    410                                 peekOne = c1;
    411                         }
    412                     }
    413                 }
    414                 if (readPeek) {
    415                     quoteString.append((char) peekOne);
    416                     peekOne = read();
    417                 }
    418             }
    419             if (peekOne == matchQuote) {
    420                 peekOne = read();
    421             }
    422             peekChar = peekOne;
    423             ttype = matchQuote;
    424             sval = quoteString.toString();
    425             return ttype;
    426         }
    427         // Do comments, both "//" and "/*stuff*/"
    428         if (currentChar == '/' && (slashSlashComments || slashStarComments)) {
    429             if ((currentChar = read()) == '*' && slashStarComments) {
    430                 int peekOne = read();
    431                 while (true) {
    432                     currentChar = peekOne;
    433                     peekOne = read();
    434                     if (currentChar == -1) {
    435                         peekChar = -1;
    436                         return (ttype = TT_EOF);
    437                     }
    438                     if (currentChar == '\r') {
    439                         if (peekOne == '\n') {
    440                             peekOne = read();
    441                         }
    442                         lineNumber++;
    443                     } else if (currentChar == '\n') {
    444                         lineNumber++;
    445                     } else if (currentChar == '*' && peekOne == '/') {
    446                         peekChar = read();
    447                         return nextToken();
    448                     }
    449                 }
    450             } else if (currentChar == '/' && slashSlashComments) {
    451                 // Skip to EOF or new line then return the next token
    452                 while ((currentChar = read()) >= 0 && currentChar != '\r'
    453                         && currentChar != '\n') {
    454                     // Intentionally empty
    455                 }
    456                 peekChar = currentChar;
    457                 return nextToken();
    458             } else if (currentType != TOKEN_COMMENT) {
    459                 // Was just a slash by itself
    460                 peekChar = currentChar;
    461                 return (ttype = '/');
    462             }
    463         }
    464         // Check for comment character
    465         if (currentType == TOKEN_COMMENT) {
    466             // Skip to EOF or new line then return the next token
    467             while ((currentChar = read()) >= 0 && currentChar != '\r'
    468                     && currentChar != '\n') {
    469                 // Intentionally empty
    470             }
    471             peekChar = currentChar;
    472             return nextToken();
    473         }
    474 
    475         peekChar = read();
    476         return (ttype = currentChar);
    477     }
    478 
    479     /**
    480      * Specifies that the character {@code ch} shall be treated as an ordinary
    481      * character by this tokenizer. That is, it has no special meaning as a
    482      * comment character, word component, white space, string delimiter or
    483      * number.
    484      *
    485      * @param ch
    486      *            the character to be considered an ordinary character.
    487      */
    488     public void ordinaryChar(int ch) {
    489         if (0 <= ch && ch < tokenTypes.length) {
    490             tokenTypes[ch] = 0;
    491         }
    492     }
    493 
    494     /**
    495      * Specifies that the characters in the range from {@code low} to {@code hi}
    496      * shall be treated as an ordinary character by this tokenizer. That is,
    497      * they have no special meaning as a comment character, word component,
    498      * white space, string delimiter or number.
    499      *
    500      * @param low
    501      *            the first character in the range of ordinary characters.
    502      * @param hi
    503      *            the last character in the range of ordinary characters.
    504      */
    505     public void ordinaryChars(int low, int hi) {
    506         if (low < 0) {
    507             low = 0;
    508         }
    509         if (hi > tokenTypes.length) {
    510             hi = tokenTypes.length - 1;
    511         }
    512         for (int i = low; i <= hi; i++) {
    513             tokenTypes[i] = 0;
    514         }
    515     }
    516 
    517     /**
    518      * Specifies that this tokenizer shall parse numbers.
    519      */
    520     public void parseNumbers() {
    521         for (int i = '0'; i <= '9'; i++) {
    522             tokenTypes[i] |= TOKEN_DIGIT;
    523         }
    524         tokenTypes['.'] |= TOKEN_DIGIT;
    525         tokenTypes['-'] |= TOKEN_DIGIT;
    526     }
    527 
    528     /**
    529      * Indicates that the current token should be pushed back and returned again
    530      * the next time {@code nextToken()} is called.
    531      */
    532     public void pushBack() {
    533         pushBackToken = true;
    534     }
    535 
    536     /**
    537      * Specifies that the character {@code ch} shall be treated as a quote
    538      * character.
    539      *
    540      * @param ch
    541      *            the character to be considered a quote character.
    542      */
    543     public void quoteChar(int ch) {
    544         if (0 <= ch && ch < tokenTypes.length) {
    545             tokenTypes[ch] = TOKEN_QUOTE;
    546         }
    547     }
    548 
    549     private int read() throws IOException {
    550         // Call the read for the appropriate stream
    551         if (inStream == null) {
    552             return inReader.read();
    553         }
    554         return inStream.read();
    555     }
    556 
    557     /**
    558      * Specifies that all characters shall be treated as ordinary characters.
    559      */
    560     public void resetSyntax() {
    561         for (int i = 0; i < 256; i++) {
    562             tokenTypes[i] = 0;
    563         }
    564     }
    565 
    566     /**
    567      * Specifies whether "slash-slash" (C++-style) comments shall be recognized.
    568      * This kind of comment ends at the end of the line.
    569      *
    570      * @param flag
    571      *            {@code true} if {@code //} should be recognized as the start
    572      *            of a comment, {@code false} otherwise.
    573      */
    574     public void slashSlashComments(boolean flag) {
    575         slashSlashComments = flag;
    576     }
    577 
    578     /**
    579      * Specifies whether "slash-star" (C-style) comments shall be recognized.
    580      * Slash-star comments cannot be nested and end when a star-slash
    581      * combination is found.
    582      *
    583      * @param flag
    584      *            {@code true} if {@code /*} should be recognized as the start
    585      *            of a comment, {@code false} otherwise.
    586      */
    587     public void slashStarComments(boolean flag) {
    588         slashStarComments = flag;
    589     }
    590 
    591     /**
    592      * Returns the state of this tokenizer in a readable format.
    593      *
    594      * @return the current state of this tokenizer.
    595      */
    596     @Override
    597     public String toString() {
    598         // Values determined through experimentation
    599         StringBuilder result = new StringBuilder();
    600         result.append("Token[");
    601         switch (ttype) {
    602             case TT_EOF:
    603                 result.append("EOF");
    604                 break;
    605             case TT_EOL:
    606                 result.append("EOL");
    607                 break;
    608             case TT_NUMBER:
    609                 result.append("n=");
    610                 result.append(nval);
    611                 break;
    612             case TT_WORD:
    613                 result.append(sval);
    614                 break;
    615             default:
    616                 if (ttype == TT_UNKNOWN || tokenTypes[ttype] == TOKEN_QUOTE) {
    617                     result.append(sval);
    618                 } else {
    619                     result.append('\'');
    620                     result.append((char) ttype);
    621                     result.append('\'');
    622                 }
    623         }
    624         result.append("], line ");
    625         result.append(lineNumber);
    626         return result.toString();
    627     }
    628 
    629     /**
    630      * Specifies that the characters in the range from {@code low} to {@code hi}
    631      * shall be treated as whitespace characters by this tokenizer.
    632      *
    633      * @param low
    634      *            the first character in the range of whitespace characters.
    635      * @param hi
    636      *            the last character in the range of whitespace characters.
    637      */
    638     public void whitespaceChars(int low, int hi) {
    639         if (low < 0) {
    640             low = 0;
    641         }
    642         if (hi > tokenTypes.length) {
    643             hi = tokenTypes.length - 1;
    644         }
    645         for (int i = low; i <= hi; i++) {
    646             tokenTypes[i] = TOKEN_WHITE;
    647         }
    648     }
    649 
    650     /**
    651      * Specifies that the characters in the range from {@code low} to {@code hi}
    652      * shall be treated as word characters by this tokenizer. A word consists of
    653      * a word character followed by zero or more word or number characters.
    654      *
    655      * @param low
    656      *            the first character in the range of word characters.
    657      * @param hi
    658      *            the last character in the range of word characters.
    659      */
    660     public void wordChars(int low, int hi) {
    661         if (low < 0) {
    662             low = 0;
    663         }
    664         if (hi > tokenTypes.length) {
    665             hi = tokenTypes.length - 1;
    666         }
    667         for (int i = low; i <= hi; i++) {
    668             tokenTypes[i] |= TOKEN_WORD;
    669         }
    670     }
    671 }
    672