1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package java.io; 19 20 import java.util.Locale; 21 22 /** 23 * Parses a stream into a set of defined tokens, one at a time. The different 24 * types of tokens that can be found are numbers, identifiers, quoted strings, 25 * and different comment styles. The class can be used for limited processing 26 * of source code of programming languages like Java, although it is nowhere 27 * near a full parser. 28 */ 29 public class StreamTokenizer { 30 /** 31 * Contains a number if the current token is a number ({@code ttype} == 32 * {@code TT_NUMBER}). 33 */ 34 public double nval; 35 36 /** 37 * Contains a string if the current token is a word ({@code ttype} == 38 * {@code TT_WORD}). 39 */ 40 public String sval; 41 42 /** 43 * The constant representing the end of the stream. 44 */ 45 public static final int TT_EOF = -1; 46 47 /** 48 * The constant representing the end of the line. 49 */ 50 public static final int TT_EOL = '\n'; 51 52 /** 53 * The constant representing a number token. 54 */ 55 public static final int TT_NUMBER = -2; 56 57 /** 58 * The constant representing a word token. 59 */ 60 public static final int TT_WORD = -3; 61 62 /** 63 * Internal representation of unknown state. 64 */ 65 private static final int TT_UNKNOWN = -4; 66 67 /** 68 * After calling {@code nextToken()}, {@code ttype} contains the type of 69 * token that has been read. When a single character is read, its value 70 * converted to an integer is stored in {@code ttype}. For a quoted string, 71 * the value is the quoted character. Otherwise, its value is one of the 72 * following: 73 * <ul> 74 * <li> {@code TT_WORD} - the token is a word.</li> 75 * <li> {@code TT_NUMBER} - the token is a number.</li> 76 * <li> {@code TT_EOL} - the end of line has been reached. Depends on 77 * whether {@code eolIsSignificant} is {@code true}.</li> 78 * <li> {@code TT_EOF} - the end of the stream has been reached.</li> 79 * </ul> 80 */ 81 public int ttype = TT_UNKNOWN; 82 83 /** 84 * Internal character meanings, 0 implies TOKEN_ORDINARY 85 */ 86 private byte[] tokenTypes = new byte[256]; 87 88 private static final byte TOKEN_COMMENT = 1; 89 90 private static final byte TOKEN_QUOTE = 2; 91 92 private static final byte TOKEN_WHITE = 4; 93 94 private static final byte TOKEN_WORD = 8; 95 96 private static final byte TOKEN_DIGIT = 16; 97 98 private int lineNumber = 1; 99 100 private boolean forceLowercase; 101 102 private boolean isEOLSignificant; 103 104 private boolean slashStarComments; 105 106 private boolean slashSlashComments; 107 108 private boolean pushBackToken; 109 110 private boolean lastCr; 111 112 /* One of these will have the stream */ 113 private InputStream inStream; 114 115 private Reader inReader; 116 117 private int peekChar = -2; 118 119 /** 120 * Private constructor to initialize the default values according to the 121 * specification. 122 */ 123 private StreamTokenizer() { 124 /* 125 * Initialize the default state per specification. All byte values 'A' 126 * through 'Z', 'a' through 'z', and '\u00A0' through '\u00FF' are 127 * considered to be alphabetic. 128 */ 129 wordChars('A', 'Z'); 130 wordChars('a', 'z'); 131 wordChars(160, 255); 132 /** 133 * All byte values '\u0000' through '\u0020' are considered to be white 134 * space. 135 */ 136 whitespaceChars(0, 32); 137 /** 138 * '/' is a comment character. Single quote '\'' and double quote '"' 139 * are string quote characters. 140 */ 141 commentChar('/'); 142 quoteChar('"'); 143 quoteChar('\''); 144 /** 145 * Numbers are parsed. 146 */ 147 parseNumbers(); 148 /** 149 * Ends of lines are treated as white space, not as separate tokens. 150 * C-style and C++-style comments are not recognized. These are the 151 * defaults and are not needed in constructor. 152 */ 153 } 154 155 /** 156 * Constructs a new {@code StreamTokenizer} with {@code is} as source input 157 * stream. This constructor is deprecated; instead, the constructor that 158 * takes a {@code Reader} as an argument should be used. 159 * 160 * @param is 161 * the source stream from which to parse tokens. 162 * @throws NullPointerException 163 * if {@code is} is {@code null}. 164 * @deprecated Use {@link #StreamTokenizer(Reader)} instead. 165 */ 166 @Deprecated 167 public StreamTokenizer(InputStream is) { 168 this(); 169 if (is == null) { 170 throw new NullPointerException("is == null"); 171 } 172 inStream = is; 173 } 174 175 /** 176 * Constructs a new {@code StreamTokenizer} with {@code r} as source reader. 177 * The tokenizer's initial state is as follows: 178 * <ul> 179 * <li>All byte values 'A' through 'Z', 'a' through 'z', and '\u00A0' 180 * through '\u00FF' are considered to be alphabetic.</li> 181 * <li>All byte values '\u0000' through '\u0020' are considered to 182 * be white space. '/' is a comment character.</li> 183 * <li>Single quote '\'' and double quote '"' are string quote characters. 184 * </li> 185 * <li>Numbers are parsed.</li> 186 * <li>End of lines are considered to be white space rather than separate 187 * tokens.</li> 188 * <li>C-style and C++-style comments are not recognized.</LI> 189 * </ul> 190 * 191 * @param r 192 * the source reader from which to parse tokens. 193 */ 194 public StreamTokenizer(Reader r) { 195 this(); 196 if (r == null) { 197 throw new NullPointerException("r == null"); 198 } 199 inReader = r; 200 } 201 202 /** 203 * Specifies that the character {@code ch} shall be treated as a comment 204 * character. 205 * 206 * @param ch 207 * the character to be considered a comment character. 208 */ 209 public void commentChar(int ch) { 210 if (ch >= 0 && ch < tokenTypes.length) { 211 tokenTypes[ch] = TOKEN_COMMENT; 212 } 213 } 214 215 /** 216 * Specifies whether the end of a line is significant and should be returned 217 * as {@code TT_EOF} in {@code ttype} by this tokenizer. 218 * 219 * @param flag 220 * {@code true} if EOL is significant, {@code false} otherwise. 221 */ 222 public void eolIsSignificant(boolean flag) { 223 isEOLSignificant = flag; 224 } 225 226 /** 227 * Returns the current line number. 228 * 229 * @return this tokenizer's current line number. 230 */ 231 public int lineno() { 232 return lineNumber; 233 } 234 235 /** 236 * Specifies whether word tokens should be converted to lower case when they 237 * are stored in {@code sval}. 238 * 239 * @param flag 240 * {@code true} if {@code sval} should be converted to lower 241 * case, {@code false} otherwise. 242 */ 243 public void lowerCaseMode(boolean flag) { 244 forceLowercase = flag; 245 } 246 247 /** 248 * Parses the next token from this tokenizer's source stream or reader. The 249 * type of the token is stored in the {@code ttype} field, additional 250 * information may be stored in the {@code nval} or {@code sval} fields. 251 * 252 * @return the value of {@code ttype}. 253 * @throws IOException 254 * if an I/O error occurs while parsing the next token. 255 */ 256 public int nextToken() throws IOException { 257 if (pushBackToken) { 258 pushBackToken = false; 259 if (ttype != TT_UNKNOWN) { 260 return ttype; 261 } 262 } 263 sval = null; // Always reset sval to null 264 int currentChar = peekChar == -2 ? read() : peekChar; 265 266 if (lastCr && currentChar == '\n') { 267 lastCr = false; 268 currentChar = read(); 269 } 270 if (currentChar == -1) { 271 return (ttype = TT_EOF); 272 } 273 274 byte currentType = currentChar > 255 ? TOKEN_WORD 275 : tokenTypes[currentChar]; 276 while ((currentType & TOKEN_WHITE) != 0) { 277 /** 278 * Skip over white space until we hit a new line or a real token 279 */ 280 if (currentChar == '\r') { 281 lineNumber++; 282 if (isEOLSignificant) { 283 lastCr = true; 284 peekChar = -2; 285 return (ttype = TT_EOL); 286 } 287 if ((currentChar = read()) == '\n') { 288 currentChar = read(); 289 } 290 } else if (currentChar == '\n') { 291 lineNumber++; 292 if (isEOLSignificant) { 293 peekChar = -2; 294 return (ttype = TT_EOL); 295 } 296 currentChar = read(); 297 } else { 298 // Advance over this white space character and try again. 299 currentChar = read(); 300 } 301 if (currentChar == -1) { 302 return (ttype = TT_EOF); 303 } 304 currentType = currentChar > 255 ? TOKEN_WORD 305 : tokenTypes[currentChar]; 306 } 307 308 /** 309 * Check for digits before checking for words since digits can be 310 * contained within words. 311 */ 312 if ((currentType & TOKEN_DIGIT) != 0) { 313 StringBuilder digits = new StringBuilder(20); 314 boolean haveDecimal = false, checkJustNegative = currentChar == '-'; 315 while (true) { 316 if (currentChar == '.') { 317 haveDecimal = true; 318 } 319 digits.append((char) currentChar); 320 currentChar = read(); 321 if ((currentChar < '0' || currentChar > '9') 322 && (haveDecimal || currentChar != '.')) { 323 break; 324 } 325 } 326 peekChar = currentChar; 327 if (checkJustNegative && digits.length() == 1) { 328 // Didn't get any other digits other than '-' 329 return (ttype = '-'); 330 } 331 try { 332 nval = Double.valueOf(digits.toString()).doubleValue(); 333 } catch (NumberFormatException e) { 334 // Unsure what to do, will write test. 335 nval = 0; 336 } 337 return (ttype = TT_NUMBER); 338 } 339 // Check for words 340 if ((currentType & TOKEN_WORD) != 0) { 341 StringBuilder word = new StringBuilder(20); 342 while (true) { 343 word.append((char) currentChar); 344 currentChar = read(); 345 if (currentChar == -1 346 || (currentChar < 256 && (tokenTypes[currentChar] & (TOKEN_WORD | TOKEN_DIGIT)) == 0)) { 347 break; 348 } 349 } 350 peekChar = currentChar; 351 sval = word.toString(); 352 if (forceLowercase) { 353 sval = sval.toLowerCase(Locale.getDefault()); 354 } 355 return (ttype = TT_WORD); 356 } 357 // Check for quoted character 358 if (currentType == TOKEN_QUOTE) { 359 int matchQuote = currentChar; 360 StringBuilder quoteString = new StringBuilder(); 361 int peekOne = read(); 362 while (peekOne >= 0 && peekOne != matchQuote && peekOne != '\r' 363 && peekOne != '\n') { 364 boolean readPeek = true; 365 if (peekOne == '\\') { 366 int c1 = read(); 367 // Check for quoted octal IE: \377 368 if (c1 <= '7' && c1 >= '0') { 369 int digitValue = c1 - '0'; 370 c1 = read(); 371 if (c1 > '7' || c1 < '0') { 372 readPeek = false; 373 } else { 374 digitValue = digitValue * 8 + (c1 - '0'); 375 c1 = read(); 376 // limit the digit value to a byte 377 if (digitValue > 037 || c1 > '7' || c1 < '0') { 378 readPeek = false; 379 } else { 380 digitValue = digitValue * 8 + (c1 - '0'); 381 } 382 } 383 if (!readPeek) { 384 // We've consumed one to many 385 quoteString.append((char) digitValue); 386 peekOne = c1; 387 } else { 388 peekOne = digitValue; 389 } 390 } else { 391 switch (c1) { 392 case 'a': 393 peekOne = 0x7; 394 break; 395 case 'b': 396 peekOne = 0x8; 397 break; 398 case 'f': 399 peekOne = 0xc; 400 break; 401 case 'n': 402 peekOne = 0xA; 403 break; 404 case 'r': 405 peekOne = 0xD; 406 break; 407 case 't': 408 peekOne = 0x9; 409 break; 410 case 'v': 411 peekOne = 0xB; 412 break; 413 default: 414 peekOne = c1; 415 } 416 } 417 } 418 if (readPeek) { 419 quoteString.append((char) peekOne); 420 peekOne = read(); 421 } 422 } 423 if (peekOne == matchQuote) { 424 peekOne = read(); 425 } 426 peekChar = peekOne; 427 ttype = matchQuote; 428 sval = quoteString.toString(); 429 return ttype; 430 } 431 // Do comments, both "//" and "/*stuff*/" 432 if (currentChar == '/' && (slashSlashComments || slashStarComments)) { 433 if ((currentChar = read()) == '*' && slashStarComments) { 434 int peekOne = read(); 435 while (true) { 436 currentChar = peekOne; 437 peekOne = read(); 438 if (currentChar == -1) { 439 peekChar = -1; 440 return (ttype = TT_EOF); 441 } 442 if (currentChar == '\r') { 443 if (peekOne == '\n') { 444 peekOne = read(); 445 } 446 lineNumber++; 447 } else if (currentChar == '\n') { 448 lineNumber++; 449 } else if (currentChar == '*' && peekOne == '/') { 450 peekChar = read(); 451 return nextToken(); 452 } 453 } 454 } else if (currentChar == '/' && slashSlashComments) { 455 // Skip to EOF or new line then return the next token 456 while ((currentChar = read()) >= 0 && currentChar != '\r' 457 && currentChar != '\n') { 458 // Intentionally empty 459 } 460 peekChar = currentChar; 461 return nextToken(); 462 } else if (currentType != TOKEN_COMMENT) { 463 // Was just a slash by itself 464 peekChar = currentChar; 465 return (ttype = '/'); 466 } 467 } 468 // Check for comment character 469 if (currentType == TOKEN_COMMENT) { 470 // Skip to EOF or new line then return the next token 471 while ((currentChar = read()) >= 0 && currentChar != '\r' 472 && currentChar != '\n') { 473 // Intentionally empty 474 } 475 peekChar = currentChar; 476 return nextToken(); 477 } 478 479 peekChar = read(); 480 return (ttype = currentChar); 481 } 482 483 /** 484 * Specifies that the character {@code ch} shall be treated as an ordinary 485 * character by this tokenizer. That is, it has no special meaning as a 486 * comment character, word component, white space, string delimiter or 487 * number. 488 * 489 * @param ch 490 * the character to be considered an ordinary character. 491 */ 492 public void ordinaryChar(int ch) { 493 if (ch >= 0 && ch < tokenTypes.length) { 494 tokenTypes[ch] = 0; 495 } 496 } 497 498 /** 499 * Specifies that the characters in the range from {@code low} to {@code hi} 500 * shall be treated as an ordinary character by this tokenizer. That is, 501 * they have no special meaning as a comment character, word component, 502 * white space, string delimiter or number. 503 * 504 * @param low 505 * the first character in the range of ordinary characters. 506 * @param hi 507 * the last character in the range of ordinary characters. 508 */ 509 public void ordinaryChars(int low, int hi) { 510 if (low < 0) { 511 low = 0; 512 } 513 if (hi > tokenTypes.length) { 514 hi = tokenTypes.length - 1; 515 } 516 for (int i = low; i <= hi; i++) { 517 tokenTypes[i] = 0; 518 } 519 } 520 521 /** 522 * Specifies that this tokenizer shall parse numbers. 523 */ 524 public void parseNumbers() { 525 for (int i = '0'; i <= '9'; i++) { 526 tokenTypes[i] |= TOKEN_DIGIT; 527 } 528 tokenTypes['.'] |= TOKEN_DIGIT; 529 tokenTypes['-'] |= TOKEN_DIGIT; 530 } 531 532 /** 533 * Indicates that the current token should be pushed back and returned again 534 * the next time {@code nextToken()} is called. 535 */ 536 public void pushBack() { 537 pushBackToken = true; 538 } 539 540 /** 541 * Specifies that the character {@code ch} shall be treated as a quote 542 * character. 543 * 544 * @param ch 545 * the character to be considered a quote character. 546 */ 547 public void quoteChar(int ch) { 548 if (ch >= 0 && ch < tokenTypes.length) { 549 tokenTypes[ch] = TOKEN_QUOTE; 550 } 551 } 552 553 private int read() throws IOException { 554 // Call the read for the appropriate stream 555 if (inStream == null) { 556 return inReader.read(); 557 } 558 return inStream.read(); 559 } 560 561 /** 562 * Specifies that all characters shall be treated as ordinary characters. 563 */ 564 public void resetSyntax() { 565 for (int i = 0; i < 256; i++) { 566 tokenTypes[i] = 0; 567 } 568 } 569 570 /** 571 * Specifies whether "slash-slash" (C++-style) comments shall be recognized. 572 * This kind of comment ends at the end of the line. 573 * 574 * @param flag 575 * {@code true} if {@code //} should be recognized as the start 576 * of a comment, {@code false} otherwise. 577 */ 578 public void slashSlashComments(boolean flag) { 579 slashSlashComments = flag; 580 } 581 582 /** 583 * Specifies whether "slash-star" (C-style) comments shall be recognized. 584 * Slash-star comments cannot be nested and end when a star-slash 585 * combination is found. 586 * 587 * @param flag 588 * {@code true} if {@code /*} should be recognized as the start 589 * of a comment, {@code false} otherwise. 590 */ 591 public void slashStarComments(boolean flag) { 592 slashStarComments = flag; 593 } 594 595 /** 596 * Returns the state of this tokenizer in a readable format. 597 * 598 * @return the current state of this tokenizer. 599 */ 600 @Override 601 public String toString() { 602 // Values determined through experimentation 603 StringBuilder result = new StringBuilder(); 604 result.append("Token["); 605 switch (ttype) { 606 case TT_EOF: 607 result.append("EOF"); 608 break; 609 case TT_EOL: 610 result.append("EOL"); 611 break; 612 case TT_NUMBER: 613 result.append("n="); 614 result.append(nval); 615 break; 616 case TT_WORD: 617 result.append(sval); 618 break; 619 default: 620 if (ttype == TT_UNKNOWN || tokenTypes[ttype] == TOKEN_QUOTE) { 621 result.append(sval); 622 } else { 623 result.append('\''); 624 result.append((char) ttype); 625 result.append('\''); 626 } 627 } 628 result.append("], line "); 629 result.append(lineNumber); 630 return result.toString(); 631 } 632 633 /** 634 * Specifies that the characters in the range from {@code low} to {@code hi} 635 * shall be treated as whitespace characters by this tokenizer. 636 * 637 * @param low 638 * the first character in the range of whitespace characters. 639 * @param hi 640 * the last character in the range of whitespace characters. 641 */ 642 public void whitespaceChars(int low, int hi) { 643 if (low < 0) { 644 low = 0; 645 } 646 if (hi > tokenTypes.length) { 647 hi = tokenTypes.length - 1; 648 } 649 for (int i = low; i <= hi; i++) { 650 tokenTypes[i] = TOKEN_WHITE; 651 } 652 } 653 654 /** 655 * Specifies that the characters in the range from {@code low} to {@code hi} 656 * shall be treated as word characters by this tokenizer. A word consists of 657 * a word character followed by zero or more word or number characters. 658 * 659 * @param low 660 * the first character in the range of word characters. 661 * @param hi 662 * the last character in the range of word characters. 663 */ 664 public void wordChars(int low, int hi) { 665 if (low < 0) { 666 low = 0; 667 } 668 if (hi > tokenTypes.length) { 669 hi = tokenTypes.length - 1; 670 } 671 for (int i = low; i <= hi; i++) { 672 tokenTypes[i] |= TOKEN_WORD; 673 } 674 } 675 } 676