1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package java.io; 19 20 /** 21 * Parses a stream into a set of defined tokens, one at a time. The different 22 * types of tokens that can be found are numbers, identifiers, quoted strings, 23 * and different comment styles. The class can be used for limited processing 24 * of source code of programming languages like Java, although it is nowhere 25 * near a full parser. 26 */ 27 public class StreamTokenizer { 28 /** 29 * Contains a number if the current token is a number ({@code ttype} == 30 * {@code TT_NUMBER}). 31 */ 32 public double nval; 33 34 /** 35 * Contains a string if the current token is a word ({@code ttype} == 36 * {@code TT_WORD}). 37 */ 38 public String sval; 39 40 /** 41 * The constant representing the end of the stream. 42 */ 43 public static final int TT_EOF = -1; 44 45 /** 46 * The constant representing the end of the line. 47 */ 48 public static final int TT_EOL = '\n'; 49 50 /** 51 * The constant representing a number token. 52 */ 53 public static final int TT_NUMBER = -2; 54 55 /** 56 * The constant representing a word token. 57 */ 58 public static final int TT_WORD = -3; 59 60 /** 61 * Internal representation of unknown state. 62 */ 63 private static final int TT_UNKNOWN = -4; 64 65 /** 66 * After calling {@code nextToken()}, {@code ttype} contains the type of 67 * token that has been read. When a single character is read, its value 68 * converted to an integer is stored in {@code ttype}. For a quoted string, 69 * the value is the quoted character. Otherwise, its value is one of the 70 * following: 71 * <ul> 72 * <li> {@code TT_WORD} - the token is a word.</li> 73 * <li> {@code TT_NUMBER} - the token is a number.</li> 74 * <li> {@code TT_EOL} - the end of line has been reached. Depends on 75 * whether {@code eolIsSignificant} is {@code true}.</li> 76 * <li> {@code TT_EOF} - the end of the stream has been reached.</li> 77 * </ul> 78 */ 79 public int ttype = TT_UNKNOWN; 80 81 /** 82 * Internal character meanings, 0 implies TOKEN_ORDINARY 83 */ 84 private byte[] tokenTypes = new byte[256]; 85 86 private static final byte TOKEN_COMMENT = 1; 87 88 private static final byte TOKEN_QUOTE = 2; 89 90 private static final byte TOKEN_WHITE = 4; 91 92 private static final byte TOKEN_WORD = 8; 93 94 private static final byte TOKEN_DIGIT = 16; 95 96 private int lineNumber = 1; 97 98 private boolean forceLowercase; 99 100 private boolean isEOLSignificant; 101 102 private boolean slashStarComments; 103 104 private boolean slashSlashComments; 105 106 private boolean pushBackToken; 107 108 private boolean lastCr; 109 110 /* One of these will have the stream */ 111 private InputStream inStream; 112 113 private Reader inReader; 114 115 private int peekChar = -2; 116 117 /** 118 * Private constructor to initialize the default values according to the 119 * specification. 120 */ 121 private StreamTokenizer() { 122 /* 123 * Initialize the default state per specification. All byte values 'A' 124 * through 'Z', 'a' through 'z', and '\u00A0' through '\u00FF' are 125 * considered to be alphabetic. 126 */ 127 wordChars('A', 'Z'); 128 wordChars('a', 'z'); 129 wordChars(160, 255); 130 /** 131 * All byte values '\u0000' through '\u0020' are considered to be white 132 * space. 133 */ 134 whitespaceChars(0, 32); 135 /** 136 * '/' is a comment character. Single quote '\'' and double quote '"' 137 * are string quote characters. 138 */ 139 commentChar('/'); 140 quoteChar('"'); 141 quoteChar('\''); 142 /** 143 * Numbers are parsed. 144 */ 145 parseNumbers(); 146 /** 147 * Ends of lines are treated as white space, not as separate tokens. 148 * C-style and C++-style comments are not recognized. These are the 149 * defaults and are not needed in constructor. 150 */ 151 } 152 153 /** 154 * Constructs a new {@code StreamTokenizer} with {@code is} as source input 155 * stream. This constructor is deprecated; instead, the constructor that 156 * takes a {@code Reader} as an arugment should be used. 157 * 158 * @param is 159 * the source stream from which to parse tokens. 160 * @throws NullPointerException 161 * if {@code is} is {@code null}. 162 * @deprecated Use {@link #StreamTokenizer(Reader)} 163 */ 164 @Deprecated 165 public StreamTokenizer(InputStream is) { 166 this(); 167 if (is == null) { 168 throw new NullPointerException(); 169 } 170 inStream = is; 171 } 172 173 /** 174 * Constructs a new {@code StreamTokenizer} with {@code r} as source reader. 175 * The tokenizer's initial state is as follows: 176 * <ul> 177 * <li>All byte values 'A' through 'Z', 'a' through 'z', and '\u00A0' 178 * through '\u00FF' are considered to be alphabetic.</li> 179 * <li>All byte values '\u0000' through '\u0020' are considered to 180 * be white space. '/' is a comment character.</li> 181 * <li>Single quote '\'' and double quote '"' are string quote characters. 182 * </li> 183 * <li>Numbers are parsed.</li> 184 * <li>End of lines are considered to be white space rather than separate 185 * tokens.</li> 186 * <li>C-style and C++-style comments are not recognized.</LI> 187 * </ul> 188 * 189 * @param r 190 * the source reader from which to parse tokens. 191 */ 192 public StreamTokenizer(Reader r) { 193 this(); 194 if (r == null) { 195 throw new NullPointerException(); 196 } 197 inReader = r; 198 } 199 200 /** 201 * Specifies that the character {@code ch} shall be treated as a comment 202 * character. 203 * 204 * @param ch 205 * the character to be considered a comment character. 206 */ 207 public void commentChar(int ch) { 208 if (0 <= ch && ch < tokenTypes.length) { 209 tokenTypes[ch] = TOKEN_COMMENT; 210 } 211 } 212 213 /** 214 * Specifies whether the end of a line is significant and should be returned 215 * as {@code TT_EOF} in {@code ttype} by this tokenizer. 216 * 217 * @param flag 218 * {@code true} if EOL is significant, {@code false} otherwise. 219 */ 220 public void eolIsSignificant(boolean flag) { 221 isEOLSignificant = flag; 222 } 223 224 /** 225 * Returns the current line number. 226 * 227 * @return this tokenizer's current line number. 228 */ 229 public int lineno() { 230 return lineNumber; 231 } 232 233 /** 234 * Specifies whether word tokens should be converted to lower case when they 235 * are stored in {@code sval}. 236 * 237 * @param flag 238 * {@code true} if {@code sval} should be converted to lower 239 * case, {@code false} otherwise. 240 */ 241 public void lowerCaseMode(boolean flag) { 242 forceLowercase = flag; 243 } 244 245 /** 246 * Parses the next token from this tokenizer's source stream or reader. The 247 * type of the token is stored in the {@code ttype} field, additional 248 * information may be stored in the {@code nval} or {@code sval} fields. 249 * 250 * @return the value of {@code ttype}. 251 * @throws IOException 252 * if an I/O error occurs while parsing the next token. 253 */ 254 public int nextToken() throws IOException { 255 if (pushBackToken) { 256 pushBackToken = false; 257 if (ttype != TT_UNKNOWN) { 258 return ttype; 259 } 260 } 261 sval = null; // Always reset sval to null 262 int currentChar = peekChar == -2 ? read() : peekChar; 263 264 if (lastCr && currentChar == '\n') { 265 lastCr = false; 266 currentChar = read(); 267 } 268 if (currentChar == -1) { 269 return (ttype = TT_EOF); 270 } 271 272 byte currentType = currentChar > 255 ? TOKEN_WORD 273 : tokenTypes[currentChar]; 274 while ((currentType & TOKEN_WHITE) != 0) { 275 /** 276 * Skip over white space until we hit a new line or a real token 277 */ 278 if (currentChar == '\r') { 279 lineNumber++; 280 if (isEOLSignificant) { 281 lastCr = true; 282 peekChar = -2; 283 return (ttype = TT_EOL); 284 } 285 if ((currentChar = read()) == '\n') { 286 currentChar = read(); 287 } 288 } else if (currentChar == '\n') { 289 lineNumber++; 290 if (isEOLSignificant) { 291 peekChar = -2; 292 return (ttype = TT_EOL); 293 } 294 currentChar = read(); 295 } else { 296 // Advance over this white space character and try again. 297 currentChar = read(); 298 } 299 if (currentChar == -1) { 300 return (ttype = TT_EOF); 301 } 302 currentType = currentChar > 255 ? TOKEN_WORD 303 : tokenTypes[currentChar]; 304 } 305 306 /** 307 * Check for digits before checking for words since digits can be 308 * contained within words. 309 */ 310 if ((currentType & TOKEN_DIGIT) != 0) { 311 StringBuilder digits = new StringBuilder(20); 312 boolean haveDecimal = false, checkJustNegative = currentChar == '-'; 313 while (true) { 314 if (currentChar == '.') { 315 haveDecimal = true; 316 } 317 digits.append((char) currentChar); 318 currentChar = read(); 319 if ((currentChar < '0' || currentChar > '9') 320 && (haveDecimal || currentChar != '.')) { 321 break; 322 } 323 } 324 peekChar = currentChar; 325 if (checkJustNegative && digits.length() == 1) { 326 // Didn't get any other digits other than '-' 327 return (ttype = '-'); 328 } 329 try { 330 nval = Double.valueOf(digits.toString()).doubleValue(); 331 } catch (NumberFormatException e) { 332 // Unsure what to do, will write test. 333 nval = 0; 334 } 335 return (ttype = TT_NUMBER); 336 } 337 // Check for words 338 if ((currentType & TOKEN_WORD) != 0) { 339 StringBuilder word = new StringBuilder(20); 340 while (true) { 341 word.append((char) currentChar); 342 currentChar = read(); 343 if (currentChar == -1 344 || (currentChar < 256 && (tokenTypes[currentChar] & (TOKEN_WORD | TOKEN_DIGIT)) == 0)) { 345 break; 346 } 347 } 348 peekChar = currentChar; 349 sval = forceLowercase ? word.toString().toLowerCase() : word 350 .toString(); 351 return (ttype = TT_WORD); 352 } 353 // Check for quoted character 354 if (currentType == TOKEN_QUOTE) { 355 int matchQuote = currentChar; 356 StringBuilder quoteString = new StringBuilder(); 357 int peekOne = read(); 358 while (peekOne >= 0 && peekOne != matchQuote && peekOne != '\r' 359 && peekOne != '\n') { 360 boolean readPeek = true; 361 if (peekOne == '\\') { 362 int c1 = read(); 363 // Check for quoted octal IE: \377 364 if (c1 <= '7' && c1 >= '0') { 365 int digitValue = c1 - '0'; 366 c1 = read(); 367 if (c1 > '7' || c1 < '0') { 368 readPeek = false; 369 } else { 370 digitValue = digitValue * 8 + (c1 - '0'); 371 c1 = read(); 372 // limit the digit value to a byte 373 if (digitValue > 037 || c1 > '7' || c1 < '0') { 374 readPeek = false; 375 } else { 376 digitValue = digitValue * 8 + (c1 - '0'); 377 } 378 } 379 if (!readPeek) { 380 // We've consumed one to many 381 quoteString.append((char) digitValue); 382 peekOne = c1; 383 } else { 384 peekOne = digitValue; 385 } 386 } else { 387 switch (c1) { 388 case 'a': 389 peekOne = 0x7; 390 break; 391 case 'b': 392 peekOne = 0x8; 393 break; 394 case 'f': 395 peekOne = 0xc; 396 break; 397 case 'n': 398 peekOne = 0xA; 399 break; 400 case 'r': 401 peekOne = 0xD; 402 break; 403 case 't': 404 peekOne = 0x9; 405 break; 406 case 'v': 407 peekOne = 0xB; 408 break; 409 default: 410 peekOne = c1; 411 } 412 } 413 } 414 if (readPeek) { 415 quoteString.append((char) peekOne); 416 peekOne = read(); 417 } 418 } 419 if (peekOne == matchQuote) { 420 peekOne = read(); 421 } 422 peekChar = peekOne; 423 ttype = matchQuote; 424 sval = quoteString.toString(); 425 return ttype; 426 } 427 // Do comments, both "//" and "/*stuff*/" 428 if (currentChar == '/' && (slashSlashComments || slashStarComments)) { 429 if ((currentChar = read()) == '*' && slashStarComments) { 430 int peekOne = read(); 431 while (true) { 432 currentChar = peekOne; 433 peekOne = read(); 434 if (currentChar == -1) { 435 peekChar = -1; 436 return (ttype = TT_EOF); 437 } 438 if (currentChar == '\r') { 439 if (peekOne == '\n') { 440 peekOne = read(); 441 } 442 lineNumber++; 443 } else if (currentChar == '\n') { 444 lineNumber++; 445 } else if (currentChar == '*' && peekOne == '/') { 446 peekChar = read(); 447 return nextToken(); 448 } 449 } 450 } else if (currentChar == '/' && slashSlashComments) { 451 // Skip to EOF or new line then return the next token 452 while ((currentChar = read()) >= 0 && currentChar != '\r' 453 && currentChar != '\n') { 454 // Intentionally empty 455 } 456 peekChar = currentChar; 457 return nextToken(); 458 } else if (currentType != TOKEN_COMMENT) { 459 // Was just a slash by itself 460 peekChar = currentChar; 461 return (ttype = '/'); 462 } 463 } 464 // Check for comment character 465 if (currentType == TOKEN_COMMENT) { 466 // Skip to EOF or new line then return the next token 467 while ((currentChar = read()) >= 0 && currentChar != '\r' 468 && currentChar != '\n') { 469 // Intentionally empty 470 } 471 peekChar = currentChar; 472 return nextToken(); 473 } 474 475 peekChar = read(); 476 return (ttype = currentChar); 477 } 478 479 /** 480 * Specifies that the character {@code ch} shall be treated as an ordinary 481 * character by this tokenizer. That is, it has no special meaning as a 482 * comment character, word component, white space, string delimiter or 483 * number. 484 * 485 * @param ch 486 * the character to be considered an ordinary character. 487 */ 488 public void ordinaryChar(int ch) { 489 if (0 <= ch && ch < tokenTypes.length) { 490 tokenTypes[ch] = 0; 491 } 492 } 493 494 /** 495 * Specifies that the characters in the range from {@code low} to {@code hi} 496 * shall be treated as an ordinary character by this tokenizer. That is, 497 * they have no special meaning as a comment character, word component, 498 * white space, string delimiter or number. 499 * 500 * @param low 501 * the first character in the range of ordinary characters. 502 * @param hi 503 * the last character in the range of ordinary characters. 504 */ 505 public void ordinaryChars(int low, int hi) { 506 if (low < 0) { 507 low = 0; 508 } 509 if (hi > tokenTypes.length) { 510 hi = tokenTypes.length - 1; 511 } 512 for (int i = low; i <= hi; i++) { 513 tokenTypes[i] = 0; 514 } 515 } 516 517 /** 518 * Specifies that this tokenizer shall parse numbers. 519 */ 520 public void parseNumbers() { 521 for (int i = '0'; i <= '9'; i++) { 522 tokenTypes[i] |= TOKEN_DIGIT; 523 } 524 tokenTypes['.'] |= TOKEN_DIGIT; 525 tokenTypes['-'] |= TOKEN_DIGIT; 526 } 527 528 /** 529 * Indicates that the current token should be pushed back and returned again 530 * the next time {@code nextToken()} is called. 531 */ 532 public void pushBack() { 533 pushBackToken = true; 534 } 535 536 /** 537 * Specifies that the character {@code ch} shall be treated as a quote 538 * character. 539 * 540 * @param ch 541 * the character to be considered a quote character. 542 */ 543 public void quoteChar(int ch) { 544 if (0 <= ch && ch < tokenTypes.length) { 545 tokenTypes[ch] = TOKEN_QUOTE; 546 } 547 } 548 549 private int read() throws IOException { 550 // Call the read for the appropriate stream 551 if (inStream == null) { 552 return inReader.read(); 553 } 554 return inStream.read(); 555 } 556 557 /** 558 * Specifies that all characters shall be treated as ordinary characters. 559 */ 560 public void resetSyntax() { 561 for (int i = 0; i < 256; i++) { 562 tokenTypes[i] = 0; 563 } 564 } 565 566 /** 567 * Specifies whether "slash-slash" (C++-style) comments shall be recognized. 568 * This kind of comment ends at the end of the line. 569 * 570 * @param flag 571 * {@code true} if {@code //} should be recognized as the start 572 * of a comment, {@code false} otherwise. 573 */ 574 public void slashSlashComments(boolean flag) { 575 slashSlashComments = flag; 576 } 577 578 /** 579 * Specifies whether "slash-star" (C-style) comments shall be recognized. 580 * Slash-star comments cannot be nested and end when a star-slash 581 * combination is found. 582 * 583 * @param flag 584 * {@code true} if {@code /*} should be recognized as the start 585 * of a comment, {@code false} otherwise. 586 */ 587 public void slashStarComments(boolean flag) { 588 slashStarComments = flag; 589 } 590 591 /** 592 * Returns the state of this tokenizer in a readable format. 593 * 594 * @return the current state of this tokenizer. 595 */ 596 @Override 597 public String toString() { 598 // Values determined through experimentation 599 StringBuilder result = new StringBuilder(); 600 result.append("Token["); 601 switch (ttype) { 602 case TT_EOF: 603 result.append("EOF"); 604 break; 605 case TT_EOL: 606 result.append("EOL"); 607 break; 608 case TT_NUMBER: 609 result.append("n="); 610 result.append(nval); 611 break; 612 case TT_WORD: 613 result.append(sval); 614 break; 615 default: 616 if (ttype == TT_UNKNOWN || tokenTypes[ttype] == TOKEN_QUOTE) { 617 result.append(sval); 618 } else { 619 result.append('\''); 620 result.append((char) ttype); 621 result.append('\''); 622 } 623 } 624 result.append("], line "); 625 result.append(lineNumber); 626 return result.toString(); 627 } 628 629 /** 630 * Specifies that the characters in the range from {@code low} to {@code hi} 631 * shall be treated as whitespace characters by this tokenizer. 632 * 633 * @param low 634 * the first character in the range of whitespace characters. 635 * @param hi 636 * the last character in the range of whitespace characters. 637 */ 638 public void whitespaceChars(int low, int hi) { 639 if (low < 0) { 640 low = 0; 641 } 642 if (hi > tokenTypes.length) { 643 hi = tokenTypes.length - 1; 644 } 645 for (int i = low; i <= hi; i++) { 646 tokenTypes[i] = TOKEN_WHITE; 647 } 648 } 649 650 /** 651 * Specifies that the characters in the range from {@code low} to {@code hi} 652 * shall be treated as word characters by this tokenizer. A word consists of 653 * a word character followed by zero or more word or number characters. 654 * 655 * @param low 656 * the first character in the range of word characters. 657 * @param hi 658 * the last character in the range of word characters. 659 */ 660 public void wordChars(int low, int hi) { 661 if (low < 0) { 662 low = 0; 663 } 664 if (hi > tokenTypes.length) { 665 hi = tokenTypes.length - 1; 666 } 667 for (int i = low; i <= hi; i++) { 668 tokenTypes[i] |= TOKEN_WORD; 669 } 670 } 671 } 672