1 /* 2 * Copyright 2016 Google Inc. All Rights Reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.turbine.parse; 18 19 import static com.google.turbine.parse.UnicodeEscapePreprocessor.ASCII_SUB; 20 21 import com.google.common.base.Verify; 22 import com.google.turbine.diag.SourceFile; 23 import com.google.turbine.diag.TurbineError; 24 import com.google.turbine.diag.TurbineError.ErrorKind; 25 26 /** A {@link Lexer} that streams input from a {@link UnicodeEscapePreprocessor}. */ 27 public class StreamLexer implements Lexer { 28 29 private final UnicodeEscapePreprocessor reader; 30 31 /** The current input character. */ 32 private char ch; 33 34 /** The start position of the current token. */ 35 private int position; 36 37 /** The start position of the current numeric literal or identifier token. */ 38 private int readFrom; 39 40 /** The value of the current string or character literal token. */ 41 private String value = null; 42 43 public StreamLexer(UnicodeEscapePreprocessor reader) { 44 this.reader = reader; 45 eat(); 46 } 47 48 /** Records the value of a literal. */ 49 private void saveValue(String value) { 50 this.value = value; 51 } 52 53 /** Records the start position of a literal. */ 54 private void readFrom() { 55 value = null; 56 readFrom = reader.position(); 57 } 58 59 /** Consumes an input character. */ 60 private void eat() { 61 ch = reader.next(); 62 } 63 64 @Override 65 public String stringValue() { 66 if (value != null) { 67 return value; 68 } 69 return reader.readString(readFrom, reader.position()); 70 } 71 72 @Override 73 public int position() { 74 return position; 75 } 76 77 @Override 78 public SourceFile source() { 79 return reader.source(); 80 } 81 82 @Override 83 public Token next() { 84 OUTER: 85 while (true) { 86 position = reader.position(); 87 switch (ch) { 88 case '\r': 89 case '\n': 90 case ' ': 91 case '\t': 92 case '\f': 93 eat(); 94 continue OUTER; 95 96 case '/': 97 { 98 eat(); 99 switch (ch) { 100 case '/': 101 while (true) { 102 eat(); 103 switch (ch) { 104 case '\n': 105 case '\r': 106 eat(); 107 continue OUTER; 108 case ASCII_SUB: 109 if (reader.done()) { 110 return Token.EOF; 111 } 112 eat(); 113 break; 114 } 115 } 116 case '*': 117 boolean sawStar = false; 118 while (true) { 119 eat(); 120 switch (ch) { 121 case '*': 122 sawStar = true; 123 break; 124 case '/': 125 if (sawStar) { 126 eat(); 127 continue OUTER; 128 } 129 sawStar = false; 130 break; 131 case ASCII_SUB: 132 if (reader.done()) { 133 return Token.EOF; 134 } 135 eat(); 136 break; 137 default: 138 sawStar = false; 139 break; 140 } 141 } 142 default: 143 if (ch == '=') { 144 eat(); 145 return Token.DIVEQ; 146 } 147 return Token.DIV; 148 } 149 } 150 151 case 'a': 152 case 'b': 153 case 'c': 154 case 'd': 155 case 'e': 156 case 'f': 157 case 'g': 158 case 'h': 159 case 'i': 160 case 'j': 161 case 'k': 162 case 'l': 163 case 'm': 164 case 'n': 165 case 'o': 166 case 'p': 167 case 'q': 168 case 'r': 169 case 's': 170 case 't': 171 case 'u': 172 case 'v': 173 case 'w': 174 case 'x': 175 case 'y': 176 case 'z': 177 case 'A': 178 case 'B': 179 case 'C': 180 case 'D': 181 case 'E': 182 case 'F': 183 case 'G': 184 case 'H': 185 case 'I': 186 case 'J': 187 case 'K': 188 case 'L': 189 case 'M': 190 case 'N': 191 case 'O': 192 case 'P': 193 case 'Q': 194 case 'R': 195 case 'S': 196 case 'T': 197 case 'U': 198 case 'V': 199 case 'W': 200 case 'X': 201 case 'Y': 202 case 'Z': 203 case '_': 204 case '$': 205 return identifier(); 206 207 case ASCII_SUB: 208 Verify.verify(reader.done()); 209 return Token.EOF; 210 211 case '-': 212 case '=': 213 case '>': 214 case '<': 215 case '!': 216 case '~': 217 case '+': 218 case '?': 219 case ':': 220 case '*': 221 case '&': 222 case '|': 223 case '^': 224 case '%': 225 return operator(); 226 case '(': 227 eat(); 228 return Token.LPAREN; 229 case ')': 230 eat(); 231 return Token.RPAREN; 232 case '{': 233 eat(); 234 return Token.LBRACE; 235 case '}': 236 eat(); 237 return Token.RBRACE; 238 case '[': 239 eat(); 240 return Token.LBRACK; 241 case ']': 242 eat(); 243 return Token.RBRACK; 244 case ';': 245 eat(); 246 return Token.SEMI; 247 case ',': 248 eat(); 249 return Token.COMMA; 250 case '@': 251 eat(); 252 return Token.AT; // what about frac, etc.? 253 254 case '0': 255 { 256 readFrom(); 257 eat(); 258 switch (ch) { 259 case 'x': 260 case 'X': 261 eat(); 262 return hexLiteral(); 263 case 'b': 264 case 'B': 265 eat(); 266 return boolLiteral(); 267 case '0': 268 case '1': 269 case '2': 270 case '3': 271 case '4': 272 case '5': 273 case '6': 274 case '7': 275 case '_': 276 return octalLiteral(); 277 case '.': 278 eat(); 279 return floatLiteral(); 280 case 'f': 281 case 'F': 282 eat(); 283 return Token.FLOAT_LITERAL; 284 case 'd': 285 case 'D': 286 eat(); 287 return Token.DOUBLE_LITERAL; 288 case 'l': 289 case 'L': 290 eat(); 291 return Token.LONG_LITERAL; 292 default: 293 return Token.INT_LITERAL; 294 } 295 } 296 case '1': 297 case '2': 298 case '3': 299 case '4': 300 case '5': 301 case '6': 302 case '7': 303 case '8': 304 case '9': 305 readFrom(); 306 return decimalLiteral(); 307 case '.': 308 { 309 readFrom(); 310 eat(); 311 switch (ch) { 312 case '.': 313 { 314 eat(); 315 if (ch == '.') { 316 eat(); 317 return Token.ELLIPSIS; 318 } else { 319 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 320 } 321 } 322 case '0': 323 case '1': 324 case '2': 325 case '3': 326 case '4': 327 case '5': 328 case '6': 329 case '7': 330 case '8': 331 case '9': 332 return floatLiteral(); 333 default: 334 return Token.DOT; 335 } 336 } 337 338 case '\'': 339 { 340 eat(); 341 char value; 342 if (ch == '\\') { 343 eat(); 344 value = escape(); 345 } else { 346 value = ch; 347 eat(); 348 } 349 if (ch == '\'') { 350 saveValue(String.valueOf(value)); 351 eat(); 352 return Token.CHAR_LITERAL; 353 } 354 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 355 } 356 357 case '"': 358 { 359 eat(); 360 readFrom(); 361 StringBuilder sb = new StringBuilder(); 362 STRING: 363 while (true) { 364 switch (ch) { 365 case '\\': 366 eat(); 367 sb.append(escape()); 368 continue STRING; 369 case '"': 370 saveValue(sb.toString()); 371 eat(); 372 return Token.STRING_LITERAL; 373 case ASCII_SUB: 374 if (reader.done()) { 375 return Token.EOF; 376 } 377 // falls through 378 default: 379 sb.append(ch); 380 eat(); 381 continue STRING; 382 } 383 } 384 } 385 default: 386 if (Character.isJavaIdentifierStart(ch)) { 387 // TODO(cushon): the style guide disallows non-ascii identifiers 388 return identifier(); 389 } 390 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 391 } 392 } 393 } 394 395 private char escape() { 396 boolean zeroToThree = false; 397 switch (ch) { 398 case 'b': 399 eat(); 400 return '\b'; 401 case 't': 402 eat(); 403 return '\t'; 404 case 'n': 405 eat(); 406 return '\n'; 407 case 'f': 408 eat(); 409 return '\f'; 410 case 'r': 411 eat(); 412 return '\r'; 413 case '"': 414 eat(); 415 return '\"'; 416 case '\'': 417 eat(); 418 return '\''; 419 case '\\': 420 eat(); 421 return '\\'; 422 case '0': 423 case '1': 424 case '2': 425 case '3': 426 zeroToThree = true; 427 // falls through 428 case '4': 429 case '5': 430 case '6': 431 case '7': 432 { 433 char value = (char) (ch - '0'); 434 eat(); 435 switch (ch) { 436 case '0': 437 case '1': 438 case '2': 439 case '3': 440 case '4': 441 case '5': 442 case '6': 443 case '7': 444 { 445 value = (char) ((value << 3) | (ch - '0')); 446 eat(); 447 if (zeroToThree) { 448 switch (ch) { 449 case '0': 450 case '1': 451 case '2': 452 case '3': 453 case '4': 454 case '5': 455 case '6': 456 case '7': 457 value = (char) ((value << 3) | (ch - '0')); 458 eat(); 459 return value; 460 default: 461 return value; 462 } 463 } 464 } 465 // fall through 466 default: 467 return value; 468 } 469 } 470 default: 471 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 472 } 473 } 474 475 private Token decimalLiteral() { 476 readDigits(); 477 switch (ch) { 478 case 'e': 479 case 'E': 480 return floatLiteral(); 481 case '.': 482 eat(); 483 return floatLiteral(); 484 case 'f': 485 case 'F': 486 eat(); 487 return Token.FLOAT_LITERAL; 488 case 'd': 489 case 'D': 490 eat(); 491 return Token.DOUBLE_LITERAL; 492 case 'l': 493 case 'L': 494 eat(); 495 return Token.LONG_LITERAL; 496 default: 497 return Token.INT_LITERAL; 498 } 499 } 500 501 private Token hexFloatLiteral() { 502 readHexDigits(); 503 switch (ch) { 504 case 'p': 505 case 'P': 506 eat(); 507 signedInteger(); 508 break; 509 } 510 return floatTypeSuffix(); 511 } 512 513 private Token floatLiteral() { 514 if ('0' <= ch && ch <= '9') { 515 readDigits(); 516 } 517 switch (ch) { 518 case 'e': 519 case 'E': 520 eat(); 521 signedInteger(); 522 break; 523 } 524 return floatTypeSuffix(); 525 } 526 527 private Token floatTypeSuffix() { 528 switch (ch) { 529 case 'd': 530 case 'D': 531 eat(); 532 return Token.DOUBLE_LITERAL; 533 case 'f': 534 case 'F': 535 eat(); 536 return Token.FLOAT_LITERAL; 537 default: 538 return Token.DOUBLE_LITERAL; 539 } 540 } 541 542 private void signedInteger() { 543 switch (ch) { 544 case '-': 545 case '+': 546 eat(); 547 break; 548 default: 549 break; 550 } 551 readDigits(); 552 } 553 554 private void readHexDigits() { 555 switch (ch) { 556 case 'A': 557 case 'B': 558 case 'C': 559 case 'D': 560 case 'E': 561 case 'F': 562 case 'a': 563 case 'b': 564 case 'c': 565 case 'd': 566 case 'e': 567 case 'f': 568 case '0': 569 case '1': 570 case '2': 571 case '3': 572 case '4': 573 case '5': 574 case '6': 575 case '7': 576 case '8': 577 case '9': 578 eat(); 579 break; 580 default: 581 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 582 } 583 OUTER: 584 while (true) { 585 switch (ch) { 586 case '_': 587 { 588 do { 589 eat(); 590 } while (ch == '_'); 591 switch (ch) { 592 case 'A': 593 case 'B': 594 case 'C': 595 case 'D': 596 case 'E': 597 case 'F': 598 case 'a': 599 case 'b': 600 case 'c': 601 case 'd': 602 case 'e': 603 case 'f': 604 case '0': 605 case '1': 606 case '2': 607 case '3': 608 case '4': 609 case '5': 610 case '6': 611 case '7': 612 case '8': 613 case '9': 614 continue OUTER; 615 default: 616 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 617 } 618 } 619 case 'A': 620 case 'B': 621 case 'C': 622 case 'D': 623 case 'E': 624 case 'F': 625 case 'a': 626 case 'b': 627 case 'c': 628 case 'd': 629 case 'e': 630 case 'f': 631 case '0': 632 case '1': 633 case '2': 634 case '3': 635 case '4': 636 case '5': 637 case '6': 638 case '7': 639 case '8': 640 case '9': 641 eat(); 642 break; 643 default: 644 return; 645 } 646 } 647 } 648 649 private void readDigits() { 650 if ('0' <= ch && ch <= '9') { 651 eat(); 652 } else { 653 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 654 } 655 OUTER: 656 while (true) { 657 switch (ch) { 658 case '_': 659 do { 660 eat(); 661 } while (ch == '_'); 662 if ('0' <= ch && ch <= '9') { 663 continue OUTER; 664 } else { 665 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 666 } 667 case '0': 668 case '1': 669 case '2': 670 case '3': 671 case '4': 672 case '5': 673 case '6': 674 case '7': 675 case '8': 676 case '9': 677 eat(); 678 continue OUTER; 679 default: 680 return; 681 } 682 } 683 } 684 685 private Token boolLiteral() { 686 readBinaryDigits(); 687 switch (ch) { 688 case 'l': 689 case 'L': 690 eat(); 691 return Token.LONG_LITERAL; 692 default: 693 return Token.INT_LITERAL; 694 } 695 } 696 697 private void readBinaryDigits() { 698 switch (ch) { 699 case '0': 700 case '1': 701 eat(); 702 break; 703 default: 704 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 705 } 706 OUTER: 707 while (true) { 708 switch (ch) { 709 case '_': 710 do { 711 eat(); 712 } while (ch == '_'); 713 switch (ch) { 714 case '0': 715 case '1': 716 continue OUTER; 717 default: 718 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 719 } 720 case '0': 721 case '1': 722 eat(); 723 continue OUTER; 724 default: 725 return; 726 } 727 } 728 } 729 730 private Token octalLiteral() { 731 readOctalDigits(); 732 switch (ch) { 733 case 'l': 734 case 'L': 735 eat(); 736 return Token.LONG_LITERAL; 737 default: 738 return Token.INT_LITERAL; 739 } 740 } 741 742 private void readOctalDigits() { 743 switch (ch) { 744 case '0': 745 case '1': 746 case '2': 747 case '3': 748 case '4': 749 case '5': 750 case '6': 751 case '7': 752 case '_': 753 eat(); 754 break; 755 default: 756 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 757 } 758 OUTER: 759 while (true) { 760 switch (ch) { 761 case '_': 762 do { 763 eat(); 764 } while (ch == '_'); 765 switch (ch) { 766 case '0': 767 case '1': 768 case '2': 769 case '3': 770 case '4': 771 case '5': 772 case '6': 773 case '7': 774 continue OUTER; 775 default: 776 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 777 } 778 case '0': 779 case '1': 780 case '2': 781 case '3': 782 case '4': 783 case '5': 784 case '6': 785 case '7': 786 eat(); 787 continue OUTER; 788 default: 789 return; 790 } 791 } 792 } 793 794 private Token hexLiteral() { 795 readHexDigits(); 796 switch (ch) { 797 case '.': 798 eat(); 799 return hexFloatLiteral(); 800 case 'l': 801 case 'L': 802 eat(); 803 return Token.LONG_LITERAL; 804 case 'p': 805 case 'P': 806 eat(); 807 signedInteger(); 808 return floatTypeSuffix(); 809 default: 810 return Token.INT_LITERAL; 811 } 812 } 813 814 private Token operator() { 815 switch (ch) { 816 case '=': 817 eat(); 818 if (ch == '=') { 819 eat(); 820 return Token.EQ; 821 } else { 822 return Token.ASSIGN; 823 } 824 case '>': 825 eat(); 826 switch (ch) { 827 case '=': 828 eat(); 829 return Token.GTE; 830 case '>': 831 eat(); 832 switch (ch) { 833 case '>': 834 eat(); 835 if (ch == '=') { 836 eat(); 837 return Token.GTGTGTE; 838 } else { 839 return Token.GTGTGT; 840 } 841 case '=': 842 eat(); 843 return Token.GTGTE; 844 default: 845 return Token.GTGT; 846 } 847 default: 848 return Token.GT; 849 } 850 case '<': 851 eat(); 852 switch (ch) { 853 case '=': 854 eat(); 855 return Token.LTE; 856 case '<': 857 eat(); 858 if (ch == '=') { 859 eat(); 860 return Token.LTLTE; 861 } else { 862 return Token.LTLT; 863 } 864 default: 865 return Token.LT; 866 } 867 case '!': 868 eat(); 869 if (ch == '=') { 870 eat(); 871 return Token.NOTEQ; 872 } else { 873 return Token.NOT; 874 } 875 case '~': 876 eat(); 877 return Token.TILDE; 878 case '?': 879 eat(); 880 return Token.COND; 881 case ':': 882 eat(); 883 if (ch == ':') { 884 eat(); 885 return Token.COLONCOLON; 886 } else { 887 return Token.COLON; 888 } 889 case '-': 890 eat(); 891 switch (ch) { 892 case '>': 893 eat(); 894 return Token.ARROW; 895 case '-': 896 eat(); 897 return Token.DECR; 898 case '=': 899 eat(); 900 return Token.MINUSEQ; 901 default: 902 return Token.MINUS; 903 } 904 case '&': 905 eat(); 906 switch (ch) { 907 case '&': 908 eat(); 909 return Token.ANDAND; 910 case '=': 911 eat(); 912 return Token.ANDEQ; 913 default: 914 return Token.AND; 915 } 916 case '|': 917 eat(); 918 switch (ch) { 919 case '=': 920 eat(); 921 return Token.OREQ; 922 case '|': 923 eat(); 924 return Token.OROR; 925 default: 926 return Token.OR; 927 } 928 case '+': 929 eat(); 930 switch (ch) { 931 case '+': 932 eat(); 933 return Token.INCR; 934 case '=': 935 eat(); 936 return Token.PLUSEQ; 937 default: 938 return Token.PLUS; 939 } 940 case '*': 941 eat(); 942 if (ch == '=') { 943 eat(); 944 return Token.MULTEQ; 945 } else { 946 return Token.MULT; 947 } 948 case '/': 949 // handled with comments 950 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 951 952 case '%': 953 eat(); 954 if (ch == '=') { 955 eat(); 956 return Token.MODEQ; 957 } else { 958 return Token.MOD; 959 } 960 case '^': 961 eat(); 962 if (ch == '=') { 963 eat(); 964 return Token.XOREQ; 965 } else { 966 return Token.XOR; 967 } 968 default: 969 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 970 } 971 } 972 973 private Token identifier() { 974 readFrom(); 975 eat(); 976 // TODO(cushon): the style guide disallows non-ascii identifiers 977 while (Character.isJavaIdentifierPart(ch)) { 978 if (ch == ASCII_SUB && reader.done()) { 979 break; 980 } 981 eat(); 982 } 983 return makeIdent(stringValue()); 984 } 985 986 private Token makeIdent(String s) { 987 switch (s) { 988 case "abstract": 989 return Token.ABSTRACT; 990 case "assert": 991 return Token.ASSERT; 992 case "boolean": 993 return Token.BOOLEAN; 994 case "break": 995 return Token.BREAK; 996 case "byte": 997 return Token.BYTE; 998 case "case": 999 return Token.CASE; 1000 case "catch": 1001 return Token.CATCH; 1002 case "char": 1003 return Token.CHAR; 1004 case "class": 1005 return Token.CLASS; 1006 case "const": 1007 return Token.CONST; 1008 case "continue": 1009 return Token.CONTINUE; 1010 case "default": 1011 return Token.DEFAULT; 1012 case "do": 1013 return Token.DO; 1014 case "double": 1015 return Token.DOUBLE; 1016 case "else": 1017 return Token.ELSE; 1018 case "enum": 1019 return Token.ENUM; 1020 case "extends": 1021 return Token.EXTENDS; 1022 case "final": 1023 return Token.FINAL; 1024 case "finally": 1025 return Token.FINALLY; 1026 case "float": 1027 return Token.FLOAT; 1028 case "for": 1029 return Token.FOR; 1030 case "goto": 1031 return Token.GOTO; 1032 case "if": 1033 return Token.IF; 1034 case "implements": 1035 return Token.IMPLEMENTS; 1036 case "import": 1037 return Token.IMPORT; 1038 case "instanceof": 1039 return Token.INSTANCEOF; 1040 case "int": 1041 return Token.INT; 1042 case "interface": 1043 return Token.INTERFACE; 1044 case "long": 1045 return Token.LONG; 1046 case "native": 1047 return Token.NATIVE; 1048 case "new": 1049 return Token.NEW; 1050 case "package": 1051 return Token.PACKAGE; 1052 case "private": 1053 return Token.PRIVATE; 1054 case "protected": 1055 return Token.PROTECTED; 1056 case "public": 1057 return Token.PUBLIC; 1058 case "return": 1059 return Token.RETURN; 1060 case "short": 1061 return Token.SHORT; 1062 case "static": 1063 return Token.STATIC; 1064 case "strictfp": 1065 return Token.STRICTFP; 1066 case "super": 1067 return Token.SUPER; 1068 case "switch": 1069 return Token.SWITCH; 1070 case "synchronized": 1071 return Token.SYNCHRONIZED; 1072 case "this": 1073 return Token.THIS; 1074 case "throw": 1075 return Token.THROW; 1076 case "throws": 1077 return Token.THROWS; 1078 case "transient": 1079 return Token.TRANSIENT; 1080 case "try": 1081 return Token.TRY; 1082 case "void": 1083 return Token.VOID; 1084 case "volatile": 1085 return Token.VOLATILE; 1086 case "while": 1087 return Token.WHILE; 1088 case "true": 1089 return Token.TRUE; 1090 case "false": 1091 return Token.FALSE; 1092 case "null": 1093 return Token.NULL; 1094 default: 1095 return Token.IDENT; 1096 } 1097 } 1098 1099 private TurbineError error(ErrorKind kind, Object... args) { 1100 return TurbineError.format(reader.source(), reader.position(), kind, args); 1101 } 1102 } 1103