1 /* 2 * Copyright 2016 Google Inc. All Rights Reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.turbine.parse; 18 19 import com.google.turbine.diag.SourceFile; 20 21 /** Preprocesses Unicode escape characters in Java source code, as described in JLS 3.3. */ 22 public class UnicodeEscapePreprocessor { 23 24 public static final char ASCII_SUB = 0x1A; 25 26 private final SourceFile source; 27 private final String input; 28 29 private int idx = 0; 30 private char ch; 31 private boolean evenLeadingSlashes = true; 32 33 public UnicodeEscapePreprocessor(SourceFile source) { 34 this.source = source; 35 this.input = source.source(); 36 } 37 38 /** Returns the current position in the input. */ 39 public int position() { 40 return idx - 1; 41 } 42 43 /** Returns true if all input has been read. */ 44 public boolean done() { 45 return idx >= input.length(); 46 } 47 48 /** Returns the next unescaped Unicode input character. */ 49 public char next() { 50 eat(); 51 if (ch == '\\' && evenLeadingSlashes) { 52 unicodeEscape(); 53 } else { 54 evenLeadingSlashes = true; 55 } 56 return ch; 57 } 58 59 /** Returns a substring of the raw (escaped) input. */ 60 public String readString(int from, int to) { 61 return input.substring(from, to); 62 } 63 64 /** Consumes a Unicode escape. */ 65 private void unicodeEscape() { 66 eat(); 67 if (ch != 'u') { 68 idx--; 69 ch = '\\'; 70 evenLeadingSlashes = false; 71 return; 72 } 73 do { 74 eat(); 75 } while (ch == 'u'); 76 char acc = (char) ((hexDigit(ch) & 0xff) << 12); 77 eat(); 78 acc |= (char) ((hexDigit(ch) & 0xff) << 8); 79 eat(); 80 acc |= (char) ((hexDigit(ch) & 0xff) << 4); 81 eat(); 82 acc |= (char) (hexDigit(ch) & 0xff); 83 ch = acc; 84 evenLeadingSlashes = ch != '\\'; 85 } 86 87 /** Consumes a hex digit. */ 88 private static int hexDigit(char d) { 89 switch (d) { 90 case '0': 91 case '1': 92 case '2': 93 case '3': 94 case '4': 95 case '5': 96 case '6': 97 case '7': 98 case '8': 99 case '9': 100 return (d - '0'); 101 case 'A': 102 case 'B': 103 case 'C': 104 case 'D': 105 case 'E': 106 case 'F': 107 return ((d - 'A') + 10); 108 case 'a': 109 case 'b': 110 case 'c': 111 case 'd': 112 case 'e': 113 case 'f': 114 return ((d - 'a') + 10); 115 case ASCII_SUB: 116 throw new AssertionError("unexpected end of input"); 117 default: 118 throw new AssertionError(String.format("unexpected hex digit: 0x%x", (int) d)); 119 } 120 } 121 122 /** 123 * Consumes a raw input character. 124 * 125 * <p>Once the input is exhausted, {@code ch} will always be ASCII SUB (\u001a). JLS 3.5 requires 126 * ASCII SUB to be ignored if it is the last character in the escaped input stream, and assuming 127 * it terminates the input avoids some bounds checks in the lexer. 128 */ 129 private void eat() { 130 ch = done() ? ASCII_SUB : input.charAt(idx); 131 idx++; 132 } 133 134 public SourceFile source() { 135 return source; 136 } 137 } 138