Home | History | Annotate | Download | only in parse
      1 /*
      2  * Copyright 2016 Google Inc. All Rights Reserved.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.turbine.parse;
     18 
     19 import com.google.turbine.diag.SourceFile;
     20 
     21 /** Preprocesses Unicode escape characters in Java source code, as described in JLS 3.3. */
     22 public class UnicodeEscapePreprocessor {
     23 
     24   public static final char ASCII_SUB = 0x1A;
     25 
     26   private final SourceFile source;
     27   private final String input;
     28 
     29   private int idx = 0;
     30   private char ch;
     31   private boolean evenLeadingSlashes = true;
     32 
     33   public UnicodeEscapePreprocessor(SourceFile source) {
     34     this.source = source;
     35     this.input = source.source();
     36   }
     37 
     38   /** Returns the current position in the input. */
     39   public int position() {
     40     return idx - 1;
     41   }
     42 
     43   /** Returns true if all input has been read. */
     44   public boolean done() {
     45     return idx >= input.length();
     46   }
     47 
     48   /** Returns the next unescaped Unicode input character. */
     49   public char next() {
     50     eat();
     51     if (ch == '\\' && evenLeadingSlashes) {
     52       unicodeEscape();
     53     } else {
     54       evenLeadingSlashes = true;
     55     }
     56     return ch;
     57   }
     58 
     59   /** Returns a substring of the raw (escaped) input. */
     60   public String readString(int from, int to) {
     61     return input.substring(from, to);
     62   }
     63 
     64   /** Consumes a Unicode escape. */
     65   private void unicodeEscape() {
     66     eat();
     67     if (ch != 'u') {
     68       idx--;
     69       ch = '\\';
     70       evenLeadingSlashes = false;
     71       return;
     72     }
     73     do {
     74       eat();
     75     } while (ch == 'u');
     76     char acc = (char) ((hexDigit(ch) & 0xff) << 12);
     77     eat();
     78     acc |= (char) ((hexDigit(ch) & 0xff) << 8);
     79     eat();
     80     acc |= (char) ((hexDigit(ch) & 0xff) << 4);
     81     eat();
     82     acc |= (char) (hexDigit(ch) & 0xff);
     83     ch = acc;
     84     evenLeadingSlashes = ch != '\\';
     85   }
     86 
     87   /** Consumes a hex digit. */
     88   private static int hexDigit(char d) {
     89     switch (d) {
     90       case '0':
     91       case '1':
     92       case '2':
     93       case '3':
     94       case '4':
     95       case '5':
     96       case '6':
     97       case '7':
     98       case '8':
     99       case '9':
    100         return (d - '0');
    101       case 'A':
    102       case 'B':
    103       case 'C':
    104       case 'D':
    105       case 'E':
    106       case 'F':
    107         return ((d - 'A') + 10);
    108       case 'a':
    109       case 'b':
    110       case 'c':
    111       case 'd':
    112       case 'e':
    113       case 'f':
    114         return ((d - 'a') + 10);
    115       case ASCII_SUB:
    116         throw new AssertionError("unexpected end of input");
    117       default:
    118         throw new AssertionError(String.format("unexpected hex digit: 0x%x", (int) d));
    119     }
    120   }
    121 
    122   /**
    123    * Consumes a raw input character.
    124    *
    125    * <p>Once the input is exhausted, {@code ch} will always be ASCII SUB (\u001a). JLS 3.5 requires
    126    * ASCII SUB to be ignored if it is the last character in the escaped input stream, and assuming
    127    * it terminates the input avoids some bounds checks in the lexer.
    128    */
    129   private void eat() {
    130     ch = done() ? ASCII_SUB : input.charAt(idx);
    131     idx++;
    132   }
    133 
    134   public SourceFile source() {
    135     return source;
    136   }
    137 }
    138