Home | History | Annotate | Download | only in impl
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  *******************************************************************************
      6  * Copyright (C) 2009-2015, Google, International Business Machines Corporation
      7  * and others. All Rights Reserved.
      8  *******************************************************************************
      9  */
     10 package android.icu.impl;
     11 
     12 import java.io.BufferedReader;
     13 import java.io.FileInputStream;
     14 import java.io.IOException;
     15 import java.io.InputStream;
     16 import java.io.InputStreamReader;
     17 import java.io.UnsupportedEncodingException;
     18 import java.text.ParsePosition;
     19 import java.util.Arrays;
     20 import java.util.Comparator;
     21 import java.util.LinkedHashSet;
     22 import java.util.List;
     23 import java.util.Map;
     24 import java.util.Map.Entry;
     25 import java.util.Set;
     26 import java.util.TreeMap;
     27 import java.util.regex.Pattern;
     28 
     29 import android.icu.text.StringTransform;
     30 import android.icu.text.SymbolTable;
     31 import android.icu.text.UnicodeSet;
     32 import android.icu.util.Freezable;
     33 
     34 /**
     35  * Contains utilities to supplement the JDK Regex, since it doesn't handle
     36  * Unicode well.
     37  *
     38  * <p>TODO: Move to android.icu.dev.somewhere.
     39  * 2015-sep-03: This is used there, and also in CLDR and in UnicodeTools.
     40  *
     41  * @author markdavis
     42  * @hide Only a subset of ICU is exposed in Android
     43  */
     44 public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform {
     45     // Note: we don't currently have any state, but intend to in the future,
     46     // particularly for the regex style supported.
     47 
     48     private SymbolTable symbolTable;
     49 
     50     /**
     51      * Set the symbol table for internal processing
     52      * @hide draft / provisional / internal are hidden on Android
     53      */
     54     public SymbolTable getSymbolTable() {
     55         return symbolTable;
     56     }
     57 
     58     /**
     59      * Get the symbol table for internal processing
     60      * @hide draft / provisional / internal are hidden on Android
     61      */
     62     public UnicodeRegex setSymbolTable(SymbolTable symbolTable) {
     63         this.symbolTable = symbolTable;
     64         return this;
     65     }
     66 
     67     /**
     68      * Adds full Unicode property support, with the latest version of Unicode,
     69      * to Java Regex, bringing it up to Level 1 (see
     70      * http://www.unicode.org/reports/tr18/). It does this by preprocessing the
     71      * regex pattern string and interpreting the character classes (\p{...},
     72      * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With
     73      * this utility, Java regex expressions can be updated to work with the
     74      * latest version of Unicode, and with all Unicode properties. Note that the
     75      * UnicodeSet syntax has not yet, however, been updated to be completely
     76      * consistent with Java regex, so be careful of the differences.
     77      * <p>Not thread-safe; create a separate copy for different threads.
     78      * <p>In the future, we may extend this to support other regex packages.
     79      *
     80      * @regex A modified Java regex pattern, as in the input to
     81      *        Pattern.compile(), except that all "character classes" are
     82      *        processed as if they were UnicodeSet patterns. Example:
     83      *        "abc[:bc=N:]. See UnicodeSet for the differences in syntax.
     84      * @return A processed Java regex pattern, suitable for input to
     85      *         Pattern.compile().
     86      */
     87     @Override
     88     public String transform(String regex) {
     89         StringBuilder result = new StringBuilder();
     90         UnicodeSet temp = new UnicodeSet();
     91         ParsePosition pos = new ParsePosition(0);
     92         int state = 0; // 1 = after \
     93 
     94         // We add each character unmodified to the output, unless we have a
     95         // UnicodeSet. Note that we don't worry about supplementary characters,
     96         // since none of the syntax uses them.
     97 
     98         for (int i = 0; i < regex.length(); ++i) {
     99             // look for UnicodeSets, allowing for quoting with \ and \Q
    100             char ch = regex.charAt(i);
    101             switch (state) {
    102             case 0: // we only care about \, and '['.
    103                 if (ch == '\\') {
    104                     if (UnicodeSet.resemblesPattern(regex, i)) {
    105                         // should only happen with \p
    106                         i = processSet(regex, i, result, temp, pos);
    107                         continue;
    108                     }
    109                     state = 1;
    110                 } else if (ch == '[') {
    111                     // if we have what looks like a UnicodeSet
    112                     if (UnicodeSet.resemblesPattern(regex, i)) {
    113                         i = processSet(regex, i, result, temp, pos);
    114                         continue;
    115                     }
    116                 }
    117                 break;
    118 
    119             case 1: // we are after a \
    120                 if (ch == 'Q') {
    121                     state = 1;
    122                 } else {
    123                     state = 0;
    124                 }
    125                 break;
    126 
    127             case 2: // we are in a \Q...
    128                 if (ch == '\\') {
    129                     state = 3;
    130                 }
    131                 break;
    132 
    133             case 3: // we are in at \Q...\
    134                 if (ch == 'E') {
    135                     state = 0;
    136                 }
    137                 state = 2;
    138                 break;
    139             }
    140             result.append(ch);
    141         }
    142         return result.toString();
    143     }
    144 
    145     /**
    146      * Convenience static function, using standard parameters.
    147      * @param regex as in process()
    148      * @return processed regex pattern, as in process()
    149      */
    150     public static String fix(String regex) {
    151         return STANDARD.transform(regex);
    152     }
    153 
    154     /**
    155      * Compile a regex string, after processing by fix(...).
    156      *
    157      * @param regex Raw regex pattern, as in fix(...).
    158      * @return Pattern
    159      */
    160     public static Pattern compile(String regex) {
    161         return Pattern.compile(STANDARD.transform(regex));
    162     }
    163 
    164     /**
    165      * Compile a regex string, after processing by fix(...).
    166      *
    167      * @param regex Raw regex pattern, as in fix(...).
    168      * @return Pattern
    169      */
    170     public static Pattern compile(String regex, int options) {
    171         return Pattern.compile(STANDARD.transform(regex), options);
    172     }
    173 
    174     /**
    175      * Compile a composed string from a set of BNF lines; see the List version for more information.
    176      *
    177      * @param bnfLines Series of BNF lines.
    178      * @return Pattern
    179      */
    180     public String compileBnf(String bnfLines) {
    181         return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n")));
    182     }
    183 
    184     /**
    185      * Compile a composed string from a set of BNF lines, such as for composing a regex
    186      * expression. The lines can be in any order, but there must not be any
    187      * cycles. The result can be used as input for fix().
    188      * <p>
    189      * Example:
    190      * <pre>
    191      * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;
    192      * scheme = reserved+;
    193      * host = // reserved+;
    194      * query = [\\=reserved]+;
    195      * fragment = reserved+;
    196      * reserved = [[:ascii:][:alphabetic:]];
    197      * </pre>
    198      * <p>
    199      * Caveats: at this point the parsing is simple; for example, # cannot be
    200      * quoted (use \\u0023); you can set it to null to disable.
    201      * The equality sign and a few others can be reset with
    202      * setBnfX().
    203      *
    204      * @param lines Series of lines that represent a BNF expression. The lines contain
    205      *          a series of statements that of the form x=y;. A statement can take
    206      *          multiple lines, but there can't be multiple statements on a line.
    207      *          A hash quotes to the end of the line.
    208      * @return Pattern
    209      */
    210     public String compileBnf(List<String> lines) {
    211         Map<String, String> variables = getVariables(lines);
    212         Set<String> unused = new LinkedHashSet<String>(variables.keySet());
    213         // brute force replacement; do twice to allow for different order
    214         // later on can optimize
    215         for (int i = 0; i < 2; ++i) {
    216             for (Entry<String, String> entry : variables.entrySet()) {
    217                 String variable   = entry.getKey(),
    218                        definition = entry.getValue();
    219 
    220                 for (Entry<String, String> entry2 : variables.entrySet()) {
    221                     String variable2 = entry2.getKey(),
    222                            definition2 = entry2.getValue();
    223                     if (variable.equals(variable2)) {
    224                         continue;
    225                     }
    226                     String altered2 = definition2.replace(variable, definition);
    227                     if (!altered2.equals(definition2)) {
    228                         unused.remove(variable);
    229                         variables.put(variable2, altered2);
    230 //                        if (log != null) {
    231 //                            try {
    232 //                                log.append(variable2 + "=" + altered2 + ";");
    233 //                            } catch (IOException e) {
    234 //                                throw (IllegalArgumentException) new IllegalArgumentException().initCause(e);
    235 //                            }
    236 //                        }
    237                     }
    238                 }
    239             }
    240         }
    241         if (unused.size() != 1) {
    242             throw new IllegalArgumentException("Not a single root: " + unused);
    243         }
    244         return variables.get(unused.iterator().next());
    245     }
    246 
    247     public String getBnfCommentString() {
    248         return bnfCommentString;
    249     }
    250 
    251     public void setBnfCommentString(String bnfCommentString) {
    252         this.bnfCommentString = bnfCommentString;
    253     }
    254 
    255     public String getBnfVariableInfix() {
    256         return bnfVariableInfix;
    257     }
    258 
    259     public void setBnfVariableInfix(String bnfVariableInfix) {
    260         this.bnfVariableInfix = bnfVariableInfix;
    261     }
    262 
    263     public String getBnfLineSeparator() {
    264         return bnfLineSeparator;
    265     }
    266 
    267     public void setBnfLineSeparator(String bnfLineSeparator) {
    268         this.bnfLineSeparator = bnfLineSeparator;
    269     }
    270 
    271     /**
    272      * Utility for loading lines from a file.
    273      * @param result The result of the appended lines.
    274      * @param file The file to have an input stream.
    275      * @param encoding if null, then UTF-8
    276      * @return filled list
    277      * @throws IOException If there were problems opening the file for input stream.
    278      */
    279     public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException {
    280         InputStream is = new FileInputStream(file);
    281         try {
    282             return appendLines(result, is, encoding);
    283         } finally {
    284             is.close();
    285         }
    286     }
    287 
    288     /**
    289      * Utility for loading lines from a UTF8 file.
    290      * @param result The result of the appended lines.
    291      * @param inputStream The input stream.
    292      * @param encoding if null, then UTF-8
    293      * @return filled list
    294      * @throws IOException  If there were problems opening the input stream for reading.
    295      */
    296     public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding)
    297             throws UnsupportedEncodingException, IOException {
    298         BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding));
    299         while (true) {
    300             String line = in.readLine();
    301             if (line == null) break;
    302             result.add(line);
    303         }
    304         return result;
    305     }
    306 
    307 
    308 
    309     /* (non-Javadoc)
    310      * @see android.icu.util.Freezable#cloneAsThawed()
    311      */
    312     @Override
    313     public UnicodeRegex cloneAsThawed() {
    314         // TODO Auto-generated method stub
    315         try {
    316             return (UnicodeRegex)clone();
    317         } catch (CloneNotSupportedException e) {
    318             throw new IllegalArgumentException(); // should never happen
    319         }
    320     }
    321 
    322     /* (non-Javadoc)
    323      * @see android.icu.util.Freezable#freeze()
    324      */
    325     @Override
    326     public UnicodeRegex freeze() {
    327         // no action needed now.
    328         return this;
    329     }
    330 
    331     /* (non-Javadoc)
    332      * @see android.icu.util.Freezable#isFrozen()
    333      */
    334     @Override
    335     public boolean isFrozen() {
    336         // at this point, always true
    337         return true;
    338     }
    339 
    340     // ===== PRIVATES =====
    341 
    342     private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) {
    343         try {
    344             pos.setIndex(i);
    345             UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0);
    346             x.complement().complement(); // hack to fix toPattern
    347             result.append(x.toPattern(false));
    348             i = pos.getIndex() - 1; // allow for the loop increment
    349             return i;
    350         } catch (Exception e) {
    351             throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e);
    352         }
    353     }
    354 
    355     private static final UnicodeRegex STANDARD = new UnicodeRegex();
    356     private String bnfCommentString = "#";
    357     private String bnfVariableInfix = "=";
    358     private String bnfLineSeparator = "\n";
    359 //    private Appendable log = null;
    360 
    361     private Comparator<Object> LongestFirst = new Comparator<Object>() {
    362         @Override
    363         public int compare(Object obj0, Object obj1) {
    364             String arg0 = obj0.toString();
    365             String arg1 = obj1.toString();
    366             int len0 = arg0.length();
    367             int len1 = arg1.length();
    368             if (len0 != len1) return len1 - len0;
    369             return arg0.compareTo(arg1);
    370         }
    371     };
    372 
    373     private Map<String, String> getVariables(List<String> lines) {
    374         Map<String, String> variables = new TreeMap<String, String>(LongestFirst);
    375         String variable = null;
    376         StringBuffer definition = new StringBuffer();
    377         int count = 0;
    378         for (String line : lines) {
    379             ++count;
    380             // remove initial bom, comments
    381             if (line.length() == 0) continue;
    382             if (line.charAt(0) == '\uFEFF') line = line.substring(1);
    383 
    384             if (bnfCommentString != null) {
    385                 int hashPos = line.indexOf(bnfCommentString);
    386                 if (hashPos >= 0) line = line.substring(0, hashPos);
    387             }
    388             String trimline = line.trim();
    389             if (trimline.length() == 0) continue;
    390 
    391             // String[] lineParts = line.split(";");
    392             String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " ");
    393             if (linePart.trim().length() == 0) continue;
    394             boolean terminated = trimline.endsWith(";");
    395             if (terminated) {
    396                 linePart = linePart.substring(0,linePart.lastIndexOf(';'));
    397             }
    398             int equalsPos = linePart.indexOf(bnfVariableInfix);
    399             if (equalsPos >= 0) {
    400                 if (variable != null) {
    401                     throw new IllegalArgumentException("Missing ';' before " + count + ") " + line);
    402                 }
    403                 variable = linePart.substring(0,equalsPos).trim();
    404                 if (variables.containsKey(variable)) {
    405                     throw new IllegalArgumentException("Duplicate variable definition in " + line);
    406                 }
    407                 definition.append(linePart.substring(equalsPos+1).trim());
    408             } else { // no equals, so
    409                 if (variable == null) {
    410                     throw new IllegalArgumentException("Missing '=' at " + count + ") " + line);
    411                 }
    412                 definition.append(bnfLineSeparator).append(linePart);
    413             }
    414             // we are terminated if i is not at the end, or the line ends with a ;
    415             if (terminated) {
    416                 variables.put(variable, definition.toString());
    417                 variable = null; // signal we have no variable
    418                 definition.setLength(0);
    419             }
    420         }
    421         if (variable != null) {
    422             throw new IllegalArgumentException("Missing ';' at end");
    423         }
    424         return variables;
    425     }
    426 }
    427