Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 2009-2015, Google, International Business Machines Corporation
      6  * and others. All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.impl;
     10 
     11 import java.io.BufferedReader;
     12 import java.io.FileInputStream;
     13 import java.io.IOException;
     14 import java.io.InputStream;
     15 import java.io.InputStreamReader;
     16 import java.io.UnsupportedEncodingException;
     17 import java.text.ParsePosition;
     18 import java.util.Arrays;
     19 import java.util.Comparator;
     20 import java.util.LinkedHashSet;
     21 import java.util.List;
     22 import java.util.Map;
     23 import java.util.Map.Entry;
     24 import java.util.Set;
     25 import java.util.TreeMap;
     26 import java.util.regex.Pattern;
     27 
     28 import com.ibm.icu.text.StringTransform;
     29 import com.ibm.icu.text.SymbolTable;
     30 import com.ibm.icu.text.UnicodeSet;
     31 import com.ibm.icu.util.Freezable;
     32 
     33 /**
     34  * Contains utilities to supplement the JDK Regex, since it doesn't handle
     35  * Unicode well.
     36  *
     37  * <p>TODO: Move to com.ibm.icu.dev.somewhere.
     38  * 2015-sep-03: This is used there, and also in CLDR and in UnicodeTools.
     39  *
     40  * @author markdavis
     41  */
     42 public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform {
     43     // Note: we don't currently have any state, but intend to in the future,
     44     // particularly for the regex style supported.
     45 
     46     private SymbolTable symbolTable;
     47 
     48     /**
     49      * Set the symbol table for internal processing
     50      * @internal
     51      */
     52     public SymbolTable getSymbolTable() {
     53         return symbolTable;
     54     }
     55 
     56     /**
     57      * Get the symbol table for internal processing
     58      * @internal
     59      */
     60     public UnicodeRegex setSymbolTable(SymbolTable symbolTable) {
     61         this.symbolTable = symbolTable;
     62         return this;
     63     }
     64 
     65     /**
     66      * Adds full Unicode property support, with the latest version of Unicode,
     67      * to Java Regex, bringing it up to Level 1 (see
     68      * http://www.unicode.org/reports/tr18/). It does this by preprocessing the
     69      * regex pattern string and interpreting the character classes (\p{...},
     70      * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With
     71      * this utility, Java regex expressions can be updated to work with the
     72      * latest version of Unicode, and with all Unicode properties. Note that the
     73      * UnicodeSet syntax has not yet, however, been updated to be completely
     74      * consistent with Java regex, so be careful of the differences.
     75      * <p>Not thread-safe; create a separate copy for different threads.
     76      * <p>In the future, we may extend this to support other regex packages.
     77      *
     78      * @regex A modified Java regex pattern, as in the input to
     79      *        Pattern.compile(), except that all "character classes" are
     80      *        processed as if they were UnicodeSet patterns. Example:
     81      *        "abc[:bc=N:]. See UnicodeSet for the differences in syntax.
     82      * @return A processed Java regex pattern, suitable for input to
     83      *         Pattern.compile().
     84      */
     85     @Override
     86     public String transform(String regex) {
     87         StringBuilder result = new StringBuilder();
     88         UnicodeSet temp = new UnicodeSet();
     89         ParsePosition pos = new ParsePosition(0);
     90         int state = 0; // 1 = after \
     91 
     92         // We add each character unmodified to the output, unless we have a
     93         // UnicodeSet. Note that we don't worry about supplementary characters,
     94         // since none of the syntax uses them.
     95 
     96         for (int i = 0; i < regex.length(); ++i) {
     97             // look for UnicodeSets, allowing for quoting with \ and \Q
     98             char ch = regex.charAt(i);
     99             switch (state) {
    100             case 0: // we only care about \, and '['.
    101                 if (ch == '\\') {
    102                     if (UnicodeSet.resemblesPattern(regex, i)) {
    103                         // should only happen with \p
    104                         i = processSet(regex, i, result, temp, pos);
    105                         continue;
    106                     }
    107                     state = 1;
    108                 } else if (ch == '[') {
    109                     // if we have what looks like a UnicodeSet
    110                     if (UnicodeSet.resemblesPattern(regex, i)) {
    111                         i = processSet(regex, i, result, temp, pos);
    112                         continue;
    113                     }
    114                 }
    115                 break;
    116 
    117             case 1: // we are after a \
    118                 if (ch == 'Q') {
    119                     state = 1;
    120                 } else {
    121                     state = 0;
    122                 }
    123                 break;
    124 
    125             case 2: // we are in a \Q...
    126                 if (ch == '\\') {
    127                     state = 3;
    128                 }
    129                 break;
    130 
    131             case 3: // we are in at \Q...\
    132                 if (ch == 'E') {
    133                     state = 0;
    134                 }
    135                 state = 2;
    136                 break;
    137             }
    138             result.append(ch);
    139         }
    140         return result.toString();
    141     }
    142 
    143     /**
    144      * Convenience static function, using standard parameters.
    145      * @param regex as in process()
    146      * @return processed regex pattern, as in process()
    147      */
    148     public static String fix(String regex) {
    149         return STANDARD.transform(regex);
    150     }
    151 
    152     /**
    153      * Compile a regex string, after processing by fix(...).
    154      *
    155      * @param regex Raw regex pattern, as in fix(...).
    156      * @return Pattern
    157      */
    158     public static Pattern compile(String regex) {
    159         return Pattern.compile(STANDARD.transform(regex));
    160     }
    161 
    162     /**
    163      * Compile a regex string, after processing by fix(...).
    164      *
    165      * @param regex Raw regex pattern, as in fix(...).
    166      * @return Pattern
    167      */
    168     public static Pattern compile(String regex, int options) {
    169         return Pattern.compile(STANDARD.transform(regex), options);
    170     }
    171 
    172     /**
    173      * Compile a composed string from a set of BNF lines; see the List version for more information.
    174      *
    175      * @param bnfLines Series of BNF lines.
    176      * @return Pattern
    177      */
    178     public String compileBnf(String bnfLines) {
    179         return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n")));
    180     }
    181 
    182     /**
    183      * Compile a composed string from a set of BNF lines, such as for composing a regex
    184      * expression. The lines can be in any order, but there must not be any
    185      * cycles. The result can be used as input for fix().
    186      * <p>
    187      * Example:
    188      * <pre>
    189      * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;
    190      * scheme = reserved+;
    191      * host = // reserved+;
    192      * query = [\\=reserved]+;
    193      * fragment = reserved+;
    194      * reserved = [[:ascii:][:alphabetic:]];
    195      * </pre>
    196      * <p>
    197      * Caveats: at this point the parsing is simple; for example, # cannot be
    198      * quoted (use \\u0023); you can set it to null to disable.
    199      * The equality sign and a few others can be reset with
    200      * setBnfX().
    201      *
    202      * @param lines Series of lines that represent a BNF expression. The lines contain
    203      *          a series of statements that of the form x=y;. A statement can take
    204      *          multiple lines, but there can't be multiple statements on a line.
    205      *          A hash quotes to the end of the line.
    206      * @return Pattern
    207      */
    208     public String compileBnf(List<String> lines) {
    209         Map<String, String> variables = getVariables(lines);
    210         Set<String> unused = new LinkedHashSet<String>(variables.keySet());
    211         // brute force replacement; do twice to allow for different order
    212         // later on can optimize
    213         for (int i = 0; i < 2; ++i) {
    214             for (Entry<String, String> entry : variables.entrySet()) {
    215                 String variable   = entry.getKey(),
    216                        definition = entry.getValue();
    217 
    218                 for (Entry<String, String> entry2 : variables.entrySet()) {
    219                     String variable2 = entry2.getKey(),
    220                            definition2 = entry2.getValue();
    221                     if (variable.equals(variable2)) {
    222                         continue;
    223                     }
    224                     String altered2 = definition2.replace(variable, definition);
    225                     if (!altered2.equals(definition2)) {
    226                         unused.remove(variable);
    227                         variables.put(variable2, altered2);
    228 //                        if (log != null) {
    229 //                            try {
    230 //                                log.append(variable2 + "=" + altered2 + ";");
    231 //                            } catch (IOException e) {
    232 //                                throw (IllegalArgumentException) new IllegalArgumentException().initCause(e);
    233 //                            }
    234 //                        }
    235                     }
    236                 }
    237             }
    238         }
    239         if (unused.size() != 1) {
    240             throw new IllegalArgumentException("Not a single root: " + unused);
    241         }
    242         return variables.get(unused.iterator().next());
    243     }
    244 
    245     public String getBnfCommentString() {
    246         return bnfCommentString;
    247     }
    248 
    249     public void setBnfCommentString(String bnfCommentString) {
    250         this.bnfCommentString = bnfCommentString;
    251     }
    252 
    253     public String getBnfVariableInfix() {
    254         return bnfVariableInfix;
    255     }
    256 
    257     public void setBnfVariableInfix(String bnfVariableInfix) {
    258         this.bnfVariableInfix = bnfVariableInfix;
    259     }
    260 
    261     public String getBnfLineSeparator() {
    262         return bnfLineSeparator;
    263     }
    264 
    265     public void setBnfLineSeparator(String bnfLineSeparator) {
    266         this.bnfLineSeparator = bnfLineSeparator;
    267     }
    268 
    269     /**
    270      * Utility for loading lines from a file.
    271      * @param result The result of the appended lines.
    272      * @param file The file to have an input stream.
    273      * @param encoding if null, then UTF-8
    274      * @return filled list
    275      * @throws IOException If there were problems opening the file for input stream.
    276      */
    277     public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException {
    278         InputStream is = new FileInputStream(file);
    279         try {
    280             return appendLines(result, is, encoding);
    281         } finally {
    282             is.close();
    283         }
    284     }
    285 
    286     /**
    287      * Utility for loading lines from a UTF8 file.
    288      * @param result The result of the appended lines.
    289      * @param inputStream The input stream.
    290      * @param encoding if null, then UTF-8
    291      * @return filled list
    292      * @throws IOException  If there were problems opening the input stream for reading.
    293      */
    294     public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding)
    295             throws UnsupportedEncodingException, IOException {
    296         BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding));
    297         while (true) {
    298             String line = in.readLine();
    299             if (line == null) break;
    300             result.add(line);
    301         }
    302         return result;
    303     }
    304 
    305 
    306 
    307     /* (non-Javadoc)
    308      * @see com.ibm.icu.util.Freezable#cloneAsThawed()
    309      */
    310     @Override
    311     public UnicodeRegex cloneAsThawed() {
    312         // TODO Auto-generated method stub
    313         try {
    314             return (UnicodeRegex)clone();
    315         } catch (CloneNotSupportedException e) {
    316             throw new IllegalArgumentException(); // should never happen
    317         }
    318     }
    319 
    320     /* (non-Javadoc)
    321      * @see com.ibm.icu.util.Freezable#freeze()
    322      */
    323     @Override
    324     public UnicodeRegex freeze() {
    325         // no action needed now.
    326         return this;
    327     }
    328 
    329     /* (non-Javadoc)
    330      * @see com.ibm.icu.util.Freezable#isFrozen()
    331      */
    332     @Override
    333     public boolean isFrozen() {
    334         // at this point, always true
    335         return true;
    336     }
    337 
    338     // ===== PRIVATES =====
    339 
    340     private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) {
    341         try {
    342             pos.setIndex(i);
    343             UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0);
    344             x.complement().complement(); // hack to fix toPattern
    345             result.append(x.toPattern(false));
    346             i = pos.getIndex() - 1; // allow for the loop increment
    347             return i;
    348         } catch (Exception e) {
    349             throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e);
    350         }
    351     }
    352 
    353     private static final UnicodeRegex STANDARD = new UnicodeRegex();
    354     private String bnfCommentString = "#";
    355     private String bnfVariableInfix = "=";
    356     private String bnfLineSeparator = "\n";
    357 //    private Appendable log = null;
    358 
    359     private Comparator<Object> LongestFirst = new Comparator<Object>() {
    360         @Override
    361         public int compare(Object obj0, Object obj1) {
    362             String arg0 = obj0.toString();
    363             String arg1 = obj1.toString();
    364             int len0 = arg0.length();
    365             int len1 = arg1.length();
    366             if (len0 != len1) return len1 - len0;
    367             return arg0.compareTo(arg1);
    368         }
    369     };
    370 
    371     private Map<String, String> getVariables(List<String> lines) {
    372         Map<String, String> variables = new TreeMap<String, String>(LongestFirst);
    373         String variable = null;
    374         StringBuffer definition = new StringBuffer();
    375         int count = 0;
    376         for (String line : lines) {
    377             ++count;
    378             // remove initial bom, comments
    379             if (line.length() == 0) continue;
    380             if (line.charAt(0) == '\uFEFF') line = line.substring(1);
    381 
    382             if (bnfCommentString != null) {
    383                 int hashPos = line.indexOf(bnfCommentString);
    384                 if (hashPos >= 0) line = line.substring(0, hashPos);
    385             }
    386             String trimline = line.trim();
    387             if (trimline.length() == 0) continue;
    388 
    389             // String[] lineParts = line.split(";");
    390             String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " ");
    391             if (linePart.trim().length() == 0) continue;
    392             boolean terminated = trimline.endsWith(";");
    393             if (terminated) {
    394                 linePart = linePart.substring(0,linePart.lastIndexOf(';'));
    395             }
    396             int equalsPos = linePart.indexOf(bnfVariableInfix);
    397             if (equalsPos >= 0) {
    398                 if (variable != null) {
    399                     throw new IllegalArgumentException("Missing ';' before " + count + ") " + line);
    400                 }
    401                 variable = linePart.substring(0,equalsPos).trim();
    402                 if (variables.containsKey(variable)) {
    403                     throw new IllegalArgumentException("Duplicate variable definition in " + line);
    404                 }
    405                 definition.append(linePart.substring(equalsPos+1).trim());
    406             } else { // no equals, so
    407                 if (variable == null) {
    408                     throw new IllegalArgumentException("Missing '=' at " + count + ") " + line);
    409                 }
    410                 definition.append(bnfLineSeparator).append(linePart);
    411             }
    412             // we are terminated if i is not at the end, or the line ends with a ;
    413             if (terminated) {
    414                 variables.put(variable, definition.toString());
    415                 variable = null; // signal we have no variable
    416                 definition.setLength(0);
    417             }
    418         }
    419         if (variable != null) {
    420             throw new IllegalArgumentException("Missing ';' at end");
    421         }
    422         return variables;
    423     }
    424 }
    425