Home | History | Annotate | Download | only in impl
      1 /*
      2  *******************************************************************************
      3  * Copyright (C) 2009-2015, Google, International Business Machines Corporation
      4  * and others. All Rights Reserved.
      5  *******************************************************************************
      6  */
      7 package com.ibm.icu.impl;
      8 
      9 import java.io.BufferedReader;
     10 import java.io.FileInputStream;
     11 import java.io.IOException;
     12 import java.io.InputStream;
     13 import java.io.InputStreamReader;
     14 import java.io.UnsupportedEncodingException;
     15 import java.text.ParsePosition;
     16 import java.util.Arrays;
     17 import java.util.Comparator;
     18 import java.util.LinkedHashSet;
     19 import java.util.List;
     20 import java.util.Map;
     21 import java.util.Map.Entry;
     22 import java.util.Set;
     23 import java.util.TreeMap;
     24 import java.util.regex.Pattern;
     25 
     26 import com.ibm.icu.text.StringTransform;
     27 import com.ibm.icu.text.SymbolTable;
     28 import com.ibm.icu.text.UnicodeSet;
     29 import com.ibm.icu.util.Freezable;
     30 
     31 /**
     32  * Contains utilities to supplement the JDK Regex, since it doesn't handle
     33  * Unicode well.
     34  *
     35  * @author markdavis
     36  */
     37 public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform {
     38     // Note: we don't currently have any state, but intend to in the future,
     39     // particularly for the regex style supported.
     40 
     41     private SymbolTable symbolTable;
     42 
     43     /**
     44      * Set the symbol table for internal processing
     45      * @internal
     46      */
     47     public SymbolTable getSymbolTable() {
     48         return symbolTable;
     49     }
     50 
     51     /**
     52      * Get the symbol table for internal processing
     53      * @internal
     54      */
     55     public UnicodeRegex setSymbolTable(SymbolTable symbolTable) {
     56         this.symbolTable = symbolTable;
     57         return this;
     58     }
     59 
     60     /**
     61      * Adds full Unicode property support, with the latest version of Unicode,
     62      * to Java Regex, bringing it up to Level 1 (see
     63      * http://www.unicode.org/reports/tr18/). It does this by preprocessing the
     64      * regex pattern string and interpreting the character classes (\p{...},
     65      * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With
     66      * this utility, Java regex expressions can be updated to work with the
     67      * latest version of Unicode, and with all Unicode properties. Note that the
     68      * UnicodeSet syntax has not yet, however, been updated to be completely
     69      * consistent with Java regex, so be careful of the differences.
     70      * <p>Not thread-safe; create a separate copy for different threads.
     71      * <p>In the future, we may extend this to support other regex packages.
     72      *
     73      * @regex A modified Java regex pattern, as in the input to
     74      *        Pattern.compile(), except that all "character classes" are
     75      *        processed as if they were UnicodeSet patterns. Example:
     76      *        "abc[:bc=N:]. See UnicodeSet for the differences in syntax.
     77      * @return A processed Java regex pattern, suitable for input to
     78      *         Pattern.compile().
     79      */
     80     public String transform(String regex) {
     81         StringBuilder result = new StringBuilder();
     82         UnicodeSet temp = new UnicodeSet();
     83         ParsePosition pos = new ParsePosition(0);
     84         int state = 0; // 1 = after \
     85 
     86         // We add each character unmodified to the output, unless we have a
     87         // UnicodeSet. Note that we don't worry about supplementary characters,
     88         // since none of the syntax uses them.
     89 
     90         for (int i = 0; i < regex.length(); ++i) {
     91             // look for UnicodeSets, allowing for quoting with \ and \Q
     92             char ch = regex.charAt(i);
     93             switch (state) {
     94             case 0: // we only care about \, and '['.
     95                 if (ch == '\\') {
     96                     if (UnicodeSet.resemblesPattern(regex, i)) {
     97                         // should only happen with \p
     98                         i = processSet(regex, i, result, temp, pos);
     99                         continue;
    100                     }
    101                     state = 1;
    102                 } else if (ch == '[') {
    103                     // if we have what looks like a UnicodeSet
    104                     if (UnicodeSet.resemblesPattern(regex, i)) {
    105                         i = processSet(regex, i, result, temp, pos);
    106                         continue;
    107                     }
    108                 }
    109                 break;
    110 
    111             case 1: // we are after a \
    112                 if (ch == 'Q') {
    113                     state = 1;
    114                 } else {
    115                     state = 0;
    116                 }
    117                 break;
    118 
    119             case 2: // we are in a \Q...
    120                 if (ch == '\\') {
    121                     state = 3;
    122                 }
    123                 break;
    124 
    125             case 3: // we are in at \Q...\
    126                 if (ch == 'E') {
    127                     state = 0;
    128                 }
    129                 state = 2;
    130                 break;
    131             }
    132             result.append(ch);
    133         }
    134         return result.toString();
    135     }
    136 
    137     /**
    138      * Convenience static function, using standard parameters.
    139      * @param regex as in process()
    140      * @return processed regex pattern, as in process()
    141      */
    142     public static String fix(String regex) {
    143         return STANDARD.transform(regex);
    144     }
    145 
    146     /**
    147      * Compile a regex string, after processing by fix(...).
    148      *
    149      * @param regex Raw regex pattern, as in fix(...).
    150      * @return Pattern
    151      */
    152     public static Pattern compile(String regex) {
    153         return Pattern.compile(STANDARD.transform(regex));
    154     }
    155 
    156     /**
    157      * Compile a regex string, after processing by fix(...).
    158      *
    159      * @param regex Raw regex pattern, as in fix(...).
    160      * @return Pattern
    161      */
    162     public static Pattern compile(String regex, int options) {
    163         return Pattern.compile(STANDARD.transform(regex), options);
    164     }
    165 
    166     /**
    167      * Compile a composed string from a set of BNF lines; see the List version for more information.
    168      *
    169      * @param bnfLines Series of BNF lines.
    170      * @return Pattern
    171      */
    172     public String compileBnf(String bnfLines) {
    173         return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n")));
    174     }
    175 
    176     /**
    177      * Compile a composed string from a set of BNF lines, such as for composing a regex
    178      * expression. The lines can be in any order, but there must not be any
    179      * cycles. The result can be used as input for fix().
    180      * <p>
    181      * Example:
    182      * <pre>
    183      * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;
    184      * scheme = reserved+;
    185      * host = // reserved+;
    186      * query = [\\=reserved]+;
    187      * fragment = reserved+;
    188      * reserved = [[:ascii:][:alphabetic:]];
    189      * </pre>
    190      * <p>
    191      * Caveats: at this point the parsing is simple; for example, # cannot be
    192      * quoted (use \\u0023); you can set it to null to disable.
    193      * The equality sign and a few others can be reset with
    194      * setBnfX().
    195      *
    196      * @param lines Series of lines that represent a BNF expression. The lines contain
    197      *          a series of statements that of the form x=y;. A statement can take
    198      *          multiple lines, but there can't be multiple statements on a line.
    199      *          A hash quotes to the end of the line.
    200      * @return Pattern
    201      */
    202     public String compileBnf(List<String> lines) {
    203         Map<String, String> variables = getVariables(lines);
    204         Set<String> unused = new LinkedHashSet<String>(variables.keySet());
    205         // brute force replacement; do twice to allow for different order
    206         // later on can optimize
    207         for (int i = 0; i < 2; ++i) {
    208             for (Entry<String, String> entry : variables.entrySet()) {
    209                 String variable   = entry.getKey(),
    210                        definition = entry.getValue();
    211 
    212                 for (Entry<String, String> entry2 : variables.entrySet()) {
    213                     String variable2 = entry2.getKey(),
    214                            definition2 = entry2.getValue();
    215                     if (variable.equals(variable2)) {
    216                         continue;
    217                     }
    218                     String altered2 = definition2.replace(variable, definition);
    219                     if (!altered2.equals(definition2)) {
    220                         unused.remove(variable);
    221                         variables.put(variable2, altered2);
    222                         if (log != null) {
    223                             try {
    224                                 log.append(variable2 + "=" + altered2 + ";");
    225                             } catch (IOException e) {
    226                                 throw (IllegalArgumentException) new IllegalArgumentException().initCause(e);
    227                             }
    228                         }
    229                     }
    230                 }
    231             }
    232         }
    233         if (unused.size() != 1) {
    234             throw new IllegalArgumentException("Not a single root: " + unused);
    235         }
    236         return variables.get(unused.iterator().next());
    237     }
    238 
    239     public String getBnfCommentString() {
    240         return bnfCommentString;
    241     }
    242 
    243     public void setBnfCommentString(String bnfCommentString) {
    244         this.bnfCommentString = bnfCommentString;
    245     }
    246 
    247     public String getBnfVariableInfix() {
    248         return bnfVariableInfix;
    249     }
    250 
    251     public void setBnfVariableInfix(String bnfVariableInfix) {
    252         this.bnfVariableInfix = bnfVariableInfix;
    253     }
    254 
    255     public String getBnfLineSeparator() {
    256         return bnfLineSeparator;
    257     }
    258 
    259     public void setBnfLineSeparator(String bnfLineSeparator) {
    260         this.bnfLineSeparator = bnfLineSeparator;
    261     }
    262 
    263     /**
    264      * Utility for loading lines from a file.
    265      * @param result The result of the appended lines.
    266      * @param file The file to have an input stream.
    267      * @param encoding if null, then UTF-8
    268      * @return filled list
    269      * @throws IOException If there were problems opening the file for input stream.
    270      */
    271     public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException {
    272         InputStream is = new FileInputStream(file);
    273         try {
    274             return appendLines(result, is, encoding);
    275         } finally {
    276             is.close();
    277         }
    278     }
    279 
    280     /**
    281      * Utility for loading lines from a UTF8 file.
    282      * @param result The result of the appended lines.
    283      * @param inputStream The input stream.
    284      * @param encoding if null, then UTF-8
    285      * @return filled list
    286      * @throws IOException  If there were problems opening the input stream for reading.
    287      */
    288     public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding)
    289             throws UnsupportedEncodingException, IOException {
    290         BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding));
    291         while (true) {
    292             String line = in.readLine();
    293             if (line == null) break;
    294             result.add(line);
    295         }
    296         return result;
    297     }
    298 
    299 
    300 
    301     /* (non-Javadoc)
    302      * @see com.ibm.icu.util.Freezable#cloneAsThawed()
    303      */
    304     public UnicodeRegex cloneAsThawed() {
    305         // TODO Auto-generated method stub
    306         try {
    307             return (UnicodeRegex)clone();
    308         } catch (CloneNotSupportedException e) {
    309             throw new IllegalArgumentException(); // should never happen
    310         }
    311     }
    312 
    313     /* (non-Javadoc)
    314      * @see com.ibm.icu.util.Freezable#freeze()
    315      */
    316     public UnicodeRegex freeze() {
    317         // no action needed now.
    318         return this;
    319     }
    320 
    321     /* (non-Javadoc)
    322      * @see com.ibm.icu.util.Freezable#isFrozen()
    323      */
    324     public boolean isFrozen() {
    325         // at this point, always true
    326         return true;
    327     }
    328 
    329     // ===== PRIVATES =====
    330 
    331     private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) {
    332         try {
    333             pos.setIndex(i);
    334             UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0);
    335             x.complement().complement(); // hack to fix toPattern
    336             result.append(x.toPattern(false));
    337             i = pos.getIndex() - 1; // allow for the loop increment
    338             return i;
    339         } catch (Exception e) {
    340             throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e);
    341         }
    342     }
    343 
    344     private static UnicodeRegex STANDARD = new UnicodeRegex();
    345     private String bnfCommentString = "#";
    346     private String bnfVariableInfix = "=";
    347     private String bnfLineSeparator = "\n";
    348     private Appendable log = null;
    349 
    350     private Comparator<Object> LongestFirst = new Comparator<Object>() {
    351         public int compare(Object obj0, Object obj1) {
    352             String arg0 = obj0.toString();
    353             String arg1 = obj1.toString();
    354             int len0 = arg0.length();
    355             int len1 = arg1.length();
    356             if (len0 != len1) return len1 - len0;
    357             return arg0.compareTo(arg1);
    358         }
    359     };
    360 
    361     private Map<String, String> getVariables(List<String> lines) {
    362         Map<String, String> variables = new TreeMap<String, String>(LongestFirst);
    363         String variable = null;
    364         StringBuffer definition = new StringBuffer();
    365         int count = 0;
    366         for (String line : lines) {
    367             ++count;
    368             // remove initial bom, comments
    369             if (line.length() == 0) continue;
    370             if (line.charAt(0) == '\uFEFF') line = line.substring(1);
    371 
    372             if (bnfCommentString != null) {
    373                 int hashPos = line.indexOf(bnfCommentString);
    374                 if (hashPos >= 0) line = line.substring(0, hashPos);
    375             }
    376             String trimline = line.trim();
    377             if (trimline.length() == 0) continue;
    378 
    379             // String[] lineParts = line.split(";");
    380             String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " ");
    381             if (linePart.trim().length() == 0) continue;
    382             boolean terminated = trimline.endsWith(";");
    383             if (terminated) {
    384                 linePart = linePart.substring(0,linePart.lastIndexOf(';'));
    385             }
    386             int equalsPos = linePart.indexOf(bnfVariableInfix);
    387             if (equalsPos >= 0) {
    388                 if (variable != null) {
    389                     throw new IllegalArgumentException("Missing ';' before " + count + ") " + line);
    390                 }
    391                 variable = linePart.substring(0,equalsPos).trim();
    392                 if (variables.containsKey(variable)) {
    393                     throw new IllegalArgumentException("Duplicate variable definition in " + line);
    394                 }
    395                 definition.append(linePart.substring(equalsPos+1).trim());
    396             } else { // no equals, so
    397                 if (variable == null) {
    398                     throw new IllegalArgumentException("Missing '=' at " + count + ") " + line);
    399                 }
    400                 definition.append(bnfLineSeparator).append(linePart);
    401             }
    402             // we are terminated if i is not at the end, or the line ends with a ;
    403             if (terminated) {
    404                 variables.put(variable, definition.toString());
    405                 variable = null; // signal we have no variable
    406                 definition.setLength(0);
    407             }
    408         }
    409         if (variable != null) {
    410             throw new IllegalArgumentException("Missing ';' at end");
    411         }
    412         return variables;
    413     }
    414 }
    415