1 /* 2 ******************************************************************************* 3 * Copyright (C) 2009-2015, Google, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 package com.ibm.icu.impl; 8 9 import java.io.BufferedReader; 10 import java.io.FileInputStream; 11 import java.io.IOException; 12 import java.io.InputStream; 13 import java.io.InputStreamReader; 14 import java.io.UnsupportedEncodingException; 15 import java.text.ParsePosition; 16 import java.util.Arrays; 17 import java.util.Comparator; 18 import java.util.LinkedHashSet; 19 import java.util.List; 20 import java.util.Map; 21 import java.util.Map.Entry; 22 import java.util.Set; 23 import java.util.TreeMap; 24 import java.util.regex.Pattern; 25 26 import com.ibm.icu.text.StringTransform; 27 import com.ibm.icu.text.SymbolTable; 28 import com.ibm.icu.text.UnicodeSet; 29 import com.ibm.icu.util.Freezable; 30 31 /** 32 * Contains utilities to supplement the JDK Regex, since it doesn't handle 33 * Unicode well. 34 * 35 * @author markdavis 36 */ 37 public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform { 38 // Note: we don't currently have any state, but intend to in the future, 39 // particularly for the regex style supported. 40 41 private SymbolTable symbolTable; 42 43 /** 44 * Set the symbol table for internal processing 45 * @internal 46 */ 47 public SymbolTable getSymbolTable() { 48 return symbolTable; 49 } 50 51 /** 52 * Get the symbol table for internal processing 53 * @internal 54 */ 55 public UnicodeRegex setSymbolTable(SymbolTable symbolTable) { 56 this.symbolTable = symbolTable; 57 return this; 58 } 59 60 /** 61 * Adds full Unicode property support, with the latest version of Unicode, 62 * to Java Regex, bringing it up to Level 1 (see 63 * http://www.unicode.org/reports/tr18/). It does this by preprocessing the 64 * regex pattern string and interpreting the character classes (\p{...}, 65 * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With 66 * this utility, Java regex expressions can be updated to work with the 67 * latest version of Unicode, and with all Unicode properties. Note that the 68 * UnicodeSet syntax has not yet, however, been updated to be completely 69 * consistent with Java regex, so be careful of the differences. 70 * <p>Not thread-safe; create a separate copy for different threads. 71 * <p>In the future, we may extend this to support other regex packages. 72 * 73 * @regex A modified Java regex pattern, as in the input to 74 * Pattern.compile(), except that all "character classes" are 75 * processed as if they were UnicodeSet patterns. Example: 76 * "abc[:bc=N:]. See UnicodeSet for the differences in syntax. 77 * @return A processed Java regex pattern, suitable for input to 78 * Pattern.compile(). 79 */ 80 public String transform(String regex) { 81 StringBuilder result = new StringBuilder(); 82 UnicodeSet temp = new UnicodeSet(); 83 ParsePosition pos = new ParsePosition(0); 84 int state = 0; // 1 = after \ 85 86 // We add each character unmodified to the output, unless we have a 87 // UnicodeSet. Note that we don't worry about supplementary characters, 88 // since none of the syntax uses them. 89 90 for (int i = 0; i < regex.length(); ++i) { 91 // look for UnicodeSets, allowing for quoting with \ and \Q 92 char ch = regex.charAt(i); 93 switch (state) { 94 case 0: // we only care about \, and '['. 95 if (ch == '\\') { 96 if (UnicodeSet.resemblesPattern(regex, i)) { 97 // should only happen with \p 98 i = processSet(regex, i, result, temp, pos); 99 continue; 100 } 101 state = 1; 102 } else if (ch == '[') { 103 // if we have what looks like a UnicodeSet 104 if (UnicodeSet.resemblesPattern(regex, i)) { 105 i = processSet(regex, i, result, temp, pos); 106 continue; 107 } 108 } 109 break; 110 111 case 1: // we are after a \ 112 if (ch == 'Q') { 113 state = 1; 114 } else { 115 state = 0; 116 } 117 break; 118 119 case 2: // we are in a \Q... 120 if (ch == '\\') { 121 state = 3; 122 } 123 break; 124 125 case 3: // we are in at \Q...\ 126 if (ch == 'E') { 127 state = 0; 128 } 129 state = 2; 130 break; 131 } 132 result.append(ch); 133 } 134 return result.toString(); 135 } 136 137 /** 138 * Convenience static function, using standard parameters. 139 * @param regex as in process() 140 * @return processed regex pattern, as in process() 141 */ 142 public static String fix(String regex) { 143 return STANDARD.transform(regex); 144 } 145 146 /** 147 * Compile a regex string, after processing by fix(...). 148 * 149 * @param regex Raw regex pattern, as in fix(...). 150 * @return Pattern 151 */ 152 public static Pattern compile(String regex) { 153 return Pattern.compile(STANDARD.transform(regex)); 154 } 155 156 /** 157 * Compile a regex string, after processing by fix(...). 158 * 159 * @param regex Raw regex pattern, as in fix(...). 160 * @return Pattern 161 */ 162 public static Pattern compile(String regex, int options) { 163 return Pattern.compile(STANDARD.transform(regex), options); 164 } 165 166 /** 167 * Compile a composed string from a set of BNF lines; see the List version for more information. 168 * 169 * @param bnfLines Series of BNF lines. 170 * @return Pattern 171 */ 172 public String compileBnf(String bnfLines) { 173 return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n"))); 174 } 175 176 /** 177 * Compile a composed string from a set of BNF lines, such as for composing a regex 178 * expression. The lines can be in any order, but there must not be any 179 * cycles. The result can be used as input for fix(). 180 * <p> 181 * Example: 182 * <pre> 183 * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?; 184 * scheme = reserved+; 185 * host = // reserved+; 186 * query = [\\=reserved]+; 187 * fragment = reserved+; 188 * reserved = [[:ascii:][:alphabetic:]]; 189 * </pre> 190 * <p> 191 * Caveats: at this point the parsing is simple; for example, # cannot be 192 * quoted (use \\u0023); you can set it to null to disable. 193 * The equality sign and a few others can be reset with 194 * setBnfX(). 195 * 196 * @param lines Series of lines that represent a BNF expression. The lines contain 197 * a series of statements that of the form x=y;. A statement can take 198 * multiple lines, but there can't be multiple statements on a line. 199 * A hash quotes to the end of the line. 200 * @return Pattern 201 */ 202 public String compileBnf(List<String> lines) { 203 Map<String, String> variables = getVariables(lines); 204 Set<String> unused = new LinkedHashSet<String>(variables.keySet()); 205 // brute force replacement; do twice to allow for different order 206 // later on can optimize 207 for (int i = 0; i < 2; ++i) { 208 for (Entry<String, String> entry : variables.entrySet()) { 209 String variable = entry.getKey(), 210 definition = entry.getValue(); 211 212 for (Entry<String, String> entry2 : variables.entrySet()) { 213 String variable2 = entry2.getKey(), 214 definition2 = entry2.getValue(); 215 if (variable.equals(variable2)) { 216 continue; 217 } 218 String altered2 = definition2.replace(variable, definition); 219 if (!altered2.equals(definition2)) { 220 unused.remove(variable); 221 variables.put(variable2, altered2); 222 if (log != null) { 223 try { 224 log.append(variable2 + "=" + altered2 + ";"); 225 } catch (IOException e) { 226 throw (IllegalArgumentException) new IllegalArgumentException().initCause(e); 227 } 228 } 229 } 230 } 231 } 232 } 233 if (unused.size() != 1) { 234 throw new IllegalArgumentException("Not a single root: " + unused); 235 } 236 return variables.get(unused.iterator().next()); 237 } 238 239 public String getBnfCommentString() { 240 return bnfCommentString; 241 } 242 243 public void setBnfCommentString(String bnfCommentString) { 244 this.bnfCommentString = bnfCommentString; 245 } 246 247 public String getBnfVariableInfix() { 248 return bnfVariableInfix; 249 } 250 251 public void setBnfVariableInfix(String bnfVariableInfix) { 252 this.bnfVariableInfix = bnfVariableInfix; 253 } 254 255 public String getBnfLineSeparator() { 256 return bnfLineSeparator; 257 } 258 259 public void setBnfLineSeparator(String bnfLineSeparator) { 260 this.bnfLineSeparator = bnfLineSeparator; 261 } 262 263 /** 264 * Utility for loading lines from a file. 265 * @param result The result of the appended lines. 266 * @param file The file to have an input stream. 267 * @param encoding if null, then UTF-8 268 * @return filled list 269 * @throws IOException If there were problems opening the file for input stream. 270 */ 271 public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException { 272 InputStream is = new FileInputStream(file); 273 try { 274 return appendLines(result, is, encoding); 275 } finally { 276 is.close(); 277 } 278 } 279 280 /** 281 * Utility for loading lines from a UTF8 file. 282 * @param result The result of the appended lines. 283 * @param inputStream The input stream. 284 * @param encoding if null, then UTF-8 285 * @return filled list 286 * @throws IOException If there were problems opening the input stream for reading. 287 */ 288 public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding) 289 throws UnsupportedEncodingException, IOException { 290 BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding)); 291 while (true) { 292 String line = in.readLine(); 293 if (line == null) break; 294 result.add(line); 295 } 296 return result; 297 } 298 299 300 301 /* (non-Javadoc) 302 * @see com.ibm.icu.util.Freezable#cloneAsThawed() 303 */ 304 public UnicodeRegex cloneAsThawed() { 305 // TODO Auto-generated method stub 306 try { 307 return (UnicodeRegex)clone(); 308 } catch (CloneNotSupportedException e) { 309 throw new IllegalArgumentException(); // should never happen 310 } 311 } 312 313 /* (non-Javadoc) 314 * @see com.ibm.icu.util.Freezable#freeze() 315 */ 316 public UnicodeRegex freeze() { 317 // no action needed now. 318 return this; 319 } 320 321 /* (non-Javadoc) 322 * @see com.ibm.icu.util.Freezable#isFrozen() 323 */ 324 public boolean isFrozen() { 325 // at this point, always true 326 return true; 327 } 328 329 // ===== PRIVATES ===== 330 331 private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) { 332 try { 333 pos.setIndex(i); 334 UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0); 335 x.complement().complement(); // hack to fix toPattern 336 result.append(x.toPattern(false)); 337 i = pos.getIndex() - 1; // allow for the loop increment 338 return i; 339 } catch (Exception e) { 340 throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e); 341 } 342 } 343 344 private static UnicodeRegex STANDARD = new UnicodeRegex(); 345 private String bnfCommentString = "#"; 346 private String bnfVariableInfix = "="; 347 private String bnfLineSeparator = "\n"; 348 private Appendable log = null; 349 350 private Comparator<Object> LongestFirst = new Comparator<Object>() { 351 public int compare(Object obj0, Object obj1) { 352 String arg0 = obj0.toString(); 353 String arg1 = obj1.toString(); 354 int len0 = arg0.length(); 355 int len1 = arg1.length(); 356 if (len0 != len1) return len1 - len0; 357 return arg0.compareTo(arg1); 358 } 359 }; 360 361 private Map<String, String> getVariables(List<String> lines) { 362 Map<String, String> variables = new TreeMap<String, String>(LongestFirst); 363 String variable = null; 364 StringBuffer definition = new StringBuffer(); 365 int count = 0; 366 for (String line : lines) { 367 ++count; 368 // remove initial bom, comments 369 if (line.length() == 0) continue; 370 if (line.charAt(0) == '\uFEFF') line = line.substring(1); 371 372 if (bnfCommentString != null) { 373 int hashPos = line.indexOf(bnfCommentString); 374 if (hashPos >= 0) line = line.substring(0, hashPos); 375 } 376 String trimline = line.trim(); 377 if (trimline.length() == 0) continue; 378 379 // String[] lineParts = line.split(";"); 380 String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " "); 381 if (linePart.trim().length() == 0) continue; 382 boolean terminated = trimline.endsWith(";"); 383 if (terminated) { 384 linePart = linePart.substring(0,linePart.lastIndexOf(';')); 385 } 386 int equalsPos = linePart.indexOf(bnfVariableInfix); 387 if (equalsPos >= 0) { 388 if (variable != null) { 389 throw new IllegalArgumentException("Missing ';' before " + count + ") " + line); 390 } 391 variable = linePart.substring(0,equalsPos).trim(); 392 if (variables.containsKey(variable)) { 393 throw new IllegalArgumentException("Duplicate variable definition in " + line); 394 } 395 definition.append(linePart.substring(equalsPos+1).trim()); 396 } else { // no equals, so 397 if (variable == null) { 398 throw new IllegalArgumentException("Missing '=' at " + count + ") " + line); 399 } 400 definition.append(bnfLineSeparator).append(linePart); 401 } 402 // we are terminated if i is not at the end, or the line ends with a ; 403 if (terminated) { 404 variables.put(variable, definition.toString()); 405 variable = null; // signal we have no variable 406 definition.setLength(0); 407 } 408 } 409 if (variable != null) { 410 throw new IllegalArgumentException("Missing ';' at end"); 411 } 412 return variables; 413 } 414 } 415