1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2009-2015, Google, International Business Machines Corporation 7 * and others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package android.icu.impl; 11 12 import java.io.BufferedReader; 13 import java.io.FileInputStream; 14 import java.io.IOException; 15 import java.io.InputStream; 16 import java.io.InputStreamReader; 17 import java.io.UnsupportedEncodingException; 18 import java.text.ParsePosition; 19 import java.util.Arrays; 20 import java.util.Comparator; 21 import java.util.LinkedHashSet; 22 import java.util.List; 23 import java.util.Map; 24 import java.util.Map.Entry; 25 import java.util.Set; 26 import java.util.TreeMap; 27 import java.util.regex.Pattern; 28 29 import android.icu.text.StringTransform; 30 import android.icu.text.SymbolTable; 31 import android.icu.text.UnicodeSet; 32 import android.icu.util.Freezable; 33 34 /** 35 * Contains utilities to supplement the JDK Regex, since it doesn't handle 36 * Unicode well. 37 * 38 * <p>TODO: Move to android.icu.dev.somewhere. 39 * 2015-sep-03: This is used there, and also in CLDR and in UnicodeTools. 40 * 41 * @author markdavis 42 * @hide Only a subset of ICU is exposed in Android 43 */ 44 public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform { 45 // Note: we don't currently have any state, but intend to in the future, 46 // particularly for the regex style supported. 47 48 private SymbolTable symbolTable; 49 50 /** 51 * Set the symbol table for internal processing 52 * @hide draft / provisional / internal are hidden on Android 53 */ 54 public SymbolTable getSymbolTable() { 55 return symbolTable; 56 } 57 58 /** 59 * Get the symbol table for internal processing 60 * @hide draft / provisional / internal are hidden on Android 61 */ 62 public UnicodeRegex setSymbolTable(SymbolTable symbolTable) { 63 this.symbolTable = symbolTable; 64 return this; 65 } 66 67 /** 68 * Adds full Unicode property support, with the latest version of Unicode, 69 * to Java Regex, bringing it up to Level 1 (see 70 * http://www.unicode.org/reports/tr18/). It does this by preprocessing the 71 * regex pattern string and interpreting the character classes (\p{...}, 72 * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With 73 * this utility, Java regex expressions can be updated to work with the 74 * latest version of Unicode, and with all Unicode properties. Note that the 75 * UnicodeSet syntax has not yet, however, been updated to be completely 76 * consistent with Java regex, so be careful of the differences. 77 * <p>Not thread-safe; create a separate copy for different threads. 78 * <p>In the future, we may extend this to support other regex packages. 79 * 80 * @regex A modified Java regex pattern, as in the input to 81 * Pattern.compile(), except that all "character classes" are 82 * processed as if they were UnicodeSet patterns. Example: 83 * "abc[:bc=N:]. See UnicodeSet for the differences in syntax. 84 * @return A processed Java regex pattern, suitable for input to 85 * Pattern.compile(). 86 */ 87 @Override 88 public String transform(String regex) { 89 StringBuilder result = new StringBuilder(); 90 UnicodeSet temp = new UnicodeSet(); 91 ParsePosition pos = new ParsePosition(0); 92 int state = 0; // 1 = after \ 93 94 // We add each character unmodified to the output, unless we have a 95 // UnicodeSet. Note that we don't worry about supplementary characters, 96 // since none of the syntax uses them. 97 98 for (int i = 0; i < regex.length(); ++i) { 99 // look for UnicodeSets, allowing for quoting with \ and \Q 100 char ch = regex.charAt(i); 101 switch (state) { 102 case 0: // we only care about \, and '['. 103 if (ch == '\\') { 104 if (UnicodeSet.resemblesPattern(regex, i)) { 105 // should only happen with \p 106 i = processSet(regex, i, result, temp, pos); 107 continue; 108 } 109 state = 1; 110 } else if (ch == '[') { 111 // if we have what looks like a UnicodeSet 112 if (UnicodeSet.resemblesPattern(regex, i)) { 113 i = processSet(regex, i, result, temp, pos); 114 continue; 115 } 116 } 117 break; 118 119 case 1: // we are after a \ 120 if (ch == 'Q') { 121 state = 1; 122 } else { 123 state = 0; 124 } 125 break; 126 127 case 2: // we are in a \Q... 128 if (ch == '\\') { 129 state = 3; 130 } 131 break; 132 133 case 3: // we are in at \Q...\ 134 if (ch == 'E') { 135 state = 0; 136 } 137 state = 2; 138 break; 139 } 140 result.append(ch); 141 } 142 return result.toString(); 143 } 144 145 /** 146 * Convenience static function, using standard parameters. 147 * @param regex as in process() 148 * @return processed regex pattern, as in process() 149 */ 150 public static String fix(String regex) { 151 return STANDARD.transform(regex); 152 } 153 154 /** 155 * Compile a regex string, after processing by fix(...). 156 * 157 * @param regex Raw regex pattern, as in fix(...). 158 * @return Pattern 159 */ 160 public static Pattern compile(String regex) { 161 return Pattern.compile(STANDARD.transform(regex)); 162 } 163 164 /** 165 * Compile a regex string, after processing by fix(...). 166 * 167 * @param regex Raw regex pattern, as in fix(...). 168 * @return Pattern 169 */ 170 public static Pattern compile(String regex, int options) { 171 return Pattern.compile(STANDARD.transform(regex), options); 172 } 173 174 /** 175 * Compile a composed string from a set of BNF lines; see the List version for more information. 176 * 177 * @param bnfLines Series of BNF lines. 178 * @return Pattern 179 */ 180 public String compileBnf(String bnfLines) { 181 return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n"))); 182 } 183 184 /** 185 * Compile a composed string from a set of BNF lines, such as for composing a regex 186 * expression. The lines can be in any order, but there must not be any 187 * cycles. The result can be used as input for fix(). 188 * <p> 189 * Example: 190 * <pre> 191 * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?; 192 * scheme = reserved+; 193 * host = // reserved+; 194 * query = [\\=reserved]+; 195 * fragment = reserved+; 196 * reserved = [[:ascii:][:alphabetic:]]; 197 * </pre> 198 * <p> 199 * Caveats: at this point the parsing is simple; for example, # cannot be 200 * quoted (use \\u0023); you can set it to null to disable. 201 * The equality sign and a few others can be reset with 202 * setBnfX(). 203 * 204 * @param lines Series of lines that represent a BNF expression. The lines contain 205 * a series of statements that of the form x=y;. A statement can take 206 * multiple lines, but there can't be multiple statements on a line. 207 * A hash quotes to the end of the line. 208 * @return Pattern 209 */ 210 public String compileBnf(List<String> lines) { 211 Map<String, String> variables = getVariables(lines); 212 Set<String> unused = new LinkedHashSet<String>(variables.keySet()); 213 // brute force replacement; do twice to allow for different order 214 // later on can optimize 215 for (int i = 0; i < 2; ++i) { 216 for (Entry<String, String> entry : variables.entrySet()) { 217 String variable = entry.getKey(), 218 definition = entry.getValue(); 219 220 for (Entry<String, String> entry2 : variables.entrySet()) { 221 String variable2 = entry2.getKey(), 222 definition2 = entry2.getValue(); 223 if (variable.equals(variable2)) { 224 continue; 225 } 226 String altered2 = definition2.replace(variable, definition); 227 if (!altered2.equals(definition2)) { 228 unused.remove(variable); 229 variables.put(variable2, altered2); 230 // if (log != null) { 231 // try { 232 // log.append(variable2 + "=" + altered2 + ";"); 233 // } catch (IOException e) { 234 // throw (IllegalArgumentException) new IllegalArgumentException().initCause(e); 235 // } 236 // } 237 } 238 } 239 } 240 } 241 if (unused.size() != 1) { 242 throw new IllegalArgumentException("Not a single root: " + unused); 243 } 244 return variables.get(unused.iterator().next()); 245 } 246 247 public String getBnfCommentString() { 248 return bnfCommentString; 249 } 250 251 public void setBnfCommentString(String bnfCommentString) { 252 this.bnfCommentString = bnfCommentString; 253 } 254 255 public String getBnfVariableInfix() { 256 return bnfVariableInfix; 257 } 258 259 public void setBnfVariableInfix(String bnfVariableInfix) { 260 this.bnfVariableInfix = bnfVariableInfix; 261 } 262 263 public String getBnfLineSeparator() { 264 return bnfLineSeparator; 265 } 266 267 public void setBnfLineSeparator(String bnfLineSeparator) { 268 this.bnfLineSeparator = bnfLineSeparator; 269 } 270 271 /** 272 * Utility for loading lines from a file. 273 * @param result The result of the appended lines. 274 * @param file The file to have an input stream. 275 * @param encoding if null, then UTF-8 276 * @return filled list 277 * @throws IOException If there were problems opening the file for input stream. 278 */ 279 public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException { 280 InputStream is = new FileInputStream(file); 281 try { 282 return appendLines(result, is, encoding); 283 } finally { 284 is.close(); 285 } 286 } 287 288 /** 289 * Utility for loading lines from a UTF8 file. 290 * @param result The result of the appended lines. 291 * @param inputStream The input stream. 292 * @param encoding if null, then UTF-8 293 * @return filled list 294 * @throws IOException If there were problems opening the input stream for reading. 295 */ 296 public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding) 297 throws UnsupportedEncodingException, IOException { 298 BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding)); 299 while (true) { 300 String line = in.readLine(); 301 if (line == null) break; 302 result.add(line); 303 } 304 return result; 305 } 306 307 308 309 /* (non-Javadoc) 310 * @see android.icu.util.Freezable#cloneAsThawed() 311 */ 312 @Override 313 public UnicodeRegex cloneAsThawed() { 314 // TODO Auto-generated method stub 315 try { 316 return (UnicodeRegex)clone(); 317 } catch (CloneNotSupportedException e) { 318 throw new IllegalArgumentException(); // should never happen 319 } 320 } 321 322 /* (non-Javadoc) 323 * @see android.icu.util.Freezable#freeze() 324 */ 325 @Override 326 public UnicodeRegex freeze() { 327 // no action needed now. 328 return this; 329 } 330 331 /* (non-Javadoc) 332 * @see android.icu.util.Freezable#isFrozen() 333 */ 334 @Override 335 public boolean isFrozen() { 336 // at this point, always true 337 return true; 338 } 339 340 // ===== PRIVATES ===== 341 342 private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) { 343 try { 344 pos.setIndex(i); 345 UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0); 346 x.complement().complement(); // hack to fix toPattern 347 result.append(x.toPattern(false)); 348 i = pos.getIndex() - 1; // allow for the loop increment 349 return i; 350 } catch (Exception e) { 351 throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e); 352 } 353 } 354 355 private static final UnicodeRegex STANDARD = new UnicodeRegex(); 356 private String bnfCommentString = "#"; 357 private String bnfVariableInfix = "="; 358 private String bnfLineSeparator = "\n"; 359 // private Appendable log = null; 360 361 private Comparator<Object> LongestFirst = new Comparator<Object>() { 362 @Override 363 public int compare(Object obj0, Object obj1) { 364 String arg0 = obj0.toString(); 365 String arg1 = obj1.toString(); 366 int len0 = arg0.length(); 367 int len1 = arg1.length(); 368 if (len0 != len1) return len1 - len0; 369 return arg0.compareTo(arg1); 370 } 371 }; 372 373 private Map<String, String> getVariables(List<String> lines) { 374 Map<String, String> variables = new TreeMap<String, String>(LongestFirst); 375 String variable = null; 376 StringBuffer definition = new StringBuffer(); 377 int count = 0; 378 for (String line : lines) { 379 ++count; 380 // remove initial bom, comments 381 if (line.length() == 0) continue; 382 if (line.charAt(0) == '\uFEFF') line = line.substring(1); 383 384 if (bnfCommentString != null) { 385 int hashPos = line.indexOf(bnfCommentString); 386 if (hashPos >= 0) line = line.substring(0, hashPos); 387 } 388 String trimline = line.trim(); 389 if (trimline.length() == 0) continue; 390 391 // String[] lineParts = line.split(";"); 392 String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " "); 393 if (linePart.trim().length() == 0) continue; 394 boolean terminated = trimline.endsWith(";"); 395 if (terminated) { 396 linePart = linePart.substring(0,linePart.lastIndexOf(';')); 397 } 398 int equalsPos = linePart.indexOf(bnfVariableInfix); 399 if (equalsPos >= 0) { 400 if (variable != null) { 401 throw new IllegalArgumentException("Missing ';' before " + count + ") " + line); 402 } 403 variable = linePart.substring(0,equalsPos).trim(); 404 if (variables.containsKey(variable)) { 405 throw new IllegalArgumentException("Duplicate variable definition in " + line); 406 } 407 definition.append(linePart.substring(equalsPos+1).trim()); 408 } else { // no equals, so 409 if (variable == null) { 410 throw new IllegalArgumentException("Missing '=' at " + count + ") " + line); 411 } 412 definition.append(bnfLineSeparator).append(linePart); 413 } 414 // we are terminated if i is not at the end, or the line ends with a ; 415 if (terminated) { 416 variables.put(variable, definition.toString()); 417 variable = null; // signal we have no variable 418 definition.setLength(0); 419 } 420 } 421 if (variable != null) { 422 throw new IllegalArgumentException("Missing ';' at end"); 423 } 424 return variables; 425 } 426 } 427