1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2009-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.dev.test.translit; 10 11 import java.util.ArrayList; 12 import java.util.List; 13 import java.util.regex.Matcher; 14 import java.util.regex.Pattern; 15 16 import org.junit.Test; 17 import org.junit.runner.RunWith; 18 import org.junit.runners.JUnit4; 19 20 import com.ibm.icu.dev.test.TestFmwk; 21 import com.ibm.icu.impl.UnicodeRegex; 22 import com.ibm.icu.lang.UCharacter; 23 import com.ibm.icu.lang.UProperty; 24 import com.ibm.icu.lang.UProperty.NameChoice; 25 import com.ibm.icu.text.Transliterator; 26 import com.ibm.icu.text.UTF16; 27 import com.ibm.icu.text.UnicodeSet; 28 29 /** 30 * @author markdavis 31 */ 32 @RunWith(JUnit4.class) 33 public class RegexUtilitiesTest extends TestFmwk { 34 /** 35 * Check basic construction. 36 */ 37 @Test 38 public void TestConstruction() { 39 String[][] tests = { 40 {"a"}, 41 {"a[a-z]b"}, 42 {"[ba-z]", "[a-z]"}, 43 {"q[ba-z]", "q[a-z]"}, 44 {"[ba-z]q", "[a-z]q"}, 45 {"a\\p{joincontrol}b", "a[\u200C\u200D]b"}, 46 {"a\\P{joincontrol}b", "a[^\u200C\u200D]b"}, 47 {"a[[:whitespace:]&[:Zl:]]b", "a[\\\u2028]b"}, 48 {"a [[:bc=cs:]&[:wspace:]] b", "a [\u00A0\u202F] b"}, 49 }; 50 for (int i = 0; i < tests.length; ++i) { 51 final String source = tests[i][0]; 52 String expected = tests[i].length == 1 ? source : tests[i][1]; 53 String actual = UnicodeRegex.fix(source); 54 assertEquals(source, expected, actual); 55 } 56 } 57 58 Transliterator hex = Transliterator.getInstance("hex"); 59 60 /** 61 * Perform an exhaustive test on all Unicode characters to make sure that the UnicodeSet with each 62 * character works. 63 */ 64 @Test 65 public void TestCharacters() { 66 UnicodeSet requiresQuote = new UnicodeSet("[\\$\\&\\-\\:\\[\\\\\\]\\^\\{\\}[:pattern_whitespace:]]"); 67 boolean skip = TestFmwk.getExhaustiveness() < 10; 68 for (int cp = 0; cp < 0x110000; ++cp) { 69 if (cp > 0xFF && skip && (cp % 37 != 0)) { 70 continue; 71 } 72 String cpString = UTF16.valueOf(cp); 73 String s = requiresQuote.contains(cp) ? "\\" + cpString : cpString; 74 String pattern = null; 75 final String rawPattern = "[" + s + s + "]"; 76 try { 77 pattern = UnicodeRegex.fix(rawPattern); 78 } catch (Exception e) { 79 errln(e.getMessage()); 80 continue; 81 } 82 final String expected = "[" + s + "]"; 83 assertEquals("Doubled character works" + hex.transform(s), expected, pattern); 84 85 // verify that we can create a regex pattern and use as expected 86 String shouldNotMatch = UTF16.valueOf((cp + 1) % 0x110000); 87 checkCharPattern(Pattern.compile(pattern), pattern, cpString, shouldNotMatch); 88 89 // verify that the Pattern.compile works 90 checkCharPattern(UnicodeRegex.compile(rawPattern), pattern, cpString, shouldNotMatch); 91 } 92 } 93 94 /** 95 * Check all integer Unicode properties to make sure they work. 96 */ 97 @Test 98 public void TestUnicodeProperties() { 99 final boolean skip = TestFmwk.getExhaustiveness() < 10; 100 UnicodeSet temp = new UnicodeSet(); 101 for (int propNum = UProperty.INT_START; propNum < UProperty.INT_LIMIT; ++propNum) { 102 if (skip && (propNum % 5 != 0)) { 103 continue; 104 } 105 String propName = UCharacter.getPropertyName(propNum, NameChoice.LONG); 106 final int intPropertyMinValue = UCharacter.getIntPropertyMinValue(propNum); 107 int intPropertyMaxValue = UCharacter.getIntPropertyMaxValue(propNum); 108 if (skip) { // only test first if not exhaustive 109 intPropertyMaxValue = intPropertyMinValue; 110 } 111 for (int valueNum = intPropertyMinValue; valueNum <= intPropertyMaxValue; ++valueNum) { 112 // hack for getting property value name 113 String valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.LONG); 114 if (valueName == null) { 115 valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.SHORT); 116 if (valueName == null) { 117 valueName = Integer.toString(valueNum); 118 } 119 } 120 temp.applyIntPropertyValue(propNum, valueNum); 121 if (temp.size() == 0) { 122 continue; 123 } 124 final String prefix = "a"; 125 final String suffix = "b"; 126 String shouldMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix; 127 temp.complement(); 128 String shouldNotMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix; 129 130 // posix style pattern 131 String rawPattern = prefix + "[:" + propName + "=" + valueName + ":]" + suffix; 132 String rawNegativePattern = prefix + "[:^" + propName + "=" + valueName + ":]" + suffix; 133 checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch); 134 checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch); 135 136 // perl style pattern 137 rawPattern = prefix + "\\p{" + propName + "=" + valueName + "}" + suffix; 138 rawNegativePattern = prefix + "\\P{" + propName + "=" + valueName + "}" + suffix; 139 checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch); 140 checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch); 141 } 142 } 143 } 144 145 @Test 146 public void TestBnf() { 147 UnicodeRegex regex = new UnicodeRegex(); 148 final String[][] tests = { 149 { 150 "c = a wq;\n" + 151 "a = xyz;\n" + 152 "b = a a c;\n" 153 }, 154 { 155 "c = a b;\n" + 156 "a = xyz;\n" + 157 "b = a a c;\n", 158 "Exception" 159 }, 160 { 161 "uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;\n" + 162 "scheme = reserved+;\n" + 163 "host = // reserved+;\n" + 164 "query = [\\=reserved]+;\n" + 165 "fragment = reserved+;\n" + 166 "reserved = [[:ascii:][:sc=grek:]&[:alphabetic:]];\n", 167 "http://\u03B1\u03B2\u03B3?huh=hi#there"}, 168 { 169 "langtagRegex.txt" 170 } 171 }; 172 for (int i = 0; i < tests.length; ++i) { 173 String test = tests[i][0]; 174 final boolean expectException = tests[i].length < 2 ? false : tests[i][1].equals("Exception"); 175 try { 176 String result; 177 if (test.endsWith(".txt")) { 178 java.io.InputStream is = RegexUtilitiesTest.class.getResourceAsStream(test); 179 List lines; 180 try { 181 lines = UnicodeRegex.appendLines(new ArrayList(), is, "UTF-8"); 182 } finally { 183 is.close(); 184 } 185 result = regex.compileBnf(lines); 186 } else { 187 result = regex.compileBnf(test); 188 } 189 if (expectException) { 190 errln("Expected exception for " + test); 191 continue; 192 } 193 result = result.replaceAll("[0-9]+%", ""); // just so we can use the language subtag stuff 194 String resolved = regex.transform(result); 195 logln(resolved); 196 Matcher m = Pattern.compile(resolved, Pattern.COMMENTS).matcher(""); 197 String checks = ""; 198 for (int j = 1; j < tests[i].length; ++j) { 199 String check = tests[i][j]; 200 if (!m.reset(check).matches()) { 201 checks = checks + "Fails " + check + "\n"; 202 } else { 203 for (int k = 1; k <= m.groupCount(); ++k) { 204 checks += "(" + m.group(k) + ")"; 205 } 206 checks += "\n"; 207 } 208 } 209 logln("Result: " + result + "\n" + checks + "\n" + test); 210 } catch (Exception e) { 211 if (!expectException) { 212 errln(e.getClass().getName() + ": " + e.getMessage()); 213 } 214 continue; 215 } 216 } 217 } 218 219 /** 220 * Utility for checking patterns 221 */ 222 private void checkCharPattern(Pattern pat, String matchTitle, String shouldMatch, String shouldNotMatch) { 223 Matcher matcher = pat.matcher(shouldMatch); 224 assertTrue(matchTitle + " and " + shouldMatch, matcher.matches()); 225 matcher.reset(shouldNotMatch); 226 assertFalse(matchTitle + " and " + shouldNotMatch, matcher.matches()); 227 } 228 } 229