1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2009-2015, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package android.icu.dev.test.translit; 11 12 import java.util.ArrayList; 13 import java.util.List; 14 import java.util.regex.Matcher; 15 import java.util.regex.Pattern; 16 17 import org.junit.Test; 18 import org.junit.runner.RunWith; 19 import org.junit.runners.JUnit4; 20 21 import android.icu.dev.test.TestFmwk; 22 import android.icu.impl.UnicodeRegex; 23 import android.icu.lang.UCharacter; 24 import android.icu.lang.UProperty; 25 import android.icu.lang.UProperty.NameChoice; 26 import android.icu.text.Transliterator; 27 import android.icu.text.UTF16; 28 import android.icu.text.UnicodeSet; 29 import android.icu.testsharding.MainTestShard; 30 31 /** 32 * @author markdavis 33 */ 34 @MainTestShard 35 @RunWith(JUnit4.class) 36 public class RegexUtilitiesTest extends TestFmwk { 37 /** 38 * Check basic construction. 39 */ 40 @Test 41 public void TestConstruction() { 42 String[][] tests = { 43 {"a"}, 44 {"a[a-z]b"}, 45 {"[ba-z]", "[a-z]"}, 46 {"q[ba-z]", "q[a-z]"}, 47 {"[ba-z]q", "[a-z]q"}, 48 {"a\\p{joincontrol}b", "a[\u200C\u200D]b"}, 49 {"a\\P{joincontrol}b", "a[^\u200C\u200D]b"}, 50 {"a[[:whitespace:]&[:Zl:]]b", "a[\\\u2028]b"}, 51 {"a [[:bc=cs:]&[:wspace:]] b", "a [\u00A0\u202F] b"}, 52 }; 53 for (int i = 0; i < tests.length; ++i) { 54 final String source = tests[i][0]; 55 String expected = tests[i].length == 1 ? source : tests[i][1]; 56 String actual = UnicodeRegex.fix(source); 57 assertEquals(source, expected, actual); 58 } 59 } 60 61 Transliterator hex = Transliterator.getInstance("hex"); 62 63 /** 64 * Perform an exhaustive test on all Unicode characters to make sure that the UnicodeSet with each 65 * character works. 66 */ 67 @Test 68 public void TestCharacters() { 69 UnicodeSet requiresQuote = new UnicodeSet("[\\$\\&\\-\\:\\[\\\\\\]\\^\\{\\}[:pattern_whitespace:]]"); 70 boolean skip = TestFmwk.getExhaustiveness() < 10; 71 for (int cp = 0; cp < 0x110000; ++cp) { 72 if (cp > 0xFF && skip && (cp % 37 != 0)) { 73 continue; 74 } 75 String cpString = UTF16.valueOf(cp); 76 String s = requiresQuote.contains(cp) ? "\\" + cpString : cpString; 77 String pattern = null; 78 final String rawPattern = "[" + s + s + "]"; 79 try { 80 pattern = UnicodeRegex.fix(rawPattern); 81 } catch (Exception e) { 82 errln(e.getMessage()); 83 continue; 84 } 85 final String expected = "[" + s + "]"; 86 assertEquals("Doubled character works" + hex.transform(s), expected, pattern); 87 88 // verify that we can create a regex pattern and use as expected 89 String shouldNotMatch = UTF16.valueOf((cp + 1) % 0x110000); 90 checkCharPattern(Pattern.compile(pattern), pattern, cpString, shouldNotMatch); 91 92 // verify that the Pattern.compile works 93 checkCharPattern(UnicodeRegex.compile(rawPattern), pattern, cpString, shouldNotMatch); 94 } 95 } 96 97 /** 98 * Check all integer Unicode properties to make sure they work. 99 */ 100 @Test 101 public void TestUnicodeProperties() { 102 final boolean skip = TestFmwk.getExhaustiveness() < 10; 103 UnicodeSet temp = new UnicodeSet(); 104 for (int propNum = UProperty.INT_START; propNum < UProperty.INT_LIMIT; ++propNum) { 105 if (skip && (propNum % 5 != 0)) { 106 continue; 107 } 108 String propName = UCharacter.getPropertyName(propNum, NameChoice.LONG); 109 final int intPropertyMinValue = UCharacter.getIntPropertyMinValue(propNum); 110 int intPropertyMaxValue = UCharacter.getIntPropertyMaxValue(propNum); 111 if (skip) { // only test first if not exhaustive 112 intPropertyMaxValue = intPropertyMinValue; 113 } 114 for (int valueNum = intPropertyMinValue; valueNum <= intPropertyMaxValue; ++valueNum) { 115 // hack for getting property value name 116 String valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.LONG); 117 if (valueName == null) { 118 valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.SHORT); 119 if (valueName == null) { 120 valueName = Integer.toString(valueNum); 121 } 122 } 123 temp.applyIntPropertyValue(propNum, valueNum); 124 if (temp.size() == 0) { 125 continue; 126 } 127 final String prefix = "a"; 128 final String suffix = "b"; 129 String shouldMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix; 130 temp.complement(); 131 String shouldNotMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix; 132 133 // posix style pattern 134 String rawPattern = prefix + "[:" + propName + "=" + valueName + ":]" + suffix; 135 String rawNegativePattern = prefix + "[:^" + propName + "=" + valueName + ":]" + suffix; 136 checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch); 137 checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch); 138 139 // perl style pattern 140 rawPattern = prefix + "\\p{" + propName + "=" + valueName + "}" + suffix; 141 rawNegativePattern = prefix + "\\P{" + propName + "=" + valueName + "}" + suffix; 142 checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch); 143 checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch); 144 } 145 } 146 } 147 148 @Test 149 public void TestBnf() { 150 UnicodeRegex regex = new UnicodeRegex(); 151 final String[][] tests = { 152 { 153 "c = a wq;\n" + 154 "a = xyz;\n" + 155 "b = a a c;\n" 156 }, 157 { 158 "c = a b;\n" + 159 "a = xyz;\n" + 160 "b = a a c;\n", 161 "Exception" 162 }, 163 { 164 "uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;\n" + 165 "scheme = reserved+;\n" + 166 "host = // reserved+;\n" + 167 "query = [\\=reserved]+;\n" + 168 "fragment = reserved+;\n" + 169 "reserved = [[:ascii:][:sc=grek:]&[:alphabetic:]];\n", 170 "http://\u03B1\u03B2\u03B3?huh=hi#there"}, 171 { 172 "langtagRegex.txt" 173 } 174 }; 175 for (int i = 0; i < tests.length; ++i) { 176 String test = tests[i][0]; 177 final boolean expectException = tests[i].length < 2 ? false : tests[i][1].equals("Exception"); 178 try { 179 String result; 180 if (test.endsWith(".txt")) { 181 java.io.InputStream is = RegexUtilitiesTest.class.getResourceAsStream(test); 182 List lines; 183 try { 184 lines = UnicodeRegex.appendLines(new ArrayList(), is, "UTF-8"); 185 } finally { 186 is.close(); 187 } 188 result = regex.compileBnf(lines); 189 } else { 190 result = regex.compileBnf(test); 191 } 192 if (expectException) { 193 errln("Expected exception for " + test); 194 continue; 195 } 196 result = result.replaceAll("[0-9]+%", ""); // just so we can use the language subtag stuff 197 String resolved = regex.transform(result); 198 logln(resolved); 199 Matcher m = Pattern.compile(resolved, Pattern.COMMENTS).matcher(""); 200 String checks = ""; 201 for (int j = 1; j < tests[i].length; ++j) { 202 String check = tests[i][j]; 203 if (!m.reset(check).matches()) { 204 checks = checks + "Fails " + check + "\n"; 205 } else { 206 for (int k = 1; k <= m.groupCount(); ++k) { 207 checks += "(" + m.group(k) + ")"; 208 } 209 checks += "\n"; 210 } 211 } 212 logln("Result: " + result + "\n" + checks + "\n" + test); 213 } catch (Exception e) { 214 if (!expectException) { 215 errln(e.getClass().getName() + ": " + e.getMessage()); 216 } 217 continue; 218 } 219 } 220 } 221 222 /** 223 * Utility for checking patterns 224 */ 225 private void checkCharPattern(Pattern pat, String matchTitle, String shouldMatch, String shouldNotMatch) { 226 Matcher matcher = pat.matcher(shouldMatch); 227 assertTrue(matchTitle + " and " + shouldMatch, matcher.matches()); 228 matcher.reset(shouldNotMatch); 229 assertFalse(matchTitle + " and " + shouldNotMatch, matcher.matches()); 230 } 231 } 232