1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2009-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.dev.test.text; 10 11 import java.io.BufferedReader; 12 import java.io.IOException; 13 import java.io.Reader; 14 import java.io.StringReader; 15 import java.lang.reflect.Constructor; 16 import java.lang.reflect.InvocationTargetException; 17 import java.lang.reflect.Method; 18 import java.text.ParseException; 19 import java.util.BitSet; 20 import java.util.HashSet; 21 import java.util.LinkedHashSet; 22 import java.util.Locale; 23 import java.util.Set; 24 import java.util.regex.Matcher; 25 import java.util.regex.Pattern; 26 27 import org.junit.Test; 28 import org.junit.runner.RunWith; 29 import org.junit.runners.JUnit4; 30 31 import com.ibm.icu.dev.test.TestFmwk; 32 import com.ibm.icu.dev.test.TestUtil; 33 import com.ibm.icu.dev.test.TestUtil.JavaVendor; 34 import com.ibm.icu.impl.Utility; 35 import com.ibm.icu.lang.UScript; 36 import com.ibm.icu.text.Normalizer2; 37 import com.ibm.icu.text.SpoofChecker; 38 import com.ibm.icu.text.SpoofChecker.CheckResult; 39 import com.ibm.icu.text.SpoofChecker.RestrictionLevel; 40 import com.ibm.icu.text.UnicodeSet; 41 import com.ibm.icu.util.ULocale; 42 43 @RunWith(JUnit4.class) 44 public class SpoofCheckerTest extends TestFmwk { 45 /* 46 * Identifiers for verifying that spoof checking is minimally alive and working. 47 */ 48 char[] goodLatinChars = { (char) 0x75, (char) 0x7a }; 49 String goodLatin = new String(goodLatinChars); /* "uz", all ASCII */ 50 /* (not confusable) */ 51 char[] scMixedChars = { (char) 0x73, (char) 0x0441 }; 52 String scMixed = new String(scMixedChars); /* "sc", with Cyrillic 'c' */ 53 /* (mixed script, confusable */ 54 55 String scLatin = "sc"; /* "sc", plain ascii. */ 56 String goodCyrl = "\u0438\u043B"; // "Cyrillic small letter i and el" Plain lower case Cyrillic letters, no latin confusables 57 String goodGreek = "\u03c0\u03c6"; // "Greek small letter pi and phi" Plain lower case Greek letters 58 59 // Various 1 l I look-alikes 60 String lll_Latin_a = "lI1"; // small letter l, cap I, digit 1, all ASCII 61 // "\uFF29\u217C\u0196" Full-width I, Small Roman Numeral fifty, Latin Cap Letter IOTA 62 String lll_Latin_b = "\uff29\u217c\u0196"; 63 String lll_Cyrl = "\u0406\u04C0\u0031"; // "\u0406\u04C01" 64 /* The skeleton transform for all of the 'lll' lookalikes is ascii lower case letter l. */ 65 String lll_Skel = "lll"; 66 67 String han_Hiragana = "\u3086\u308A \u77F3\u7530"; // Hiragana, space, Han 68 69 70 /* 71 * Test basic constructor. 72 */ 73 @Test 74 public void TestUSpoof() { 75 SpoofChecker sc = new SpoofChecker.Builder().build(); 76 if (sc == null) { 77 errln("FAIL: null SpoofChecker"); 78 } 79 } 80 81 /* 82 * Test build from source rules. 83 */ 84 @Test 85 public void TestOpenFromSourceRules() { 86 if (TestUtil.getJavaVendor() == JavaVendor.IBM && TestUtil.getJavaVersion() == 5) { 87 // Note: IBM Java 5 has a bug reading a large UTF-8 text contents 88 logln("Skip this test case because of the IBM Java 5 bug"); 89 return; 90 } 91 String fileName; 92 Reader confusables; 93 94 try { 95 SpoofChecker rsc = null; 96 97 fileName = "unicode/confusables.txt"; 98 confusables = TestUtil.getDataReader(fileName, "UTF-8"); 99 try { 100 rsc = new SpoofChecker.Builder().setData(confusables).build(); 101 } finally { 102 confusables.close(); 103 } 104 105 if (rsc == null) { 106 errln("FAIL: null SpoofChecker"); 107 return; 108 } 109 // Check that newly built-from-rules SpoofChecker is able to function. 110 checkSkeleton(rsc, "TestOpenFromSourceRules"); 111 112 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 113 rsc.failsChecks("Hello", result); 114 115 // The checker we just built from source rules should be equivalent to the 116 // default checker created from prebuilt rules baked into the ICU data. 117 SpoofChecker defaultChecker = new SpoofChecker.Builder().build(); 118 assertEquals("Checker built from rules equals default", defaultChecker, rsc); 119 assertEquals("Checker built from rules has same hash code as default", defaultChecker.hashCode(), rsc.hashCode()); 120 121 SpoofChecker optionChecker = new SpoofChecker.Builder(). 122 setRestrictionLevel(RestrictionLevel.UNRESTRICTIVE).build(); 123 assertFalse("", optionChecker.equals(rsc)); 124 125 String stubConfusables = 126 "# Stub confusables data\n" + 127 "05AD ; 0596 ; MA # ( ) HEBREW ACCENT DEHI HEBREW ACCENT TIPEHA #\n"; 128 129 // Verify that re-using a builder doesn't alter SpoofCheckers that were 130 // previously created by that builder. (The builder could modify data 131 // being used by the existing checker) 132 133 SpoofChecker.Builder builder = new SpoofChecker.Builder(); 134 SpoofChecker testChecker1 = builder.build(); 135 assertTrue("", testChecker1.equals(defaultChecker)); 136 137 builder.setData(new StringReader(stubConfusables)); 138 builder.setRestrictionLevel(RestrictionLevel.UNRESTRICTIVE); 139 builder.setChecks(SpoofChecker.SINGLE_SCRIPT_CONFUSABLE); 140 Set<ULocale>allowedLocales = new HashSet<ULocale>(); 141 allowedLocales.add(ULocale.JAPANESE); 142 allowedLocales.add(ULocale.FRENCH); 143 builder.setAllowedLocales(allowedLocales); 144 SpoofChecker testChecker2 = builder.build(); 145 SpoofChecker testChecker3 = builder.build(); 146 147 assertTrue("", testChecker1.equals(defaultChecker)); 148 assertFalse("", testChecker2.equals(defaultChecker)); 149 assertTrue("", testChecker2.equals(testChecker3)); 150 151 } catch (java.io.IOException e) { 152 errln(e.toString()); 153 } catch (ParseException e) { 154 errln(e.toString()); 155 } 156 } 157 158 /* 159 * Set & Get Check Flags 160 */ 161 @Test 162 public void TestGetSetChecks1() { 163 SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS).build(); 164 int t; 165 t = sc.getChecks(); 166 assertEquals("", SpoofChecker.ALL_CHECKS, t); 167 168 sc = new SpoofChecker.Builder().setChecks(0).build(); 169 t = sc.getChecks(); 170 assertEquals("", 0, t); 171 172 int checks = SpoofChecker.WHOLE_SCRIPT_CONFUSABLE | SpoofChecker.MIXED_SCRIPT_CONFUSABLE 173 | SpoofChecker.ANY_CASE; 174 sc = new SpoofChecker.Builder().setChecks(checks).build(); 175 t = sc.getChecks(); 176 assertEquals("", checks, t); 177 } 178 179 /* 180 * get & setAllowedChars 181 */ 182 @Test 183 public void TestGetSetAllowedChars() { 184 SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).build(); 185 UnicodeSet us; 186 UnicodeSet uset; 187 188 uset = sc.getAllowedChars(); 189 assertTrue("", uset.isFrozen()); 190 us = new UnicodeSet(0x41, 0x5A); /* [A-Z] */ 191 sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedChars(us).build(); 192 assertEquals("", us, sc.getAllowedChars()); 193 } 194 195 /* 196 * get & set Checks 197 */ 198 @Test 199 public void TestGetSetChecks() { 200 SpoofChecker sc = new SpoofChecker.Builder().build(); 201 int checks; 202 int checks2; 203 boolean checkResults; 204 205 checks = sc.getChecks(); 206 assertEquals("", SpoofChecker.ALL_CHECKS, checks); 207 208 checks &= ~(SpoofChecker.SINGLE_SCRIPT | SpoofChecker.MIXED_SCRIPT_CONFUSABLE); 209 sc = new SpoofChecker.Builder().setChecks(checks).build(); 210 checks2 = sc.getChecks(); 211 assertEquals("", checks, checks2); 212 213 /* 214 * The checks that were disabled just above are the same ones that the "scMixed" test fails. So with those tests 215 * gone checking that Identifier should now succeed 216 */ 217 checkResults = sc.failsChecks(scMixed); 218 assertFalse("", checkResults); 219 } 220 221 /* 222 * AllowedLocales 223 */ 224 @Test 225 public void TestAllowedLocales() { 226 SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).build(); 227 Set<ULocale> allowedLocales = null; 228 Set<Locale> allowedJavaLocales = null; 229 boolean checkResults; 230 231 /* Default allowed locales list should be empty */ 232 allowedLocales = sc.getAllowedLocales(); 233 assertTrue("Empty allowed locales", allowedLocales.isEmpty()); 234 235 allowedJavaLocales = sc.getAllowedJavaLocales(); 236 assertTrue("Empty allowed Java locales", allowedJavaLocales.isEmpty()); 237 238 /* Allow en and ru, which should enable Latin and Cyrillic only to pass */ 239 ULocale enloc = new ULocale("en"); 240 ULocale ruloc = new ULocale("ru_RU"); 241 allowedLocales = new HashSet<ULocale>(); 242 allowedLocales.add(enloc); 243 allowedLocales.add(ruloc); 244 sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build(); 245 allowedLocales = sc.getAllowedLocales(); 246 assertTrue("en in allowed locales", allowedLocales.contains(enloc)); 247 assertTrue("ru_RU in allowed locales", allowedLocales.contains(ruloc)); 248 249 Locale frlocJ = new Locale("fr"); 250 allowedJavaLocales = new HashSet<Locale>(); 251 allowedJavaLocales.add(frlocJ); 252 sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedJavaLocales(allowedJavaLocales).build(); 253 assertFalse("no en in allowed Java locales", allowedJavaLocales.contains(new Locale("en"))); 254 assertTrue("fr in allowed Java locales", allowedJavaLocales.contains(frlocJ)); 255 256 sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build(); 257 258 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 259 checkResults = sc.failsChecks(goodLatin); 260 assertFalse("", checkResults); 261 262 checkResults = sc.failsChecks(goodGreek, result); 263 assertEquals("", SpoofChecker.CHAR_LIMIT, result.checks); 264 265 checkResults = sc.failsChecks(goodCyrl); 266 assertFalse("", checkResults); 267 268 /* Reset with an empty locale list, which should allow all characters to pass */ 269 allowedLocales = new LinkedHashSet<ULocale>(); 270 sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build(); 271 272 checkResults = sc.failsChecks(goodGreek); 273 assertFalse("", checkResults); 274 } 275 276 /* 277 * AllowedChars set/get the UnicodeSet of allowed characters. 278 */ 279 @Test 280 public void TestAllowedChars() { 281 SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).build(); 282 UnicodeSet set; 283 UnicodeSet tmpSet; 284 boolean checkResults; 285 286 /* By default, we should see no restriction; the UnicodeSet should allow all characters. */ 287 set = sc.getAllowedChars(); 288 tmpSet = new UnicodeSet(0, 0x10ffff); 289 assertEquals("", tmpSet, set); 290 291 /* Remove a character that is in our good Latin test identifier from the allowed chars set. */ 292 tmpSet.remove(goodLatin.charAt(1)); 293 sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedChars(tmpSet).build(); 294 295 /* Latin Identifier should now fail; other non-latin test cases should still be OK */ 296 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 297 checkResults = sc.failsChecks(goodLatin, result); 298 assertTrue("", checkResults); 299 assertEquals("", SpoofChecker.CHAR_LIMIT, result.checks); 300 } 301 302 @Test 303 public void TestCheck() { 304 SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS).build(); 305 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 306 boolean checkResults; 307 308 result.position = 666; 309 checkResults = sc.failsChecks(goodLatin, result); 310 assertFalse("", checkResults); 311 assertEquals("", 0, result.checks); 312 313 checkResults = sc.failsChecks(goodCyrl, result); 314 assertFalse("", checkResults); 315 assertEquals("", 0, result.checks); 316 317 result.position = 666; 318 checkResults = sc.failsChecks(scMixed, result); 319 assertTrue("", checkResults); 320 assertEquals("", SpoofChecker.RESTRICTION_LEVEL, result.checks); 321 322 result.position = 666; 323 checkResults = sc.failsChecks(han_Hiragana, result); 324 assertFalse("", checkResults); 325 assertEquals("", 0, result.checks); 326 } 327 328 @Test 329 public void TestAreConfusable1() { 330 SpoofChecker sc = new SpoofChecker.Builder().build(); 331 int checkResults; 332 checkResults = sc.areConfusable(scLatin, scMixed); 333 assertEquals("Latin/Mixed is not MIXED_SCRIPT_CONFUSABLE", SpoofChecker.MIXED_SCRIPT_CONFUSABLE, checkResults); 334 335 checkResults = sc.areConfusable(goodGreek, scLatin); 336 assertEquals("Greek/Latin is not unconfusable", 0, checkResults); 337 338 checkResults = sc.areConfusable(lll_Latin_a, lll_Latin_b); 339 assertEquals("Latin/Latin is not SINGLE_SCRIPT_CONFUSABLE", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, checkResults); 340 } 341 342 @Test 343 public void TestGetSkeleton() { 344 SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); 345 String dest; 346 dest = sc.getSkeleton(SpoofChecker.ANY_CASE, lll_Latin_a); 347 assertEquals("", lll_Skel, dest); 348 } 349 350 /** 351 * IntlTestSpoof is the top level test class for the Unicode Spoof detection tests 352 */ 353 354 // Test the USpoofDetector API functions that require C++ 355 // The pure C part of the API, which is most of it, is tested in cintltst 356 /** 357 * IntlTestSpoof tests for USpoofDetector 358 */ 359 @Test 360 public void TestSpoofAPI() { 361 SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS).build(); 362 String s = "xyz"; 363 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 364 result.position = 666; 365 boolean checkResults = sc.failsChecks(s, result); 366 assertFalse("", checkResults); 367 assertEquals("", 0, result.position); 368 369 sc = new SpoofChecker.Builder().build(); 370 String s1 = "cxs"; 371 String s2 = Utility.unescape("\\u0441\\u0445\\u0455"); // Cyrillic "cxs" 372 int checkResult = sc.areConfusable(s1, s2); 373 assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, checkResult); 374 375 sc = new SpoofChecker.Builder().build(); 376 s = "I1l0O"; 377 String dest = sc.getSkeleton(SpoofChecker.ANY_CASE, s); 378 assertEquals("", dest, "lllOO"); 379 } 380 381 @Test 382 public void TestSkeleton() { 383 SpoofChecker sc = new SpoofChecker.Builder().build(); 384 checkSkeleton(sc, "TestSkeleton"); 385 } 386 387 // testSkeleton. Spot check a number of confusable skeleton substitutions from the 388 // Unicode data file confusables.txt 389 // Test cases chosen for substitutions of various lengths, and 390 // membership in different mapping tables. 391 public void checkSkeleton(SpoofChecker sc, String testName) { 392 int ML = 0; 393 int SL = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE; 394 int MA = SpoofChecker.ANY_CASE; 395 int SA = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE | SpoofChecker.ANY_CASE; 396 397 checkSkeleton(sc, MA, "\\u02b9identifier'", "'identifier'", testName); 398 399 checkSkeleton(sc, SL, "nochange", "nochange", testName); 400 checkSkeleton(sc, SA, "nochange", "nochange", testName); 401 checkSkeleton(sc, ML, "nochange", "nochange", testName); 402 checkSkeleton(sc, MA, "nochange", "nochange", testName); 403 checkSkeleton(sc, MA, "love", "love", testName); 404 checkSkeleton(sc, MA, "1ove", "love", testName); // Digit 1 to letter l 405 checkSkeleton(sc, ML, "OOPS", "OOPS", testName); 406 checkSkeleton(sc, ML, "00PS", "OOPS", testName); 407 checkSkeleton(sc, MA, "OOPS", "OOPS", testName); 408 checkSkeleton(sc, MA, "00PS", "OOPS", testName); // Digit 0 to letter O 409 checkSkeleton(sc, SL, "\\u059c", "\\u0301", testName); 410 checkSkeleton(sc, SL, "\\u2A74", "\\u003A\\u003A\\u003D", testName); 411 checkSkeleton(sc, SL, "\\u247E", "(ll)", testName); 412 checkSkeleton(sc, SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u006c\\u0644\\u006f", testName); 413 414 // 0C83 mapping existed in the ML and MA tables, did not exist in SL, SA (Original Unicode 7) 415 // mapping exists in all tables (ICU 55). 416 // 0C83 ; 0983 ; ML # KANNADA SIGN VISARGA to 417 checkSkeleton(sc, SL, "\\u0C83", "\\u0983", testName); 418 checkSkeleton(sc, SA, "\\u0C83", "\\u0983", testName); 419 checkSkeleton(sc, ML, "\\u0C83", "\\u0983", testName); 420 checkSkeleton(sc, MA, "\\u0C83", "\\u0983", testName); 421 422 // 0391 mappings existed only in MA and SA tables (Original Unicode 7). 423 // mappings exist in all tables (ICU 55) 424 checkSkeleton(sc, MA, "\\u0391", "A", testName); 425 checkSkeleton(sc, SA, "\\u0391", "A", testName); 426 checkSkeleton(sc, ML, "\\u0391", "A", testName); 427 checkSkeleton(sc, SL, "\\u0391", "A", testName); 428 429 // 13CF Mappings in all four tables, different in MA (Original Unicode 7). 430 // Mapping same in all tables (ICU 55) 431 checkSkeleton(sc, ML, "\\u13CF", "b", testName); 432 checkSkeleton(sc, MA, "\\u13CF", "b", testName); 433 checkSkeleton(sc, SL, "\\u13CF", "b", testName); 434 checkSkeleton(sc, SA, "\\u13CF", "b", testName); 435 436 // 0022 ; 0027 0027 ; 437 // all tables 438 checkSkeleton(sc, SL, "\"", "\\u0027\\u0027", testName); 439 checkSkeleton(sc, SA, "\"", "\\u0027\\u0027", testName); 440 checkSkeleton(sc, ML, "\"", "\\u0027\\u0027", testName); 441 checkSkeleton(sc, MA, "\"", "\\u0027\\u0027", testName); 442 443 } 444 445 // Internal function to run a single skeleton test case. 446 // 447 // Run a single confusable skeleton transformation test case. 448 // 449 void checkSkeleton(SpoofChecker sc, int type, String input, String expected, String testName) { 450 String uInput = Utility.unescape(input); 451 String uExpected = Utility.unescape(expected); 452 String actual; 453 actual = sc.getSkeleton(type, uInput); 454 Throwable t = new Throwable(); 455 int lineNumberOfTest = t.getStackTrace()[1].getLineNumber(); 456 457 assertEquals(testName + " test at line " + lineNumberOfTest + " : Expected (escaped): " + expected, uExpected, actual); 458 } 459 460 @Test 461 public void TestAreConfusable() { 462 SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); 463 String s1 = "A long string that will overflow stack buffers. A long string that will overflow stack buffers. " 464 + "A long string that will overflow stack buffers. A long string that will overflow stack buffers. "; 465 String s2 = "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. " 466 + "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "; 467 assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, sc.areConfusable(s1, s2)); 468 } 469 470 @Test 471 public void TestConfusableFlagVariants() { 472 // The spoof checker should only return those tests that the user requested. This test makes sure that 473 // the checker doesn't return anything the user doesn't want. This test started passing in ICU 58. 474 475 // NOTE: These strings are the same ones as in the documentation. If the confusables data changes 476 // and this test breaks, pick a new confusables pair, update it here, and also update it in the 477 // documentation of SpoofChecker.java. 478 String latn = "desparejado"; 479 String cyrl = ""; 480 String mixed = "dsrd"; 481 482 Object[][] tests = { 483 // string 1, string 2, checks for spoof checker, expected output 484 { latn, cyrl, 485 SpoofChecker.CONFUSABLE, 486 SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE }, 487 { latn, cyrl, 488 SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, 489 SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE }, 490 { latn, cyrl, 491 SpoofChecker.MIXED_SCRIPT_CONFUSABLE, 492 SpoofChecker.MIXED_SCRIPT_CONFUSABLE }, 493 { latn, cyrl, 494 SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, 495 SpoofChecker.WHOLE_SCRIPT_CONFUSABLE }, 496 { latn, cyrl, 497 SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, 498 0 }, 499 { latn, mixed, 500 SpoofChecker.CONFUSABLE, 501 SpoofChecker.MIXED_SCRIPT_CONFUSABLE }, 502 { latn, mixed, 503 SpoofChecker.MIXED_SCRIPT_CONFUSABLE, 504 SpoofChecker.MIXED_SCRIPT_CONFUSABLE }, 505 { latn, mixed, 506 SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, 507 SpoofChecker.MIXED_SCRIPT_CONFUSABLE }, 508 { latn, mixed, 509 SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, 510 0 }, 511 { latn, latn, 512 SpoofChecker.CONFUSABLE, 513 SpoofChecker.SINGLE_SCRIPT_CONFUSABLE }, 514 }; 515 516 for (Object[] test : tests) { 517 String s1 = (String) test[0]; 518 String s2 = (String) test[1]; 519 int checks = (Integer) test[2]; 520 int expectedResult = (Integer) test[3]; 521 522 // Sanity check: expectedResult should be a subset of checks 523 assertEquals("Invalid test case", expectedResult & checks, expectedResult); 524 525 SpoofChecker sc = new SpoofChecker.Builder().setChecks(checks).build(); 526 int actualResult = sc.areConfusable(s1, s2); 527 assertEquals("Comparing '" + s1 + "' and '" + s2 + "' with checks '" + checks + "'", 528 expectedResult, actualResult); 529 } 530 } 531 532 @Test 533 public void TestInvisible() { 534 SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.INVISIBLE).build(); 535 String s = Utility.unescape("abcd\\u0301ef"); 536 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); 537 result.position = -42; 538 assertFalse("", sc.failsChecks(s, result)); 539 assertEquals("", 0, result.checks); 540 assertEquals("", result.position, 0); 541 542 String s2 = Utility.unescape("abcd\\u0301\\u0302\\u0301ef"); 543 assertTrue("", sc.failsChecks(s2, result)); 544 assertEquals("", SpoofChecker.INVISIBLE, result.checks); 545 assertEquals("", 0, result.position); 546 547 // Two acute accents, one from the composed a with acute accent, \u00e1, 548 // and one separate. 549 result.position = -42; 550 String s3 = Utility.unescape("abcd\\u00e1\\u0301xyz"); 551 assertTrue("", sc.failsChecks(s3, result)); 552 assertEquals("", SpoofChecker.INVISIBLE, result.checks); 553 assertEquals("", 0, result.position); 554 } 555 556 @Test 557 public void TestRestrictionLevel() { 558 Object[][] tests = { 559 {"a", RestrictionLevel.UNRESTRICTIVE}, 560 {"a", RestrictionLevel.ASCII}, 561 {"", RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE}, 562 {"a", RestrictionLevel.HIGHLY_RESTRICTIVE}, 563 {"a", RestrictionLevel.MODERATELY_RESTRICTIVE}, 564 {"a", RestrictionLevel.MINIMALLY_RESTRICTIVE}, 565 {"a", RestrictionLevel.UNRESTRICTIVE}, 566 {"a\u303c", RestrictionLevel.HIGHLY_RESTRICTIVE}, 567 {"a\u303c", RestrictionLevel.HIGHLY_RESTRICTIVE}, 568 {"a\u303c", RestrictionLevel.HIGHLY_RESTRICTIVE}, 569 { "a\u303c", RestrictionLevel.HIGHLY_RESTRICTIVE}, 570 {"a1", RestrictionLevel.MODERATELY_RESTRICTIVE}, 571 {"a1", RestrictionLevel.MODERATELY_RESTRICTIVE}, 572 {"\u303ca1", RestrictionLevel.MINIMALLY_RESTRICTIVE}, 573 {"a\u303c1", RestrictionLevel.MINIMALLY_RESTRICTIVE}, 574 }; 575 576 UnicodeSet allowedChars = new UnicodeSet(); 577 // Allowed Identifier Characters. In addition to the Recommended Set, 578 // allow u303c, which has an interesting script extension of Hani Hira Kana. 579 allowedChars.addAll(SpoofChecker.RECOMMENDED).add(0x303c); 580 581 CheckResult checkResult = new CheckResult(); 582 for (Object[] test : tests) { 583 String testString = (String) test[0]; 584 RestrictionLevel expectedLevel = (RestrictionLevel) test[1]; 585 for (RestrictionLevel levelSetInSpoofChecker : RestrictionLevel.values()) { 586 SpoofChecker sc = new SpoofChecker.Builder() 587 .setAllowedChars(allowedChars) 588 .setRestrictionLevel(levelSetInSpoofChecker) 589 .setChecks(SpoofChecker.RESTRICTION_LEVEL) // only check this 590 .build(); 591 boolean actualValue = sc.failsChecks(testString, checkResult); 592 assertEquals("Testing restriction level for '" + testString + "'", 593 expectedLevel, checkResult.restrictionLevel); 594 595 // we want to fail if the text is (say) MODERATE and the testLevel is ASCII 596 boolean expectedFailure = expectedLevel.compareTo(levelSetInSpoofChecker) > 0; 597 assertEquals("Testing spoof restriction level for '" + testString + "', " + levelSetInSpoofChecker, 598 expectedFailure, actualValue); 599 600 // Coverage for getRestrictionLevel 601 assertEquals("Restriction level on built SpoofChecker should be same as on builder", 602 levelSetInSpoofChecker, sc.getRestrictionLevel()); 603 } 604 } 605 } 606 607 @Test 608 public void TestMixedNumbers() { 609 Object[][] tests = { 610 {"1", "[0]"}, 611 {"", "[]"}, 612 {"1", "[0]"}, 613 {"", "[]"}, 614 {"a", "[]"}, 615 {"a\u303c", "[]"}, 616 {"a\u303c", "[]"}, 617 {"a\u303c", "[]"}, 618 { "a\u303c", "[]"}, 619 {"a1", "[0]"}, 620 {"a1", "[0]"}, 621 {"\u303ca1", "[0]"}, 622 {"a\u303c1", "[0]"}, 623 }; 624 CheckResult checkResult = new CheckResult(); 625 for (Object[] test : tests) { 626 String testString = (String) test[0]; 627 UnicodeSet expected = new UnicodeSet((String)test[1]); 628 629 SpoofChecker sc = new SpoofChecker.Builder() 630 .setChecks(SpoofChecker.MIXED_NUMBERS) // only check this 631 .build(); 632 boolean actualValue = sc.failsChecks(testString, checkResult); 633 assertEquals("", expected, checkResult.numerics); 634 assertEquals("Testing spoof mixed numbers for '" + testString + "', ", expected.size() > 1, actualValue); 635 } 636 } 637 638 @Test 639 public void TestBug11635() { 640 // The bug was an error in iterating through supplementary characters in IdentifierInfo. 641 // The three supplemental chars in the string are "123" from the mathematical bold digit range. 642 // Common script, Nd general category, and no other restrictions on allowed characters 643 // leaves "ABC123" as SINGLE_SCRIPT_RESTRICTIVE. 644 String identifier = Utility.unescape("ABC\\U0001D7CF\\U0001D7D0\\U0001D7D1"); 645 CheckResult checkResult = new CheckResult(); 646 SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.RESTRICTION_LEVEL).build(); 647 sc.failsChecks(identifier, checkResult); 648 assertEquals("", RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE, checkResult.restrictionLevel); 649 } 650 651 private String parseHex(String in) { 652 StringBuilder sb = new StringBuilder(); 653 for (String oneCharAsHexString : in.split("\\s+")) { 654 if (oneCharAsHexString.length() > 0) { 655 sb.appendCodePoint(Integer.parseInt(oneCharAsHexString, 16)); 656 } 657 } 658 return sb.toString(); 659 } 660 661 private String escapeString(String in) { 662 StringBuilder out = new StringBuilder(); 663 for (int i = 0; i < in.length(); i++) { 664 int c = in.codePointAt(i); 665 if (c <= 0x7f) { 666 out.append((char) c); 667 } else if (c <= 0xffff) { 668 out.append(String.format("\\u%04x", c)); 669 } else { 670 out.append(String.format("\\U%06x", c)); 671 i++; 672 } 673 } 674 return out.toString(); 675 } 676 677 // Verify that each item from the Unicode confusables.txt file 678 // transforms into the expected skeleton. 679 @Test 680 public void testConfData() { 681 if (TestUtil.getJavaVendor() == JavaVendor.IBM && TestUtil.getJavaVersion() == 5) { 682 // Note: IBM Java 5 has a bug reading a large UTF-8 text contents 683 logln("Skip this test case because of the IBM Java 5 bug"); 684 return; 685 } 686 try { 687 // Read in the confusables.txt file. (Distributed by Unicode.org) 688 String fileName = "unicode/confusables.txt"; 689 BufferedReader confusablesRdr = TestUtil.getDataReader(fileName, "UTF-8"); 690 691 // Create a default spoof checker to use in this test. 692 SpoofChecker sc = new SpoofChecker.Builder().build(); 693 694 // Parse lines from the confusables.txt file. Example Line: 695 // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH .... 696 // Lines have three fields. The hex fields can contain more than one character, 697 // and each character may be more than 4 digits (for supplemntals) 698 // This regular expression matches lines and splits the fields into capture groups. 699 // Capture group 1: map from chars 700 // 2: map to chars 701 // 3: table type, SL, ML, SA or MA (deprecated) 702 // 4: Comment Lines Only 703 // 5: Error Lines Only 704 Matcher parseLine = Pattern.compile( 705 "\\ufeff?" + "(?:([0-9A-F\\s]+);([0-9A-F\\s]+);\\s*(SL|ML|SA|MA)\\s*(?:#.*?)?$)" 706 + "|\\ufeff?(\\s*(?:#.*)?)"). // Comment line 707 matcher(""); 708 Normalizer2 normalizer = Normalizer2.getNFDInstance(); 709 int lineNum = 0; 710 String inputLine; 711 while ((inputLine = confusablesRdr.readLine()) != null) { 712 lineNum++; 713 parseLine.reset(inputLine); 714 if (!parseLine.matches()) { 715 errln("Syntax error in confusable data file at line " + lineNum); 716 errln(inputLine); 717 break; 718 } 719 if (parseLine.group(4) != null) { 720 continue; // comment line 721 } 722 String from = parseHex(parseLine.group(1)); 723 724 if (!normalizer.isNormalized(from)) { 725 // The source character was not NFD. 726 // Skip this case; the first step in obtaining a skeleton is to NFD the input, 727 // so the mapping in this line of confusables.txt will never be applied. 728 continue; 729 } 730 731 String rawExpected = parseHex(parseLine.group(2)); 732 String expected = normalizer.normalize(rawExpected); 733 734 String actual; 735 actual = sc.getSkeleton(from); 736 737 if (!actual.equals(expected)) { 738 errln("confusables.txt: " + lineNum + ": " + parseLine.group(0)); 739 errln("Actual: " + escapeString(actual)); 740 } 741 } 742 confusablesRdr.close(); 743 } catch (IOException e) { 744 errln(e.toString()); 745 } 746 } 747 748 @Test 749 public void TestCheckResultToString11447() { 750 CheckResult checkResult = new CheckResult(); 751 SpoofChecker sc = new SpoofChecker.Builder() 752 .setChecks(SpoofChecker.MIXED_NUMBERS) 753 .build(); 754 sc.failsChecks("1", checkResult); 755 assertTrue("CheckResult: ", checkResult.toString().contains("MIXED_NUMBERS")); 756 } 757 758 @Test 759 public void TestDeprecated() { 760 // getSkeleton 761 SpoofChecker sc = new SpoofChecker.Builder().build(); 762 assertEquals("Deprecated version of getSkeleton method does not work", 763 sc.getSkeleton(SpoofChecker.ANY_CASE, scMixed), 764 sc.getSkeleton(scMixed)); 765 766 // setData 767 try { 768 String fileName1 = "unicode/confusables.txt"; 769 String fileName2 = "unicode/confusablesWholeScript.txt"; 770 Reader reader1 = TestUtil.getDataReader(fileName1, "UTF-8"); 771 Reader reader2 = TestUtil.getDataReader(fileName2, "UTF-8"); 772 Reader reader3 = TestUtil.getDataReader(fileName1, "UTF-8"); 773 try { 774 SpoofChecker sc2 = new SpoofChecker.Builder() 775 .setData(reader1, reader2) 776 .build(); 777 SpoofChecker sc1 = new SpoofChecker.Builder() 778 .setData(reader3) 779 .build(); 780 assertEquals("Deprecated version of setData method does not work", sc1, sc2); 781 } finally { 782 reader1.close(); 783 reader2.close(); 784 reader3.close(); 785 } 786 } catch(IOException e) { 787 fail("Could not load confusables data"); 788 } catch (ParseException e) { 789 fail("Could not parse confusables data"); 790 } 791 } 792 793 @Test 794 public void testScriptSet() { 795 try { 796 Class ScriptSet = Class.forName("com.ibm.icu.text.SpoofChecker$ScriptSet"); 797 Constructor ctor = ScriptSet.getDeclaredConstructor(); 798 ctor.setAccessible(true); 799 BitSet ss = (BitSet) ctor.newInstance(); 800 801 ss.set(UScript.MYANMAR); 802 assertEquals("ScriptSet toString with Myanmar", "<ScriptSet { Mymr }>", ss.toString()); 803 ss.set(UScript.BENGALI); 804 ss.set(UScript.LATIN); 805 assertEquals("ScriptSet toString with Myanmar, Latin, and Bengali", "<ScriptSet { Beng Latn Mymr }>", ss.toString()); 806 807 Method and = ScriptSet.getDeclaredMethod("and", Integer.TYPE); 808 and.setAccessible(true); 809 and.invoke(ss, UScript.BENGALI); 810 assertEquals("ScriptSet toString with Bengali only", "<ScriptSet { Beng }>", ss.toString()); 811 812 Method setAll = ScriptSet.getDeclaredMethod("setAll"); 813 setAll.setAccessible(true); 814 setAll.invoke(ss); 815 assertEquals("ScriptSet toString with all scripts", "<ScriptSet { * }>", ss.toString()); 816 817 Method isFull = ScriptSet.getDeclaredMethod("isFull"); 818 isFull.setAccessible(true); 819 boolean result = (Boolean) isFull.invoke(ss); 820 assertEquals("ScriptSet should evaluate as full", true, result); 821 822 } catch (ClassNotFoundException e) { 823 fail("Failed while testing ScriptSet: " + e.getClass() + ": " + e.getMessage()); 824 } catch (InstantiationException e) { 825 fail("Failed while testing ScriptSet: " + e.getClass() + ": " + e.getMessage()); 826 } catch (IllegalAccessException e) { 827 fail("Failed while testing ScriptSet: " + e.getClass() + ": " + e.getMessage()); 828 } catch (SecurityException e) { 829 fail("Failed while testing ScriptSet: " + e.getClass() + ": " + e.getMessage()); 830 } catch (NoSuchMethodException e) { 831 fail("Failed while testing ScriptSet: " + e.getClass() + ": " + e.getMessage()); 832 } catch (IllegalArgumentException e) { 833 fail("Failed while testing ScriptSet: " + e.getClass() + ": " + e.getMessage()); 834 } catch (InvocationTargetException e) { 835 fail("Failed while testing ScriptSet: " + e.getClass() + ": " + e.getMessage()); 836 } 837 } 838 839 @Test 840 public void testCopyConstructor() { 841 SpoofChecker sc1 = new SpoofChecker.Builder() 842 .setAllowedChars(SpoofChecker.RECOMMENDED) 843 .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.INVISIBLE) 844 .build(); 845 SpoofChecker sc2 = new SpoofChecker.Builder(sc1).build(); 846 assertEquals("Copy constructor should produce identical instances", sc1, sc2); 847 } 848 } 849