Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 2009-2015, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.dev.test.text;
     10 
     11 import java.io.BufferedReader;
     12 import java.io.IOException;
     13 import java.io.Reader;
     14 import java.io.StringReader;
     15 import java.lang.reflect.Constructor;
     16 import java.lang.reflect.InvocationTargetException;
     17 import java.lang.reflect.Method;
     18 import java.text.ParseException;
     19 import java.util.BitSet;
     20 import java.util.HashSet;
     21 import java.util.LinkedHashSet;
     22 import java.util.Locale;
     23 import java.util.Set;
     24 import java.util.regex.Matcher;
     25 import java.util.regex.Pattern;
     26 
     27 import org.junit.Test;
     28 import org.junit.runner.RunWith;
     29 import org.junit.runners.JUnit4;
     30 
     31 import com.ibm.icu.dev.test.TestFmwk;
     32 import com.ibm.icu.dev.test.TestUtil;
     33 import com.ibm.icu.dev.test.TestUtil.JavaVendor;
     34 import com.ibm.icu.impl.Utility;
     35 import com.ibm.icu.lang.UScript;
     36 import com.ibm.icu.text.Normalizer2;
     37 import com.ibm.icu.text.SpoofChecker;
     38 import com.ibm.icu.text.SpoofChecker.CheckResult;
     39 import com.ibm.icu.text.SpoofChecker.RestrictionLevel;
     40 import com.ibm.icu.text.UnicodeSet;
     41 import com.ibm.icu.util.ULocale;
     42 
     43 @RunWith(JUnit4.class)
     44 public class SpoofCheckerTest extends TestFmwk {
     45     /*
     46      * Identifiers for verifying that spoof checking is minimally alive and working.
     47      */
     48     char[] goodLatinChars = { (char) 0x75, (char) 0x7a };
     49     String goodLatin = new String(goodLatinChars); /* "uz", all ASCII */
     50     /* (not confusable) */
     51     char[] scMixedChars = { (char) 0x73, (char) 0x0441 };
     52     String scMixed = new String(scMixedChars); /* "sc", with Cyrillic 'c' */
     53     /* (mixed script, confusable */
     54 
     55     String scLatin = "sc";   /* "sc", plain ascii. */
     56     String goodCyrl = "\u0438\u043B";    // "Cyrillic small letter i and el"  Plain lower case Cyrillic letters, no latin confusables
     57     String goodGreek = "\u03c0\u03c6";   // "Greek small letter pi and phi"  Plain lower case Greek letters
     58 
     59     // Various 1 l I look-alikes
     60     String lll_Latin_a = "lI1";   // small letter l, cap I, digit 1, all ASCII
     61     //  "\uFF29\u217C\u0196"  Full-width I, Small Roman Numeral fifty, Latin Cap Letter IOTA
     62     String lll_Latin_b = "\uff29\u217c\u0196";
     63     String lll_Cyrl = "\u0406\u04C0\u0031";  // "\u0406\u04C01"
     64     /* The skeleton transform for all of the 'lll' lookalikes is ascii lower case letter l. */
     65     String lll_Skel = "lll";
     66 
     67     String han_Hiragana = "\u3086\u308A \u77F3\u7530";  // Hiragana, space, Han
     68 
     69 
     70     /*
     71      * Test basic constructor.
     72      */
     73     @Test
     74     public void TestUSpoof() {
     75         SpoofChecker sc = new SpoofChecker.Builder().build();
     76         if (sc == null) {
     77             errln("FAIL: null SpoofChecker");
     78         }
     79     }
     80 
     81     /*
     82      * Test build from source rules.
     83      */
     84     @Test
     85     public void TestOpenFromSourceRules() {
     86         if (TestUtil.getJavaVendor() == JavaVendor.IBM && TestUtil.getJavaVersion() == 5) {
     87             // Note: IBM Java 5 has a bug reading a large UTF-8 text contents
     88             logln("Skip this test case because of the IBM Java 5 bug");
     89             return;
     90         }
     91         String fileName;
     92         Reader confusables;
     93 
     94         try {
     95             SpoofChecker rsc = null;
     96 
     97             fileName = "unicode/confusables.txt";
     98             confusables = TestUtil.getDataReader(fileName, "UTF-8");
     99             try {
    100                 rsc = new SpoofChecker.Builder().setData(confusables).build();
    101             } finally {
    102                 confusables.close();
    103             }
    104 
    105             if (rsc == null) {
    106                 errln("FAIL: null SpoofChecker");
    107                 return;
    108             }
    109             // Check that newly built-from-rules SpoofChecker is able to function.
    110             checkSkeleton(rsc, "TestOpenFromSourceRules");
    111 
    112             SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
    113             rsc.failsChecks("Hello", result);
    114 
    115             // The checker we just built from source rules should be equivalent to the
    116             //  default checker created from prebuilt rules baked into the ICU data.
    117             SpoofChecker defaultChecker = new SpoofChecker.Builder().build();
    118             assertEquals("Checker built from rules equals default", defaultChecker, rsc);
    119             assertEquals("Checker built from rules has same hash code as default", defaultChecker.hashCode(), rsc.hashCode());
    120 
    121             SpoofChecker optionChecker = new SpoofChecker.Builder().
    122                                     setRestrictionLevel(RestrictionLevel.UNRESTRICTIVE).build();
    123             assertFalse("", optionChecker.equals(rsc));
    124 
    125             String stubConfusables =
    126                 "# Stub confusables data\n" +
    127                 "05AD ; 0596 ;  MA  # (    ) HEBREW ACCENT DEHI  HEBREW ACCENT TIPEHA   #\n";
    128 
    129             // Verify that re-using a builder doesn't alter SpoofCheckers that were
    130             //  previously created by that builder. (The builder could modify data
    131             //  being used by the existing checker)
    132 
    133             SpoofChecker.Builder builder = new SpoofChecker.Builder();
    134             SpoofChecker testChecker1 = builder.build();
    135             assertTrue("", testChecker1.equals(defaultChecker));
    136 
    137             builder.setData(new StringReader(stubConfusables));
    138             builder.setRestrictionLevel(RestrictionLevel.UNRESTRICTIVE);
    139             builder.setChecks(SpoofChecker.SINGLE_SCRIPT_CONFUSABLE);
    140             Set<ULocale>allowedLocales = new HashSet<ULocale>();
    141             allowedLocales.add(ULocale.JAPANESE);
    142             allowedLocales.add(ULocale.FRENCH);
    143             builder.setAllowedLocales(allowedLocales);
    144             SpoofChecker testChecker2 = builder.build();
    145             SpoofChecker testChecker3 = builder.build();
    146 
    147             assertTrue("", testChecker1.equals(defaultChecker));
    148             assertFalse("", testChecker2.equals(defaultChecker));
    149             assertTrue("", testChecker2.equals(testChecker3));
    150 
    151         } catch (java.io.IOException e) {
    152             errln(e.toString());
    153         } catch (ParseException e) {
    154             errln(e.toString());
    155         }
    156     }
    157 
    158     /*
    159      * Set & Get Check Flags
    160      */
    161     @Test
    162     public void TestGetSetChecks1() {
    163         SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS).build();
    164         int t;
    165         t = sc.getChecks();
    166         assertEquals("", SpoofChecker.ALL_CHECKS, t);
    167 
    168         sc = new SpoofChecker.Builder().setChecks(0).build();
    169         t = sc.getChecks();
    170         assertEquals("", 0, t);
    171 
    172         int checks = SpoofChecker.WHOLE_SCRIPT_CONFUSABLE | SpoofChecker.MIXED_SCRIPT_CONFUSABLE
    173                 | SpoofChecker.ANY_CASE;
    174         sc = new SpoofChecker.Builder().setChecks(checks).build();
    175         t = sc.getChecks();
    176         assertEquals("", checks, t);
    177     }
    178 
    179     /*
    180      * get & setAllowedChars
    181      */
    182     @Test
    183     public void TestGetSetAllowedChars() {
    184         SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).build();
    185         UnicodeSet us;
    186         UnicodeSet uset;
    187 
    188         uset = sc.getAllowedChars();
    189         assertTrue("", uset.isFrozen());
    190         us = new UnicodeSet(0x41, 0x5A); /* [A-Z] */
    191         sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedChars(us).build();
    192         assertEquals("", us, sc.getAllowedChars());
    193     }
    194 
    195     /*
    196      * get & set Checks
    197      */
    198     @Test
    199     public void TestGetSetChecks() {
    200         SpoofChecker sc = new SpoofChecker.Builder().build();
    201         int checks;
    202         int checks2;
    203         boolean checkResults;
    204 
    205         checks = sc.getChecks();
    206         assertEquals("", SpoofChecker.ALL_CHECKS, checks);
    207 
    208         checks &= ~(SpoofChecker.SINGLE_SCRIPT | SpoofChecker.MIXED_SCRIPT_CONFUSABLE);
    209         sc = new SpoofChecker.Builder().setChecks(checks).build();
    210         checks2 = sc.getChecks();
    211         assertEquals("", checks, checks2);
    212 
    213         /*
    214          * The checks that were disabled just above are the same ones that the "scMixed" test fails. So with those tests
    215          * gone checking that Identifier should now succeed
    216          */
    217         checkResults = sc.failsChecks(scMixed);
    218         assertFalse("", checkResults);
    219     }
    220 
    221     /*
    222      * AllowedLocales
    223      */
    224     @Test
    225     public void TestAllowedLocales() {
    226         SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).build();
    227         Set<ULocale> allowedLocales = null;
    228         Set<Locale> allowedJavaLocales = null;
    229         boolean checkResults;
    230 
    231         /* Default allowed locales list should be empty */
    232         allowedLocales = sc.getAllowedLocales();
    233         assertTrue("Empty allowed locales", allowedLocales.isEmpty());
    234 
    235         allowedJavaLocales = sc.getAllowedJavaLocales();
    236         assertTrue("Empty allowed Java locales", allowedJavaLocales.isEmpty());
    237 
    238         /* Allow en and ru, which should enable Latin and Cyrillic only to pass */
    239         ULocale enloc = new ULocale("en");
    240         ULocale ruloc = new ULocale("ru_RU");
    241         allowedLocales = new HashSet<ULocale>();
    242         allowedLocales.add(enloc);
    243         allowedLocales.add(ruloc);
    244         sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build();
    245         allowedLocales = sc.getAllowedLocales();
    246         assertTrue("en in allowed locales", allowedLocales.contains(enloc));
    247         assertTrue("ru_RU in allowed locales", allowedLocales.contains(ruloc));
    248 
    249         Locale frlocJ = new Locale("fr");
    250         allowedJavaLocales = new HashSet<Locale>();
    251         allowedJavaLocales.add(frlocJ);
    252         sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedJavaLocales(allowedJavaLocales).build();
    253         assertFalse("no en in allowed Java locales", allowedJavaLocales.contains(new Locale("en")));
    254         assertTrue("fr in allowed Java locales", allowedJavaLocales.contains(frlocJ));
    255 
    256         sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build();
    257 
    258         SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
    259         checkResults = sc.failsChecks(goodLatin);
    260         assertFalse("", checkResults);
    261 
    262         checkResults = sc.failsChecks(goodGreek, result);
    263         assertEquals("", SpoofChecker.CHAR_LIMIT, result.checks);
    264 
    265         checkResults = sc.failsChecks(goodCyrl);
    266         assertFalse("", checkResults);
    267 
    268         /* Reset with an empty locale list, which should allow all characters to pass */
    269         allowedLocales = new LinkedHashSet<ULocale>();
    270         sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build();
    271 
    272         checkResults = sc.failsChecks(goodGreek);
    273         assertFalse("", checkResults);
    274     }
    275 
    276     /*
    277      * AllowedChars set/get the UnicodeSet of allowed characters.
    278      */
    279     @Test
    280     public void TestAllowedChars() {
    281         SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).build();
    282         UnicodeSet set;
    283         UnicodeSet tmpSet;
    284         boolean checkResults;
    285 
    286         /* By default, we should see no restriction; the UnicodeSet should allow all characters. */
    287         set = sc.getAllowedChars();
    288         tmpSet = new UnicodeSet(0, 0x10ffff);
    289         assertEquals("", tmpSet, set);
    290 
    291         /* Remove a character that is in our good Latin test identifier from the allowed chars set. */
    292         tmpSet.remove(goodLatin.charAt(1));
    293         sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedChars(tmpSet).build();
    294 
    295         /* Latin Identifier should now fail; other non-latin test cases should still be OK */
    296         SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
    297         checkResults = sc.failsChecks(goodLatin, result);
    298         assertTrue("", checkResults);
    299         assertEquals("", SpoofChecker.CHAR_LIMIT, result.checks);
    300     }
    301 
    302     @Test
    303     public void TestCheck() {
    304         SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS).build();
    305         SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
    306         boolean checkResults;
    307 
    308         result.position = 666;
    309         checkResults = sc.failsChecks(goodLatin, result);
    310         assertFalse("", checkResults);
    311         assertEquals("", 0, result.checks);
    312 
    313         checkResults = sc.failsChecks(goodCyrl, result);
    314         assertFalse("", checkResults);
    315         assertEquals("", 0, result.checks);
    316 
    317         result.position = 666;
    318         checkResults = sc.failsChecks(scMixed, result);
    319         assertTrue("", checkResults);
    320         assertEquals("", SpoofChecker.RESTRICTION_LEVEL, result.checks);
    321 
    322         result.position = 666;
    323         checkResults = sc.failsChecks(han_Hiragana, result);
    324         assertFalse("", checkResults);
    325         assertEquals("", 0, result.checks);
    326     }
    327 
    328     @Test
    329     public void TestAreConfusable1() {
    330         SpoofChecker sc = new SpoofChecker.Builder().build();
    331         int checkResults;
    332         checkResults = sc.areConfusable(scLatin, scMixed);
    333         assertEquals("Latin/Mixed is not MIXED_SCRIPT_CONFUSABLE", SpoofChecker.MIXED_SCRIPT_CONFUSABLE, checkResults);
    334 
    335         checkResults = sc.areConfusable(goodGreek, scLatin);
    336         assertEquals("Greek/Latin is not unconfusable", 0, checkResults);
    337 
    338         checkResults = sc.areConfusable(lll_Latin_a, lll_Latin_b);
    339         assertEquals("Latin/Latin is not SINGLE_SCRIPT_CONFUSABLE", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, checkResults);
    340     }
    341 
    342     @Test
    343     public void TestGetSkeleton() {
    344         SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
    345         String dest;
    346         dest = sc.getSkeleton(SpoofChecker.ANY_CASE, lll_Latin_a);
    347         assertEquals("", lll_Skel, dest);
    348     }
    349 
    350     /**
    351      * IntlTestSpoof is the top level test class for the Unicode Spoof detection tests
    352      */
    353 
    354     // Test the USpoofDetector API functions that require C++
    355     // The pure C part of the API, which is most of it, is tested in cintltst
    356     /**
    357      * IntlTestSpoof tests for USpoofDetector
    358      */
    359     @Test
    360     public void TestSpoofAPI() {
    361         SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS).build();
    362         String s = "xyz";
    363         SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
    364         result.position = 666;
    365         boolean checkResults = sc.failsChecks(s, result);
    366         assertFalse("", checkResults);
    367         assertEquals("", 0, result.position);
    368 
    369         sc = new SpoofChecker.Builder().build();
    370         String s1 = "cxs";
    371         String s2 = Utility.unescape("\\u0441\\u0445\\u0455"); // Cyrillic "cxs"
    372         int checkResult = sc.areConfusable(s1, s2);
    373         assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, checkResult);
    374 
    375         sc = new SpoofChecker.Builder().build();
    376         s = "I1l0O";
    377         String dest = sc.getSkeleton(SpoofChecker.ANY_CASE, s);
    378         assertEquals("", dest, "lllOO");
    379     }
    380 
    381     @Test
    382     public void TestSkeleton() {
    383         SpoofChecker sc = new SpoofChecker.Builder().build();
    384         checkSkeleton(sc, "TestSkeleton");
    385     }
    386 
    387     // testSkeleton. Spot check a number of confusable skeleton substitutions from the
    388     // Unicode data file confusables.txt
    389     // Test cases chosen for substitutions of various lengths, and
    390     // membership in different mapping tables.
    391     public void checkSkeleton(SpoofChecker sc, String testName) {
    392         int ML = 0;
    393         int SL = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE;
    394         int MA = SpoofChecker.ANY_CASE;
    395         int SA = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE | SpoofChecker.ANY_CASE;
    396 
    397         checkSkeleton(sc, MA, "\\u02b9identifier'",  "'identifier'",  testName);
    398 
    399         checkSkeleton(sc, SL, "nochange", "nochange", testName);
    400         checkSkeleton(sc, SA, "nochange", "nochange", testName);
    401         checkSkeleton(sc, ML, "nochange", "nochange", testName);
    402         checkSkeleton(sc, MA, "nochange", "nochange", testName);
    403         checkSkeleton(sc, MA, "love", "love", testName);
    404         checkSkeleton(sc, MA, "1ove", "love", testName);   // Digit 1 to letter l
    405         checkSkeleton(sc, ML, "OOPS", "OOPS", testName);
    406         checkSkeleton(sc, ML, "00PS", "OOPS", testName);
    407         checkSkeleton(sc, MA, "OOPS", "OOPS", testName);
    408         checkSkeleton(sc, MA, "00PS", "OOPS", testName);   // Digit 0 to letter O
    409         checkSkeleton(sc, SL, "\\u059c", "\\u0301", testName);
    410         checkSkeleton(sc, SL, "\\u2A74", "\\u003A\\u003A\\u003D", testName);
    411         checkSkeleton(sc, SL, "\\u247E", "(ll)", testName);
    412         checkSkeleton(sc, SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u006c\\u0644\\u006f", testName);
    413 
    414         // 0C83 mapping existed in the ML and MA tables, did not exist in SL, SA (Original Unicode 7)
    415         //   mapping exists in all tables (ICU 55).
    416         // 0C83 ; 0983 ; ML #  KANNADA SIGN VISARGA to
    417         checkSkeleton(sc, SL, "\\u0C83", "\\u0983", testName);
    418         checkSkeleton(sc, SA, "\\u0C83", "\\u0983", testName);
    419         checkSkeleton(sc, ML, "\\u0C83", "\\u0983", testName);
    420         checkSkeleton(sc, MA, "\\u0C83", "\\u0983", testName);
    421 
    422         // 0391 mappings existed only in MA and SA tables (Original Unicode 7).
    423         //      mappings exist in all tables (ICU 55)
    424         checkSkeleton(sc, MA, "\\u0391", "A", testName);
    425         checkSkeleton(sc, SA, "\\u0391", "A", testName);
    426         checkSkeleton(sc, ML, "\\u0391", "A", testName);
    427         checkSkeleton(sc, SL, "\\u0391", "A", testName);
    428 
    429         // 13CF Mappings in all four tables, different in MA (Original Unicode 7).
    430         //      Mapping same in all tables (ICU 55)
    431         checkSkeleton(sc, ML, "\\u13CF", "b", testName);
    432         checkSkeleton(sc, MA, "\\u13CF", "b", testName);
    433         checkSkeleton(sc, SL, "\\u13CF", "b", testName);
    434         checkSkeleton(sc, SA, "\\u13CF", "b", testName);
    435 
    436         // 0022 ; 0027 0027 ;
    437         // all tables
    438         checkSkeleton(sc, SL, "\"", "\\u0027\\u0027", testName);
    439         checkSkeleton(sc, SA, "\"", "\\u0027\\u0027", testName);
    440         checkSkeleton(sc, ML, "\"", "\\u0027\\u0027", testName);
    441         checkSkeleton(sc, MA, "\"", "\\u0027\\u0027", testName);
    442 
    443     }
    444 
    445     // Internal function to run a single skeleton test case.
    446     //
    447     // Run a single confusable skeleton transformation test case.
    448     //
    449     void checkSkeleton(SpoofChecker sc, int type, String input, String expected, String testName) {
    450         String uInput = Utility.unescape(input);
    451         String uExpected = Utility.unescape(expected);
    452         String actual;
    453         actual = sc.getSkeleton(type, uInput);
    454         Throwable t = new Throwable();
    455         int lineNumberOfTest = t.getStackTrace()[1].getLineNumber();
    456 
    457         assertEquals(testName + " test at line " + lineNumberOfTest + " :  Expected (escaped): " + expected, uExpected, actual);
    458     }
    459 
    460     @Test
    461     public void TestAreConfusable() {
    462         SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
    463         String s1 = "A long string that will overflow stack buffers.  A long string that will overflow stack buffers. "
    464                 + "A long string that will overflow stack buffers.  A long string that will overflow stack buffers. ";
    465         String s2 = "A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. "
    466                 + "A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. ";
    467         assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, sc.areConfusable(s1, s2));
    468     }
    469 
    470     @Test
    471     public void TestConfusableFlagVariants() {
    472         // The spoof checker should only return those tests that the user requested.  This test makes sure that
    473         // the checker doesn't return anything the user doesn't want.  This test started passing in ICU 58.
    474 
    475         // NOTE: These strings are the same ones as in the documentation.  If the confusables data changes
    476         // and this test breaks, pick a new confusables pair, update it here, and also update it in the
    477         // documentation of SpoofChecker.java.
    478         String latn = "desparejado";
    479         String cyrl = "";
    480         String mixed = "dsrd";
    481 
    482         Object[][] tests = {
    483                 // string 1, string 2, checks for spoof checker, expected output
    484                 { latn, cyrl,
    485                     SpoofChecker.CONFUSABLE,
    486                     SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE },
    487                 { latn, cyrl,
    488                     SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE,
    489                     SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE },
    490                 { latn, cyrl,
    491                     SpoofChecker.MIXED_SCRIPT_CONFUSABLE,
    492                     SpoofChecker.MIXED_SCRIPT_CONFUSABLE },
    493                 { latn, cyrl,
    494                     SpoofChecker.WHOLE_SCRIPT_CONFUSABLE,
    495                     SpoofChecker.WHOLE_SCRIPT_CONFUSABLE },
    496                 { latn, cyrl,
    497                     SpoofChecker.SINGLE_SCRIPT_CONFUSABLE,
    498                     0 },
    499                 { latn, mixed,
    500                     SpoofChecker.CONFUSABLE,
    501                     SpoofChecker.MIXED_SCRIPT_CONFUSABLE },
    502                 { latn, mixed,
    503                     SpoofChecker.MIXED_SCRIPT_CONFUSABLE,
    504                     SpoofChecker.MIXED_SCRIPT_CONFUSABLE },
    505                 { latn, mixed,
    506                     SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE,
    507                     SpoofChecker.MIXED_SCRIPT_CONFUSABLE },
    508                 { latn, mixed,
    509                     SpoofChecker.WHOLE_SCRIPT_CONFUSABLE,
    510                     0 },
    511                 { latn, latn,
    512                     SpoofChecker.CONFUSABLE,
    513                     SpoofChecker.SINGLE_SCRIPT_CONFUSABLE },
    514         };
    515 
    516         for (Object[] test : tests) {
    517             String s1 = (String) test[0];
    518             String s2 = (String) test[1];
    519             int checks = (Integer) test[2];
    520             int expectedResult = (Integer) test[3];
    521 
    522             // Sanity check: expectedResult should be a subset of checks
    523             assertEquals("Invalid test case", expectedResult & checks, expectedResult);
    524 
    525             SpoofChecker sc = new SpoofChecker.Builder().setChecks(checks).build();
    526             int actualResult = sc.areConfusable(s1, s2);
    527             assertEquals("Comparing '" + s1 + "' and '" + s2 + "' with checks '" + checks + "'",
    528                     expectedResult, actualResult);
    529         }
    530     }
    531 
    532     @Test
    533     public void TestInvisible() {
    534         SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.INVISIBLE).build();
    535         String s = Utility.unescape("abcd\\u0301ef");
    536         SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
    537         result.position = -42;
    538         assertFalse("", sc.failsChecks(s, result));
    539         assertEquals("", 0, result.checks);
    540         assertEquals("", result.position, 0);
    541 
    542         String s2 = Utility.unescape("abcd\\u0301\\u0302\\u0301ef");
    543         assertTrue("", sc.failsChecks(s2, result));
    544         assertEquals("", SpoofChecker.INVISIBLE, result.checks);
    545         assertEquals("", 0, result.position);
    546 
    547         // Two acute accents, one from the composed a with acute accent, \u00e1,
    548         // and one separate.
    549         result.position = -42;
    550         String s3 = Utility.unescape("abcd\\u00e1\\u0301xyz");
    551         assertTrue("", sc.failsChecks(s3, result));
    552         assertEquals("", SpoofChecker.INVISIBLE, result.checks);
    553         assertEquals("", 0, result.position);
    554     }
    555 
    556     @Test
    557     public void TestRestrictionLevel() {
    558         Object[][] tests = {
    559                 {"a", RestrictionLevel.UNRESTRICTIVE},
    560                 {"a", RestrictionLevel.ASCII},
    561                 {"", RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE},
    562                 {"a", RestrictionLevel.HIGHLY_RESTRICTIVE},
    563                 {"a", RestrictionLevel.MODERATELY_RESTRICTIVE},
    564                 {"a", RestrictionLevel.MINIMALLY_RESTRICTIVE},
    565                 {"a", RestrictionLevel.UNRESTRICTIVE},
    566                 {"a\u303c", RestrictionLevel.HIGHLY_RESTRICTIVE},
    567                 {"a\u303c", RestrictionLevel.HIGHLY_RESTRICTIVE},
    568                 {"a\u303c", RestrictionLevel.HIGHLY_RESTRICTIVE},
    569                 { "a\u303c", RestrictionLevel.HIGHLY_RESTRICTIVE},
    570                 {"a1", RestrictionLevel.MODERATELY_RESTRICTIVE},
    571                 {"a1", RestrictionLevel.MODERATELY_RESTRICTIVE},
    572                 {"\u303ca1", RestrictionLevel.MINIMALLY_RESTRICTIVE},
    573                 {"a\u303c1", RestrictionLevel.MINIMALLY_RESTRICTIVE},
    574         };
    575 
    576         UnicodeSet allowedChars = new UnicodeSet();
    577         // Allowed Identifier Characters. In addition to the Recommended Set,
    578         //    allow u303c, which has an interesting script extension of Hani Hira Kana.
    579         allowedChars.addAll(SpoofChecker.RECOMMENDED).add(0x303c);
    580 
    581         CheckResult checkResult = new CheckResult();
    582         for (Object[] test : tests) {
    583             String testString = (String) test[0];
    584             RestrictionLevel expectedLevel = (RestrictionLevel) test[1];
    585             for (RestrictionLevel levelSetInSpoofChecker : RestrictionLevel.values()) {
    586                 SpoofChecker sc = new SpoofChecker.Builder()
    587                         .setAllowedChars(allowedChars)
    588                         .setRestrictionLevel(levelSetInSpoofChecker)
    589                         .setChecks(SpoofChecker.RESTRICTION_LEVEL) // only check this
    590                         .build();
    591                 boolean actualValue = sc.failsChecks(testString, checkResult);
    592                 assertEquals("Testing restriction level for '" + testString + "'",
    593                         expectedLevel, checkResult.restrictionLevel);
    594 
    595                 // we want to fail if the text is (say) MODERATE and the testLevel is ASCII
    596                 boolean expectedFailure = expectedLevel.compareTo(levelSetInSpoofChecker) > 0;
    597                 assertEquals("Testing spoof restriction level for '" + testString + "', " + levelSetInSpoofChecker,
    598                         expectedFailure, actualValue);
    599 
    600                 // Coverage for getRestrictionLevel
    601                 assertEquals("Restriction level on built SpoofChecker should be same as on builder",
    602                         levelSetInSpoofChecker, sc.getRestrictionLevel());
    603             }
    604         }
    605     }
    606 
    607     @Test
    608     public void TestMixedNumbers() {
    609         Object[][] tests = {
    610                 {"1", "[0]"},
    611                 {"", "[]"},
    612                 {"1", "[0]"},
    613                 {"", "[]"},
    614                 {"a", "[]"},
    615                 {"a\u303c", "[]"},
    616                 {"a\u303c", "[]"},
    617                 {"a\u303c", "[]"},
    618                 { "a\u303c", "[]"},
    619                 {"a1", "[0]"},
    620                 {"a1", "[0]"},
    621                 {"\u303ca1", "[0]"},
    622                 {"a\u303c1", "[0]"},
    623         };
    624         CheckResult checkResult = new CheckResult();
    625         for (Object[] test : tests) {
    626             String testString = (String) test[0];
    627             UnicodeSet expected = new UnicodeSet((String)test[1]);
    628 
    629             SpoofChecker sc = new SpoofChecker.Builder()
    630             .setChecks(SpoofChecker.MIXED_NUMBERS) // only check this
    631             .build();
    632             boolean actualValue = sc.failsChecks(testString, checkResult);
    633             assertEquals("", expected, checkResult.numerics);
    634             assertEquals("Testing spoof mixed numbers for '" + testString + "', ", expected.size() > 1, actualValue);
    635         }
    636     }
    637 
    638     @Test
    639     public void TestBug11635() {
    640         // The bug was an error in iterating through supplementary characters in IdentifierInfo.
    641         //  The three supplemental chars in the string are "123" from the mathematical bold digit range.
    642         //  Common script, Nd general category, and no other restrictions on allowed characters
    643         //  leaves "ABC123" as SINGLE_SCRIPT_RESTRICTIVE.
    644         String identifier = Utility.unescape("ABC\\U0001D7CF\\U0001D7D0\\U0001D7D1");
    645         CheckResult checkResult = new CheckResult();
    646         SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.RESTRICTION_LEVEL).build();
    647         sc.failsChecks(identifier, checkResult);
    648         assertEquals("", RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE, checkResult.restrictionLevel);
    649     }
    650 
    651     private String parseHex(String in) {
    652         StringBuilder sb = new StringBuilder();
    653         for (String oneCharAsHexString : in.split("\\s+")) {
    654             if (oneCharAsHexString.length() > 0) {
    655                 sb.appendCodePoint(Integer.parseInt(oneCharAsHexString, 16));
    656             }
    657         }
    658         return sb.toString();
    659     }
    660 
    661     private String escapeString(String in) {
    662         StringBuilder out = new StringBuilder();
    663         for (int i = 0; i < in.length(); i++) {
    664             int c = in.codePointAt(i);
    665             if (c <= 0x7f) {
    666                 out.append((char) c);
    667             } else if (c <= 0xffff) {
    668                 out.append(String.format("\\u%04x", c));
    669             } else {
    670                 out.append(String.format("\\U%06x", c));
    671                 i++;
    672             }
    673         }
    674         return out.toString();
    675     }
    676 
    677     // Verify that each item from the Unicode confusables.txt file
    678     // transforms into the expected skeleton.
    679     @Test
    680     public void testConfData() {
    681         if (TestUtil.getJavaVendor() == JavaVendor.IBM && TestUtil.getJavaVersion() == 5) {
    682             // Note: IBM Java 5 has a bug reading a large UTF-8 text contents
    683             logln("Skip this test case because of the IBM Java 5 bug");
    684             return;
    685         }
    686         try {
    687             // Read in the confusables.txt file. (Distributed by Unicode.org)
    688             String fileName = "unicode/confusables.txt";
    689             BufferedReader confusablesRdr = TestUtil.getDataReader(fileName, "UTF-8");
    690 
    691             // Create a default spoof checker to use in this test.
    692             SpoofChecker sc = new SpoofChecker.Builder().build();
    693 
    694             // Parse lines from the confusables.txt file. Example Line:
    695             // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH ....
    696             // Lines have three fields. The hex fields can contain more than one character,
    697             // and each character may be more than 4 digits (for supplemntals)
    698             // This regular expression matches lines and splits the fields into capture groups.
    699             // Capture group 1: map from chars
    700             // 2: map to chars
    701             // 3: table type, SL, ML, SA or MA (deprecated)
    702             // 4: Comment Lines Only
    703             // 5: Error Lines Only
    704             Matcher parseLine = Pattern.compile(
    705                     "\\ufeff?" + "(?:([0-9A-F\\s]+);([0-9A-F\\s]+);\\s*(SL|ML|SA|MA)\\s*(?:#.*?)?$)"
    706                             + "|\\ufeff?(\\s*(?:#.*)?)"). // Comment line
    707                             matcher("");
    708             Normalizer2 normalizer = Normalizer2.getNFDInstance();
    709             int lineNum = 0;
    710             String inputLine;
    711             while ((inputLine = confusablesRdr.readLine()) != null) {
    712                 lineNum++;
    713                 parseLine.reset(inputLine);
    714                 if (!parseLine.matches()) {
    715                     errln("Syntax error in confusable data file at line " + lineNum);
    716                     errln(inputLine);
    717                     break;
    718                 }
    719                 if (parseLine.group(4) != null) {
    720                     continue; // comment line
    721                 }
    722                 String from = parseHex(parseLine.group(1));
    723 
    724                 if (!normalizer.isNormalized(from)) {
    725                     // The source character was not NFD.
    726                     // Skip this case; the first step in obtaining a skeleton is to NFD the input,
    727                     // so the mapping in this line of confusables.txt will never be applied.
    728                     continue;
    729                 }
    730 
    731                 String rawExpected = parseHex(parseLine.group(2));
    732                 String expected = normalizer.normalize(rawExpected);
    733 
    734                 String actual;
    735                 actual = sc.getSkeleton(from);
    736 
    737                 if (!actual.equals(expected)) {
    738                     errln("confusables.txt: " + lineNum + ": " + parseLine.group(0));
    739                     errln("Actual: " + escapeString(actual));
    740                 }
    741             }
    742             confusablesRdr.close();
    743         } catch (IOException e) {
    744             errln(e.toString());
    745         }
    746     }
    747 
    748     @Test
    749     public void TestCheckResultToString11447() {
    750         CheckResult checkResult = new CheckResult();
    751         SpoofChecker sc = new SpoofChecker.Builder()
    752                 .setChecks(SpoofChecker.MIXED_NUMBERS)
    753                 .build();
    754         sc.failsChecks("1", checkResult);
    755         assertTrue("CheckResult: ", checkResult.toString().contains("MIXED_NUMBERS"));
    756     }
    757 
    758     @Test
    759     public void TestDeprecated() {
    760         // getSkeleton
    761         SpoofChecker sc = new SpoofChecker.Builder().build();
    762         assertEquals("Deprecated version of getSkeleton method does not work",
    763                 sc.getSkeleton(SpoofChecker.ANY_CASE, scMixed),
    764                 sc.getSkeleton(scMixed));
    765 
    766         // setData
    767         try {
    768             String fileName1 = "unicode/confusables.txt";
    769             String fileName2 = "unicode/confusablesWholeScript.txt";
    770             Reader reader1 = TestUtil.getDataReader(fileName1, "UTF-8");
    771             Reader reader2 = TestUtil.getDataReader(fileName2, "UTF-8");
    772             Reader reader3 = TestUtil.getDataReader(fileName1, "UTF-8");
    773             try {
    774                 SpoofChecker sc2 = new SpoofChecker.Builder()
    775                         .setData(reader1, reader2)
    776                         .build();
    777                 SpoofChecker sc1 = new SpoofChecker.Builder()
    778                         .setData(reader3)
    779                         .build();
    780                 assertEquals("Deprecated version of setData method does not work", sc1, sc2);
    781             } finally {
    782                 reader1.close();
    783                 reader2.close();
    784                 reader3.close();
    785             }
    786         } catch(IOException e) {
    787             fail("Could not load confusables data");
    788         } catch (ParseException e) {
    789             fail("Could not parse confusables data");
    790         }
    791     }
    792 
    793     @Test
    794     public void testScriptSet() {
    795         try {
    796             Class ScriptSet = Class.forName("com.ibm.icu.text.SpoofChecker$ScriptSet");
    797             Constructor ctor = ScriptSet.getDeclaredConstructor();
    798             ctor.setAccessible(true);
    799             BitSet ss = (BitSet) ctor.newInstance();
    800 
    801             ss.set(UScript.MYANMAR);
    802             assertEquals("ScriptSet toString with Myanmar", "<ScriptSet { Mymr }>", ss.toString());
    803             ss.set(UScript.BENGALI);
    804             ss.set(UScript.LATIN);
    805             assertEquals("ScriptSet toString with Myanmar, Latin, and Bengali", "<ScriptSet { Beng Latn Mymr }>", ss.toString());
    806 
    807             Method and = ScriptSet.getDeclaredMethod("and", Integer.TYPE);
    808             and.setAccessible(true);
    809             and.invoke(ss, UScript.BENGALI);
    810             assertEquals("ScriptSet toString with Bengali only", "<ScriptSet { Beng }>", ss.toString());
    811 
    812             Method setAll = ScriptSet.getDeclaredMethod("setAll");
    813             setAll.setAccessible(true);
    814             setAll.invoke(ss);
    815             assertEquals("ScriptSet toString with all scripts", "<ScriptSet { * }>", ss.toString());
    816 
    817             Method isFull = ScriptSet.getDeclaredMethod("isFull");
    818             isFull.setAccessible(true);
    819             boolean result = (Boolean) isFull.invoke(ss);
    820             assertEquals("ScriptSet should evaluate as full", true, result);
    821 
    822         } catch (ClassNotFoundException e) {
    823             fail("Failed while testing ScriptSet: " + e.getClass() + ": " + e.getMessage());
    824         } catch (InstantiationException e) {
    825             fail("Failed while testing ScriptSet: " + e.getClass() + ": " + e.getMessage());
    826         } catch (IllegalAccessException e) {
    827             fail("Failed while testing ScriptSet: " + e.getClass() + ": " + e.getMessage());
    828         } catch (SecurityException e) {
    829             fail("Failed while testing ScriptSet: " + e.getClass() + ": " + e.getMessage());
    830         } catch (NoSuchMethodException e) {
    831             fail("Failed while testing ScriptSet: " + e.getClass() + ": " + e.getMessage());
    832         } catch (IllegalArgumentException e) {
    833             fail("Failed while testing ScriptSet: " + e.getClass() + ": " + e.getMessage());
    834         } catch (InvocationTargetException e) {
    835             fail("Failed while testing ScriptSet: " + e.getClass() + ": " + e.getMessage());
    836         }
    837     }
    838 
    839     @Test
    840     public void testCopyConstructor() {
    841         SpoofChecker sc1 = new SpoofChecker.Builder()
    842                 .setAllowedChars(SpoofChecker.RECOMMENDED)
    843                 .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.INVISIBLE)
    844                 .build();
    845         SpoofChecker sc2 = new SpoofChecker.Builder(sc1).build();
    846         assertEquals("Copy constructor should produce identical instances", sc1, sc2);
    847     }
    848 }
    849