Home | History | Annotate | Download | only in intltest
      1 /*
      2 **********************************************************************
      3 * Copyright (C) 2011, International Business Machines Corporation
      4 * and others.  All Rights Reserved.
      5 **********************************************************************
      6 */
      7 /**
      8  * IntlTestSpoof tests for USpoofDetector
      9  */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO
     14 
     15 #include "itspoof.h"
     16 #include "unicode/uspoof.h"
     17 #include "unicode/unistr.h"
     18 #include "unicode/regex.h"
     19 #include "unicode/normlzr.h"
     20 #include "cstring.h"
     21 #include <stdlib.h>
     22 #include <stdio.h>
     23 
     24 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
     25     errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}}
     26 
     27 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
     28     errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}
     29 
     30 #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
     31     errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
     32              __FILE__, __LINE__, #a, (a), #b, (b)); }}
     33 
     34 #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
     35     errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
     36              __FILE__, __LINE__, #a, (a), #b, (b)); }}
     37 
     38 /*
     39  *   TEST_SETUP and TEST_TEARDOWN
     40  *         macros to handle the boilerplate around setting up test case.
     41  *         Put arbitrary test code between SETUP and TEARDOWN.
     42  *         "sc" is the ready-to-go  SpoofChecker for use in the tests.
     43  */
     44 #define TEST_SETUP {  \
     45     UErrorCode status = U_ZERO_ERROR; \
     46     USpoofChecker *sc;     \
     47     sc = uspoof_open(&status);  \
     48     TEST_ASSERT_SUCCESS(status);   \
     49     if (U_SUCCESS(status)){
     50 
     51 #define TEST_TEARDOWN  \
     52     }  \
     53     TEST_ASSERT_SUCCESS(status);  \
     54     uspoof_close(sc);  \
     55 }
     56 
     57 
     58 
     59 
     60 void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     61 {
     62     if (exec) logln("TestSuite spoof: ");
     63     switch (index) {
     64         case 0:
     65             name = "TestSpoofAPI";
     66             if (exec) {
     67                 testSpoofAPI();
     68             }
     69             break;
     70          case 1:
     71             name = "TestSkeleton";
     72             if (exec) {
     73                 testSkeleton();
     74             }
     75             break;
     76          case 2:
     77             name = "TestAreConfusable";
     78             if (exec) {
     79                 testAreConfusable();
     80             }
     81             break;
     82           case 3:
     83             name = "TestInvisible";
     84             if (exec) {
     85                 testInvisible();
     86             }
     87             break;
     88           case 4:
     89             name = "testConfData";
     90             if (exec) {
     91                 testConfData();
     92             }
     93             break;
     94           case 5:
     95             name = "testBug8654";
     96             if (exec) {
     97                 testBug8654();
     98             }
     99             break;
    100          default: name=""; break;
    101     }
    102 }
    103 
    104 void IntlTestSpoof::testSpoofAPI() {
    105 
    106     TEST_SETUP
    107         UnicodeString s("xyz");  // Many latin ranges are whole-script confusable with other scripts.
    108                                  // If this test starts failing, consult confusablesWholeScript.txt
    109         int32_t position = 666;
    110         int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status);
    111         TEST_ASSERT_SUCCESS(status);
    112         TEST_ASSERT_EQ(0, checkResults);
    113         TEST_ASSERT_EQ(666, position);
    114     TEST_TEARDOWN;
    115 
    116     TEST_SETUP
    117         UnicodeString s1("cxs");
    118         UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape();  // Cyrillic "cxs"
    119         int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status);
    120         TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
    121 
    122     TEST_TEARDOWN;
    123 
    124     TEST_SETUP
    125         UnicodeString s("I1l0O");
    126         UnicodeString dest;
    127         UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status);
    128         TEST_ASSERT_SUCCESS(status);
    129         TEST_ASSERT(UnicodeString("lllOO") == dest);
    130         TEST_ASSERT(&dest == &retStr);
    131     TEST_TEARDOWN;
    132 }
    133 
    134 
    135 #define CHECK_SKELETON(type, input, expected) { \
    136     checkSkeleton(sc, type, input, expected, __LINE__); \
    137     }
    138 
    139 
    140 // testSkeleton.   Spot check a number of confusable skeleton substitutions from the
    141 //                 Unicode data file confusables.txt
    142 //                 Test cases chosen for substitutions of various lengths, and
    143 //                 membership in different mapping tables.
    144 void IntlTestSpoof::testSkeleton() {
    145     const uint32_t ML = 0;
    146     const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
    147     const uint32_t MA = USPOOF_ANY_CASE;
    148     const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
    149 
    150     TEST_SETUP
    151         // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations.
    152         CHECK_SKELETON(SL, " A 1ong \\u02b9identifier' that will overflow implementation stack buffers, forcing heap allocations."
    153                            " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
    154                            " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
    155                            " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",
    156 
    157                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
    158                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
    159                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
    160                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.")
    161 
    162         CHECK_SKELETON(SL, "nochange", "nochange");
    163         CHECK_SKELETON(MA, "love", "love");
    164         CHECK_SKELETON(MA, "1ove", "love");   // Digit 1 to letter l
    165         CHECK_SKELETON(ML, "OOPS", "OOPS");
    166         CHECK_SKELETON(ML, "00PS", "00PS");   // Digit 0 unchanged in lower case mode.
    167         CHECK_SKELETON(MA, "OOPS", "OOPS");
    168         CHECK_SKELETON(MA, "00PS", "OOPS");   // Digit 0 to letter O in any case mode only
    169         CHECK_SKELETON(SL, "\\u059c", "\\u0301");
    170         CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D");
    171         CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u006C\\u006C\\u0029");  // "(ll)"
    172         CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647");
    173 
    174         // This mapping exists in the ML and MA tables, does not exist in SL, SA
    175         //0C83 ;	0C03 ;
    176         CHECK_SKELETON(SL, "\\u0C83", "\\u0C83");
    177         CHECK_SKELETON(SA, "\\u0C83", "\\u0C83");
    178         CHECK_SKELETON(ML, "\\u0C83", "\\u0983");
    179         CHECK_SKELETON(MA, "\\u0C83", "\\u0983");
    180 
    181         // 0391 ; 0041 ;
    182         // This mapping exists only in the MA table.
    183         CHECK_SKELETON(MA, "\\u0391", "A");
    184         CHECK_SKELETON(SA, "\\u0391", "\\u0391");
    185         CHECK_SKELETON(ML, "\\u0391", "\\u0391");
    186         CHECK_SKELETON(SL, "\\u0391", "\\u0391");
    187 
    188         // 13CF ;  0062 ;
    189         // This mapping exists in the ML and MA tables
    190         CHECK_SKELETON(ML, "\\u13CF", "b");
    191         CHECK_SKELETON(MA, "\\u13CF", "b");
    192         CHECK_SKELETON(SL, "\\u13CF", "\\u13CF");
    193         CHECK_SKELETON(SA, "\\u13CF", "\\u13CF");
    194 
    195         // 0022 ;  0027 0027 ;
    196         // all tables.
    197         CHECK_SKELETON(SL, "\\u0022", "\\u0027\\u0027");
    198         CHECK_SKELETON(SA, "\\u0022", "\\u0027\\u0027");
    199         CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027");
    200         CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027");
    201 
    202         // 017F ;  0066 ;
    203         // This mapping exists in the SA and MA tables
    204         CHECK_SKELETON(MA, "\\u017F", "f");
    205         CHECK_SKELETON(SA, "\\u017F", "f");
    206 
    207     TEST_TEARDOWN;
    208 }
    209 
    210 
    211 //
    212 //  Run a single confusable skeleton transformation test case.
    213 //
    214 void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type,
    215                                   const char *input, const char *expected, int32_t lineNum) {
    216     UnicodeString uInput = UnicodeString(input).unescape();
    217     UnicodeString uExpected = UnicodeString(expected).unescape();
    218 
    219     UErrorCode status = U_ZERO_ERROR;
    220     UnicodeString actual;
    221     uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status);
    222     if (U_FAILURE(status)) {
    223         errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum,
    224               u_errorName(status));
    225         return;
    226     }
    227     if (uExpected != actual) {
    228         errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.",
    229                __FILE__, __LINE__, lineNum);
    230         errln(UnicodeString(" Actual   Skeleton: \"") + actual + UnicodeString("\"\n") +
    231               UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\""));
    232     }
    233 }
    234 
    235 void IntlTestSpoof::testAreConfusable() {
    236     TEST_SETUP
    237         UnicodeString s1("A long string that will overflow stack buffers.  A long string that will overflow stack buffers. "
    238                          "A long string that will overflow stack buffers.  A long string that will overflow stack buffers. ");
    239         UnicodeString s2("A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. "
    240                          "A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. ");
    241         TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status));
    242         TEST_ASSERT_SUCCESS(status);
    243 
    244     TEST_TEARDOWN;
    245 }
    246 
    247 void IntlTestSpoof::testInvisible() {
    248     TEST_SETUP
    249         UnicodeString  s = UnicodeString("abcd\\u0301ef").unescape();
    250         int32_t position = -42;
    251         TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status));
    252         TEST_ASSERT_SUCCESS(status);
    253         TEST_ASSERT(position == -42);
    254 
    255         UnicodeString  s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
    256         TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status));
    257         TEST_ASSERT_SUCCESS(status);
    258         TEST_ASSERT_EQ(7, position);
    259 
    260         // Two acute accents, one from the composed a with acute accent, \u00e1,
    261         // and one separate.
    262         position = -42;
    263         UnicodeString  s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
    264         TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status));
    265         TEST_ASSERT_SUCCESS(status);
    266         TEST_ASSERT_EQ(7, position);
    267     TEST_TEARDOWN;
    268 }
    269 
    270 void IntlTestSpoof::testBug8654() {
    271     TEST_SETUP
    272         UnicodeString s = UnicodeString("B\\u00c1\\u0301").unescape();
    273         int32_t position = -42;
    274         TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s, &position, &status) & USPOOF_INVISIBLE );
    275         TEST_ASSERT_SUCCESS(status);
    276         TEST_ASSERT_EQ(3, position);
    277     TEST_TEARDOWN;
    278 }
    279 
    280 static UnicodeString parseHex(const UnicodeString &in) {
    281     // Convert a series of hex numbers in a Unicode String to a string with the
    282     // corresponding characters.
    283     // The conversion is _really_ annoying.  There must be some function to just do it.
    284     UnicodeString result;
    285     UChar32 cc = 0;
    286     for (int32_t i=0; i<in.length(); i++) {
    287         UChar c = in.charAt(i);
    288         if (c == 0x20) {   // Space
    289             if (cc > 0) {
    290                result.append(cc);
    291                cc = 0;
    292             }
    293         } else if (c>=0x30 && c<=0x39) {
    294             cc = (cc<<4) + (c - 0x30);
    295         } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) {
    296             cc = (cc<<4) + (c & 0x0f)+9;
    297         }
    298         // else do something with bad input.
    299     }
    300     if (cc > 0) {
    301         result.append(cc);
    302     }
    303     return result;
    304 }
    305 
    306 
    307 //
    308 // Append the hex form of a UChar32 to a UnicodeString.
    309 // Used in formatting error messages.
    310 // Match the formatting of numbers in confusables.txt
    311 // Minimum of 4 digits, no leading zeroes for positions 5 and up.
    312 //
    313 static void appendHexUChar(UnicodeString &dest, UChar32 c) {
    314     UBool   doZeroes = FALSE;
    315     for (int bitNum=28; bitNum>=0; bitNum-=4) {
    316         if (bitNum <= 12) {
    317             doZeroes = TRUE;
    318         }
    319         int hexDigit = (c>>bitNum) & 0x0f;
    320         if (hexDigit != 0 || doZeroes) {
    321             doZeroes = TRUE;
    322             dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x41));
    323         }
    324     }
    325     dest.append((UChar)0x20);
    326 }
    327 
    328 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
    329 
    330 //  testConfData - Check each data item from the Unicode confusables.txt file,
    331 //                 verify that it transforms correctly in a skeleton.
    332 //
    333 void IntlTestSpoof::testConfData() {
    334     UErrorCode status = U_ZERO_ERROR;
    335 
    336     const char *testDataDir = IntlTest::getSourceTestData(status);
    337     TEST_ASSERT_SUCCESS(status);
    338     char buffer[2000];
    339     uprv_strcpy(buffer, testDataDir);
    340     uprv_strcat(buffer, "confusables.txt");
    341 
    342     LocalStdioFilePointer f(fopen(buffer, "rb"));
    343     if (f.isNull()) {
    344         errln("Skipping test spoof/testConfData.  File confusables.txt not accessible.");
    345         return;
    346     }
    347     fseek(f.getAlias(), 0, SEEK_END);
    348     int32_t  fileSize = ftell(f.getAlias());
    349     LocalArray<char> fileBuf(new char[fileSize]);
    350     fseek(f.getAlias(), 0, SEEK_SET);
    351     int32_t amt_read = fread(fileBuf.getAlias(), 1, fileSize, f.getAlias());
    352     TEST_ASSERT_EQ(amt_read, fileSize);
    353     TEST_ASSERT(fileSize>0);
    354     if (amt_read != fileSize || fileSize <=0) {
    355         return;
    356     }
    357     UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.getAlias(), fileSize));
    358 
    359     LocalUSpoofCheckerPointer sc(uspoof_open(&status));
    360     TEST_ASSERT_SUCCESS(status);
    361 
    362     // Parse lines from the confusables.txt file.  Example Line:
    363     // FF44 ;	0064 ;	SL	# ( d -> d ) FULLWIDTH ....
    364     // Three fields.  The hex fields can contain more than one character,
    365     //                and each character may be more than 4 digits (for supplemntals)
    366     // This regular expression matches lines and splits the fields into capture groups.
    367     RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status);
    368     TEST_ASSERT_SUCCESS(status);
    369     while (parseLine.find()) {
    370         UnicodeString from = parseHex(parseLine.group(1, status));
    371         if (!Normalizer::isNormalized(from, UNORM_NFD, status)) {
    372             // The source character was not NFD.
    373             // Skip this case; the first step in obtaining a skeleton is to NFD the input,
    374             //  so the mapping in this line of confusables.txt will never be applied.
    375             continue;
    376         }
    377 
    378         UnicodeString rawExpected = parseHex(parseLine.group(2, status));
    379         UnicodeString expected;
    380         Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status);
    381         TEST_ASSERT_SUCCESS(status);
    382 
    383         int32_t skeletonType = 0;
    384         UnicodeString tableType = parseLine.group(3, status);
    385         TEST_ASSERT_SUCCESS(status);
    386         if (tableType.indexOf("SL") >= 0) {
    387             skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
    388         } else if (tableType.indexOf("SA") >= 0) {
    389             skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
    390         } else if (tableType.indexOf("ML") >= 0) {
    391             skeletonType = 0;
    392         } else if (tableType.indexOf("MA") >= 0) {
    393             skeletonType = USPOOF_ANY_CASE;
    394         }
    395 
    396         UnicodeString actual;
    397         uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actual, &status);
    398         TEST_ASSERT_SUCCESS(status);
    399         TEST_ASSERT(actual == expected);
    400         if (actual != expected) {
    401             errln(parseLine.group(0, status));
    402             UnicodeString line = "Actual: ";
    403             int i = 0;
    404             while (i < actual.length()) {
    405                 appendHexUChar(line, actual.char32At(i));
    406                 i = actual.moveIndex32(i, 1);
    407             }
    408             errln(line);
    409         }
    410         if (U_FAILURE(status)) {
    411             break;
    412         }
    413     }
    414 }
    415 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
    416 
    417