Home | History | Annotate | Download | only in intltest
      1 /*
      2 **********************************************************************
      3 * Copyright (C) 2010, International Business Machines Corporation
      4 * and others.  All Rights Reserved.
      5 **********************************************************************
      6 */
      7 /**
      8  * IntlTestSpoof tests for USpoofDetector
      9  */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO
     14 
     15 #include "itspoof.h"
     16 #include "unicode/uspoof.h"
     17 #include "unicode/unistr.h"
     18 #include "unicode/regex.h"
     19 #include "unicode/normlzr.h"
     20 #include "cstring.h"
     21 #include <stdlib.h>
     22 #include <stdio.h>
     23 
     24 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
     25     errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}}
     26 
     27 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
     28     errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}
     29 
     30 #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
     31     errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
     32              __FILE__, __LINE__, #a, (a), #b, (b)); }}
     33 
     34 #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
     35     errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
     36              __FILE__, __LINE__, #a, (a), #b, (b)); }}
     37 
     38 /*
     39  *   TEST_SETUP and TEST_TEARDOWN
     40  *         macros to handle the boilerplate around setting up test case.
     41  *         Put arbitrary test code between SETUP and TEARDOWN.
     42  *         "sc" is the ready-to-go  SpoofChecker for use in the tests.
     43  */
     44 #define TEST_SETUP {  \
     45     UErrorCode status = U_ZERO_ERROR; \
     46     USpoofChecker *sc;     \
     47     sc = uspoof_open(&status);  \
     48     TEST_ASSERT_SUCCESS(status);   \
     49     if (U_SUCCESS(status)){
     50 
     51 #define TEST_TEARDOWN  \
     52     }  \
     53     TEST_ASSERT_SUCCESS(status);  \
     54     uspoof_close(sc);  \
     55 }
     56 
     57 
     58 
     59 
     60 void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     61 {
     62     if (exec) logln("TestSuite spoof: ");
     63     switch (index) {
     64         case 0:
     65             name = "TestSpoofAPI";
     66             if (exec) {
     67                 testSpoofAPI();
     68             }
     69             break;
     70          case 1:
     71             name = "TestSkeleton";
     72             if (exec) {
     73                 testSkeleton();
     74             }
     75             break;
     76          case 2:
     77             name = "TestAreConfusable";
     78             if (exec) {
     79                 testAreConfusable();
     80             }
     81             break;
     82           case 3:
     83             name = "TestInvisible";
     84             if (exec) {
     85                 testInvisible();
     86             }
     87             break;
     88           case 4:
     89             name = "testConfData";
     90             if (exec) {
     91                 testConfData();
     92             }
     93             break;
     94         default: name=""; break;
     95     }
     96 }
     97 
     98 void IntlTestSpoof::testSpoofAPI() {
     99 
    100     TEST_SETUP
    101         UnicodeString s("xyz");  // Many latin ranges are whole-script confusable with other scripts.
    102                                  // If this test starts failing, consult confusablesWholeScript.txt
    103         int32_t position = 666;
    104         int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status);
    105         TEST_ASSERT_SUCCESS(status);
    106         TEST_ASSERT_EQ(0, checkResults);
    107         TEST_ASSERT_EQ(666, position);
    108     TEST_TEARDOWN;
    109 
    110     TEST_SETUP
    111         UnicodeString s1("cxs");
    112         UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape();  // Cyrillic "cxs"
    113         int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status);
    114         TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
    115 
    116     TEST_TEARDOWN;
    117 
    118     TEST_SETUP
    119         UnicodeString s("I1l0O");
    120         UnicodeString dest;
    121         UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status);
    122         TEST_ASSERT_SUCCESS(status);
    123         TEST_ASSERT(UnicodeString("lllOO") == dest);
    124         TEST_ASSERT(&dest == &retStr);
    125     TEST_TEARDOWN;
    126 }
    127 
    128 
    129 #define CHECK_SKELETON(type, input, expected) { \
    130     checkSkeleton(sc, type, input, expected, __LINE__); \
    131     }
    132 
    133 
    134 // testSkeleton.   Spot check a number of confusable skeleton substitutions from the
    135 //                 Unicode data file confusables.txt
    136 //                 Test cases chosen for substitutions of various lengths, and
    137 //                 membership in different mapping tables.
    138 void IntlTestSpoof::testSkeleton() {
    139     const uint32_t ML = 0;
    140     const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
    141     const uint32_t MA = USPOOF_ANY_CASE;
    142     const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
    143 
    144     TEST_SETUP
    145         // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations.
    146         CHECK_SKELETON(SL, " A 1ong \\u02b9identifier' that will overflow implementation stack buffers, forcing heap allocations."
    147                            " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
    148                            " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
    149                            " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",
    150 
    151                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
    152                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
    153                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
    154                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.")
    155 
    156         // FC5F ;	FE74 0651 ;   ML  #* ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM to
    157         //                                ARABIC KASRATAN ISOLATED FORM, ARABIC SHADDA
    158         //    This character NFKD normalizes to \u0020 \u064d \u0651, so its confusable mapping
    159         //    is never used in creating a skeleton.
    160         CHECK_SKELETON(SL, "\\uFC5F", " \\u064d\\u0651");
    161 
    162         CHECK_SKELETON(SL, "nochange", "nochange");
    163         CHECK_SKELETON(MA, "love", "love");
    164         CHECK_SKELETON(MA, "1ove", "love");   // Digit 1 to letter l
    165         CHECK_SKELETON(ML, "OOPS", "OOPS");
    166         CHECK_SKELETON(ML, "00PS", "00PS");   // Digit 0 unchanged in lower case mode.
    167         CHECK_SKELETON(MA, "OOPS", "OOPS");
    168         CHECK_SKELETON(MA, "00PS", "OOPS");   // Digit 0 to letter O in any case mode only
    169         CHECK_SKELETON(SL, "\\u059c", "\\u0301");
    170         CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D");
    171         CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u006C\\u006C\\u0029");  // "(ll)"
    172         CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647");
    173 
    174         // This mapping exists in the ML and MA tables, does not exist in SL, SA
    175         //0C83 ;	0C03 ;
    176         CHECK_SKELETON(SL, "\\u0C83", "\\u0C83");
    177         CHECK_SKELETON(SA, "\\u0C83", "\\u0C83");
    178         CHECK_SKELETON(ML, "\\u0C83", "\\u0983");
    179         CHECK_SKELETON(MA, "\\u0C83", "\\u0983");
    180 
    181         // 0391 ; 0041 ;
    182         // This mapping exists only in the MA table.
    183         CHECK_SKELETON(MA, "\\u0391", "A");
    184         CHECK_SKELETON(SA, "\\u0391", "\\u0391");
    185         CHECK_SKELETON(ML, "\\u0391", "\\u0391");
    186         CHECK_SKELETON(SL, "\\u0391", "\\u0391");
    187 
    188         // 13CF ;  0062 ;
    189         // This mapping exists in the ML and MA tables
    190         CHECK_SKELETON(ML, "\\u13CF", "b");
    191         CHECK_SKELETON(MA, "\\u13CF", "b");
    192         CHECK_SKELETON(SL, "\\u13CF", "\\u13CF");
    193         CHECK_SKELETON(SA, "\\u13CF", "\\u13CF");
    194 
    195         // 0022 ;  0027 0027 ;
    196         // all tables.
    197         CHECK_SKELETON(SL, "\\u0022", "\\u0027\\u0027");
    198         CHECK_SKELETON(SA, "\\u0022", "\\u0027\\u0027");
    199         CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027");
    200         CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027");
    201 
    202     TEST_TEARDOWN;
    203 }
    204 
    205 
    206 //
    207 //  Run a single confusable skeleton transformation test case.
    208 //
    209 void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type,
    210                                   const char *input, const char *expected, int32_t lineNum) {
    211     UnicodeString uInput = UnicodeString(input).unescape();
    212     UnicodeString uExpected = UnicodeString(expected).unescape();
    213 
    214     UErrorCode status = U_ZERO_ERROR;
    215     UnicodeString actual;
    216     uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status);
    217     if (U_FAILURE(status)) {
    218         errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum,
    219               u_errorName(status));
    220         return;
    221     }
    222     if (uExpected != actual) {
    223         errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.",
    224                __FILE__, __LINE__, lineNum);
    225         errln(UnicodeString(" Actual   Skeleton: \"") + actual + UnicodeString("\"\n") +
    226               UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\""));
    227     }
    228 }
    229 
    230 void IntlTestSpoof::testAreConfusable() {
    231     TEST_SETUP
    232         UnicodeString s1("A long string that will overflow stack buffers.  A long string that will overflow stack buffers. "
    233                          "A long string that will overflow stack buffers.  A long string that will overflow stack buffers. ");
    234         UnicodeString s2("A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. "
    235                          "A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. ");
    236         TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status));
    237         TEST_ASSERT_SUCCESS(status);
    238 
    239     TEST_TEARDOWN;
    240 }
    241 
    242 void IntlTestSpoof::testInvisible() {
    243     TEST_SETUP
    244         UnicodeString  s = UnicodeString("abcd\\u0301ef").unescape();
    245         int32_t position = -42;
    246         TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status));
    247         TEST_ASSERT_SUCCESS(status);
    248         TEST_ASSERT(position == -42);
    249 
    250         UnicodeString  s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
    251         TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status));
    252         TEST_ASSERT_SUCCESS(status);
    253         TEST_ASSERT_EQ(7, position);
    254 
    255         // Tow acute accents, one from the composed a with acute accent, \u00e1,
    256         // and one separate.
    257         position = -42;
    258         UnicodeString  s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
    259         TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status));
    260         TEST_ASSERT_SUCCESS(status);
    261         TEST_ASSERT_EQ(7, position);
    262     TEST_TEARDOWN;
    263 }
    264 
    265 
    266 static UnicodeString parseHex(const UnicodeString &in) {
    267     // Convert a series of hex numbers in a Unicode String to a string with the
    268     // corresponding characters.
    269     // The conversion is _really_ annoying.  There must be some function to just do it.
    270     UnicodeString result;
    271     UChar32 cc = 0;
    272     for (int32_t i=0; i<in.length(); i++) {
    273         UChar c = in.charAt(i);
    274         if (c == 0x20) {   // Space
    275             if (cc > 0) {
    276                result.append(cc);
    277                cc = 0;
    278             }
    279         } else if (c>=0x30 && c<=0x39) {
    280             cc = (cc<<4) + (c - 0x30);
    281         } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) {
    282             cc = (cc<<4) + (c & 0x0f)+9;
    283         }
    284         // else do something with bad input.
    285     }
    286     if (cc > 0) {
    287         result.append(cc);
    288     }
    289     return result;
    290 }
    291 
    292 
    293 //
    294 // Append the hex form of a UChar32 to a UnicodeString.
    295 // Used in formatting error messages.
    296 // Match the formatting of numbers in confusables.txt
    297 // Minimum of 4 digits, no leading zeroes for positions 5 and up.
    298 //
    299 static void appendHexUChar(UnicodeString &dest, UChar32 c) {
    300     UBool   doZeroes = FALSE;
    301     for (int bitNum=28; bitNum>=0; bitNum-=4) {
    302         if (bitNum <= 12) {
    303             doZeroes = TRUE;
    304         }
    305         int hexDigit = (c>>bitNum) & 0x0f;
    306         if (hexDigit != 0 || doZeroes) {
    307             doZeroes = TRUE;
    308             dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x41));
    309         }
    310     }
    311     dest.append((UChar)0x20);
    312 }
    313 
    314 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
    315 
    316 //  testConfData - Check each data item from the Unicode confusables.txt file,
    317 //                 verify that it transforms correctly in a skeleton.
    318 //
    319 void IntlTestSpoof::testConfData() {
    320     UErrorCode status = U_ZERO_ERROR;
    321 
    322     const char *testDataDir = IntlTest::getSourceTestData(status);
    323     TEST_ASSERT_SUCCESS(status);
    324     char buffer[2000];
    325     uprv_strcpy(buffer, testDataDir);
    326     uprv_strcat(buffer, "confusables.txt");
    327 
    328     LocalStdioFilePointer f(fopen(buffer, "rb"));
    329     if (f.isNull()) {
    330         errln("Skipping test spoof/testConfData.  File confusables.txt not accessible.");
    331         return;
    332     }
    333     fseek(f.getAlias(), 0, SEEK_END);
    334     int32_t  fileSize = ftell(f.getAlias());
    335     LocalArray<char> fileBuf(new char[fileSize]);
    336     fseek(f.getAlias(), 0, SEEK_SET);
    337     int32_t amt_read = fread(fileBuf.getAlias(), 1, fileSize, f.getAlias());
    338     TEST_ASSERT_EQ(amt_read, fileSize);
    339     TEST_ASSERT(fileSize>0);
    340     if (amt_read != fileSize || fileSize <=0) {
    341         return;
    342     }
    343     UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.getAlias(), fileSize));
    344 
    345     LocalUSpoofCheckerPointer sc(uspoof_open(&status));
    346     TEST_ASSERT_SUCCESS(status);
    347 
    348     // Parse lines from the confusables.txt file.  Example Line:
    349     // FF44 ;	0064 ;	SL	# ( d -> d ) FULLWIDTH ....
    350     // Three fields.  The hex fields can contain more than one character,
    351     //                and each character may be more than 4 digits (for supplemntals)
    352     // This regular expression matches lines and splits the fields into capture groups.
    353     RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status);
    354     TEST_ASSERT_SUCCESS(status);
    355     while (parseLine.find()) {
    356         UnicodeString from = parseHex(parseLine.group(1, status));
    357         if (!Normalizer::isNormalized(from, UNORM_NFKD, status)) {
    358             // The source character was not NFKD.
    359             // Skip this case; the first step in obtaining a skeleton is to NFKD the input,
    360             //  so the mapping in this line of confusables.txt will never be applied.
    361             continue;
    362         }
    363 
    364         UnicodeString rawExpected = parseHex(parseLine.group(2, status));
    365         UnicodeString expected;
    366         Normalizer::decompose(rawExpected, TRUE, 0, expected, status);
    367         TEST_ASSERT_SUCCESS(status);
    368 
    369         int32_t skeletonType = 0;
    370         UnicodeString tableType = parseLine.group(3, status);
    371         TEST_ASSERT_SUCCESS(status);
    372         if (tableType.indexOf("SL") >= 0) {
    373             skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
    374         } else if (tableType.indexOf("SA") >= 0) {
    375             skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
    376         } else if (tableType.indexOf("ML") >= 0) {
    377             skeletonType = 0;
    378         } else if (tableType.indexOf("MA") >= 0) {
    379             skeletonType = USPOOF_ANY_CASE;
    380         }
    381 
    382         UnicodeString actual;
    383         uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actual, &status);
    384         TEST_ASSERT_SUCCESS(status);
    385         TEST_ASSERT(actual == expected);
    386         if (actual != expected) {
    387             errln(parseLine.group(0, status));
    388             UnicodeString line = "Actual: ";
    389             int i = 0;
    390             while (i < actual.length()) {
    391                 appendHexUChar(line, actual.char32At(i));
    392                 i = actual.moveIndex32(i, 1);
    393             }
    394             errln(line);
    395         }
    396         if (U_FAILURE(status)) {
    397             break;
    398         }
    399     }
    400 }
    401 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
    402 
    403