Home | History | Annotate | Download | only in cintltst
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 2009-2010, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 /********************************************************************************
      7 *
      8 * File spooftest.c
      9 *
     10 *********************************************************************************/
     11 /*C API TEST for the uspoof Unicode Indentifier Spoofing and Security API */
     12 /**
     13 *   This is an API test for ICU spoof detection in plain C.  It doesn't test very many cases, and doesn't
     14 *   try to test the full functionality.  It just calls each function and verifies that it
     15 *   works on a basic level.
     16 *
     17 *   More complete testing of spoof detection functionality is done with the C++ tests.
     18 **/
     19 
     20 #include "unicode/utypes.h"
     21 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION
     22 
     23 #include <stdlib.h>
     24 #include <stdio.h>
     25 #include <string.h>
     26 #include "unicode/uspoof.h"
     27 #include "unicode/ustring.h"
     28 #include "unicode/uset.h"
     29 #include "cintltst.h"
     30 
     31 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
     32     log_err_status(status, "Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status));}}
     33 
     34 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
     35 log_err("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}
     36 
     37 #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
     38     log_err("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
     39              __FILE__, __LINE__, #a, (a), #b, (b)); }}
     40 
     41 #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
     42     log_err("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
     43              __FILE__, __LINE__, #a, (a), #b, (b)); }}
     44 
     45 
     46 /*
     47  *   TEST_SETUP and TEST_TEARDOWN
     48  *         macros to handle the boilerplate around setting up test case.
     49  *         Put arbitrary test code between SETUP and TEARDOWN.
     50  *         "sc" is the ready-to-go  SpoofChecker for use in the tests.
     51  */
     52 #define TEST_SETUP {  \
     53     UErrorCode status = U_ZERO_ERROR; \
     54     USpoofChecker *sc;     \
     55     sc = uspoof_open(&status);  \
     56     TEST_ASSERT_SUCCESS(status);   \
     57     if (U_SUCCESS(status)){
     58 
     59 #define TEST_TEARDOWN  \
     60     }  \
     61     TEST_ASSERT_SUCCESS(status);  \
     62     uspoof_close(sc);  \
     63 }
     64 
     65 
     66 static void TestUSpoofCAPI(void);
     67 
     68 void addUSpoofTest(TestNode** root);
     69 
     70 void addUSpoofTest(TestNode** root)
     71 {
     72 #if !UCONFIG_NO_FILE_IO
     73     addTest(root, &TestUSpoofCAPI, "uspoof/TestUSpoofCAPI");
     74 #endif
     75 }
     76 
     77 /*
     78  *  Identifiers for verifying that spoof checking is minimally alive and working.
     79  */
     80 const UChar goodLatin[] = {(UChar)0x75, (UChar)0x7a, 0};    /* "uz", all ASCII             */
     81                                                             /*   (not confusable)          */
     82 const UChar scMixed[]  = {(UChar)0x73, (UChar)0x0441, 0};   /* "sc", with Cyrillic 'c'     */
     83                                                             /*   (mixed script, confusable */
     84 
     85 const UChar scLatin[]  = {(UChar)0x73,  (UChar)0x63, 0};    /* "sc", plain ascii.        */
     86 const UChar goodCyrl[] = {(UChar)0x438, (UChar)0x43B, 0};   /* Plain lower case Cyrillic letters,
     87                                                                no latin confusables         */
     88 
     89 const UChar goodGreek[]   = {(UChar)0x3c0, (UChar)0x3c6, 0};   /* Plain lower case Greek letters */
     90 
     91 const UChar lll_Latin_a[] = {(UChar)0x6c, (UChar)0x49, (UChar)0x31, 0};   /* lI1, all ASCII */
     92 
     93                              /*  Full-width I, Small Roman Numeral fifty, Latin Cap Letter IOTA*/
     94 const UChar lll_Latin_b[] = {(UChar)0xff29, (UChar)0x217c, (UChar)0x196, 0};
     95 
     96 const UChar lll_Cyrl[]    = {(UChar)0x0406, (UChar)0x04C0, (UChar)0x31, 0};
     97 
     98 /* The skeleton transform for all of thes 'lll' lookalikes is all lower case l. */
     99 const UChar lll_Skel[]    = {(UChar)0x6c, (UChar)0x6c, (UChar)0x6c, 0};
    100 
    101 /* Provide better code coverage */
    102 const char goodLatinUTF8[]    = {0x75, 0x77, 0};
    103 /*
    104  *   Spoof Detction C API Tests
    105  */
    106 static void TestUSpoofCAPI(void) {
    107 
    108     /*
    109      *  basic uspoof_open().
    110      */
    111     {
    112         USpoofChecker *sc;
    113         UErrorCode  status = U_ZERO_ERROR;
    114         sc = uspoof_open(&status);
    115         TEST_ASSERT_SUCCESS(status);
    116         if (U_FAILURE(status)) {
    117             /* If things are so broken that we can't even open a default spoof checker,  */
    118             /*   don't even try the rest of the tests.  They would all fail.             */
    119             return;
    120         }
    121         uspoof_close(sc);
    122     }
    123 
    124 
    125 
    126     /*
    127      *  Test Open from source rules.
    128     */
    129     TEST_SETUP
    130     const char *dataSrcDir;
    131     char       *fileName;
    132     char       *confusables;
    133     int         confusablesLength;
    134     char       *confusablesWholeScript;
    135     int         confusablesWholeScriptLength;
    136     FILE       *f;
    137     UParseError pe;
    138     int32_t     errType;
    139     USpoofChecker *rsc;
    140 
    141     dataSrcDir = ctest_dataSrcDir();
    142     fileName = malloc(strlen(dataSrcDir) + 100);
    143     strcpy(fileName, dataSrcDir);
    144     strcat(fileName, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "confusables.txt");
    145     f = fopen(fileName, "r");
    146     TEST_ASSERT_NE(f, NULL);
    147     confusables = malloc(3000000);
    148     confusablesLength = fread(confusables, 1, 3000000, f);
    149     fclose(f);
    150 
    151 
    152     strcpy(fileName, dataSrcDir);
    153     strcat(fileName, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "confusablesWholeScript.txt");
    154     f = fopen(fileName, "r");
    155     TEST_ASSERT_NE(f, NULL);
    156     confusablesWholeScript = malloc(1000000);
    157     confusablesWholeScriptLength = fread(confusablesWholeScript, 1, 1000000, f);
    158     fclose(f);
    159 
    160     rsc = uspoof_openFromSource(confusables, confusablesLength,
    161                                               confusablesWholeScript, confusablesWholeScriptLength,
    162                                               &errType, &pe, &status);
    163     TEST_ASSERT_SUCCESS(status);
    164 
    165     free(confusablesWholeScript);
    166     free(confusables);
    167     free(fileName);
    168     uspoof_close(rsc);
    169     /*  printf("ParseError Line is %d\n", pe.line);  */
    170     TEST_TEARDOWN;
    171 
    172 
    173     /*
    174      * openFromSerialized and serialize
    175     */
    176     TEST_SETUP
    177         int32_t        serializedSize = 0;
    178         int32_t        actualLength = 0;
    179         char           *buf;
    180         USpoofChecker  *sc2;
    181         int32_t         checkResults;
    182 
    183 
    184         serializedSize = uspoof_serialize(sc, NULL, 0, &status);
    185         TEST_ASSERT_EQ(status, U_BUFFER_OVERFLOW_ERROR);
    186         TEST_ASSERT(serializedSize > 0);
    187 
    188         /* Serialize the default spoof checker */
    189         status = U_ZERO_ERROR;
    190         buf = (char *)malloc(serializedSize + 10);
    191         TEST_ASSERT(buf != NULL);
    192         buf[serializedSize] = 42;
    193         uspoof_serialize(sc, buf, serializedSize, &status);
    194         TEST_ASSERT_SUCCESS(status);
    195         TEST_ASSERT_EQ(42, buf[serializedSize]);
    196 
    197         /* Create a new spoof checker from the freshly serialized data */
    198         sc2 = uspoof_openFromSerialized(buf, serializedSize+10, &actualLength, &status);
    199         TEST_ASSERT_SUCCESS(status);
    200         TEST_ASSERT_NE(NULL, sc2);
    201         TEST_ASSERT_EQ(serializedSize, actualLength);
    202 
    203         /* Verify that the new spoof checker at least wiggles */
    204         checkResults = uspoof_check(sc2, goodLatin, -1, NULL, &status);
    205         TEST_ASSERT_SUCCESS(status);
    206         TEST_ASSERT_EQ(0, checkResults);
    207 
    208         checkResults = uspoof_check(sc2, scMixed, -1, NULL, &status);
    209         TEST_ASSERT_SUCCESS(status);
    210         TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
    211 
    212         uspoof_close(sc2);
    213         free(buf);
    214     TEST_TEARDOWN;
    215 
    216 
    217 
    218     /*
    219      * Set & Get Check Flags
    220     */
    221     TEST_SETUP
    222         int32_t t;
    223         uspoof_setChecks(sc, USPOOF_ALL_CHECKS, &status);
    224         TEST_ASSERT_SUCCESS(status);
    225         t = uspoof_getChecks(sc, &status);
    226         TEST_ASSERT_EQ(t, USPOOF_ALL_CHECKS);
    227 
    228         uspoof_setChecks(sc, 0, &status);
    229         TEST_ASSERT_SUCCESS(status);
    230         t = uspoof_getChecks(sc, &status);
    231         TEST_ASSERT_EQ(0, t);
    232 
    233         uspoof_setChecks(sc,
    234                         USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE,
    235                         &status);
    236         TEST_ASSERT_SUCCESS(status);
    237         t = uspoof_getChecks(sc, &status);
    238         TEST_ASSERT_SUCCESS(status);
    239         TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE, t);
    240     TEST_TEARDOWN;
    241 
    242     /*
    243     * get & setAllowedChars
    244     */
    245     TEST_SETUP
    246         USet *us;
    247         const USet *uset;
    248 
    249         uset = uspoof_getAllowedChars(sc, &status);
    250         TEST_ASSERT_SUCCESS(status);
    251         TEST_ASSERT(uset_isFrozen(uset));
    252         us = uset_open((UChar32)0x41, (UChar32)0x5A);   /*  [A-Z]  */
    253         uspoof_setAllowedChars(sc, us, &status);
    254         TEST_ASSERT_SUCCESS(status);
    255         TEST_ASSERT_NE(us, uspoof_getAllowedChars(sc, &status));
    256         TEST_ASSERT(uset_equals(us, uspoof_getAllowedChars(sc, &status)));
    257         TEST_ASSERT_SUCCESS(status);
    258         uset_close(us);
    259     TEST_TEARDOWN;
    260 
    261     /*
    262     *  clone()
    263     */
    264 
    265     TEST_SETUP
    266         USpoofChecker *clone1 = NULL;
    267         USpoofChecker *clone2 = NULL;
    268         int32_t        checkResults = 0;
    269 
    270         clone1 = uspoof_clone(sc, &status);
    271         TEST_ASSERT_SUCCESS(status);
    272         TEST_ASSERT_NE(clone1, sc);
    273 
    274         clone2 = uspoof_clone(clone1, &status);
    275         TEST_ASSERT_SUCCESS(status);
    276         TEST_ASSERT_NE(clone2, clone1);
    277 
    278         uspoof_close(clone1);
    279 
    280         /* Verify that the cloned spoof checker is alive */
    281         checkResults = uspoof_check(clone2, goodLatin, -1, NULL, &status);
    282         TEST_ASSERT_SUCCESS(status);
    283         TEST_ASSERT_EQ(0, checkResults);
    284 
    285         checkResults = uspoof_check(clone2, scMixed, -1, NULL, &status);
    286         TEST_ASSERT_SUCCESS(status);
    287         TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
    288         uspoof_close(clone2);
    289     TEST_TEARDOWN;
    290 
    291     /*
    292      *  get & set Checks
    293     */
    294     TEST_SETUP
    295         int32_t   checks;
    296         int32_t   checks2;
    297         int32_t   checkResults;
    298 
    299         checks = uspoof_getChecks(sc, &status);
    300         TEST_ASSERT_SUCCESS(status);
    301         TEST_ASSERT_EQ(USPOOF_ALL_CHECKS, checks);
    302 
    303         checks &= ~(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE);
    304         uspoof_setChecks(sc, checks, &status);
    305         TEST_ASSERT_SUCCESS(status);
    306         checks2 = uspoof_getChecks(sc, &status);
    307         TEST_ASSERT_EQ(checks, checks2);
    308 
    309         /* The checks that were disabled just above are the same ones that the "scMixed" test fails.
    310             So with those tests gone checking that Identifier should now succeed */
    311         checkResults = uspoof_check(sc, scMixed, -1, NULL, &status);
    312         TEST_ASSERT_SUCCESS(status);
    313         TEST_ASSERT_EQ(0, checkResults);
    314     TEST_TEARDOWN;
    315 
    316     /*
    317      *  AllowedLoacles
    318      */
    319 
    320     TEST_SETUP
    321         const char  *allowedLocales;
    322         int32_t  checkResults;
    323 
    324         /* Default allowed locales list should be empty */
    325         allowedLocales = uspoof_getAllowedLocales(sc, &status);
    326         TEST_ASSERT_SUCCESS(status);
    327         TEST_ASSERT(strcmp("", allowedLocales) == 0)
    328 
    329         /* Allow en and ru, which should enable Latin and Cyrillic only to pass */
    330         uspoof_setAllowedLocales(sc, "en, ru_RU", &status);
    331         TEST_ASSERT_SUCCESS(status);
    332         allowedLocales = uspoof_getAllowedLocales(sc, &status);
    333         TEST_ASSERT_SUCCESS(status);
    334         TEST_ASSERT(strstr(allowedLocales, "en") != NULL);
    335         TEST_ASSERT(strstr(allowedLocales, "ru") != NULL);
    336 
    337         /* Limit checks to USPOOF_CHAR_LIMIT.  Some of the test data has whole script confusables also,
    338          * which we don't want to see in this test. */
    339         uspoof_setChecks(sc, USPOOF_CHAR_LIMIT, &status);
    340         TEST_ASSERT_SUCCESS(status);
    341 
    342         checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status);
    343         TEST_ASSERT_SUCCESS(status);
    344         TEST_ASSERT_EQ(0, checkResults);
    345 
    346         checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
    347         TEST_ASSERT_SUCCESS(status);
    348         TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT, checkResults);
    349 
    350         checkResults = uspoof_check(sc, goodCyrl, -1, NULL, &status);
    351         TEST_ASSERT_SUCCESS(status);
    352         TEST_ASSERT_EQ(0, checkResults);
    353 
    354         /* Reset with an empty locale list, which should allow all characters to pass */
    355         uspoof_setAllowedLocales(sc, " ", &status);
    356         TEST_ASSERT_SUCCESS(status);
    357 
    358         checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
    359         TEST_ASSERT_SUCCESS(status);
    360         TEST_ASSERT_EQ(0, checkResults);
    361     TEST_TEARDOWN;
    362 
    363     /*
    364      * AllowedChars   set/get the USet of allowed characters.
    365      */
    366     TEST_SETUP
    367         const USet  *set;
    368         USet        *tmpSet;
    369         int32_t      checkResults;
    370 
    371         /* By default, we should see no restriction; the USet should allow all characters. */
    372         set = uspoof_getAllowedChars(sc, &status);
    373         TEST_ASSERT_SUCCESS(status);
    374         tmpSet = uset_open(0, 0x10ffff);
    375         TEST_ASSERT(uset_equals(tmpSet, set));
    376 
    377         /* Setting the allowed chars should enable the check. */
    378         uspoof_setChecks(sc, USPOOF_ALL_CHECKS & ~USPOOF_CHAR_LIMIT, &status);
    379         TEST_ASSERT_SUCCESS(status);
    380 
    381         /* Remove a character that is in our good Latin test identifier from the allowed chars set. */
    382         uset_remove(tmpSet, goodLatin[1]);
    383         uspoof_setAllowedChars(sc, tmpSet, &status);
    384         TEST_ASSERT_SUCCESS(status);
    385         uset_close(tmpSet);
    386 
    387         /* Latin Identifier should now fail; other non-latin test cases should still be OK */
    388         checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status);
    389         TEST_ASSERT_SUCCESS(status);
    390         TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT, checkResults);
    391 
    392         checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
    393         TEST_ASSERT_SUCCESS(status);
    394         TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
    395     TEST_TEARDOWN;
    396 
    397     /*
    398      * check UTF-8
    399      */
    400     TEST_SETUP
    401         char    utf8buf[200];
    402         int32_t checkResults;
    403         int32_t position;
    404 
    405         u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodLatin, -1, &status);
    406         TEST_ASSERT_SUCCESS(status);
    407         position = 666;
    408         checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
    409         TEST_ASSERT_SUCCESS(status);
    410         TEST_ASSERT_EQ(0, checkResults);
    411         TEST_ASSERT_EQ(666, position);
    412 
    413         u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodCyrl, -1, &status);
    414         TEST_ASSERT_SUCCESS(status);
    415         checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
    416         TEST_ASSERT_SUCCESS(status);
    417         TEST_ASSERT_EQ(0, checkResults);
    418 
    419         u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, scMixed, -1, &status);
    420         TEST_ASSERT_SUCCESS(status);
    421         position = 666;
    422         checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
    423         TEST_ASSERT_SUCCESS(status);
    424         TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_SINGLE_SCRIPT , checkResults);
    425         TEST_ASSERT_EQ(2, position);
    426 
    427     TEST_TEARDOWN;
    428 
    429     /*
    430      * uspoof_areConfusable()
    431      */
    432     TEST_SETUP
    433         int32_t  checkResults;
    434 
    435         checkResults = uspoof_areConfusable(sc, scLatin, -1, scMixed, -1, &status);
    436         TEST_ASSERT_SUCCESS(status);
    437         TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
    438 
    439         checkResults = uspoof_areConfusable(sc, goodGreek, -1, scLatin, -1, &status);
    440         TEST_ASSERT_SUCCESS(status);
    441         TEST_ASSERT_EQ(0, checkResults);
    442 
    443         checkResults = uspoof_areConfusable(sc, lll_Latin_a, -1, lll_Latin_b, -1, &status);
    444         TEST_ASSERT_SUCCESS(status);
    445         TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);
    446 
    447     TEST_TEARDOWN;
    448 
    449     /*
    450      * areConfusableUTF8
    451      */
    452     TEST_SETUP
    453         int32_t checkResults;
    454         char s1[200];
    455         char s2[200];
    456 
    457 
    458         u_strToUTF8(s1, sizeof(s1), NULL, scLatin, -1, &status);
    459         u_strToUTF8(s2, sizeof(s2), NULL, scMixed, -1, &status);
    460         TEST_ASSERT_SUCCESS(status);
    461         checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status);
    462         TEST_ASSERT_SUCCESS(status);
    463         TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
    464 
    465         u_strToUTF8(s1, sizeof(s1), NULL, goodGreek, -1, &status);
    466         u_strToUTF8(s2, sizeof(s2), NULL, scLatin, -1, &status);
    467         TEST_ASSERT_SUCCESS(status);
    468         checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status);
    469         TEST_ASSERT_SUCCESS(status);
    470         TEST_ASSERT_EQ(0, checkResults);
    471 
    472         u_strToUTF8(s1, sizeof(s1), NULL, lll_Latin_a, -1, &status);
    473         u_strToUTF8(s2, sizeof(s2), NULL, lll_Latin_b, -1, &status);
    474         TEST_ASSERT_SUCCESS(status);
    475         checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status);
    476         TEST_ASSERT_SUCCESS(status);
    477         TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);
    478 
    479     TEST_TEARDOWN;
    480 
    481 
    482   /*
    483    * getSkeleton
    484    */
    485 
    486     TEST_SETUP
    487         UChar dest[100];
    488         int32_t   skelLength;
    489 
    490         skelLength = uspoof_getSkeleton(sc, USPOOF_ANY_CASE, lll_Latin_a, -1, dest, sizeof(dest)/sizeof(UChar), &status);
    491         TEST_ASSERT_SUCCESS(status);
    492         TEST_ASSERT_EQ(0, u_strcmp(lll_Skel, dest));
    493         TEST_ASSERT_EQ(u_strlen(lll_Skel), skelLength);
    494 
    495         skelLength = uspoof_getSkeletonUTF8(sc, USPOOF_ANY_CASE, goodLatinUTF8, -1, (char*)dest,
    496                                             sizeof(dest)/sizeof(UChar), &status);
    497         TEST_ASSERT_SUCCESS(status);
    498 
    499         skelLength = uspoof_getSkeleton(sc, USPOOF_ANY_CASE, lll_Latin_a, -1, NULL, 0, &status);
    500         TEST_ASSERT_EQ(U_BUFFER_OVERFLOW_ERROR, status);
    501         TEST_ASSERT_EQ(3, skelLength);
    502         status = U_ZERO_ERROR;
    503 
    504     TEST_TEARDOWN;
    505 }
    506 
    507 #endif  /* UCONFIG_NO_REGULAR_EXPRESSIONS */
    508