1 /* 2 ********************************************************************** 3 * Copyright (C) 2010, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 /** 8 * IntlTestSpoof tests for USpoofDetector 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO 14 15 #include "itspoof.h" 16 #include "unicode/uspoof.h" 17 #include "unicode/unistr.h" 18 #include "unicode/regex.h" 19 #include "unicode/normlzr.h" 20 #include "cstring.h" 21 #include <stdlib.h> 22 #include <stdio.h> 23 24 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \ 25 errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}} 26 27 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \ 28 errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};} 29 30 #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \ 31 errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \ 32 __FILE__, __LINE__, #a, (a), #b, (b)); }} 33 34 #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \ 35 errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \ 36 __FILE__, __LINE__, #a, (a), #b, (b)); }} 37 38 /* 39 * TEST_SETUP and TEST_TEARDOWN 40 * macros to handle the boilerplate around setting up test case. 41 * Put arbitrary test code between SETUP and TEARDOWN. 42 * "sc" is the ready-to-go SpoofChecker for use in the tests. 43 */ 44 #define TEST_SETUP { \ 45 UErrorCode status = U_ZERO_ERROR; \ 46 USpoofChecker *sc; \ 47 sc = uspoof_open(&status); \ 48 TEST_ASSERT_SUCCESS(status); \ 49 if (U_SUCCESS(status)){ 50 51 #define TEST_TEARDOWN \ 52 } \ 53 TEST_ASSERT_SUCCESS(status); \ 54 uspoof_close(sc); \ 55 } 56 57 58 59 60 void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 61 { 62 if (exec) logln("TestSuite spoof: "); 63 switch (index) { 64 case 0: 65 name = "TestSpoofAPI"; 66 if (exec) { 67 testSpoofAPI(); 68 } 69 break; 70 case 1: 71 name = "TestSkeleton"; 72 if (exec) { 73 testSkeleton(); 74 } 75 break; 76 case 2: 77 name = "TestAreConfusable"; 78 if (exec) { 79 testAreConfusable(); 80 } 81 break; 82 case 3: 83 name = "TestInvisible"; 84 if (exec) { 85 testInvisible(); 86 } 87 break; 88 case 4: 89 name = "testConfData"; 90 if (exec) { 91 testConfData(); 92 } 93 break; 94 default: name=""; break; 95 } 96 } 97 98 void IntlTestSpoof::testSpoofAPI() { 99 100 TEST_SETUP 101 UnicodeString s("xyz"); // Many latin ranges are whole-script confusable with other scripts. 102 // If this test starts failing, consult confusablesWholeScript.txt 103 int32_t position = 666; 104 int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status); 105 TEST_ASSERT_SUCCESS(status); 106 TEST_ASSERT_EQ(0, checkResults); 107 TEST_ASSERT_EQ(666, position); 108 TEST_TEARDOWN; 109 110 TEST_SETUP 111 UnicodeString s1("cxs"); 112 UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape(); // Cyrillic "cxs" 113 int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status); 114 TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults); 115 116 TEST_TEARDOWN; 117 118 TEST_SETUP 119 UnicodeString s("I1l0O"); 120 UnicodeString dest; 121 UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status); 122 TEST_ASSERT_SUCCESS(status); 123 TEST_ASSERT(UnicodeString("lllOO") == dest); 124 TEST_ASSERT(&dest == &retStr); 125 TEST_TEARDOWN; 126 } 127 128 129 #define CHECK_SKELETON(type, input, expected) { \ 130 checkSkeleton(sc, type, input, expected, __LINE__); \ 131 } 132 133 134 // testSkeleton. Spot check a number of confusable skeleton substitutions from the 135 // Unicode data file confusables.txt 136 // Test cases chosen for substitutions of various lengths, and 137 // membership in different mapping tables. 138 void IntlTestSpoof::testSkeleton() { 139 const uint32_t ML = 0; 140 const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE; 141 const uint32_t MA = USPOOF_ANY_CASE; 142 const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; 143 144 TEST_SETUP 145 // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations. 146 CHECK_SKELETON(SL, " A 1ong \\u02b9identifier' that will overflow implementation stack buffers, forcing heap allocations." 147 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations." 148 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations." 149 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.", 150 151 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." 152 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." 153 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." 154 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.") 155 156 // FC5F ; FE74 0651 ; ML #* ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM to 157 // ARABIC KASRATAN ISOLATED FORM, ARABIC SHADDA 158 // This character NFKD normalizes to \u0020 \u064d \u0651, so its confusable mapping 159 // is never used in creating a skeleton. 160 CHECK_SKELETON(SL, "\\uFC5F", " \\u064d\\u0651"); 161 162 CHECK_SKELETON(SL, "nochange", "nochange"); 163 CHECK_SKELETON(MA, "love", "love"); 164 CHECK_SKELETON(MA, "1ove", "love"); // Digit 1 to letter l 165 CHECK_SKELETON(ML, "OOPS", "OOPS"); 166 CHECK_SKELETON(ML, "00PS", "00PS"); // Digit 0 unchanged in lower case mode. 167 CHECK_SKELETON(MA, "OOPS", "OOPS"); 168 CHECK_SKELETON(MA, "00PS", "OOPS"); // Digit 0 to letter O in any case mode only 169 CHECK_SKELETON(SL, "\\u059c", "\\u0301"); 170 CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D"); 171 CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u006C\\u006C\\u0029"); // "(ll)" 172 CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647"); 173 174 // This mapping exists in the ML and MA tables, does not exist in SL, SA 175 //0C83 ; 0C03 ; 176 CHECK_SKELETON(SL, "\\u0C83", "\\u0C83"); 177 CHECK_SKELETON(SA, "\\u0C83", "\\u0C83"); 178 CHECK_SKELETON(ML, "\\u0C83", "\\u0983"); 179 CHECK_SKELETON(MA, "\\u0C83", "\\u0983"); 180 181 // 0391 ; 0041 ; 182 // This mapping exists only in the MA table. 183 CHECK_SKELETON(MA, "\\u0391", "A"); 184 CHECK_SKELETON(SA, "\\u0391", "\\u0391"); 185 CHECK_SKELETON(ML, "\\u0391", "\\u0391"); 186 CHECK_SKELETON(SL, "\\u0391", "\\u0391"); 187 188 // 13CF ; 0062 ; 189 // This mapping exists in the ML and MA tables 190 CHECK_SKELETON(ML, "\\u13CF", "b"); 191 CHECK_SKELETON(MA, "\\u13CF", "b"); 192 CHECK_SKELETON(SL, "\\u13CF", "\\u13CF"); 193 CHECK_SKELETON(SA, "\\u13CF", "\\u13CF"); 194 195 // 0022 ; 0027 0027 ; 196 // all tables. 197 CHECK_SKELETON(SL, "\\u0022", "\\u0027\\u0027"); 198 CHECK_SKELETON(SA, "\\u0022", "\\u0027\\u0027"); 199 CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027"); 200 CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027"); 201 202 TEST_TEARDOWN; 203 } 204 205 206 // 207 // Run a single confusable skeleton transformation test case. 208 // 209 void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type, 210 const char *input, const char *expected, int32_t lineNum) { 211 UnicodeString uInput = UnicodeString(input).unescape(); 212 UnicodeString uExpected = UnicodeString(expected).unescape(); 213 214 UErrorCode status = U_ZERO_ERROR; 215 UnicodeString actual; 216 uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status); 217 if (U_FAILURE(status)) { 218 errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum, 219 u_errorName(status)); 220 return; 221 } 222 if (uExpected != actual) { 223 errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.", 224 __FILE__, __LINE__, lineNum); 225 errln(UnicodeString(" Actual Skeleton: \"") + actual + UnicodeString("\"\n") + 226 UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\"")); 227 } 228 } 229 230 void IntlTestSpoof::testAreConfusable() { 231 TEST_SETUP 232 UnicodeString s1("A long string that will overflow stack buffers. A long string that will overflow stack buffers. " 233 "A long string that will overflow stack buffers. A long string that will overflow stack buffers. "); 234 UnicodeString s2("A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. " 235 "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "); 236 TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status)); 237 TEST_ASSERT_SUCCESS(status); 238 239 TEST_TEARDOWN; 240 } 241 242 void IntlTestSpoof::testInvisible() { 243 TEST_SETUP 244 UnicodeString s = UnicodeString("abcd\\u0301ef").unescape(); 245 int32_t position = -42; 246 TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status)); 247 TEST_ASSERT_SUCCESS(status); 248 TEST_ASSERT(position == -42); 249 250 UnicodeString s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape(); 251 TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status)); 252 TEST_ASSERT_SUCCESS(status); 253 TEST_ASSERT_EQ(7, position); 254 255 // Tow acute accents, one from the composed a with acute accent, \u00e1, 256 // and one separate. 257 position = -42; 258 UnicodeString s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape(); 259 TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status)); 260 TEST_ASSERT_SUCCESS(status); 261 TEST_ASSERT_EQ(7, position); 262 TEST_TEARDOWN; 263 } 264 265 266 static UnicodeString parseHex(const UnicodeString &in) { 267 // Convert a series of hex numbers in a Unicode String to a string with the 268 // corresponding characters. 269 // The conversion is _really_ annoying. There must be some function to just do it. 270 UnicodeString result; 271 UChar32 cc = 0; 272 for (int32_t i=0; i<in.length(); i++) { 273 UChar c = in.charAt(i); 274 if (c == 0x20) { // Space 275 if (cc > 0) { 276 result.append(cc); 277 cc = 0; 278 } 279 } else if (c>=0x30 && c<=0x39) { 280 cc = (cc<<4) + (c - 0x30); 281 } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) { 282 cc = (cc<<4) + (c & 0x0f)+9; 283 } 284 // else do something with bad input. 285 } 286 if (cc > 0) { 287 result.append(cc); 288 } 289 return result; 290 } 291 292 293 // 294 // Append the hex form of a UChar32 to a UnicodeString. 295 // Used in formatting error messages. 296 // Match the formatting of numbers in confusables.txt 297 // Minimum of 4 digits, no leading zeroes for positions 5 and up. 298 // 299 static void appendHexUChar(UnicodeString &dest, UChar32 c) { 300 UBool doZeroes = FALSE; 301 for (int bitNum=28; bitNum>=0; bitNum-=4) { 302 if (bitNum <= 12) { 303 doZeroes = TRUE; 304 } 305 int hexDigit = (c>>bitNum) & 0x0f; 306 if (hexDigit != 0 || doZeroes) { 307 doZeroes = TRUE; 308 dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x41)); 309 } 310 } 311 dest.append((UChar)0x20); 312 } 313 314 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose); 315 316 // testConfData - Check each data item from the Unicode confusables.txt file, 317 // verify that it transforms correctly in a skeleton. 318 // 319 void IntlTestSpoof::testConfData() { 320 UErrorCode status = U_ZERO_ERROR; 321 322 const char *testDataDir = IntlTest::getSourceTestData(status); 323 TEST_ASSERT_SUCCESS(status); 324 char buffer[2000]; 325 uprv_strcpy(buffer, testDataDir); 326 uprv_strcat(buffer, "confusables.txt"); 327 328 LocalStdioFilePointer f(fopen(buffer, "rb")); 329 if (f.isNull()) { 330 errln("Skipping test spoof/testConfData. File confusables.txt not accessible."); 331 return; 332 } 333 fseek(f.getAlias(), 0, SEEK_END); 334 int32_t fileSize = ftell(f.getAlias()); 335 LocalArray<char> fileBuf(new char[fileSize]); 336 fseek(f.getAlias(), 0, SEEK_SET); 337 int32_t amt_read = fread(fileBuf.getAlias(), 1, fileSize, f.getAlias()); 338 TEST_ASSERT_EQ(amt_read, fileSize); 339 TEST_ASSERT(fileSize>0); 340 if (amt_read != fileSize || fileSize <=0) { 341 return; 342 } 343 UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.getAlias(), fileSize)); 344 345 LocalUSpoofCheckerPointer sc(uspoof_open(&status)); 346 TEST_ASSERT_SUCCESS(status); 347 348 // Parse lines from the confusables.txt file. Example Line: 349 // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH .... 350 // Three fields. The hex fields can contain more than one character, 351 // and each character may be more than 4 digits (for supplemntals) 352 // This regular expression matches lines and splits the fields into capture groups. 353 RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status); 354 TEST_ASSERT_SUCCESS(status); 355 while (parseLine.find()) { 356 UnicodeString from = parseHex(parseLine.group(1, status)); 357 if (!Normalizer::isNormalized(from, UNORM_NFKD, status)) { 358 // The source character was not NFKD. 359 // Skip this case; the first step in obtaining a skeleton is to NFKD the input, 360 // so the mapping in this line of confusables.txt will never be applied. 361 continue; 362 } 363 364 UnicodeString rawExpected = parseHex(parseLine.group(2, status)); 365 UnicodeString expected; 366 Normalizer::decompose(rawExpected, TRUE, 0, expected, status); 367 TEST_ASSERT_SUCCESS(status); 368 369 int32_t skeletonType = 0; 370 UnicodeString tableType = parseLine.group(3, status); 371 TEST_ASSERT_SUCCESS(status); 372 if (tableType.indexOf("SL") >= 0) { 373 skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; 374 } else if (tableType.indexOf("SA") >= 0) { 375 skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; 376 } else if (tableType.indexOf("ML") >= 0) { 377 skeletonType = 0; 378 } else if (tableType.indexOf("MA") >= 0) { 379 skeletonType = USPOOF_ANY_CASE; 380 } 381 382 UnicodeString actual; 383 uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actual, &status); 384 TEST_ASSERT_SUCCESS(status); 385 TEST_ASSERT(actual == expected); 386 if (actual != expected) { 387 errln(parseLine.group(0, status)); 388 UnicodeString line = "Actual: "; 389 int i = 0; 390 while (i < actual.length()) { 391 appendHexUChar(line, actual.char32At(i)); 392 i = actual.moveIndex32(i, 1); 393 } 394 errln(line); 395 } 396 if (U_FAILURE(status)) { 397 break; 398 } 399 } 400 } 401 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS 402 403