1 /* 2 ********************************************************************** 3 * Copyright (C) 2011, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 /** 8 * IntlTestSpoof tests for USpoofDetector 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO 14 15 #include "itspoof.h" 16 #include "unicode/uspoof.h" 17 #include "unicode/unistr.h" 18 #include "unicode/regex.h" 19 #include "unicode/normlzr.h" 20 #include "cstring.h" 21 #include <stdlib.h> 22 #include <stdio.h> 23 24 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \ 25 errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}} 26 27 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \ 28 errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};} 29 30 #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \ 31 errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \ 32 __FILE__, __LINE__, #a, (a), #b, (b)); }} 33 34 #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \ 35 errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \ 36 __FILE__, __LINE__, #a, (a), #b, (b)); }} 37 38 /* 39 * TEST_SETUP and TEST_TEARDOWN 40 * macros to handle the boilerplate around setting up test case. 41 * Put arbitrary test code between SETUP and TEARDOWN. 42 * "sc" is the ready-to-go SpoofChecker for use in the tests. 43 */ 44 #define TEST_SETUP { \ 45 UErrorCode status = U_ZERO_ERROR; \ 46 USpoofChecker *sc; \ 47 sc = uspoof_open(&status); \ 48 TEST_ASSERT_SUCCESS(status); \ 49 if (U_SUCCESS(status)){ 50 51 #define TEST_TEARDOWN \ 52 } \ 53 TEST_ASSERT_SUCCESS(status); \ 54 uspoof_close(sc); \ 55 } 56 57 58 59 60 void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 61 { 62 if (exec) logln("TestSuite spoof: "); 63 switch (index) { 64 case 0: 65 name = "TestSpoofAPI"; 66 if (exec) { 67 testSpoofAPI(); 68 } 69 break; 70 case 1: 71 name = "TestSkeleton"; 72 if (exec) { 73 testSkeleton(); 74 } 75 break; 76 case 2: 77 name = "TestAreConfusable"; 78 if (exec) { 79 testAreConfusable(); 80 } 81 break; 82 case 3: 83 name = "TestInvisible"; 84 if (exec) { 85 testInvisible(); 86 } 87 break; 88 case 4: 89 name = "testConfData"; 90 if (exec) { 91 testConfData(); 92 } 93 break; 94 case 5: 95 name = "testBug8654"; 96 if (exec) { 97 testBug8654(); 98 } 99 break; 100 default: name=""; break; 101 } 102 } 103 104 void IntlTestSpoof::testSpoofAPI() { 105 106 TEST_SETUP 107 UnicodeString s("xyz"); // Many latin ranges are whole-script confusable with other scripts. 108 // If this test starts failing, consult confusablesWholeScript.txt 109 int32_t position = 666; 110 int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status); 111 TEST_ASSERT_SUCCESS(status); 112 TEST_ASSERT_EQ(0, checkResults); 113 TEST_ASSERT_EQ(666, position); 114 TEST_TEARDOWN; 115 116 TEST_SETUP 117 UnicodeString s1("cxs"); 118 UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape(); // Cyrillic "cxs" 119 int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status); 120 TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults); 121 122 TEST_TEARDOWN; 123 124 TEST_SETUP 125 UnicodeString s("I1l0O"); 126 UnicodeString dest; 127 UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status); 128 TEST_ASSERT_SUCCESS(status); 129 TEST_ASSERT(UnicodeString("lllOO") == dest); 130 TEST_ASSERT(&dest == &retStr); 131 TEST_TEARDOWN; 132 } 133 134 135 #define CHECK_SKELETON(type, input, expected) { \ 136 checkSkeleton(sc, type, input, expected, __LINE__); \ 137 } 138 139 140 // testSkeleton. Spot check a number of confusable skeleton substitutions from the 141 // Unicode data file confusables.txt 142 // Test cases chosen for substitutions of various lengths, and 143 // membership in different mapping tables. 144 void IntlTestSpoof::testSkeleton() { 145 const uint32_t ML = 0; 146 const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE; 147 const uint32_t MA = USPOOF_ANY_CASE; 148 const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; 149 150 TEST_SETUP 151 // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations. 152 CHECK_SKELETON(SL, " A 1ong \\u02b9identifier' that will overflow implementation stack buffers, forcing heap allocations." 153 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations." 154 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations." 155 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.", 156 157 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." 158 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." 159 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." 160 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.") 161 162 CHECK_SKELETON(SL, "nochange", "nochange"); 163 CHECK_SKELETON(MA, "love", "love"); 164 CHECK_SKELETON(MA, "1ove", "love"); // Digit 1 to letter l 165 CHECK_SKELETON(ML, "OOPS", "OOPS"); 166 CHECK_SKELETON(ML, "00PS", "00PS"); // Digit 0 unchanged in lower case mode. 167 CHECK_SKELETON(MA, "OOPS", "OOPS"); 168 CHECK_SKELETON(MA, "00PS", "OOPS"); // Digit 0 to letter O in any case mode only 169 CHECK_SKELETON(SL, "\\u059c", "\\u0301"); 170 CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D"); 171 CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u006C\\u006C\\u0029"); // "(ll)" 172 CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647"); 173 174 // This mapping exists in the ML and MA tables, does not exist in SL, SA 175 //0C83 ; 0C03 ; 176 CHECK_SKELETON(SL, "\\u0C83", "\\u0C83"); 177 CHECK_SKELETON(SA, "\\u0C83", "\\u0C83"); 178 CHECK_SKELETON(ML, "\\u0C83", "\\u0983"); 179 CHECK_SKELETON(MA, "\\u0C83", "\\u0983"); 180 181 // 0391 ; 0041 ; 182 // This mapping exists only in the MA table. 183 CHECK_SKELETON(MA, "\\u0391", "A"); 184 CHECK_SKELETON(SA, "\\u0391", "\\u0391"); 185 CHECK_SKELETON(ML, "\\u0391", "\\u0391"); 186 CHECK_SKELETON(SL, "\\u0391", "\\u0391"); 187 188 // 13CF ; 0062 ; 189 // This mapping exists in the ML and MA tables 190 CHECK_SKELETON(ML, "\\u13CF", "b"); 191 CHECK_SKELETON(MA, "\\u13CF", "b"); 192 CHECK_SKELETON(SL, "\\u13CF", "\\u13CF"); 193 CHECK_SKELETON(SA, "\\u13CF", "\\u13CF"); 194 195 // 0022 ; 0027 0027 ; 196 // all tables. 197 CHECK_SKELETON(SL, "\\u0022", "\\u0027\\u0027"); 198 CHECK_SKELETON(SA, "\\u0022", "\\u0027\\u0027"); 199 CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027"); 200 CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027"); 201 202 // 017F ; 0066 ; 203 // This mapping exists in the SA and MA tables 204 CHECK_SKELETON(MA, "\\u017F", "f"); 205 CHECK_SKELETON(SA, "\\u017F", "f"); 206 207 TEST_TEARDOWN; 208 } 209 210 211 // 212 // Run a single confusable skeleton transformation test case. 213 // 214 void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type, 215 const char *input, const char *expected, int32_t lineNum) { 216 UnicodeString uInput = UnicodeString(input).unescape(); 217 UnicodeString uExpected = UnicodeString(expected).unescape(); 218 219 UErrorCode status = U_ZERO_ERROR; 220 UnicodeString actual; 221 uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status); 222 if (U_FAILURE(status)) { 223 errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum, 224 u_errorName(status)); 225 return; 226 } 227 if (uExpected != actual) { 228 errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.", 229 __FILE__, __LINE__, lineNum); 230 errln(UnicodeString(" Actual Skeleton: \"") + actual + UnicodeString("\"\n") + 231 UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\"")); 232 } 233 } 234 235 void IntlTestSpoof::testAreConfusable() { 236 TEST_SETUP 237 UnicodeString s1("A long string that will overflow stack buffers. A long string that will overflow stack buffers. " 238 "A long string that will overflow stack buffers. A long string that will overflow stack buffers. "); 239 UnicodeString s2("A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. " 240 "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "); 241 TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status)); 242 TEST_ASSERT_SUCCESS(status); 243 244 TEST_TEARDOWN; 245 } 246 247 void IntlTestSpoof::testInvisible() { 248 TEST_SETUP 249 UnicodeString s = UnicodeString("abcd\\u0301ef").unescape(); 250 int32_t position = -42; 251 TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status)); 252 TEST_ASSERT_SUCCESS(status); 253 TEST_ASSERT(position == -42); 254 255 UnicodeString s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape(); 256 TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status)); 257 TEST_ASSERT_SUCCESS(status); 258 TEST_ASSERT_EQ(7, position); 259 260 // Two acute accents, one from the composed a with acute accent, \u00e1, 261 // and one separate. 262 position = -42; 263 UnicodeString s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape(); 264 TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status)); 265 TEST_ASSERT_SUCCESS(status); 266 TEST_ASSERT_EQ(7, position); 267 TEST_TEARDOWN; 268 } 269 270 void IntlTestSpoof::testBug8654() { 271 TEST_SETUP 272 UnicodeString s = UnicodeString("B\\u00c1\\u0301").unescape(); 273 int32_t position = -42; 274 TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s, &position, &status) & USPOOF_INVISIBLE ); 275 TEST_ASSERT_SUCCESS(status); 276 TEST_ASSERT_EQ(3, position); 277 TEST_TEARDOWN; 278 } 279 280 static UnicodeString parseHex(const UnicodeString &in) { 281 // Convert a series of hex numbers in a Unicode String to a string with the 282 // corresponding characters. 283 // The conversion is _really_ annoying. There must be some function to just do it. 284 UnicodeString result; 285 UChar32 cc = 0; 286 for (int32_t i=0; i<in.length(); i++) { 287 UChar c = in.charAt(i); 288 if (c == 0x20) { // Space 289 if (cc > 0) { 290 result.append(cc); 291 cc = 0; 292 } 293 } else if (c>=0x30 && c<=0x39) { 294 cc = (cc<<4) + (c - 0x30); 295 } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) { 296 cc = (cc<<4) + (c & 0x0f)+9; 297 } 298 // else do something with bad input. 299 } 300 if (cc > 0) { 301 result.append(cc); 302 } 303 return result; 304 } 305 306 307 // 308 // Append the hex form of a UChar32 to a UnicodeString. 309 // Used in formatting error messages. 310 // Match the formatting of numbers in confusables.txt 311 // Minimum of 4 digits, no leading zeroes for positions 5 and up. 312 // 313 static void appendHexUChar(UnicodeString &dest, UChar32 c) { 314 UBool doZeroes = FALSE; 315 for (int bitNum=28; bitNum>=0; bitNum-=4) { 316 if (bitNum <= 12) { 317 doZeroes = TRUE; 318 } 319 int hexDigit = (c>>bitNum) & 0x0f; 320 if (hexDigit != 0 || doZeroes) { 321 doZeroes = TRUE; 322 dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x41)); 323 } 324 } 325 dest.append((UChar)0x20); 326 } 327 328 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose); 329 330 // testConfData - Check each data item from the Unicode confusables.txt file, 331 // verify that it transforms correctly in a skeleton. 332 // 333 void IntlTestSpoof::testConfData() { 334 UErrorCode status = U_ZERO_ERROR; 335 336 const char *testDataDir = IntlTest::getSourceTestData(status); 337 TEST_ASSERT_SUCCESS(status); 338 char buffer[2000]; 339 uprv_strcpy(buffer, testDataDir); 340 uprv_strcat(buffer, "confusables.txt"); 341 342 LocalStdioFilePointer f(fopen(buffer, "rb")); 343 if (f.isNull()) { 344 errln("Skipping test spoof/testConfData. File confusables.txt not accessible."); 345 return; 346 } 347 fseek(f.getAlias(), 0, SEEK_END); 348 int32_t fileSize = ftell(f.getAlias()); 349 LocalArray<char> fileBuf(new char[fileSize]); 350 fseek(f.getAlias(), 0, SEEK_SET); 351 int32_t amt_read = fread(fileBuf.getAlias(), 1, fileSize, f.getAlias()); 352 TEST_ASSERT_EQ(amt_read, fileSize); 353 TEST_ASSERT(fileSize>0); 354 if (amt_read != fileSize || fileSize <=0) { 355 return; 356 } 357 UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.getAlias(), fileSize)); 358 359 LocalUSpoofCheckerPointer sc(uspoof_open(&status)); 360 TEST_ASSERT_SUCCESS(status); 361 362 // Parse lines from the confusables.txt file. Example Line: 363 // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH .... 364 // Three fields. The hex fields can contain more than one character, 365 // and each character may be more than 4 digits (for supplemntals) 366 // This regular expression matches lines and splits the fields into capture groups. 367 RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status); 368 TEST_ASSERT_SUCCESS(status); 369 while (parseLine.find()) { 370 UnicodeString from = parseHex(parseLine.group(1, status)); 371 if (!Normalizer::isNormalized(from, UNORM_NFD, status)) { 372 // The source character was not NFD. 373 // Skip this case; the first step in obtaining a skeleton is to NFD the input, 374 // so the mapping in this line of confusables.txt will never be applied. 375 continue; 376 } 377 378 UnicodeString rawExpected = parseHex(parseLine.group(2, status)); 379 UnicodeString expected; 380 Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status); 381 TEST_ASSERT_SUCCESS(status); 382 383 int32_t skeletonType = 0; 384 UnicodeString tableType = parseLine.group(3, status); 385 TEST_ASSERT_SUCCESS(status); 386 if (tableType.indexOf("SL") >= 0) { 387 skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; 388 } else if (tableType.indexOf("SA") >= 0) { 389 skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; 390 } else if (tableType.indexOf("ML") >= 0) { 391 skeletonType = 0; 392 } else if (tableType.indexOf("MA") >= 0) { 393 skeletonType = USPOOF_ANY_CASE; 394 } 395 396 UnicodeString actual; 397 uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actual, &status); 398 TEST_ASSERT_SUCCESS(status); 399 TEST_ASSERT(actual == expected); 400 if (actual != expected) { 401 errln(parseLine.group(0, status)); 402 UnicodeString line = "Actual: "; 403 int i = 0; 404 while (i < actual.length()) { 405 appendHexUChar(line, actual.char32At(i)); 406 i = actual.moveIndex32(i, 1); 407 } 408 errln(line); 409 } 410 if (U_FAILURE(status)) { 411 break; 412 } 413 } 414 } 415 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS 416 417