1 /* 2 ********************************************************************** 3 * Copyright (C) 2009, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 /** 8 * IntlTestSpoof tests for USpoofDetector 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO 14 15 #include "itspoof.h" 16 #include "unicode/uspoof.h" 17 #include "unicode/unistr.h" 18 #include "unicode/regex.h" 19 #include "unicode/normlzr.h" 20 #include "cstring.h" 21 #include <stdlib.h> 22 #include <stdio.h> 23 24 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \ 25 errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}} 26 27 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \ 28 errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};} 29 30 #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \ 31 errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \ 32 __FILE__, __LINE__, #a, (a), #b, (b)); }} 33 34 #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \ 35 errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \ 36 __FILE__, __LINE__, #a, (a), #b, (b)); }} 37 38 /* 39 * TEST_SETUP and TEST_TEARDOWN 40 * macros to handle the boilerplate around setting up test case. 41 * Put arbitrary test code between SETUP and TEARDOWN. 42 * "sc" is the ready-to-go SpoofChecker for use in the tests. 43 */ 44 #define TEST_SETUP { \ 45 UErrorCode status = U_ZERO_ERROR; \ 46 USpoofChecker *sc; \ 47 sc = uspoof_open(&status); \ 48 TEST_ASSERT_SUCCESS(status); \ 49 if (U_SUCCESS(status)){ 50 51 #define TEST_TEARDOWN \ 52 } \ 53 TEST_ASSERT_SUCCESS(status); \ 54 uspoof_close(sc); \ 55 } 56 57 58 59 60 void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 61 { 62 if (exec) logln("TestSuite spoof: "); 63 switch (index) { 64 case 0: 65 name = "TestSpoofAPI"; 66 if (exec) { 67 testSpoofAPI(); 68 } 69 break; 70 case 1: 71 name = "TestSkeleton"; 72 if (exec) { 73 testSkeleton(); 74 } 75 break; 76 case 2: 77 name = "TestAreConfusable"; 78 if (exec) { 79 testAreConfusable(); 80 } 81 break; 82 case 3: 83 name = "TestInvisible"; 84 if (exec) { 85 testInvisible(); 86 } 87 break; 88 case 4: 89 name = "testConfData"; 90 if (exec) { 91 testConfData(); 92 } 93 break; 94 default: name=""; break; 95 } 96 } 97 98 void IntlTestSpoof::testSpoofAPI() { 99 100 TEST_SETUP 101 UnicodeString s("uvw"); 102 int32_t position = 666; 103 int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status); 104 TEST_ASSERT_SUCCESS(status); 105 TEST_ASSERT_EQ(0, checkResults); 106 TEST_ASSERT_EQ(666, position); 107 TEST_TEARDOWN; 108 109 TEST_SETUP 110 UnicodeString s1("cxs"); 111 UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape(); // Cyrillic "cxs" 112 int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status); 113 TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults); 114 115 TEST_TEARDOWN; 116 117 TEST_SETUP 118 UnicodeString s("I1l0O"); 119 UnicodeString dest; 120 UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status); 121 TEST_ASSERT_SUCCESS(status); 122 TEST_ASSERT(UnicodeString("11100") == dest); 123 TEST_ASSERT(&dest == &retStr); 124 TEST_TEARDOWN; 125 } 126 127 128 #define CHECK_SKELETON(type, input, expected) { \ 129 checkSkeleton(sc, type, input, expected, __LINE__); \ 130 } 131 132 133 // testSkeleton. Spot check a number of confusable skeleton substitutions from the 134 // Unicode data file confusables.txt 135 // Test cases chosen for substitutions of various lengths, and 136 // membership in different mapping tables. 137 void IntlTestSpoof::testSkeleton() { 138 const uint32_t ML = 0; 139 const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE; 140 const uint32_t MA = USPOOF_ANY_CASE; 141 const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; 142 143 TEST_SETUP 144 // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations. 145 CHECK_SKELETON(SL, " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations." 146 " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations." 147 " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations." 148 " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations.", 149 150 " A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations." 151 " A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations." 152 " A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations." 153 " A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations.") 154 155 // FC5F ; FE74 0651 ; ML #* ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM to 156 // ARABIC KASRATAN ISOLATED FORM, ARABIC SHADDA 157 // This character NFKD normalizes to \u0020 \u064d \u0651, so its confusable mapping 158 // is never used in creating a skeleton. 159 CHECK_SKELETON(SL, "\\uFC5F", " \\u064d\\u0651"); 160 161 CHECK_SKELETON(SL, "nochange", "nochange"); 162 CHECK_SKELETON(MA, "love", "1ove"); // lower case l to digit 1 163 CHECK_SKELETON(ML, "OOPS", "OOPS"); 164 CHECK_SKELETON(MA, "OOPS", "00PS"); // Letter O to digit 0 in any case mode only 165 CHECK_SKELETON(SL, "\\u059c", "\\u0301"); 166 CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D"); 167 CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u0031\\u0031\\u0029"); 168 CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647"); 169 170 // This mapping exists in the ML and MA tables, does not exist in SL, SA 171 //0C83 ; 0C03 ; 172 CHECK_SKELETON(SL, "\\u0C83", "\\u0C83"); 173 CHECK_SKELETON(SA, "\\u0C83", "\\u0C83"); 174 CHECK_SKELETON(ML, "\\u0C83", "\\u0C03"); 175 CHECK_SKELETON(MA, "\\u0C83", "\\u0C03"); 176 177 // 0391 ; 0041 ; 178 // This mapping exists only in the MA table. 179 CHECK_SKELETON(MA, "\\u0391", "A"); 180 CHECK_SKELETON(SA, "\\u0391", "\\u0391"); 181 CHECK_SKELETON(ML, "\\u0391", "\\u0391"); 182 CHECK_SKELETON(SL, "\\u0391", "\\u0391"); 183 184 // 13CF ; 0062 ; 185 // This mapping exists in the ML and MA tables 186 CHECK_SKELETON(ML, "\\u13CF", "b"); 187 CHECK_SKELETON(MA, "\\u13CF", "b"); 188 CHECK_SKELETON(SL, "\\u13CF", "\\u13CF"); 189 CHECK_SKELETON(SA, "\\u13CF", "\\u13CF"); 190 191 // 0022 ; 02B9 02B9 ; 192 // all tables. 193 CHECK_SKELETON(SL, "\\u0022", "\\u02B9\\u02B9"); 194 CHECK_SKELETON(SA, "\\u0022", "\\u02B9\\u02B9"); 195 CHECK_SKELETON(ML, "\\u0022", "\\u02B9\\u02B9"); 196 CHECK_SKELETON(MA, "\\u0022", "\\u02B9\\u02B9"); 197 198 TEST_TEARDOWN; 199 } 200 201 202 // 203 // Run a single confusable skeleton transformation test case. 204 // 205 void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type, 206 const char *input, const char *expected, int32_t lineNum) { 207 UnicodeString uInput = UnicodeString(input).unescape(); 208 UnicodeString uExpected = UnicodeString(expected).unescape(); 209 210 UErrorCode status = U_ZERO_ERROR; 211 UnicodeString actual; 212 uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status); 213 if (U_FAILURE(status)) { 214 errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum, 215 u_errorName(status)); 216 return; 217 } 218 if (uExpected != actual) { 219 errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.", 220 __FILE__, __LINE__, lineNum); 221 errln(UnicodeString(" Actual Skeleton: \"") + actual + UnicodeString("\"\n") + 222 UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\"")); 223 } 224 } 225 226 void IntlTestSpoof::testAreConfusable() { 227 TEST_SETUP 228 UnicodeString s1("A long string that will overflow stack buffers. A long string that will overflow stack buffers. " 229 "A long string that will overflow stack buffers. A long string that will overflow stack buffers. "); 230 UnicodeString s2("A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. " 231 "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "); 232 TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status)); 233 TEST_ASSERT_SUCCESS(status); 234 235 TEST_TEARDOWN; 236 } 237 238 void IntlTestSpoof::testInvisible() { 239 TEST_SETUP 240 UnicodeString s = UnicodeString("abcd\\u0301ef").unescape(); 241 int32_t position = -42; 242 TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status)); 243 TEST_ASSERT_SUCCESS(status); 244 TEST_ASSERT(position == -42); 245 246 UnicodeString s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape(); 247 TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status)); 248 TEST_ASSERT_SUCCESS(status); 249 TEST_ASSERT_EQ(7, position); 250 251 // Tow acute accents, one from the composed a with acute accent, \u00e1, 252 // and one separate. 253 position = -42; 254 UnicodeString s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape(); 255 TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status)); 256 TEST_ASSERT_SUCCESS(status); 257 TEST_ASSERT_EQ(7, position); 258 TEST_TEARDOWN; 259 } 260 261 262 static UnicodeString parseHex(const UnicodeString &in) { 263 // Convert a series of hex numbers in a Unicode String to a string with the 264 // corresponding characters. 265 // The conversion is _really_ annoying. There must be some function to just do it. 266 UnicodeString result; 267 UChar32 cc = 0; 268 for (int32_t i=0; i<in.length(); i++) { 269 UChar c = in.charAt(i); 270 if (c == 0x20) { // Space 271 if (cc > 0) { 272 result.append(cc); 273 cc = 0; 274 } 275 } else if (c>=0x30 && c<=0x39) { 276 cc = (cc<<4) + (c - 0x30); 277 } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) { 278 cc = (cc<<4) + (c & 0x0f)+9; 279 } 280 // else do something with bad input. 281 } 282 if (cc > 0) { 283 result.append(cc); 284 } 285 return result; 286 } 287 288 289 // 290 // Append the hex form of a UChar32 to a UnicodeString. 291 // Used in formatting error messages. 292 // Match the formatting of numbers in confusables.txt 293 // Minimum of 4 digits, no leading zeroes for positions 5 and up. 294 // 295 static void appendHexUChar(UnicodeString &dest, UChar32 c) { 296 UBool doZeroes = FALSE; 297 for (int bitNum=28; bitNum>=0; bitNum-=4) { 298 if (bitNum <= 12) { 299 doZeroes = TRUE; 300 } 301 int hexDigit = (c>>bitNum) & 0x0f; 302 if (hexDigit != 0 || doZeroes) { 303 doZeroes = TRUE; 304 dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x41)); 305 } 306 } 307 dest.append((UChar)0x20); 308 } 309 310 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose); 311 312 // testConfData - Check each data item from the Unicode confusables.txt file, 313 // verify that it transforms correctly in a skeleton. 314 // 315 void IntlTestSpoof::testConfData() { 316 UErrorCode status = U_ZERO_ERROR; 317 318 const char *testDataDir = IntlTest::getSourceTestData(status); 319 TEST_ASSERT_SUCCESS(status); 320 char buffer[2000]; 321 uprv_strcpy(buffer, testDataDir); 322 uprv_strcat(buffer, "confusables.txt"); 323 324 LocalStdioFilePointer f(fopen(buffer, "rb")); 325 if (f.isNull()) { 326 errln("Skipping test spoof/testConfData. File confusables.txt not accessible."); 327 return; 328 } 329 fseek(f.getAlias(), 0, SEEK_END); 330 int32_t fileSize = ftell(f.getAlias()); 331 LocalArray<char> fileBuf(new char[fileSize]); 332 fseek(f.getAlias(), 0, SEEK_SET); 333 int32_t amt_read = fread(fileBuf.getAlias(), 1, fileSize, f.getAlias()); 334 TEST_ASSERT_EQ(amt_read, fileSize); 335 TEST_ASSERT(fileSize>0); 336 if (amt_read != fileSize || fileSize <=0) { 337 return; 338 } 339 UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.getAlias(), fileSize)); 340 341 LocalUSpoofCheckerPointer sc(uspoof_open(&status)); 342 TEST_ASSERT_SUCCESS(status); 343 344 // Parse lines from the confusables.txt file. Example Line: 345 // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH .... 346 // Three fields. The hex fields can contain more than one character, 347 // and each character may be more than 4 digits (for supplemntals) 348 // This regular expression matches lines and splits the fields into capture groups. 349 RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status); 350 TEST_ASSERT_SUCCESS(status); 351 while (parseLine.find()) { 352 UnicodeString from = parseHex(parseLine.group(1, status)); 353 if (!Normalizer::isNormalized(from, UNORM_NFKD, status)) { 354 // The source character was not NFKD. 355 // Skip this case; the first step in obtaining a skeleton is to NFKD the input, 356 // so the mapping in this line of confusables.txt will never be applied. 357 continue; 358 } 359 360 UnicodeString rawExpected = parseHex(parseLine.group(2, status)); 361 UnicodeString expected; 362 Normalizer::decompose(rawExpected, TRUE, 0, expected, status); 363 TEST_ASSERT_SUCCESS(status); 364 365 int32_t skeletonType = 0; 366 UnicodeString tableType = parseLine.group(3, status); 367 TEST_ASSERT_SUCCESS(status); 368 if (tableType.indexOf("SL") >= 0) { 369 skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; 370 } else if (tableType.indexOf("SA") >= 0) { 371 skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; 372 } else if (tableType.indexOf("ML") >= 0) { 373 skeletonType = 0; 374 } else if (tableType.indexOf("MA") >= 0) { 375 skeletonType = USPOOF_ANY_CASE; 376 } 377 378 UnicodeString actual; 379 uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actual, &status); 380 TEST_ASSERT_SUCCESS(status); 381 TEST_ASSERT(actual == expected); 382 if (actual != expected) { 383 errln(parseLine.group(0, status)); 384 UnicodeString line = "Actual: "; 385 int i = 0; 386 while (i < actual.length()) { 387 appendHexUChar(line, actual.char32At(i)); 388 i = actual.moveIndex32(i, 1); 389 } 390 errln(line); 391 } 392 if (U_FAILURE(status)) { 393 break; 394 } 395 } 396 } 397 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS 398 399