1 /* 2 ********************************************************************** 3 * Copyright (C) 2011-2013, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 /** 8 * IntlTestSpoof tests for USpoofDetector 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO 14 15 #include "itspoof.h" 16 17 #include "unicode/normlzr.h" 18 #include "unicode/regex.h" 19 #include "unicode/unistr.h" 20 #include "unicode/uscript.h" 21 #include "unicode/uspoof.h" 22 23 #include "cstring.h" 24 #include "identifier_info.h" 25 #include "scriptset.h" 26 #include "uhash.h" 27 28 #include <stdlib.h> 29 #include <stdio.h> 30 31 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \ 32 errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}} 33 34 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \ 35 errln("Test Failure at file %s, line %d: \"%s\" is false.", __FILE__, __LINE__, #expr);};} 36 37 #define TEST_ASSERT_MSG(expr, msg) {if ((expr)==FALSE) { \ 38 dataerrln("Test Failure at file %s, line %d, %s: \"%s\" is false.", __FILE__, __LINE__, msg, #expr);};} 39 40 #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \ 41 errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d)", \ 42 __FILE__, __LINE__, #a, (a), #b, (b)); }} 43 44 #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \ 45 errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d)", \ 46 __FILE__, __LINE__, #a, (a), #b, (b)); }} 47 48 #define LENGTHOF(array) ((int32_t)(sizeof(array)/sizeof((array)[0]))) 49 50 /* 51 * TEST_SETUP and TEST_TEARDOWN 52 * macros to handle the boilerplate around setting up test case. 53 * Put arbitrary test code between SETUP and TEARDOWN. 54 * "sc" is the ready-to-go SpoofChecker for use in the tests. 55 */ 56 #define TEST_SETUP { \ 57 UErrorCode status = U_ZERO_ERROR; \ 58 USpoofChecker *sc; \ 59 sc = uspoof_open(&status); \ 60 TEST_ASSERT_SUCCESS(status); \ 61 if (U_SUCCESS(status)){ 62 63 #define TEST_TEARDOWN \ 64 } \ 65 TEST_ASSERT_SUCCESS(status); \ 66 uspoof_close(sc); \ 67 } 68 69 70 71 72 void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 73 { 74 if (exec) logln("TestSuite spoof: "); 75 switch (index) { 76 case 0: 77 name = "TestSpoofAPI"; 78 if (exec) { 79 testSpoofAPI(); 80 } 81 break; 82 case 1: 83 name = "TestSkeleton"; 84 if (exec) { 85 testSkeleton(); 86 } 87 break; 88 case 2: 89 name = "TestAreConfusable"; 90 if (exec) { 91 testAreConfusable(); 92 } 93 break; 94 case 3: 95 name = "TestInvisible"; 96 if (exec) { 97 testInvisible(); 98 } 99 break; 100 case 4: 101 name = "testConfData"; 102 if (exec) { 103 testConfData(); 104 } 105 break; 106 case 5: 107 name = "testBug8654"; 108 if (exec) { 109 testBug8654(); 110 } 111 break; 112 case 6: 113 name = "testIdentifierInfo"; 114 if (exec) { 115 testIdentifierInfo(); 116 } 117 break; 118 case 7: 119 name = "testScriptSet"; 120 if (exec) { 121 testScriptSet(); 122 } 123 break; 124 case 8: 125 name = "testRestrictionLevel"; 126 if (exec) { 127 testRestrictionLevel(); 128 } 129 break; 130 case 9: 131 name = "testMixedNumbers"; 132 if (exec) { 133 testMixedNumbers(); 134 } 135 break; 136 137 138 default: name=""; break; 139 } 140 } 141 142 void IntlTestSpoof::testSpoofAPI() { 143 144 TEST_SETUP 145 UnicodeString s("xyz"); // Many latin ranges are whole-script confusable with other scripts. 146 // If this test starts failing, consult confusablesWholeScript.txt 147 int32_t position = 666; 148 int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status); 149 TEST_ASSERT_SUCCESS(status); 150 TEST_ASSERT_EQ(0, checkResults); 151 TEST_ASSERT_EQ(0, position); 152 TEST_TEARDOWN; 153 154 TEST_SETUP 155 UnicodeString s1("cxs"); 156 UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape(); // Cyrillic "cxs" 157 int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status); 158 TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults); 159 160 TEST_TEARDOWN; 161 162 TEST_SETUP 163 UnicodeString s("I1l0O"); 164 UnicodeString dest; 165 UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status); 166 TEST_ASSERT_SUCCESS(status); 167 TEST_ASSERT(UnicodeString("lllOO") == dest); 168 TEST_ASSERT(&dest == &retStr); 169 TEST_TEARDOWN; 170 } 171 172 173 #define CHECK_SKELETON(type, input, expected) { \ 174 checkSkeleton(sc, type, input, expected, __LINE__); \ 175 } 176 177 178 // testSkeleton. Spot check a number of confusable skeleton substitutions from the 179 // Unicode data file confusables.txt 180 // Test cases chosen for substitutions of various lengths, and 181 // membership in different mapping tables. 182 void IntlTestSpoof::testSkeleton() { 183 const uint32_t ML = 0; 184 const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE; 185 const uint32_t MA = USPOOF_ANY_CASE; 186 const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; 187 188 TEST_SETUP 189 // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations. 190 CHECK_SKELETON(SL, " A 1ong \\u02b9identifier' that will overflow implementation stack buffers, forcing heap allocations." 191 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations." 192 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations." 193 " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.", 194 195 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." 196 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." 197 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." 198 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.") 199 200 CHECK_SKELETON(SL, "nochange", "nochange"); 201 CHECK_SKELETON(MA, "love", "love"); 202 CHECK_SKELETON(MA, "1ove", "love"); // Digit 1 to letter l 203 CHECK_SKELETON(ML, "OOPS", "OOPS"); 204 CHECK_SKELETON(ML, "00PS", "00PS"); // Digit 0 unchanged in lower case mode. 205 CHECK_SKELETON(MA, "OOPS", "OOPS"); 206 CHECK_SKELETON(MA, "00PS", "OOPS"); // Digit 0 to letter O in any case mode only 207 CHECK_SKELETON(SL, "\\u059c", "\\u0301"); 208 CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D"); 209 CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u006C\\u006C\\u0029"); // "(ll)" 210 CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647"); 211 212 // This mapping exists in the ML and MA tables, does not exist in SL, SA 213 //0C83 ; 0C03 ; 214 CHECK_SKELETON(SL, "\\u0C83", "\\u0C83"); 215 CHECK_SKELETON(SA, "\\u0C83", "\\u0C83"); 216 CHECK_SKELETON(ML, "\\u0C83", "\\u0983"); 217 CHECK_SKELETON(MA, "\\u0C83", "\\u0983"); 218 219 // 0391 ; 0041 ; 220 // This mapping exists only in the MA table. 221 CHECK_SKELETON(MA, "\\u0391", "A"); 222 CHECK_SKELETON(SA, "\\u0391", "\\u0391"); 223 CHECK_SKELETON(ML, "\\u0391", "\\u0391"); 224 CHECK_SKELETON(SL, "\\u0391", "\\u0391"); 225 226 // 13CF ; 0062 ; 227 // This mapping exists in the ML and MA tables 228 CHECK_SKELETON(ML, "\\u13CF", "b"); 229 CHECK_SKELETON(MA, "\\u13CF", "b"); 230 CHECK_SKELETON(SL, "\\u13CF", "\\u13CF"); 231 CHECK_SKELETON(SA, "\\u13CF", "\\u13CF"); 232 233 // 0022 ; 0027 0027 ; 234 // all tables. 235 CHECK_SKELETON(SL, "\\u0022", "\\u0027\\u0027"); 236 CHECK_SKELETON(SA, "\\u0022", "\\u0027\\u0027"); 237 CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027"); 238 CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027"); 239 240 // 017F ; 0066 ; 241 // This mapping exists in the SA and MA tables 242 CHECK_SKELETON(MA, "\\u017F", "f"); 243 CHECK_SKELETON(SA, "\\u017F", "f"); 244 245 TEST_TEARDOWN; 246 } 247 248 249 // 250 // Run a single confusable skeleton transformation test case. 251 // 252 void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type, 253 const char *input, const char *expected, int32_t lineNum) { 254 UnicodeString uInput = UnicodeString(input).unescape(); 255 UnicodeString uExpected = UnicodeString(expected).unescape(); 256 257 UErrorCode status = U_ZERO_ERROR; 258 UnicodeString actual; 259 uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status); 260 if (U_FAILURE(status)) { 261 errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum, 262 u_errorName(status)); 263 return; 264 } 265 if (uExpected != actual) { 266 errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.", 267 __FILE__, __LINE__, lineNum); 268 errln(UnicodeString(" Actual Skeleton: \"") + actual + UnicodeString("\"\n") + 269 UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\"")); 270 } 271 } 272 273 void IntlTestSpoof::testAreConfusable() { 274 TEST_SETUP 275 UnicodeString s1("A long string that will overflow stack buffers. A long string that will overflow stack buffers. " 276 "A long string that will overflow stack buffers. A long string that will overflow stack buffers. "); 277 UnicodeString s2("A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. " 278 "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "); 279 TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status)); 280 TEST_ASSERT_SUCCESS(status); 281 282 TEST_TEARDOWN; 283 } 284 285 void IntlTestSpoof::testInvisible() { 286 TEST_SETUP 287 UnicodeString s = UnicodeString("abcd\\u0301ef").unescape(); 288 int32_t position = -42; 289 TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status)); 290 TEST_ASSERT_SUCCESS(status); 291 TEST_ASSERT(0 == position); 292 293 UnicodeString s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape(); 294 TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status)); 295 TEST_ASSERT_SUCCESS(status); 296 TEST_ASSERT_EQ(0, position); 297 298 // Two acute accents, one from the composed a with acute accent, \u00e1, 299 // and one separate. 300 position = -42; 301 UnicodeString s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape(); 302 TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status)); 303 TEST_ASSERT_SUCCESS(status); 304 TEST_ASSERT_EQ(0, position); 305 TEST_TEARDOWN; 306 } 307 308 void IntlTestSpoof::testBug8654() { 309 TEST_SETUP 310 UnicodeString s = UnicodeString("B\\u00c1\\u0301").unescape(); 311 int32_t position = -42; 312 TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s, &position, &status) & USPOOF_INVISIBLE ); 313 TEST_ASSERT_SUCCESS(status); 314 TEST_ASSERT_EQ(0, position); 315 TEST_TEARDOWN; 316 } 317 318 static UnicodeString parseHex(const UnicodeString &in) { 319 // Convert a series of hex numbers in a Unicode String to a string with the 320 // corresponding characters. 321 // The conversion is _really_ annoying. There must be some function to just do it. 322 UnicodeString result; 323 UChar32 cc = 0; 324 for (int32_t i=0; i<in.length(); i++) { 325 UChar c = in.charAt(i); 326 if (c == 0x20) { // Space 327 if (cc > 0) { 328 result.append(cc); 329 cc = 0; 330 } 331 } else if (c>=0x30 && c<=0x39) { 332 cc = (cc<<4) + (c - 0x30); 333 } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) { 334 cc = (cc<<4) + (c & 0x0f)+9; 335 } 336 // else do something with bad input. 337 } 338 if (cc > 0) { 339 result.append(cc); 340 } 341 return result; 342 } 343 344 345 // 346 // Append the hex form of a UChar32 to a UnicodeString. 347 // Used in formatting error messages. 348 // Match the formatting of numbers in confusables.txt 349 // Minimum of 4 digits, no leading zeroes for positions 5 and up. 350 // 351 static void appendHexUChar(UnicodeString &dest, UChar32 c) { 352 UBool doZeroes = FALSE; 353 for (int bitNum=28; bitNum>=0; bitNum-=4) { 354 if (bitNum <= 12) { 355 doZeroes = TRUE; 356 } 357 int hexDigit = (c>>bitNum) & 0x0f; 358 if (hexDigit != 0 || doZeroes) { 359 doZeroes = TRUE; 360 dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x41)); 361 } 362 } 363 dest.append((UChar)0x20); 364 } 365 366 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose); 367 368 // testConfData - Check each data item from the Unicode confusables.txt file, 369 // verify that it transforms correctly in a skeleton. 370 // 371 void IntlTestSpoof::testConfData() { 372 UErrorCode status = U_ZERO_ERROR; 373 374 const char *testDataDir = IntlTest::getSourceTestData(status); 375 TEST_ASSERT_SUCCESS(status); 376 char buffer[2000]; 377 uprv_strcpy(buffer, testDataDir); 378 uprv_strcat(buffer, "confusables.txt"); 379 380 LocalStdioFilePointer f(fopen(buffer, "rb")); 381 if (f.isNull()) { 382 errln("Skipping test spoof/testConfData. File confusables.txt not accessible."); 383 return; 384 } 385 fseek(f.getAlias(), 0, SEEK_END); 386 int32_t fileSize = ftell(f.getAlias()); 387 LocalArray<char> fileBuf(new char[fileSize]); 388 fseek(f.getAlias(), 0, SEEK_SET); 389 int32_t amt_read = fread(fileBuf.getAlias(), 1, fileSize, f.getAlias()); 390 TEST_ASSERT_EQ(amt_read, fileSize); 391 TEST_ASSERT(fileSize>0); 392 if (amt_read != fileSize || fileSize <=0) { 393 return; 394 } 395 UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.getAlias(), fileSize)); 396 397 LocalUSpoofCheckerPointer sc(uspoof_open(&status)); 398 TEST_ASSERT_SUCCESS(status); 399 400 // Parse lines from the confusables.txt file. Example Line: 401 // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH .... 402 // Three fields. The hex fields can contain more than one character, 403 // and each character may be more than 4 digits (for supplemntals) 404 // This regular expression matches lines and splits the fields into capture groups. 405 RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status); 406 TEST_ASSERT_SUCCESS(status); 407 while (parseLine.find()) { 408 UnicodeString from = parseHex(parseLine.group(1, status)); 409 if (!Normalizer::isNormalized(from, UNORM_NFD, status)) { 410 // The source character was not NFD. 411 // Skip this case; the first step in obtaining a skeleton is to NFD the input, 412 // so the mapping in this line of confusables.txt will never be applied. 413 continue; 414 } 415 416 UnicodeString rawExpected = parseHex(parseLine.group(2, status)); 417 UnicodeString expected; 418 Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status); 419 TEST_ASSERT_SUCCESS(status); 420 421 int32_t skeletonType = 0; 422 UnicodeString tableType = parseLine.group(3, status); 423 TEST_ASSERT_SUCCESS(status); 424 if (tableType.indexOf("SL") >= 0) { 425 skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; 426 } else if (tableType.indexOf("SA") >= 0) { 427 skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; 428 } else if (tableType.indexOf("ML") >= 0) { 429 skeletonType = 0; 430 } else if (tableType.indexOf("MA") >= 0) { 431 skeletonType = USPOOF_ANY_CASE; 432 } 433 434 UnicodeString actual; 435 uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actual, &status); 436 TEST_ASSERT_SUCCESS(status); 437 TEST_ASSERT(actual == expected); 438 if (actual != expected) { 439 errln(parseLine.group(0, status)); 440 UnicodeString line = "Actual: "; 441 int i = 0; 442 while (i < actual.length()) { 443 appendHexUChar(line, actual.char32At(i)); 444 i = actual.moveIndex32(i, 1); 445 } 446 errln(line); 447 } 448 if (U_FAILURE(status)) { 449 break; 450 } 451 } 452 } 453 454 // testIdentifierInfo. Note that IdentifierInfo is not public ICU API at this time 455 void IntlTestSpoof::testIdentifierInfo() { 456 UErrorCode status = U_ZERO_ERROR; 457 ScriptSet bitset12; bitset12.set(USCRIPT_LATIN, status).set(USCRIPT_HANGUL, status); 458 ScriptSet bitset2; bitset2.set(USCRIPT_HANGUL, status); 459 TEST_ASSERT(bitset12.contains(bitset2)); 460 TEST_ASSERT(bitset12.contains(bitset12)); 461 TEST_ASSERT(!bitset2.contains(bitset12)); 462 463 ScriptSet arabSet; arabSet.set(USCRIPT_ARABIC, status); 464 ScriptSet latinSet; latinSet.set(USCRIPT_LATIN, status); 465 UElement arabEl; arabEl.pointer = &arabSet; 466 UElement latinEl; latinEl.pointer = &latinSet; 467 TEST_ASSERT(uhash_compareScriptSet(arabEl, latinEl) < 0); 468 TEST_ASSERT(uhash_compareScriptSet(latinEl, arabEl) > 0); 469 470 UnicodeString scriptString; 471 bitset12.displayScripts(scriptString); 472 TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang Latn") == scriptString); 473 474 status = U_ZERO_ERROR; 475 UHashtable *alternates = uhash_open(uhash_hashScriptSet ,uhash_compareScriptSet, NULL, &status); 476 uhash_puti(alternates, &bitset12, 1, &status); 477 uhash_puti(alternates, &bitset2, 1, &status); 478 UnicodeString alternatesString; 479 IdentifierInfo::displayAlternates(alternatesString, alternates, status); 480 TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang; Hang Latn") == alternatesString); 481 TEST_ASSERT_SUCCESS(status); 482 483 status = U_ZERO_ERROR; 484 ScriptSet tScriptSet; 485 tScriptSet.parseScripts(scriptString, status); 486 TEST_ASSERT_SUCCESS(status); 487 TEST_ASSERT(bitset12 == tScriptSet); 488 UnicodeString ss; 489 ss.remove(); 490 uhash_close(alternates); 491 492 struct Test { 493 const char *fTestString; 494 URestrictionLevel fRestrictionLevel; 495 const char *fNumerics; 496 const char *fScripts; 497 const char *fAlternates; 498 const char *fCommonAlternates; 499 } tests[] = { 500 {"\\u0061\\u2665", USPOOF_UNRESTRICTIVE, "[]", "Latn", "", ""}, 501 {"\\u0061\\u3006", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"}, 502 {"\\u0061\\u30FC\\u3006", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hira Kana", "Hira Kana"}, 503 {"\\u0061\\u30FC\\u3006\\u30A2", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""}, 504 {"\\u30A2\\u0061\\u30FC\\u3006", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""}, 505 {"\\u0061\\u0031\\u0661", USPOOF_UNRESTRICTIVE, "[\\u0030\\u0660]", "Latn", "Arab Thaa", "Arab Thaa"}, 506 {"\\u0061\\u0031\\u0661\\u06F1", USPOOF_UNRESTRICTIVE, "[\\u0030\\u0660\\u06F0]", "Latn Arab", "", ""}, 507 {"\\u0661\\u30FC\\u3006\\u0061\\u30A2\\u0031\\u0967\\u06F1", USPOOF_UNRESTRICTIVE, 508 "[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab Deva", "", ""}, 509 {"\\u0061\\u30A2\\u30FC\\u3006\\u0031\\u0967\\u0661\\u06F1", USPOOF_UNRESTRICTIVE, 510 "[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab Deva", "", ""} 511 }; 512 513 int testNum; 514 for (testNum = 0; testNum < LENGTHOF(tests); testNum++) { 515 char testNumStr[40]; 516 sprintf(testNumStr, "testNum = %d", testNum); 517 Test &test = tests[testNum]; 518 status = U_ZERO_ERROR; 519 UnicodeString testString(test.fTestString); // Note: may do charset conversion. 520 testString = testString.unescape(); 521 IdentifierInfo idInfo(status); 522 TEST_ASSERT_SUCCESS(status); 523 idInfo.setIdentifierProfile(*uspoof_getRecommendedUnicodeSet(&status)); 524 idInfo.setIdentifier(testString, status); 525 TEST_ASSERT_MSG(*idInfo.getIdentifier() == testString, testNumStr); 526 527 URestrictionLevel restrictionLevel = test.fRestrictionLevel; 528 TEST_ASSERT_MSG(restrictionLevel == idInfo.getRestrictionLevel(status), testNumStr); 529 530 status = U_ZERO_ERROR; 531 UnicodeSet numerics(UnicodeString(test.fNumerics).unescape(), status); 532 TEST_ASSERT_SUCCESS(status); 533 TEST_ASSERT_MSG(numerics == *idInfo.getNumerics(), testNumStr); 534 535 ScriptSet scripts; 536 scripts.parseScripts(UnicodeString(test.fScripts), status); 537 TEST_ASSERT_MSG(scripts == *idInfo.getScripts(), testNumStr); 538 539 UnicodeString alternatesStr; 540 IdentifierInfo::displayAlternates(alternatesStr, idInfo.getAlternates(), status); 541 TEST_ASSERT_MSG(UnicodeString(test.fAlternates) == alternatesStr, testNumStr); 542 543 ScriptSet commonAlternates; 544 commonAlternates.parseScripts(UnicodeString(test.fCommonAlternates), status); 545 TEST_ASSERT_MSG(commonAlternates == *idInfo.getCommonAmongAlternates(), testNumStr); 546 } 547 548 // Test of getScriptCount() 549 // Script and or Script Extension for chars used in the tests 550 // \\u3013 ; Bopo Hang Hani Hira Kana # So GETA MARK 551 // \\uA838 ; Deva Gujr Guru Kthi Takr # Sc NORTH INDIC RUPEE MARK 552 // \\u0951 ; Deva Latn # Mn DEVANAGARI STRESS SIGN UDATTA 553 // 554 // \\u0370 ; Greek # L GREEK CAPITAL LETTER HETA 555 // \\u0481 ; Cyrillic # L& CYRILLIC SMALL LETTER KOPPA 556 // \\u0904 ; Devanagari # Lo DEVANAGARI LETTER SHORT A 557 // \\u3041 ; Hiragana # Lo HIRAGANA LETTER SMALL A 558 // 1234 ; Common # ascii digits 559 // \\u0300 ; Inherited # Mn COMBINING GRAVE ACCENT 560 561 struct ScriptTest { 562 const char *fTestString; 563 int32_t fScriptCount; 564 } scriptTests[] = { 565 {"Hello", 1}, 566 {"Hello\\u0370", 2}, 567 {"1234", 0}, 568 {"Hello1234\\u0300", 1}, // Common and Inherited are ignored. 569 {"\\u0030", 0}, 570 {"abc\\u0951", 1}, 571 {"abc\\u3013", 2}, 572 {"\\uA838\\u0951", 1}, // Triggers commonAmongAlternates path. 573 {"\\u3013\\uA838", 2} 574 }; 575 576 status = U_ZERO_ERROR; 577 IdentifierInfo identifierInfo(status); 578 for (testNum=0; testNum<LENGTHOF(scriptTests); testNum++) { 579 ScriptTest &test = scriptTests[testNum]; 580 char msgBuf[100]; 581 sprintf(msgBuf, "testNum = %d ", testNum); 582 UnicodeString testString = UnicodeString(test.fTestString).unescape(); 583 584 status = U_ZERO_ERROR; 585 identifierInfo.setIdentifier(testString, status); 586 int32_t scriptCount = identifierInfo.getScriptCount(); 587 TEST_ASSERT_MSG(test.fScriptCount == scriptCount, msgBuf); 588 } 589 } 590 591 void IntlTestSpoof::testScriptSet() { 592 ScriptSet s1; 593 ScriptSet s2; 594 UErrorCode status = U_ZERO_ERROR; 595 596 TEST_ASSERT(s1 == s2); 597 s1.set(USCRIPT_ARABIC,status); 598 TEST_ASSERT_SUCCESS(status); 599 TEST_ASSERT(!(s1 == s2)); 600 TEST_ASSERT(s1.test(USCRIPT_ARABIC, status)); 601 TEST_ASSERT(s1.test(USCRIPT_GREEK, status) == FALSE); 602 603 status = U_ZERO_ERROR; 604 s1.reset(USCRIPT_ARABIC, status); 605 TEST_ASSERT(s1 == s2); 606 607 status = U_ZERO_ERROR; 608 s1.setAll(); 609 TEST_ASSERT(s1.test(USCRIPT_COMMON, status)); 610 TEST_ASSERT(s1.test(USCRIPT_ETHIOPIC, status)); 611 TEST_ASSERT(s1.test(USCRIPT_CODE_LIMIT, status)); 612 s1.resetAll(); 613 TEST_ASSERT(!s1.test(USCRIPT_COMMON, status)); 614 TEST_ASSERT(!s1.test(USCRIPT_ETHIOPIC, status)); 615 TEST_ASSERT(!s1.test(USCRIPT_CODE_LIMIT, status)); 616 617 status = U_ZERO_ERROR; 618 s1.set(USCRIPT_TAKRI, status); 619 s1.set(USCRIPT_BLISSYMBOLS, status); 620 s2.setAll(); 621 TEST_ASSERT(s2.contains(s1)); 622 TEST_ASSERT(!s1.contains(s2)); 623 TEST_ASSERT(s2.intersects(s1)); 624 TEST_ASSERT(s1.intersects(s2)); 625 s2.reset(USCRIPT_TAKRI, status); 626 TEST_ASSERT(!s2.contains(s1)); 627 TEST_ASSERT(!s1.contains(s2)); 628 TEST_ASSERT(s1.intersects(s2)); 629 TEST_ASSERT(s2.intersects(s1)); 630 TEST_ASSERT_SUCCESS(status); 631 632 status = U_ZERO_ERROR; 633 s1.resetAll(); 634 s1.set(USCRIPT_NKO, status); 635 s1.set(USCRIPT_COMMON, status); 636 s2 = s1; 637 TEST_ASSERT(s2 == s1); 638 TEST_ASSERT_EQ(2, s2.countMembers()); 639 s2.intersect(s1); 640 TEST_ASSERT(s2 == s1); 641 s2.setAll(); 642 TEST_ASSERT(!(s2 == s1)); 643 TEST_ASSERT(s2.countMembers() >= USCRIPT_CODE_LIMIT); 644 s2.intersect(s1); 645 TEST_ASSERT(s2 == s1); 646 647 s2.setAll(); 648 s2.reset(USCRIPT_COMMON, status); 649 s2.intersect(s1); 650 TEST_ASSERT(s2.countMembers() == 1); 651 652 s1.resetAll(); 653 s1.set(USCRIPT_AFAKA, status); 654 s1.set(USCRIPT_VAI, status); 655 s1.set(USCRIPT_INHERITED, status); 656 int32_t n = -1; 657 for (int32_t i=0; i<4; i++) { 658 n = s1.nextSetBit(n+1); 659 switch (i) { 660 case 0: TEST_ASSERT_EQ(USCRIPT_INHERITED, n); break; 661 case 1: TEST_ASSERT_EQ(USCRIPT_VAI, n); break; 662 case 2: TEST_ASSERT_EQ(USCRIPT_AFAKA, n); break; 663 case 3: TEST_ASSERT_EQ(-1, (int32_t)n); break; 664 default: TEST_ASSERT(FALSE); 665 } 666 } 667 TEST_ASSERT_SUCCESS(status); 668 } 669 670 671 void IntlTestSpoof::testRestrictionLevel() { 672 struct Test { 673 const char *fId; 674 URestrictionLevel fExpectedRestrictionLevel; 675 } tests[] = { 676 {"\\u0061\\u03B3\\u2665", USPOOF_UNRESTRICTIVE}, 677 {"a", USPOOF_ASCII}, 678 {"\\u03B3", USPOOF_HIGHLY_RESTRICTIVE}, 679 {"\\u0061\\u30A2\\u30FC", USPOOF_HIGHLY_RESTRICTIVE}, 680 {"\\u0061\\u0904", USPOOF_MODERATELY_RESTRICTIVE}, 681 {"\\u0061\\u03B3", USPOOF_MINIMALLY_RESTRICTIVE} 682 }; 683 char msgBuffer[100]; 684 685 URestrictionLevel restrictionLevels[] = { USPOOF_ASCII, USPOOF_HIGHLY_RESTRICTIVE, 686 USPOOF_MODERATELY_RESTRICTIVE, USPOOF_MINIMALLY_RESTRICTIVE, USPOOF_UNRESTRICTIVE}; 687 688 UErrorCode status = U_ZERO_ERROR; 689 IdentifierInfo idInfo(status); 690 TEST_ASSERT_SUCCESS(status); 691 idInfo.setIdentifierProfile(*uspoof_getRecommendedUnicodeSet(&status)); 692 TEST_ASSERT_SUCCESS(status); 693 for (int32_t testNum=0; testNum < LENGTHOF(tests); testNum++) { 694 status = U_ZERO_ERROR; 695 const Test &test = tests[testNum]; 696 UnicodeString testString = UnicodeString(test.fId).unescape(); 697 URestrictionLevel expectedLevel = test.fExpectedRestrictionLevel; 698 idInfo.setIdentifier(testString, status); 699 sprintf(msgBuffer, "testNum = %d ", testNum); 700 TEST_ASSERT_SUCCESS(status); 701 TEST_ASSERT_MSG(expectedLevel == idInfo.getRestrictionLevel(status), msgBuffer); 702 for (int levelIndex=0; levelIndex<LENGTHOF(restrictionLevels); levelIndex++) { 703 status = U_ZERO_ERROR; 704 URestrictionLevel levelSetInSpoofChecker = restrictionLevels[levelIndex]; 705 USpoofChecker *sc = uspoof_open(&status); 706 uspoof_setChecks(sc, USPOOF_RESTRICTION_LEVEL, &status); 707 uspoof_setAllowedChars(sc, uspoof_getRecommendedSet(&status), &status); 708 uspoof_setRestrictionLevel(sc, levelSetInSpoofChecker); 709 UBool actualValue = uspoof_checkUnicodeString(sc, testString, NULL, &status) != 0; 710 711 // we want to fail if the text is (say) MODERATE and the testLevel is ASCII 712 UBool expectedFailure = expectedLevel > levelSetInSpoofChecker || 713 !uspoof_getRecommendedUnicodeSet(&status)->containsAll(testString); 714 sprintf(msgBuffer, "testNum = %d, levelIndex = %d", testNum, levelIndex); 715 TEST_ASSERT_MSG(expectedFailure == actualValue, msgBuffer); 716 TEST_ASSERT_SUCCESS(status); 717 uspoof_close(sc); 718 } 719 } 720 } 721 722 723 void IntlTestSpoof::testMixedNumbers() { 724 struct Test { 725 const char *fTestString; 726 const char *fExpectedSet; 727 } tests[] = { 728 {"1", "[0]"}, 729 {"\\u0967", "[\\u0966]"}, 730 {"1\\u0967", "[0\\u0966]"}, 731 {"\\u0661\\u06F1", "[\\u0660\\u06F0]"} 732 }; 733 UErrorCode status = U_ZERO_ERROR; 734 IdentifierInfo idInfo(status); 735 for (int32_t testNum=0; testNum < LENGTHOF(tests); testNum++) { 736 char msgBuf[100]; 737 sprintf(msgBuf, "testNum = %d ", testNum); 738 Test &test = tests[testNum]; 739 740 status = U_ZERO_ERROR; 741 UnicodeString testString = UnicodeString(test.fTestString).unescape(); 742 UnicodeSet expectedSet(UnicodeString(test.fExpectedSet).unescape(), status); 743 idInfo.setIdentifier(testString, status); 744 TEST_ASSERT_SUCCESS(status); 745 TEST_ASSERT_MSG(expectedSet == *idInfo.getNumerics(), msgBuf); 746 747 status = U_ZERO_ERROR; 748 USpoofChecker *sc = uspoof_open(&status); 749 uspoof_setChecks(sc, USPOOF_MIXED_NUMBERS, &status); // only check this 750 int32_t result = uspoof_checkUnicodeString(sc, testString, NULL, &status); 751 UBool mixedNumberFailure = ((result & USPOOF_MIXED_NUMBERS) != 0); 752 TEST_ASSERT_MSG((expectedSet.size() > 1) == mixedNumberFailure, msgBuf); 753 uspoof_close(sc); 754 } 755 } 756 757 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO */ 758