1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 *************************************************************************** 5 * Copyright (C) 2008-2016, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 *************************************************************************** 8 * file name: uspoof.h 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2008Feb13 14 * created by: Andy Heninger 15 * 16 * Unicode Spoof Detection 17 */ 18 19 #ifndef USPOOF_H 20 #define USPOOF_H 21 22 #include "unicode/utypes.h" 23 #include "unicode/uset.h" 24 #include "unicode/parseerr.h" 25 #include "unicode/localpointer.h" 26 27 #if !UCONFIG_NO_NORMALIZATION 28 29 30 #if U_SHOW_CPLUSPLUS_API 31 #include "unicode/unistr.h" 32 #include "unicode/uniset.h" 33 #endif 34 35 36 /** 37 * \file 38 * \brief Unicode Security and Spoofing Detection, C API. 39 * 40 * <p> 41 * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and 42 * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions: 43 * 44 * <ol> 45 * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "Harvest" and 46 * "Ηarvest", where the second string starts with the Greek capital letter Eta.</li> 47 * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof 48 * detection</em>), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.</li> 49 * </ol> 50 * 51 * <p> 52 * Although originally designed as a method for flagging suspicious identifier strings such as URLs, 53 * <code>USpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word 54 * content filters. 55 * 56 * <p> 57 * The functions of this class are exposed as C API, with a handful of syntactical conveniences for C++. 58 * 59 * <h2>Confusables</h2> 60 * 61 * <p> 62 * The following example shows how to use <code>USpoofChecker</code> to check for confusability between two strings: 63 * 64 * \code{.c} 65 * UErrorCode status = U_ZERO_ERROR; 66 * UChar* str1 = (UChar*) u"Harvest"; 67 * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA 68 * 69 * USpoofChecker* sc = uspoof_open(&status); 70 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); 71 * 72 * int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status); 73 * UBool result = bitmask != 0; 74 * // areConfusable: 1 (status: U_ZERO_ERROR) 75 * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status)); 76 * uspoof_close(sc); 77 * \endcode 78 * 79 * <p> 80 * The call to {@link uspoof_open} creates a <code>USpoofChecker</code> object; the call to {@link uspoof_setChecks} 81 * enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the 82 * confusability test; and the following line extracts the result out of the return value. For best performance, 83 * the instance should be created once (e.g., upon application startup), and the efficient 84 * {@link uspoof_areConfusable} method can be used at runtime. 85 * 86 * <p> 87 * The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call 88 * {@link uspoof_close} when the object goes out of scope: 89 * 90 * \code{.cpp} 91 * UErrorCode status = U_ZERO_ERROR; 92 * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); 93 * uspoof_setChecks(sc.getAlias(), USPOOF_CONFUSABLE, &status); 94 * // ... 95 * \endcode 96 * 97 * <p> 98 * UTS 39 defines two strings to be <em>confusable</em> if they map to the same <em>skeleton string</em>. A skeleton can 99 * be thought of as a "hash code". {@link uspoof_getSkeleton} computes the skeleton for a particular string, so 100 * the following snippet is equivalent to the example above: 101 * 102 * \code{.c} 103 * UErrorCode status = U_ZERO_ERROR; 104 * UChar* str1 = (UChar*) u"Harvest"; 105 * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA 106 * 107 * USpoofChecker* sc = uspoof_open(&status); 108 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); 109 * 110 * // Get skeleton 1 111 * int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status); 112 * UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar)); 113 * status = U_ZERO_ERROR; 114 * uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status); 115 * 116 * // Get skeleton 2 117 * int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status); 118 * UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar)); 119 * status = U_ZERO_ERROR; 120 * uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status); 121 * 122 * // Are the skeletons the same? 123 * UBool result = u_strcmp(skel1, skel2) == 0; 124 * // areConfusable: 1 (status: U_ZERO_ERROR) 125 * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status)); 126 * uspoof_close(sc); 127 * free(skel1); 128 * free(skel2); 129 * \endcode 130 * 131 * <p> 132 * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling 133 * {@link uspoof_areConfusable} many times in a loop, {@link uspoof_getSkeleton} can be used instead, as shown below: 134 * 135 * \code{.c} 136 * UErrorCode status = U_ZERO_ERROR; 137 * #define DICTIONARY_LENGTH 2 138 * UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" }; 139 * UChar* skeletons[DICTIONARY_LENGTH]; 140 * UChar* str = (UChar*) u"1orern"; 141 * 142 * // Setup: 143 * USpoofChecker* sc = uspoof_open(&status); 144 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); 145 * for (size_t i=0; i<DICTIONARY_LENGTH; i++) { 146 * UChar* word = dictionary[i]; 147 * int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status); 148 * skeletons[i] = (UChar*) malloc(++len * sizeof(UChar)); 149 * status = U_ZERO_ERROR; 150 * uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status); 151 * } 152 * 153 * // Live Check: 154 * { 155 * int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status); 156 * UChar* skel = (UChar*) malloc(++len * sizeof(UChar)); 157 * status = U_ZERO_ERROR; 158 * uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status); 159 * UBool result = FALSE; 160 * for (size_t i=0; i<DICTIONARY_LENGTH; i++) { 161 * result = u_strcmp(skel, skeletons[i]) == 0; 162 * if (result == TRUE) { break; } 163 * } 164 * // Has confusable in dictionary: 1 (status: U_ZERO_ERROR) 165 * printf("Has confusable in dictionary: %d (status: %s)\n", result, u_errorName(status)); 166 * free(skel); 167 * } 168 * 169 * for (size_t i=0; i<DICTIONARY_LENGTH; i++) { 170 * free(skeletons[i]); 171 * } 172 * uspoof_close(sc); 173 * \endcode 174 * 175 * <p> 176 * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em> 177 * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons 178 * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons. 179 * 180 * <h2>Spoof Detection</h2> 181 * 182 * <p> 183 * The following snippet shows a minimal example of using <code>USpoofChecker</code> to perform spoof detection on a 184 * string: 185 * 186 * \code{.c} 187 * UErrorCode status = U_ZERO_ERROR; 188 * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A 189 * 190 * // Get the default set of allowable characters: 191 * USet* allowed = uset_openEmpty(); 192 * uset_addAll(allowed, uspoof_getRecommendedSet(&status)); 193 * uset_addAll(allowed, uspoof_getInclusionSet(&status)); 194 * 195 * USpoofChecker* sc = uspoof_open(&status); 196 * uspoof_setAllowedChars(sc, allowed, &status); 197 * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE); 198 * 199 * int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status); 200 * UBool result = bitmask != 0; 201 * // fails checks: 1 (status: U_ZERO_ERROR) 202 * printf("fails checks: %d (status: %s)\n", result, u_errorName(status)); 203 * uspoof_close(sc); 204 * uset_close(allowed); 205 * \endcode 206 * 207 * <p> 208 * As in the case for confusability checking, it is good practice to create one <code>USpoofChecker</code> instance at 209 * startup, and call the cheaper {@link uspoof_check} online. We specify the set of 210 * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39. 211 * 212 * <p> 213 * In addition to {@link uspoof_check}, the function {@link uspoof_checkUTF8} is exposed for UTF8-encoded char* strings, 214 * and {@link uspoof_checkUnicodeString} is exposed for C++ programmers. 215 * 216 * <p> 217 * If the {@link USPOOF_AUX_INFO} check is enabled, a limited amount of information on why a string failed the checks 218 * is available in the returned bitmask. For complete information, use the {@link uspoof_check2} class of functions 219 * with a {@link USpoofCheckResult} parameter: 220 * 221 * \code{.c} 222 * UErrorCode status = U_ZERO_ERROR; 223 * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A 224 * 225 * // Get the default set of allowable characters: 226 * USet* allowed = uset_openEmpty(); 227 * uset_addAll(allowed, uspoof_getRecommendedSet(&status)); 228 * uset_addAll(allowed, uspoof_getInclusionSet(&status)); 229 * 230 * USpoofChecker* sc = uspoof_open(&status); 231 * uspoof_setAllowedChars(sc, allowed, &status); 232 * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE); 233 * 234 * USpoofCheckResult* checkResult = uspoof_openCheckResult(&status); 235 * int32_t bitmask = uspoof_check2(sc, str, -1, checkResult, &status); 236 * 237 * int32_t failures1 = bitmask; 238 * int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status); 239 * assert(failures1 == failures2); 240 * // checks that failed: 0x00000010 (status: U_ZERO_ERROR) 241 * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status)); 242 * 243 * // Cleanup: 244 * uspoof_close(sc); 245 * uset_close(allowed); 246 * uspoof_closeCheckResult(checkResult); 247 * \endcode 248 * 249 * C++ users can take advantage of a few syntactical conveniences. The following snippet is functionally 250 * equivalent to the one above: 251 * 252 * \code{.cpp} 253 * UErrorCode status = U_ZERO_ERROR; 254 * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A 255 * 256 * // Get the default set of allowable characters: 257 * UnicodeSet allowed; 258 * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status)); 259 * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status)); 260 * 261 * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); 262 * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status); 263 * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE); 264 * 265 * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status)); 266 * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status); 267 * 268 * int32_t failures1 = bitmask; 269 * int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status); 270 * assert(failures1 == failures2); 271 * // checks that failed: 0x00000010 (status: U_ZERO_ERROR) 272 * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status)); 273 * 274 * // Explicit cleanup not necessary. 275 * \endcode 276 * 277 * <p> 278 * The return value is a bitmask of the checks that failed. In this case, there was one check that failed: 279 * {@link USPOOF_RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are: 280 * 281 * <ul> 282 * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the 283 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS 284 * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li> 285 * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character 286 * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li> 287 * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable 288 * characters. See {@link uspoof_setAllowedChars} and {@link uspoof_setAllowedLocales}.</li> 289 * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li> 290 * </ul> 291 * 292 * <p> 293 * These checks can be enabled independently of each other. For example, if you were interested in checking for only the 294 * INVISIBLE and MIXED_NUMBERS conditions, you could do: 295 * 296 * \code{.c} 297 * UErrorCode status = U_ZERO_ERROR; 298 * UChar* str = (UChar*) u"8\u09EA"; // 8 mixed with U+09EA BENGALI DIGIT FOUR 299 * 300 * USpoofChecker* sc = uspoof_open(&status); 301 * uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status); 302 * 303 * int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status); 304 * UBool result = bitmask != 0; 305 * // fails checks: 1 (status: U_ZERO_ERROR) 306 * printf("fails checks: %d (status: %s)\n", result, u_errorName(status)); 307 * uspoof_close(sc); 308 * \endcode 309 * 310 * <p> 311 * Here is an example in C++ showing how to compute the restriction level of a string: 312 * 313 * \code{.cpp} 314 * UErrorCode status = U_ZERO_ERROR; 315 * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A 316 * 317 * // Get the default set of allowable characters: 318 * UnicodeSet allowed; 319 * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status)); 320 * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status)); 321 * 322 * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); 323 * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status); 324 * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE); 325 * uspoof_setChecks(sc.getAlias(), USPOOF_RESTRICTION_LEVEL | USPOOF_AUX_INFO, &status); 326 * 327 * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status)); 328 * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status); 329 * 330 * URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status); 331 * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask: 332 * assert((restrictionLevel & bitmask) == restrictionLevel); 333 * // Restriction level: 0x50000000 (status: U_ZERO_ERROR) 334 * printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status)); 335 * \endcode 336 * 337 * <p> 338 * The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since 339 * USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check. 340 * 341 * <p> 342 * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in 343 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings 344 * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have 345 * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is 346 * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed 347 * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on 348 * the levels, see UTS 39 or {@link URestrictionLevel}. The Restriction Level test is aware of the set of 349 * allowed characters set in {@link uspoof_setAllowedChars}. Note that characters which have script code 350 * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple 351 * scripts. 352 * 353 * <h2>Additional Information</h2> 354 * 355 * <p> 356 * A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers. 357 * 358 * <p> 359 * <b>Thread Safety:</b> The test functions for checking a single identifier, or for testing whether 360 * two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads, 361 * using the same USpoofChecker instance. 362 * 363 * <p> 364 * More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are 365 * thread safe. Those that take a non-const USpoofChecker are not thread safe.. 366 * 367 * @stable ICU 4.6 368 */ 369 370 struct USpoofChecker; 371 /** 372 * @stable ICU 4.2 373 */ 374 typedef struct USpoofChecker USpoofChecker; /**< typedef for C of USpoofChecker */ 375 376 struct USpoofCheckResult; 377 /** 378 * @see uspoof_openCheckResult 379 * @stable ICU 58 380 */ 381 typedef struct USpoofCheckResult USpoofCheckResult; 382 383 /** 384 * Enum for the kinds of checks that USpoofChecker can perform. 385 * These enum values are used both to select the set of checks that 386 * will be performed, and to report results from the check function. 387 * 388 * @stable ICU 4.2 389 */ 390 typedef enum USpoofChecks { 391 /** 392 * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates 393 * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section 394 * 4. 395 * 396 * @see uspoof_areConfusable 397 * @stable ICU 4.2 398 */ 399 USPOOF_SINGLE_SCRIPT_CONFUSABLE = 1, 400 401 /** 402 * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates 403 * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS 404 * 39 section 4. 405 * 406 * @see uspoof_areConfusable 407 * @stable ICU 4.2 408 */ 409 USPOOF_MIXED_SCRIPT_CONFUSABLE = 2, 410 411 /** 412 * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates 413 * that the two strings are visually confusable and that they are not from the same script but both of them are 414 * single-script strings, according to UTS 39 section 4. 415 * 416 * @see uspoof_areConfusable 417 * @stable ICU 4.2 418 */ 419 USPOOF_WHOLE_SCRIPT_CONFUSABLE = 4, 420 421 /** 422 * Enable this flag in {@link uspoof_setChecks} to turn on all types of confusables. You may set 423 * the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to 424 * make {@link uspoof_areConfusable} return only those types of confusables. 425 * 426 * @see uspoof_areConfusable 427 * @see uspoof_getSkeleton 428 * @stable ICU 58 429 */ 430 USPOOF_CONFUSABLE = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, 431 432 #ifndef U_HIDE_DEPRECATED_API 433 /** 434 * This flag is deprecated and no longer affects the behavior of SpoofChecker. 435 * 436 * @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was deprecated. 437 */ 438 USPOOF_ANY_CASE = 8, 439 #endif /* U_HIDE_DEPRECATED_API */ 440 441 /** 442 * Check that an identifier is no looser than the specified RestrictionLevel. 443 * The default if {@link uspoof_setRestrictionLevel} is not called is HIGHLY_RESTRICTIVE. 444 * 445 * If USPOOF_AUX_INFO is enabled the actual restriction level of the 446 * identifier being tested will also be returned by uspoof_check(). 447 * 448 * @see URestrictionLevel 449 * @see uspoof_setRestrictionLevel 450 * @see USPOOF_AUX_INFO 451 * 452 * @stable ICU 51 453 */ 454 USPOOF_RESTRICTION_LEVEL = 16, 455 456 #ifndef U_HIDE_DEPRECATED_API 457 /** Check that an identifier contains only characters from a 458 * single script (plus chars from the common and inherited scripts.) 459 * Applies to checks of a single identifier check only. 460 * @deprecated ICU 51 Use RESTRICTION_LEVEL instead. 461 */ 462 USPOOF_SINGLE_SCRIPT = USPOOF_RESTRICTION_LEVEL, 463 #endif /* U_HIDE_DEPRECATED_API */ 464 465 /** Check an identifier for the presence of invisible characters, 466 * such as zero-width spaces, or character sequences that are 467 * likely not to display, such as multiple occurrences of the same 468 * non-spacing mark. This check does not test the input string as a whole 469 * for conformance to any particular syntax for identifiers. 470 */ 471 USPOOF_INVISIBLE = 32, 472 473 /** Check that an identifier contains only characters from a specified set 474 * of acceptable characters. See {@link uspoof_setAllowedChars} and 475 * {@link uspoof_setAllowedLocales}. Note that a string that fails this check 476 * will also fail the {@link USPOOF_RESTRICTION_LEVEL} check. 477 */ 478 USPOOF_CHAR_LIMIT = 64, 479 480 /** 481 * Check that an identifier does not mix numbers from different numbering systems. 482 * For more information, see UTS 39 section 5.3. 483 * 484 * @stable ICU 51 485 */ 486 USPOOF_MIXED_NUMBERS = 128, 487 488 /** 489 * Enable all spoof checks. 490 * 491 * @stable ICU 4.6 492 */ 493 USPOOF_ALL_CHECKS = 0xFFFF, 494 495 /** 496 * Enable the return of auxillary (non-error) information in the 497 * upper bits of the check results value. 498 * 499 * If this "check" is not enabled, the results of {@link uspoof_check} will be 500 * zero when an identifier passes all of the enabled checks. 501 * 502 * If this "check" is enabled, (uspoof_check() & {@link USPOOF_ALL_CHECKS}) will 503 * be zero when an identifier passes all checks. 504 * 505 * @stable ICU 51 506 */ 507 USPOOF_AUX_INFO = 0x40000000 508 509 } USpoofChecks; 510 511 512 /** 513 * Constants from UAX #39 for use in {@link uspoof_setRestrictionLevel}, and 514 * for returned identifier restriction levels in check results. 515 * 516 * @stable ICU 51 517 * 518 * @see uspoof_setRestrictionLevel 519 * @see uspoof_check 520 */ 521 typedef enum URestrictionLevel { 522 /** 523 * All characters in the string are in the identifier profile and all characters in the string are in the 524 * ASCII range. 525 * 526 * @stable ICU 51 527 */ 528 USPOOF_ASCII = 0x10000000, 529 /** 530 * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and 531 * the string is single-script, according to the definition in UTS 39 section 5.1. 532 * 533 * @stable ICU 53 534 */ 535 USPOOF_SINGLE_SCRIPT_RESTRICTIVE = 0x20000000, 536 /** 537 * The string classifies as Single Script, or all characters in the string are in the identifier profile and 538 * the string is covered by any of the following sets of scripts, according to the definition in UTS 39 539 * section 5.1: 540 * <ul> 541 * <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li> 542 * <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li> 543 * <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li> 544 * </ul> 545 * This is the default restriction in ICU. 546 * 547 * @stable ICU 51 548 */ 549 USPOOF_HIGHLY_RESTRICTIVE = 0x30000000, 550 /** 551 * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile 552 * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic, 553 * Greek, and Cherokee. 554 * 555 * @stable ICU 51 556 */ 557 USPOOF_MODERATELY_RESTRICTIVE = 0x40000000, 558 /** 559 * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts. 560 * 561 * @stable ICU 51 562 */ 563 USPOOF_MINIMALLY_RESTRICTIVE = 0x50000000, 564 /** 565 * Any valid identifiers, including characters outside of the Identifier Profile. 566 * 567 * @stable ICU 51 568 */ 569 USPOOF_UNRESTRICTIVE = 0x60000000, 570 /** 571 * Mask for selecting the Restriction Level bits from the return value of {@link uspoof_check}. 572 * 573 * @stable ICU 53 574 */ 575 USPOOF_RESTRICTION_LEVEL_MASK = 0x7F000000, 576 #ifndef U_HIDE_INTERNAL_API 577 /** 578 * An undefined restriction level. 579 * @internal 580 */ 581 USPOOF_UNDEFINED_RESTRICTIVE = -1 582 #endif /* U_HIDE_INTERNAL_API */ 583 } URestrictionLevel; 584 585 /** 586 * Create a Unicode Spoof Checker, configured to perform all 587 * checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT. 588 * Note that additional checks may be added in the future, 589 * resulting in the changes to the default checking behavior. 590 * 591 * @param status The error code, set if this function encounters a problem. 592 * @return the newly created Spoof Checker 593 * @stable ICU 4.2 594 */ 595 U_STABLE USpoofChecker * U_EXPORT2 596 uspoof_open(UErrorCode *status); 597 598 599 /** 600 * Open a Spoof checker from its serialized form, stored in 32-bit-aligned memory. 601 * Inverse of uspoof_serialize(). 602 * The memory containing the serialized data must remain valid and unchanged 603 * as long as the spoof checker, or any cloned copies of the spoof checker, 604 * are in use. Ownership of the memory remains with the caller. 605 * The spoof checker (and any clones) must be closed prior to deleting the 606 * serialized data. 607 * 608 * @param data a pointer to 32-bit-aligned memory containing the serialized form of spoof data 609 * @param length the number of bytes available at data; 610 * can be more than necessary 611 * @param pActualLength receives the actual number of bytes at data taken up by the data; 612 * can be NULL 613 * @param pErrorCode ICU error code 614 * @return the spoof checker. 615 * 616 * @see uspoof_open 617 * @see uspoof_serialize 618 * @stable ICU 4.2 619 */ 620 U_STABLE USpoofChecker * U_EXPORT2 621 uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength, 622 UErrorCode *pErrorCode); 623 624 /** 625 * Open a Spoof Checker from the source form of the spoof data. 626 * The input corresponds to the Unicode data file confusables.txt 627 * as described in Unicode UAX #39. The syntax of the source data 628 * is as described in UAX #39 for this file, and the content of 629 * this file is acceptable input. 630 * 631 * The character encoding of the (char *) input text is UTF-8. 632 * 633 * @param confusables a pointer to the confusable characters definitions, 634 * as found in file confusables.txt from unicode.org. 635 * @param confusablesLen The length of the confusables text, or -1 if the 636 * input string is zero terminated. 637 * @param confusablesWholeScript 638 * Deprecated in ICU 58. No longer used. 639 * @param confusablesWholeScriptLen 640 * Deprecated in ICU 58. No longer used. 641 * @param errType In the event of an error in the input, indicates 642 * which of the input files contains the error. 643 * The value is one of USPOOF_SINGLE_SCRIPT_CONFUSABLE or 644 * USPOOF_WHOLE_SCRIPT_CONFUSABLE, or 645 * zero if no errors are found. 646 * @param pe In the event of an error in the input, receives the position 647 * in the input text (line, offset) of the error. 648 * @param status an in/out ICU UErrorCode. Among the possible errors is 649 * U_PARSE_ERROR, which is used to report syntax errors 650 * in the input. 651 * @return A spoof checker that uses the rules from the input files. 652 * @stable ICU 4.2 653 */ 654 U_STABLE USpoofChecker * U_EXPORT2 655 uspoof_openFromSource(const char *confusables, int32_t confusablesLen, 656 const char *confusablesWholeScript, int32_t confusablesWholeScriptLen, 657 int32_t *errType, UParseError *pe, UErrorCode *status); 658 659 660 /** 661 * Close a Spoof Checker, freeing any memory that was being held by 662 * its implementation. 663 * @stable ICU 4.2 664 */ 665 U_STABLE void U_EXPORT2 666 uspoof_close(USpoofChecker *sc); 667 668 #if U_SHOW_CPLUSPLUS_API 669 670 U_NAMESPACE_BEGIN 671 672 /** 673 * \class LocalUSpoofCheckerPointer 674 * "Smart pointer" class, closes a USpoofChecker via uspoof_close(). 675 * For most methods see the LocalPointerBase base class. 676 * 677 * @see LocalPointerBase 678 * @see LocalPointer 679 * @stable ICU 4.4 680 */ 681 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckerPointer, USpoofChecker, uspoof_close); 682 683 U_NAMESPACE_END 684 685 #endif 686 687 /** 688 * Clone a Spoof Checker. The clone will be set to perform the same checks 689 * as the original source. 690 * 691 * @param sc The source USpoofChecker 692 * @param status The error code, set if this function encounters a problem. 693 * @return 694 * @stable ICU 4.2 695 */ 696 U_STABLE USpoofChecker * U_EXPORT2 697 uspoof_clone(const USpoofChecker *sc, UErrorCode *status); 698 699 700 /** 701 * Specify the bitmask of checks that will be performed by {@link uspoof_check}. Calling this method 702 * overwrites any checks that may have already been enabled. By default, all checks are enabled. 703 * 704 * To enable specific checks and disable all others, the "whitelisted" checks should be ORed together. For 705 * example, to fail strings containing characters outside of the set specified by {@link uspoof_setAllowedChars} and 706 * also strings that contain digits from mixed numbering systems: 707 * 708 * <pre> 709 * {@code 710 * uspoof_setChecks(USPOOF_CHAR_LIMIT | USPOOF_MIXED_NUMBERS); 711 * } 712 * </pre> 713 * 714 * To disable specific checks and enable all others, the "blacklisted" checks should be ANDed away from 715 * ALL_CHECKS. For example, if you are not planning to use the {@link uspoof_areConfusable} functionality, 716 * it is good practice to disable the CONFUSABLE check: 717 * 718 * <pre> 719 * {@code 720 * uspoof_setChecks(USPOOF_ALL_CHECKS & ~USPOOF_CONFUSABLE); 721 * } 722 * </pre> 723 * 724 * Note that methods such as {@link uspoof_setAllowedChars}, {@link uspoof_setAllowedLocales}, and 725 * {@link uspoof_setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they 726 * enable onto the existing bitmask specified by this method. For more details, see the documentation of those 727 * methods. 728 * 729 * @param sc The USpoofChecker 730 * @param checks The set of checks that this spoof checker will perform. 731 * The value is a bit set, obtained by OR-ing together 732 * values from enum USpoofChecks. 733 * @param status The error code, set if this function encounters a problem. 734 * @stable ICU 4.2 735 * 736 */ 737 U_STABLE void U_EXPORT2 738 uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status); 739 740 /** 741 * Get the set of checks that this Spoof Checker has been configured to perform. 742 * 743 * @param sc The USpoofChecker 744 * @param status The error code, set if this function encounters a problem. 745 * @return The set of checks that this spoof checker will perform. 746 * The value is a bit set, obtained by OR-ing together 747 * values from enum USpoofChecks. 748 * @stable ICU 4.2 749 * 750 */ 751 U_STABLE int32_t U_EXPORT2 752 uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status); 753 754 /** 755 * Set the loosest restriction level allowed for strings. The default if this is not called is 756 * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and 757 * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are 758 * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}. 759 * 760 * @param sc The USpoofChecker 761 * @param restrictionLevel The loosest restriction level allowed. 762 * @see URestrictionLevel 763 * @stable ICU 51 764 */ 765 U_STABLE void U_EXPORT2 766 uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel); 767 768 769 /** 770 * Get the Restriction Level that will be tested if the checks include {@link USPOOF_RESTRICTION_LEVEL}. 771 * 772 * @return The restriction level 773 * @see URestrictionLevel 774 * @stable ICU 51 775 */ 776 U_STABLE URestrictionLevel U_EXPORT2 777 uspoof_getRestrictionLevel(const USpoofChecker *sc); 778 779 /** 780 * Limit characters that are acceptable in identifiers being checked to those 781 * normally used with the languages associated with the specified locales. 782 * Any previously specified list of locales is replaced by the new settings. 783 * 784 * A set of languages is determined from the locale(s), and 785 * from those a set of acceptable Unicode scripts is determined. 786 * Characters from this set of scripts, along with characters from 787 * the "common" and "inherited" Unicode Script categories 788 * will be permitted. 789 * 790 * Supplying an empty string removes all restrictions; 791 * characters from any script will be allowed. 792 * 793 * The {@link USPOOF_CHAR_LIMIT} test is automatically enabled for this 794 * USpoofChecker when calling this function with a non-empty list 795 * of locales. 796 * 797 * The Unicode Set of characters that will be allowed is accessible 798 * via the uspoof_getAllowedChars() function. uspoof_setAllowedLocales() 799 * will <i>replace</i> any previously applied set of allowed characters. 800 * 801 * Adjustments, such as additions or deletions of certain classes of characters, 802 * can be made to the result of uspoof_setAllowedLocales() by 803 * fetching the resulting set with uspoof_getAllowedChars(), 804 * manipulating it with the Unicode Set API, then resetting the 805 * spoof detectors limits with uspoof_setAllowedChars(). 806 * 807 * @param sc The USpoofChecker 808 * @param localesList A list list of locales, from which the language 809 * and associated script are extracted. The locales 810 * are comma-separated if there is more than one. 811 * White space may not appear within an individual locale, 812 * but is ignored otherwise. 813 * The locales are syntactically like those from the 814 * HTTP Accept-Language header. 815 * If the localesList is empty, no restrictions will be placed on 816 * the allowed characters. 817 * 818 * @param status The error code, set if this function encounters a problem. 819 * @stable ICU 4.2 820 */ 821 U_STABLE void U_EXPORT2 822 uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status); 823 824 /** 825 * Get a list of locales for the scripts that are acceptable in strings 826 * to be checked. If no limitations on scripts have been specified, 827 * an empty string will be returned. 828 * 829 * uspoof_setAllowedChars() will reset the list of allowed to be empty. 830 * 831 * The format of the returned list is the same as that supplied to 832 * uspoof_setAllowedLocales(), but returned list may not be identical 833 * to the originally specified string; the string may be reformatted, 834 * and information other than languages from 835 * the originally specified locales may be omitted. 836 * 837 * @param sc The USpoofChecker 838 * @param status The error code, set if this function encounters a problem. 839 * @return A string containing a list of locales corresponding 840 * to the acceptable scripts, formatted like an 841 * HTTP Accept Language value. 842 * 843 * @stable ICU 4.2 844 */ 845 U_STABLE const char * U_EXPORT2 846 uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status); 847 848 849 /** 850 * Limit the acceptable characters to those specified by a Unicode Set. 851 * Any previously specified character limit is 852 * is replaced by the new settings. This includes limits on 853 * characters that were set with the uspoof_setAllowedLocales() function. 854 * 855 * The USPOOF_CHAR_LIMIT test is automatically enabled for this 856 * USpoofChecker by this function. 857 * 858 * @param sc The USpoofChecker 859 * @param chars A Unicode Set containing the list of 860 * characters that are permitted. Ownership of the set 861 * remains with the caller. The incoming set is cloned by 862 * this function, so there are no restrictions on modifying 863 * or deleting the USet after calling this function. 864 * @param status The error code, set if this function encounters a problem. 865 * @stable ICU 4.2 866 */ 867 U_STABLE void U_EXPORT2 868 uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status); 869 870 871 /** 872 * Get a USet for the characters permitted in an identifier. 873 * This corresponds to the limits imposed by the Set Allowed Characters 874 * functions. Limitations imposed by other checks will not be 875 * reflected in the set returned by this function. 876 * 877 * The returned set will be frozen, meaning that it cannot be modified 878 * by the caller. 879 * 880 * Ownership of the returned set remains with the Spoof Detector. The 881 * returned set will become invalid if the spoof detector is closed, 882 * or if a new set of allowed characters is specified. 883 * 884 * 885 * @param sc The USpoofChecker 886 * @param status The error code, set if this function encounters a problem. 887 * @return A USet containing the characters that are permitted by 888 * the USPOOF_CHAR_LIMIT test. 889 * @stable ICU 4.2 890 */ 891 U_STABLE const USet * U_EXPORT2 892 uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status); 893 894 895 #if U_SHOW_CPLUSPLUS_API 896 /** 897 * Limit the acceptable characters to those specified by a Unicode Set. 898 * Any previously specified character limit is 899 * is replaced by the new settings. This includes limits on 900 * characters that were set with the uspoof_setAllowedLocales() function. 901 * 902 * The USPOOF_CHAR_LIMIT test is automatically enabled for this 903 * USoofChecker by this function. 904 * 905 * @param sc The USpoofChecker 906 * @param chars A Unicode Set containing the list of 907 * characters that are permitted. Ownership of the set 908 * remains with the caller. The incoming set is cloned by 909 * this function, so there are no restrictions on modifying 910 * or deleting the UnicodeSet after calling this function. 911 * @param status The error code, set if this function encounters a problem. 912 * @stable ICU 4.2 913 */ 914 U_STABLE void U_EXPORT2 915 uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const icu::UnicodeSet *chars, UErrorCode *status); 916 917 918 /** 919 * Get a UnicodeSet for the characters permitted in an identifier. 920 * This corresponds to the limits imposed by the Set Allowed Characters / 921 * UnicodeSet functions. Limitations imposed by other checks will not be 922 * reflected in the set returned by this function. 923 * 924 * The returned set will be frozen, meaning that it cannot be modified 925 * by the caller. 926 * 927 * Ownership of the returned set remains with the Spoof Detector. The 928 * returned set will become invalid if the spoof detector is closed, 929 * or if a new set of allowed characters is specified. 930 * 931 * 932 * @param sc The USpoofChecker 933 * @param status The error code, set if this function encounters a problem. 934 * @return A UnicodeSet containing the characters that are permitted by 935 * the USPOOF_CHAR_LIMIT test. 936 * @stable ICU 4.2 937 */ 938 U_STABLE const icu::UnicodeSet * U_EXPORT2 939 uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status); 940 #endif 941 942 943 /** 944 * Check the specified string for possible security issues. 945 * The text to be checked will typically be an identifier of some sort. 946 * The set of checks to be performed is specified with uspoof_setChecks(). 947 * 948 * \note 949 * Consider using the newer API, {@link uspoof_check2}, instead. 950 * The newer API exposes additional information from the check procedure 951 * and is otherwise identical to this method. 952 * 953 * @param sc The USpoofChecker 954 * @param id The identifier to be checked for possible security issues, 955 * in UTF-16 format. 956 * @param length the length of the string to be checked, expressed in 957 * 16 bit UTF-16 code units, or -1 if the string is 958 * zero terminated. 959 * @param position Deprecated in ICU 51. Always returns zero. 960 * Originally, an out parameter for the index of the first 961 * string position that failed a check. 962 * This parameter may be NULL. 963 * @param status The error code, set if an error occurred while attempting to 964 * perform the check. 965 * Spoofing or security issues detected with the input string are 966 * not reported here, but through the function's return value. 967 * @return An integer value with bits set for any potential security 968 * or spoofing issues detected. The bits are defined by 969 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 970 * will be zero if the input string passes all of the 971 * enabled checks. 972 * @see uspoof_check2 973 * @stable ICU 4.2 974 */ 975 U_STABLE int32_t U_EXPORT2 976 uspoof_check(const USpoofChecker *sc, 977 const UChar *id, int32_t length, 978 int32_t *position, 979 UErrorCode *status); 980 981 982 /** 983 * Check the specified string for possible security issues. 984 * The text to be checked will typically be an identifier of some sort. 985 * The set of checks to be performed is specified with uspoof_setChecks(). 986 * 987 * \note 988 * Consider using the newer API, {@link uspoof_check2UTF8}, instead. 989 * The newer API exposes additional information from the check procedure 990 * and is otherwise identical to this method. 991 * 992 * @param sc The USpoofChecker 993 * @param id A identifier to be checked for possible security issues, in UTF8 format. 994 * @param length the length of the string to be checked, or -1 if the string is 995 * zero terminated. 996 * @param position Deprecated in ICU 51. Always returns zero. 997 * Originally, an out parameter for the index of the first 998 * string position that failed a check. 999 * This parameter may be NULL. 1000 * @param status The error code, set if an error occurred while attempting to 1001 * perform the check. 1002 * Spoofing or security issues detected with the input string are 1003 * not reported here, but through the function's return value. 1004 * If the input contains invalid UTF-8 sequences, 1005 * a status of U_INVALID_CHAR_FOUND will be returned. 1006 * @return An integer value with bits set for any potential security 1007 * or spoofing issues detected. The bits are defined by 1008 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1009 * will be zero if the input string passes all of the 1010 * enabled checks. 1011 * @see uspoof_check2UTF8 1012 * @stable ICU 4.2 1013 */ 1014 U_STABLE int32_t U_EXPORT2 1015 uspoof_checkUTF8(const USpoofChecker *sc, 1016 const char *id, int32_t length, 1017 int32_t *position, 1018 UErrorCode *status); 1019 1020 1021 #if U_SHOW_CPLUSPLUS_API 1022 /** 1023 * Check the specified string for possible security issues. 1024 * The text to be checked will typically be an identifier of some sort. 1025 * The set of checks to be performed is specified with uspoof_setChecks(). 1026 * 1027 * \note 1028 * Consider using the newer API, {@link uspoof_check2UnicodeString}, instead. 1029 * The newer API exposes additional information from the check procedure 1030 * and is otherwise identical to this method. 1031 * 1032 * @param sc The USpoofChecker 1033 * @param id A identifier to be checked for possible security issues. 1034 * @param position Deprecated in ICU 51. Always returns zero. 1035 * Originally, an out parameter for the index of the first 1036 * string position that failed a check. 1037 * This parameter may be NULL. 1038 * @param status The error code, set if an error occurred while attempting to 1039 * perform the check. 1040 * Spoofing or security issues detected with the input string are 1041 * not reported here, but through the function's return value. 1042 * @return An integer value with bits set for any potential security 1043 * or spoofing issues detected. The bits are defined by 1044 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1045 * will be zero if the input string passes all of the 1046 * enabled checks. 1047 * @see uspoof_check2UnicodeString 1048 * @stable ICU 4.2 1049 */ 1050 U_STABLE int32_t U_EXPORT2 1051 uspoof_checkUnicodeString(const USpoofChecker *sc, 1052 const icu::UnicodeString &id, 1053 int32_t *position, 1054 UErrorCode *status); 1055 #endif 1056 1057 1058 /** 1059 * Check the specified string for possible security issues. 1060 * The text to be checked will typically be an identifier of some sort. 1061 * The set of checks to be performed is specified with uspoof_setChecks(). 1062 * 1063 * @param sc The USpoofChecker 1064 * @param id The identifier to be checked for possible security issues, 1065 * in UTF-16 format. 1066 * @param length the length of the string to be checked, or -1 if the string is 1067 * zero terminated. 1068 * @param checkResult An instance of USpoofCheckResult to be filled with 1069 * details about the identifier. Can be NULL. 1070 * @param status The error code, set if an error occurred while attempting to 1071 * perform the check. 1072 * Spoofing or security issues detected with the input string are 1073 * not reported here, but through the function's return value. 1074 * @return An integer value with bits set for any potential security 1075 * or spoofing issues detected. The bits are defined by 1076 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1077 * will be zero if the input string passes all of the 1078 * enabled checks. Any information in this bitmask will be 1079 * consistent with the information saved in the optional 1080 * checkResult parameter. 1081 * @see uspoof_openCheckResult 1082 * @see uspoof_check2UTF8 1083 * @see uspoof_check2UnicodeString 1084 * @stable ICU 58 1085 */ 1086 U_STABLE int32_t U_EXPORT2 1087 uspoof_check2(const USpoofChecker *sc, 1088 const UChar* id, int32_t length, 1089 USpoofCheckResult* checkResult, 1090 UErrorCode *status); 1091 1092 /** 1093 * Check the specified string for possible security issues. 1094 * The text to be checked will typically be an identifier of some sort. 1095 * The set of checks to be performed is specified with uspoof_setChecks(). 1096 * 1097 * This version of {@link uspoof_check} accepts a USpoofCheckResult, which 1098 * returns additional information about the identifier. For more 1099 * information, see {@link uspoof_openCheckResult}. 1100 * 1101 * @param sc The USpoofChecker 1102 * @param id A identifier to be checked for possible security issues, in UTF8 format. 1103 * @param length the length of the string to be checked, or -1 if the string is 1104 * zero terminated. 1105 * @param checkResult An instance of USpoofCheckResult to be filled with 1106 * details about the identifier. Can be NULL. 1107 * @param status The error code, set if an error occurred while attempting to 1108 * perform the check. 1109 * Spoofing or security issues detected with the input string are 1110 * not reported here, but through the function's return value. 1111 * @return An integer value with bits set for any potential security 1112 * or spoofing issues detected. The bits are defined by 1113 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1114 * will be zero if the input string passes all of the 1115 * enabled checks. Any information in this bitmask will be 1116 * consistent with the information saved in the optional 1117 * checkResult parameter. 1118 * @see uspoof_openCheckResult 1119 * @see uspoof_check2 1120 * @see uspoof_check2UnicodeString 1121 * @stable ICU 58 1122 */ 1123 U_STABLE int32_t U_EXPORT2 1124 uspoof_check2UTF8(const USpoofChecker *sc, 1125 const char *id, int32_t length, 1126 USpoofCheckResult* checkResult, 1127 UErrorCode *status); 1128 1129 #if U_SHOW_CPLUSPLUS_API 1130 /** 1131 * Check the specified string for possible security issues. 1132 * The text to be checked will typically be an identifier of some sort. 1133 * The set of checks to be performed is specified with uspoof_setChecks(). 1134 * 1135 * @param sc The USpoofChecker 1136 * @param id A identifier to be checked for possible security issues. 1137 * @param checkResult An instance of USpoofCheckResult to be filled with 1138 * details about the identifier. Can be NULL. 1139 * @param status The error code, set if an error occurred while attempting to 1140 * perform the check. 1141 * Spoofing or security issues detected with the input string are 1142 * not reported here, but through the function's return value. 1143 * @return An integer value with bits set for any potential security 1144 * or spoofing issues detected. The bits are defined by 1145 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1146 * will be zero if the input string passes all of the 1147 * enabled checks. Any information in this bitmask will be 1148 * consistent with the information saved in the optional 1149 * checkResult parameter. 1150 * @see uspoof_openCheckResult 1151 * @see uspoof_check2 1152 * @see uspoof_check2UTF8 1153 * @stable ICU 58 1154 */ 1155 U_STABLE int32_t U_EXPORT2 1156 uspoof_check2UnicodeString(const USpoofChecker *sc, 1157 const icu::UnicodeString &id, 1158 USpoofCheckResult* checkResult, 1159 UErrorCode *status); 1160 #endif 1161 1162 /** 1163 * Create a USpoofCheckResult, used by the {@link uspoof_check2} class of functions to return 1164 * information about the identifier. Information includes: 1165 * <ul> 1166 * <li>A bitmask of the checks that failed</li> 1167 * <li>The identifier's restriction level (UTS 39 section 5.2)</li> 1168 * <li>The set of numerics in the string (UTS 39 section 5.3)</li> 1169 * </ul> 1170 * The data held in a USpoofCheckResult is cleared whenever it is passed into a new call 1171 * of {@link uspoof_check2}. 1172 * 1173 * @param status The error code, set if this function encounters a problem. 1174 * @return the newly created USpoofCheckResult 1175 * @see uspoof_check2 1176 * @see uspoof_check2UTF8 1177 * @see uspoof_check2UnicodeString 1178 * @stable ICU 58 1179 */ 1180 U_STABLE USpoofCheckResult* U_EXPORT2 1181 uspoof_openCheckResult(UErrorCode *status); 1182 1183 /** 1184 * Close a USpoofCheckResult, freeing any memory that was being held by 1185 * its implementation. 1186 * 1187 * @param checkResult The instance of USpoofCheckResult to close 1188 * @stable ICU 58 1189 */ 1190 U_STABLE void U_EXPORT2 1191 uspoof_closeCheckResult(USpoofCheckResult *checkResult); 1192 1193 #if U_SHOW_CPLUSPLUS_API 1194 1195 U_NAMESPACE_BEGIN 1196 1197 /** 1198 * \class LocalUSpoofCheckResultPointer 1199 * "Smart pointer" class, closes a USpoofCheckResult via {@link uspoof_closeCheckResult}. 1200 * For most methods see the LocalPointerBase base class. 1201 * 1202 * @see LocalPointerBase 1203 * @see LocalPointer 1204 * @stable ICU 58 1205 */ 1206 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckResultPointer, USpoofCheckResult, uspoof_closeCheckResult); 1207 1208 U_NAMESPACE_END 1209 1210 #endif 1211 1212 /** 1213 * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests 1214 * in question: USPOOF_RESTRICTION_LEVEL, USPOOF_CHAR_LIMIT, and so on. 1215 * 1216 * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} 1217 * @param status The error code, set if an error occurred. 1218 * @return An integer value with bits set for any potential security 1219 * or spoofing issues detected. The bits are defined by 1220 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1221 * will be zero if the input string passes all of the 1222 * enabled checks. 1223 * @see uspoof_setChecks 1224 * @stable ICU 58 1225 */ 1226 U_STABLE int32_t U_EXPORT2 1227 uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status); 1228 1229 /** 1230 * Gets the restriction level that the text meets, if the USPOOF_RESTRICTION_LEVEL check 1231 * was enabled; otherwise, undefined. 1232 * 1233 * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} 1234 * @param status The error code, set if an error occurred. 1235 * @return The restriction level contained in the USpoofCheckResult 1236 * @see uspoof_setRestrictionLevel 1237 * @stable ICU 58 1238 */ 1239 U_STABLE URestrictionLevel U_EXPORT2 1240 uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status); 1241 1242 /** 1243 * Gets the set of numerics found in the string, if the USPOOF_MIXED_NUMBERS check was enabled; 1244 * otherwise, undefined. The set will contain the zero digit from each decimal number system found 1245 * in the input string. Ownership of the returned USet remains with the USpoofCheckResult. 1246 * The USet will be free'd when {@link uspoof_closeCheckResult} is called. 1247 * 1248 * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} 1249 * @return The set of numerics contained in the USpoofCheckResult 1250 * @param status The error code, set if an error occurred. 1251 * @stable ICU 58 1252 */ 1253 U_STABLE const USet* U_EXPORT2 1254 uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status); 1255 1256 1257 /** 1258 * Check the whether two specified strings are visually confusable. 1259 * 1260 * If the strings are confusable, the return value will be nonzero, as long as 1261 * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks(). 1262 * 1263 * The bits in the return value correspond to flags for each of the classes of 1264 * confusables applicable to the two input strings. According to UTS 39 1265 * section 4, the possible flags are: 1266 * 1267 * <ul> 1268 * <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li> 1269 * <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li> 1270 * <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li> 1271 * </ul> 1272 * 1273 * If one or more of the above flags were not listed in uspoof_setChecks(), this 1274 * function will never report that class of confusable. The check 1275 * {@link USPOOF_CONFUSABLE} enables all three flags. 1276 * 1277 * 1278 * @param sc The USpoofChecker 1279 * @param id1 The first of the two identifiers to be compared for 1280 * confusability. The strings are in UTF-16 format. 1281 * @param length1 the length of the first identifer, expressed in 1282 * 16 bit UTF-16 code units, or -1 if the string is 1283 * nul terminated. 1284 * @param id2 The second of the two identifiers to be compared for 1285 * confusability. The identifiers are in UTF-16 format. 1286 * @param length2 The length of the second identifiers, expressed in 1287 * 16 bit UTF-16 code units, or -1 if the string is 1288 * nul terminated. 1289 * @param status The error code, set if an error occurred while attempting to 1290 * perform the check. 1291 * Confusability of the identifiers is not reported here, 1292 * but through this function's return value. 1293 * @return An integer value with bit(s) set corresponding to 1294 * the type of confusability found, as defined by 1295 * enum USpoofChecks. Zero is returned if the identifiers 1296 * are not confusable. 1297 * 1298 * @stable ICU 4.2 1299 */ 1300 U_STABLE int32_t U_EXPORT2 1301 uspoof_areConfusable(const USpoofChecker *sc, 1302 const UChar *id1, int32_t length1, 1303 const UChar *id2, int32_t length2, 1304 UErrorCode *status); 1305 1306 1307 1308 /** 1309 * A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format. 1310 * 1311 * @param sc The USpoofChecker 1312 * @param id1 The first of the two identifiers to be compared for 1313 * confusability. The strings are in UTF-8 format. 1314 * @param length1 the length of the first identifiers, in bytes, or -1 1315 * if the string is nul terminated. 1316 * @param id2 The second of the two identifiers to be compared for 1317 * confusability. The strings are in UTF-8 format. 1318 * @param length2 The length of the second string in bytes, or -1 1319 * if the string is nul terminated. 1320 * @param status The error code, set if an error occurred while attempting to 1321 * perform the check. 1322 * Confusability of the strings is not reported here, 1323 * but through this function's return value. 1324 * @return An integer value with bit(s) set corresponding to 1325 * the type of confusability found, as defined by 1326 * enum USpoofChecks. Zero is returned if the strings 1327 * are not confusable. 1328 * 1329 * @stable ICU 4.2 1330 * 1331 * @see uspoof_areConfusable 1332 */ 1333 U_STABLE int32_t U_EXPORT2 1334 uspoof_areConfusableUTF8(const USpoofChecker *sc, 1335 const char *id1, int32_t length1, 1336 const char *id2, int32_t length2, 1337 UErrorCode *status); 1338 1339 1340 1341 1342 #if U_SHOW_CPLUSPLUS_API 1343 /** 1344 * A version of {@link uspoof_areConfusable} accepting UnicodeStrings. 1345 * 1346 * @param sc The USpoofChecker 1347 * @param s1 The first of the two identifiers to be compared for 1348 * confusability. The strings are in UTF-8 format. 1349 * @param s2 The second of the two identifiers to be compared for 1350 * confusability. The strings are in UTF-8 format. 1351 * @param status The error code, set if an error occurred while attempting to 1352 * perform the check. 1353 * Confusability of the identifiers is not reported here, 1354 * but through this function's return value. 1355 * @return An integer value with bit(s) set corresponding to 1356 * the type of confusability found, as defined by 1357 * enum USpoofChecks. Zero is returned if the identifiers 1358 * are not confusable. 1359 * 1360 * @stable ICU 4.2 1361 * 1362 * @see uspoof_areConfusable 1363 */ 1364 U_STABLE int32_t U_EXPORT2 1365 uspoof_areConfusableUnicodeString(const USpoofChecker *sc, 1366 const icu::UnicodeString &s1, 1367 const icu::UnicodeString &s2, 1368 UErrorCode *status); 1369 #endif 1370 1371 1372 /** 1373 * Get the "skeleton" for an identifier. 1374 * Skeletons are a transformation of the input identifier; 1375 * Two identifiers are confusable if their skeletons are identical. 1376 * See Unicode UAX #39 for additional information. 1377 * 1378 * Using skeletons directly makes it possible to quickly check 1379 * whether an identifier is confusable with any of some large 1380 * set of existing identifiers, by creating an efficiently 1381 * searchable collection of the skeletons. 1382 * 1383 * @param sc The USpoofChecker 1384 * @param type Deprecated in ICU 58. You may pass any number. 1385 * Originally, controlled which of the Unicode confusable data 1386 * tables to use. 1387 * @param id The input identifier whose skeleton will be computed. 1388 * @param length The length of the input identifier, expressed in 16 bit 1389 * UTF-16 code units, or -1 if the string is zero terminated. 1390 * @param dest The output buffer, to receive the skeleton string. 1391 * @param destCapacity The length of the output buffer, in 16 bit units. 1392 * The destCapacity may be zero, in which case the function will 1393 * return the actual length of the skeleton. 1394 * @param status The error code, set if an error occurred while attempting to 1395 * perform the check. 1396 * @return The length of the skeleton string. The returned length 1397 * is always that of the complete skeleton, even when the 1398 * supplied buffer is too small (or of zero length) 1399 * 1400 * @stable ICU 4.2 1401 * @see uspoof_areConfusable 1402 */ 1403 U_STABLE int32_t U_EXPORT2 1404 uspoof_getSkeleton(const USpoofChecker *sc, 1405 uint32_t type, 1406 const UChar *id, int32_t length, 1407 UChar *dest, int32_t destCapacity, 1408 UErrorCode *status); 1409 1410 /** 1411 * Get the "skeleton" for an identifier. 1412 * Skeletons are a transformation of the input identifier; 1413 * Two identifiers are confusable if their skeletons are identical. 1414 * See Unicode UAX #39 for additional information. 1415 * 1416 * Using skeletons directly makes it possible to quickly check 1417 * whether an identifier is confusable with any of some large 1418 * set of existing identifiers, by creating an efficiently 1419 * searchable collection of the skeletons. 1420 * 1421 * @param sc The USpoofChecker 1422 * @param type Deprecated in ICU 58. You may pass any number. 1423 * Originally, controlled which of the Unicode confusable data 1424 * tables to use. 1425 * @param id The UTF-8 format identifier whose skeleton will be computed. 1426 * @param length The length of the input string, in bytes, 1427 * or -1 if the string is zero terminated. 1428 * @param dest The output buffer, to receive the skeleton string. 1429 * @param destCapacity The length of the output buffer, in bytes. 1430 * The destCapacity may be zero, in which case the function will 1431 * return the actual length of the skeleton. 1432 * @param status The error code, set if an error occurred while attempting to 1433 * perform the check. Possible Errors include U_INVALID_CHAR_FOUND 1434 * for invalid UTF-8 sequences, and 1435 * U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small 1436 * to hold the complete skeleton. 1437 * @return The length of the skeleton string, in bytes. The returned length 1438 * is always that of the complete skeleton, even when the 1439 * supplied buffer is too small (or of zero length) 1440 * 1441 * @stable ICU 4.2 1442 */ 1443 U_STABLE int32_t U_EXPORT2 1444 uspoof_getSkeletonUTF8(const USpoofChecker *sc, 1445 uint32_t type, 1446 const char *id, int32_t length, 1447 char *dest, int32_t destCapacity, 1448 UErrorCode *status); 1449 1450 #if U_SHOW_CPLUSPLUS_API 1451 /** 1452 * Get the "skeleton" for an identifier. 1453 * Skeletons are a transformation of the input identifier; 1454 * Two identifiers are confusable if their skeletons are identical. 1455 * See Unicode UAX #39 for additional information. 1456 * 1457 * Using skeletons directly makes it possible to quickly check 1458 * whether an identifier is confusable with any of some large 1459 * set of existing identifiers, by creating an efficiently 1460 * searchable collection of the skeletons. 1461 * 1462 * @param sc The USpoofChecker. 1463 * @param type Deprecated in ICU 58. You may pass any number. 1464 * Originally, controlled which of the Unicode confusable data 1465 * tables to use. 1466 * @param id The input identifier whose skeleton will be computed. 1467 * @param dest The output identifier, to receive the skeleton string. 1468 * @param status The error code, set if an error occurred while attempting to 1469 * perform the check. 1470 * @return A reference to the destination (skeleton) string. 1471 * 1472 * @stable ICU 4.2 1473 */ 1474 U_I18N_API icu::UnicodeString & U_EXPORT2 1475 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, 1476 uint32_t type, 1477 const icu::UnicodeString &id, 1478 icu::UnicodeString &dest, 1479 UErrorCode *status); 1480 #endif /* U_SHOW_CPLUSPLUS_API */ 1481 1482 /** 1483 * Get the set of Candidate Characters for Inclusion in Identifiers, as defined 1484 * in http://unicode.org/Public/security/latest/xidmodifications.txt 1485 * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. 1486 * 1487 * The returned set is frozen. Ownership of the set remains with the ICU library; it must not 1488 * be deleted by the caller. 1489 * 1490 * @param status The error code, set if a problem occurs while creating the set. 1491 * 1492 * @stable ICU 51 1493 */ 1494 U_STABLE const USet * U_EXPORT2 1495 uspoof_getInclusionSet(UErrorCode *status); 1496 1497 /** 1498 * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined 1499 * in http://unicode.org/Public/security/latest/xidmodifications.txt 1500 * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. 1501 * 1502 * The returned set is frozen. Ownership of the set remains with the ICU library; it must not 1503 * be deleted by the caller. 1504 * 1505 * @param status The error code, set if a problem occurs while creating the set. 1506 * 1507 * @stable ICU 51 1508 */ 1509 U_STABLE const USet * U_EXPORT2 1510 uspoof_getRecommendedSet(UErrorCode *status); 1511 1512 #if U_SHOW_CPLUSPLUS_API 1513 1514 /** 1515 * Get the set of Candidate Characters for Inclusion in Identifiers, as defined 1516 * in http://unicode.org/Public/security/latest/xidmodifications.txt 1517 * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. 1518 * 1519 * The returned set is frozen. Ownership of the set remains with the ICU library; it must not 1520 * be deleted by the caller. 1521 * 1522 * @param status The error code, set if a problem occurs while creating the set. 1523 * 1524 * @stable ICU 51 1525 */ 1526 U_STABLE const icu::UnicodeSet * U_EXPORT2 1527 uspoof_getInclusionUnicodeSet(UErrorCode *status); 1528 1529 /** 1530 * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined 1531 * in http://unicode.org/Public/security/latest/xidmodifications.txt 1532 * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. 1533 * 1534 * The returned set is frozen. Ownership of the set remains with the ICU library; it must not 1535 * be deleted by the caller. 1536 * 1537 * @param status The error code, set if a problem occurs while creating the set. 1538 * 1539 * @stable ICU 51 1540 */ 1541 U_STABLE const icu::UnicodeSet * U_EXPORT2 1542 uspoof_getRecommendedUnicodeSet(UErrorCode *status); 1543 1544 #endif /* U_SHOW_CPLUSPLUS_API */ 1545 1546 /** 1547 * Serialize the data for a spoof detector into a chunk of memory. 1548 * The flattened spoof detection tables can later be used to efficiently 1549 * instantiate a new Spoof Detector. 1550 * 1551 * The serialized spoof checker includes only the data compiled from the 1552 * Unicode data tables by uspoof_openFromSource(); it does not include 1553 * include any other state or configuration that may have been set. 1554 * 1555 * @param sc the Spoof Detector whose data is to be serialized. 1556 * @param data a pointer to 32-bit-aligned memory to be filled with the data, 1557 * can be NULL if capacity==0 1558 * @param capacity the number of bytes available at data, 1559 * or 0 for preflighting 1560 * @param status an in/out ICU UErrorCode; possible errors include: 1561 * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization 1562 * - U_ILLEGAL_ARGUMENT_ERROR the data or capacity parameters are bad 1563 * @return the number of bytes written or needed for the spoof data 1564 * 1565 * @see utrie2_openFromSerialized() 1566 * @stable ICU 4.2 1567 */ 1568 U_STABLE int32_t U_EXPORT2 1569 uspoof_serialize(USpoofChecker *sc, 1570 void *data, int32_t capacity, 1571 UErrorCode *status); 1572 1573 1574 #endif 1575 1576 #endif /* USPOOF_H */ 1577