1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 *************************************************************************** 5 * Copyright (C) 2008-2016, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 *************************************************************************** 8 * file name: uspoof.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2008Feb13 14 * created by: Andy Heninger 15 * 16 * Unicode Spoof Detection 17 */ 18 19 #ifndef USPOOF_H 20 #define USPOOF_H 21 22 #include "unicode/utypes.h" 23 #include "unicode/uset.h" 24 #include "unicode/parseerr.h" 25 #include "unicode/localpointer.h" 26 27 #if !UCONFIG_NO_NORMALIZATION 28 29 30 #if U_SHOW_CPLUSPLUS_API 31 #include "unicode/unistr.h" 32 #include "unicode/uniset.h" 33 #endif 34 35 36 /** 37 * \file 38 * \brief Unicode Security and Spoofing Detection, C API. 39 * 40 * <p> 41 * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and 42 * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions: 43 * 44 * <ol> 45 * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "Harvest" and 46 * "Ηarvest", where the second string starts with the Greek capital letter Eta.</li> 47 * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof 48 * detection</em>), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.</li> 49 * </ol> 50 * 51 * <p> 52 * Although originally designed as a method for flagging suspicious identifier strings such as URLs, 53 * <code>USpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word 54 * content filters. 55 * 56 * <p> 57 * The functions of this class are exposed as C API, with a handful of syntactical conveniences for C++. 58 * 59 * <h2>Confusables</h2> 60 * 61 * <p> 62 * The following example shows how to use <code>USpoofChecker</code> to check for confusability between two strings: 63 * 64 * \code{.c} 65 * UErrorCode status = U_ZERO_ERROR; 66 * UChar* str1 = (UChar*) u"Harvest"; 67 * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA 68 * 69 * USpoofChecker* sc = uspoof_open(&status); 70 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); 71 * 72 * int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status); 73 * UBool result = bitmask != 0; 74 * // areConfusable: 1 (status: U_ZERO_ERROR) 75 * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status)); 76 * uspoof_close(sc); 77 * \endcode 78 * 79 * <p> 80 * The call to {@link uspoof_open} creates a <code>USpoofChecker</code> object; the call to {@link uspoof_setChecks} 81 * enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the 82 * confusability test; and the following line extracts the result out of the return value. For best performance, 83 * the instance should be created once (e.g., upon application startup), and the efficient 84 * {@link uspoof_areConfusable} method can be used at runtime. 85 * 86 * <p> 87 * The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call 88 * {@link uspoof_close} when the object goes out of scope: 89 * 90 * \code{.cpp} 91 * UErrorCode status = U_ZERO_ERROR; 92 * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); 93 * uspoof_setChecks(sc.getAlias(), USPOOF_CONFUSABLE, &status); 94 * // ... 95 * \endcode 96 * 97 * <p> 98 * UTS 39 defines two strings to be <em>confusable</em> if they map to the same <em>skeleton string</em>. A skeleton can 99 * be thought of as a "hash code". {@link uspoof_getSkeleton} computes the skeleton for a particular string, so 100 * the following snippet is equivalent to the example above: 101 * 102 * \code{.c} 103 * UErrorCode status = U_ZERO_ERROR; 104 * UChar* str1 = (UChar*) u"Harvest"; 105 * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA 106 * 107 * USpoofChecker* sc = uspoof_open(&status); 108 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); 109 * 110 * // Get skeleton 1 111 * int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status); 112 * UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar)); 113 * status = U_ZERO_ERROR; 114 * uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status); 115 * 116 * // Get skeleton 2 117 * int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status); 118 * UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar)); 119 * status = U_ZERO_ERROR; 120 * uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status); 121 * 122 * // Are the skeletons the same? 123 * UBool result = u_strcmp(skel1, skel2) == 0; 124 * // areConfusable: 1 (status: U_ZERO_ERROR) 125 * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status)); 126 * uspoof_close(sc); 127 * free(skel1); 128 * free(skel2); 129 * \endcode 130 * 131 * <p> 132 * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling 133 * {@link uspoof_areConfusable} many times in a loop, {@link uspoof_getSkeleton} can be used instead, as shown below: 134 * 135 * \code{.c} 136 * UErrorCode status = U_ZERO_ERROR; 137 * #define DICTIONARY_LENGTH 2 138 * UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" }; 139 * UChar* skeletons[DICTIONARY_LENGTH]; 140 * UChar* str = (UChar*) u"1orern"; 141 * 142 * // Setup: 143 * USpoofChecker* sc = uspoof_open(&status); 144 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); 145 * for (size_t i=0; i<DICTIONARY_LENGTH; i++) { 146 * UChar* word = dictionary[i]; 147 * int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status); 148 * skeletons[i] = (UChar*) malloc(++len * sizeof(UChar)); 149 * status = U_ZERO_ERROR; 150 * uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status); 151 * } 152 * 153 * // Live Check: 154 * { 155 * int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status); 156 * UChar* skel = (UChar*) malloc(++len * sizeof(UChar)); 157 * status = U_ZERO_ERROR; 158 * uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status); 159 * UBool result = FALSE; 160 * for (size_t i=0; i<DICTIONARY_LENGTH; i++) { 161 * result = u_strcmp(skel, skeletons[i]) == 0; 162 * if (result == TRUE) { break; } 163 * } 164 * // Has confusable in dictionary: 1 (status: U_ZERO_ERROR) 165 * printf("Has confusable in dictionary: %d (status: %s)\n", result, u_errorName(status)); 166 * free(skel); 167 * } 168 * 169 * for (size_t i=0; i<DICTIONARY_LENGTH; i++) { 170 * free(skeletons[i]); 171 * } 172 * uspoof_close(sc); 173 * \endcode 174 * 175 * <p> 176 * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em> 177 * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons 178 * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons. 179 * 180 * <h2>Spoof Detection</h2> 181 * 182 * <p> 183 * The following snippet shows a minimal example of using <code>USpoofChecker</code> to perform spoof detection on a 184 * string: 185 * 186 * \code{.c} 187 * UErrorCode status = U_ZERO_ERROR; 188 * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A 189 * 190 * // Get the default set of allowable characters: 191 * USet* allowed = uset_openEmpty(); 192 * uset_addAll(allowed, uspoof_getRecommendedSet(&status)); 193 * uset_addAll(allowed, uspoof_getInclusionSet(&status)); 194 * 195 * USpoofChecker* sc = uspoof_open(&status); 196 * uspoof_setAllowedChars(sc, allowed, &status); 197 * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE); 198 * 199 * int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status); 200 * UBool result = bitmask != 0; 201 * // fails checks: 1 (status: U_ZERO_ERROR) 202 * printf("fails checks: %d (status: %s)\n", result, u_errorName(status)); 203 * uspoof_close(sc); 204 * uset_close(allowed); 205 * \endcode 206 * 207 * <p> 208 * As in the case for confusability checking, it is good practice to create one <code>USpoofChecker</code> instance at 209 * startup, and call the cheaper {@link uspoof_check} online. We specify the set of 210 * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39. 211 * 212 * <p> 213 * In addition to {@link uspoof_check}, the function {@link uspoof_checkUTF8} is exposed for UTF8-encoded char* strings, 214 * and {@link uspoof_checkUnicodeString} is exposed for C++ programmers. 215 * 216 * <p> 217 * If the {@link USPOOF_AUX_INFO} check is enabled, a limited amount of information on why a string failed the checks 218 * is available in the returned bitmask. For complete information, use the {@link uspoof_check2} class of functions 219 * with a {@link USpoofCheckResult} parameter: 220 * 221 * \code{.c} 222 * UErrorCode status = U_ZERO_ERROR; 223 * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A 224 * 225 * // Get the default set of allowable characters: 226 * USet* allowed = uset_openEmpty(); 227 * uset_addAll(allowed, uspoof_getRecommendedSet(&status)); 228 * uset_addAll(allowed, uspoof_getInclusionSet(&status)); 229 * 230 * USpoofChecker* sc = uspoof_open(&status); 231 * uspoof_setAllowedChars(sc, allowed, &status); 232 * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE); 233 * 234 * USpoofCheckResult* checkResult = uspoof_openCheckResult(&status); 235 * int32_t bitmask = uspoof_check2(sc, str, -1, checkResult, &status); 236 * 237 * int32_t failures1 = bitmask; 238 * int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status); 239 * assert(failures1 == failures2); 240 * // checks that failed: 0x00000010 (status: U_ZERO_ERROR) 241 * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status)); 242 * 243 * // Cleanup: 244 * uspoof_close(sc); 245 * uset_close(allowed); 246 * uspoof_closeCheckResult(checkResult); 247 * \endcode 248 * 249 * C++ users can take advantage of a few syntactical conveniences. The following snippet is functionally 250 * equivalent to the one above: 251 * 252 * \code{.cpp} 253 * UErrorCode status = U_ZERO_ERROR; 254 * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A 255 * 256 * // Get the default set of allowable characters: 257 * UnicodeSet allowed; 258 * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status)); 259 * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status)); 260 * 261 * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); 262 * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status); 263 * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE); 264 * 265 * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status)); 266 * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status); 267 * 268 * int32_t failures1 = bitmask; 269 * int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status); 270 * assert(failures1 == failures2); 271 * // checks that failed: 0x00000010 (status: U_ZERO_ERROR) 272 * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status)); 273 * 274 * // Explicit cleanup not necessary. 275 * \endcode 276 * 277 * <p> 278 * The return value is a bitmask of the checks that failed. In this case, there was one check that failed: 279 * {@link USPOOF_RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are: 280 * 281 * <ul> 282 * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the 283 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS 284 * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li> 285 * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character 286 * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li> 287 * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable 288 * characters. See {@link uspoof_setAllowedChars} and {@link uspoof_setAllowedLocales}.</li> 289 * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li> 290 * </ul> 291 * 292 * <p> 293 * These checks can be enabled independently of each other. For example, if you were interested in checking for only the 294 * INVISIBLE and MIXED_NUMBERS conditions, you could do: 295 * 296 * \code{.c} 297 * UErrorCode status = U_ZERO_ERROR; 298 * UChar* str = (UChar*) u"8\u09EA"; // 8 mixed with U+09EA BENGALI DIGIT FOUR 299 * 300 * USpoofChecker* sc = uspoof_open(&status); 301 * uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status); 302 * 303 * int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status); 304 * UBool result = bitmask != 0; 305 * // fails checks: 1 (status: U_ZERO_ERROR) 306 * printf("fails checks: %d (status: %s)\n", result, u_errorName(status)); 307 * uspoof_close(sc); 308 * \endcode 309 * 310 * <p> 311 * Here is an example in C++ showing how to compute the restriction level of a string: 312 * 313 * \code{.cpp} 314 * UErrorCode status = U_ZERO_ERROR; 315 * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A 316 * 317 * // Get the default set of allowable characters: 318 * UnicodeSet allowed; 319 * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status)); 320 * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status)); 321 * 322 * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); 323 * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status); 324 * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE); 325 * uspoof_setChecks(sc.getAlias(), USPOOF_RESTRICTION_LEVEL | USPOOF_AUX_INFO, &status); 326 * 327 * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status)); 328 * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status); 329 * 330 * URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status); 331 * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask: 332 * assert((restrictionLevel & bitmask) == restrictionLevel); 333 * // Restriction level: 0x50000000 (status: U_ZERO_ERROR) 334 * printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status)); 335 * \endcode 336 * 337 * <p> 338 * The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since 339 * USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check. 340 * 341 * <p> 342 * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in 343 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings 344 * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have 345 * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is 346 * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed 347 * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on 348 * the levels, see UTS 39 or {@link URestrictionLevel}. The Restriction Level test is aware of the set of 349 * allowed characters set in {@link uspoof_setAllowedChars}. Note that characters which have script code 350 * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple 351 * scripts. 352 * 353 * <h2>Additional Information</h2> 354 * 355 * <p> 356 * A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers. 357 * 358 * <p> 359 * <b>Thread Safety:</b> The test functions for checking a single identifier, or for testing whether 360 * two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads, 361 * using the same USpoofChecker instance. 362 * 363 * <p> 364 * More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are 365 * thread safe. Those that take a non-const USpoofChecker are not thread safe.. 366 * 367 * @stable ICU 4.6 368 */ 369 370 struct USpoofChecker; 371 typedef struct USpoofChecker USpoofChecker; /**< typedef for C of USpoofChecker */ 372 373 #ifndef U_HIDE_DRAFT_API 374 /** 375 * @see uspoof_openCheckResult 376 */ 377 struct USpoofCheckResult; 378 /** 379 * @see uspoof_openCheckResult 380 */ 381 typedef struct USpoofCheckResult USpoofCheckResult; 382 #endif /* U_HIDE_DRAFT_API */ 383 384 /** 385 * Enum for the kinds of checks that USpoofChecker can perform. 386 * These enum values are used both to select the set of checks that 387 * will be performed, and to report results from the check function. 388 * 389 * @stable ICU 4.2 390 */ 391 typedef enum USpoofChecks { 392 /** 393 * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates 394 * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section 395 * 4. 396 * 397 * @see uspoof_areConfusable 398 * @stable ICU 4.2 399 */ 400 USPOOF_SINGLE_SCRIPT_CONFUSABLE = 1, 401 402 /** 403 * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates 404 * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS 405 * 39 section 4. 406 * 407 * @see uspoof_areConfusable 408 * @stable ICU 4.2 409 */ 410 USPOOF_MIXED_SCRIPT_CONFUSABLE = 2, 411 412 /** 413 * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates 414 * that the two strings are visually confusable and that they are not from the same script but both of them are 415 * single-script strings, according to UTS 39 section 4. 416 * 417 * @see uspoof_areConfusable 418 * @stable ICU 4.2 419 */ 420 USPOOF_WHOLE_SCRIPT_CONFUSABLE = 4, 421 422 #ifndef U_HIDE_DRAFT_API 423 /** 424 * Enable this flag in {@link uspoof_setChecks} to turn on all types of confusables. You may set 425 * the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to 426 * make {@link uspoof_areConfusable} return only those types of confusables. 427 * 428 * @see uspoof_areConfusable 429 * @see uspoof_getSkeleton 430 * @draft ICU 58 431 */ 432 USPOOF_CONFUSABLE = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, 433 #endif /* U_HIDE_DRAFT_API */ 434 435 #ifndef U_HIDE_DEPRECATED_API 436 /** 437 * This flag is deprecated and no longer affects the behavior of SpoofChecker. 438 * 439 * @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was deprecated. 440 */ 441 USPOOF_ANY_CASE = 8, 442 #endif /* U_HIDE_DEPRECATED_API */ 443 444 /** 445 * Check that an identifier is no looser than the specified RestrictionLevel. 446 * The default if {@link uspoof_setRestrictionLevel} is not called is HIGHLY_RESTRICTIVE. 447 * 448 * If USPOOF_AUX_INFO is enabled the actual restriction level of the 449 * identifier being tested will also be returned by uspoof_check(). 450 * 451 * @see URestrictionLevel 452 * @see uspoof_setRestrictionLevel 453 * @see USPOOF_AUX_INFO 454 * 455 * @stable ICU 51 456 */ 457 USPOOF_RESTRICTION_LEVEL = 16, 458 459 #ifndef U_HIDE_DEPRECATED_API 460 /** Check that an identifier contains only characters from a 461 * single script (plus chars from the common and inherited scripts.) 462 * Applies to checks of a single identifier check only. 463 * @deprecated ICU 51 Use RESTRICTION_LEVEL instead. 464 */ 465 USPOOF_SINGLE_SCRIPT = USPOOF_RESTRICTION_LEVEL, 466 #endif /* U_HIDE_DEPRECATED_API */ 467 468 /** Check an identifier for the presence of invisible characters, 469 * such as zero-width spaces, or character sequences that are 470 * likely not to display, such as multiple occurrences of the same 471 * non-spacing mark. This check does not test the input string as a whole 472 * for conformance to any particular syntax for identifiers. 473 */ 474 USPOOF_INVISIBLE = 32, 475 476 /** Check that an identifier contains only characters from a specified set 477 * of acceptable characters. See {@link uspoof_setAllowedChars} and 478 * {@link uspoof_setAllowedLocales}. Note that a string that fails this check 479 * will also fail the {@link USPOOF_RESTRICTION_LEVEL} check. 480 */ 481 USPOOF_CHAR_LIMIT = 64, 482 483 /** 484 * Check that an identifier does not mix numbers from different numbering systems. 485 * For more information, see UTS 39 section 5.3. 486 * 487 * @stable ICU 51 488 */ 489 USPOOF_MIXED_NUMBERS = 128, 490 491 /** 492 * Enable all spoof checks. 493 * 494 * @stable ICU 4.6 495 */ 496 USPOOF_ALL_CHECKS = 0xFFFF, 497 498 /** 499 * Enable the return of auxillary (non-error) information in the 500 * upper bits of the check results value. 501 * 502 * If this "check" is not enabled, the results of {@link uspoof_check} will be 503 * zero when an identifier passes all of the enabled checks. 504 * 505 * If this "check" is enabled, (uspoof_check() & {@link USPOOF_ALL_CHECKS}) will 506 * be zero when an identifier passes all checks. 507 * 508 * @stable ICU 51 509 */ 510 USPOOF_AUX_INFO = 0x40000000 511 512 } USpoofChecks; 513 514 515 /** 516 * Constants from UAX #39 for use in {@link uspoof_setRestrictionLevel}, and 517 * for returned identifier restriction levels in check results. 518 * 519 * @stable ICU 51 520 * 521 * @see uspoof_setRestrictionLevel 522 * @see uspoof_check 523 */ 524 typedef enum URestrictionLevel { 525 /** 526 * All characters in the string are in the identifier profile and all characters in the string are in the 527 * ASCII range. 528 * 529 * @stable ICU 51 530 */ 531 USPOOF_ASCII = 0x10000000, 532 /** 533 * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and 534 * the string is single-script, according to the definition in UTS 39 section 5.1. 535 * 536 * @stable ICU 53 537 */ 538 USPOOF_SINGLE_SCRIPT_RESTRICTIVE = 0x20000000, 539 /** 540 * The string classifies as Single Script, or all characters in the string are in the identifier profile and 541 * the string is covered by any of the following sets of scripts, according to the definition in UTS 39 542 * section 5.1: 543 * <ul> 544 * <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li> 545 * <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li> 546 * <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li> 547 * </ul> 548 * This is the default restriction in ICU. 549 * 550 * @stable ICU 51 551 */ 552 USPOOF_HIGHLY_RESTRICTIVE = 0x30000000, 553 /** 554 * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile 555 * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic, 556 * Greek, and Cherokee. 557 * 558 * @stable ICU 51 559 */ 560 USPOOF_MODERATELY_RESTRICTIVE = 0x40000000, 561 /** 562 * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts. 563 * 564 * @stable ICU 51 565 */ 566 USPOOF_MINIMALLY_RESTRICTIVE = 0x50000000, 567 /** 568 * Any valid identifiers, including characters outside of the Identifier Profile. 569 * 570 * @stable ICU 51 571 */ 572 USPOOF_UNRESTRICTIVE = 0x60000000, 573 /** 574 * Mask for selecting the Restriction Level bits from the return value of {@link uspoof_check}. 575 * 576 * @stable ICU 53 577 */ 578 USPOOF_RESTRICTION_LEVEL_MASK = 0x7F000000, 579 #ifndef U_HIDE_INTERNAL_API 580 /** 581 * An undefined restriction level. 582 * @internal 583 */ 584 USPOOF_UNDEFINED_RESTRICTIVE = -1 585 #endif /* U_HIDE_INTERNAL_API */ 586 } URestrictionLevel; 587 588 /** 589 * Create a Unicode Spoof Checker, configured to perform all 590 * checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT. 591 * Note that additional checks may be added in the future, 592 * resulting in the changes to the default checking behavior. 593 * 594 * @param status The error code, set if this function encounters a problem. 595 * @return the newly created Spoof Checker 596 * @stable ICU 4.2 597 */ 598 U_STABLE USpoofChecker * U_EXPORT2 599 uspoof_open(UErrorCode *status); 600 601 602 /** 603 * Open a Spoof checker from its serialized form, stored in 32-bit-aligned memory. 604 * Inverse of uspoof_serialize(). 605 * The memory containing the serialized data must remain valid and unchanged 606 * as long as the spoof checker, or any cloned copies of the spoof checker, 607 * are in use. Ownership of the memory remains with the caller. 608 * The spoof checker (and any clones) must be closed prior to deleting the 609 * serialized data. 610 * 611 * @param data a pointer to 32-bit-aligned memory containing the serialized form of spoof data 612 * @param length the number of bytes available at data; 613 * can be more than necessary 614 * @param pActualLength receives the actual number of bytes at data taken up by the data; 615 * can be NULL 616 * @param pErrorCode ICU error code 617 * @return the spoof checker. 618 * 619 * @see uspoof_open 620 * @see uspoof_serialize 621 * @stable ICU 4.2 622 */ 623 U_STABLE USpoofChecker * U_EXPORT2 624 uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength, 625 UErrorCode *pErrorCode); 626 627 /** 628 * Open a Spoof Checker from the source form of the spoof data. 629 * The input corresponds to the Unicode data file confusables.txt 630 * as described in Unicode UAX #39. The syntax of the source data 631 * is as described in UAX #39 for this file, and the content of 632 * this file is acceptable input. 633 * 634 * The character encoding of the (char *) input text is UTF-8. 635 * 636 * @param confusables a pointer to the confusable characters definitions, 637 * as found in file confusables.txt from unicode.org. 638 * @param confusablesLen The length of the confusables text, or -1 if the 639 * input string is zero terminated. 640 * @param confusablesWholeScript 641 * Deprecated in ICU 58. No longer used. 642 * @param confusablesWholeScriptLen 643 * Deprecated in ICU 58. No longer used. 644 * @param errType In the event of an error in the input, indicates 645 * which of the input files contains the error. 646 * The value is one of USPOOF_SINGLE_SCRIPT_CONFUSABLE or 647 * USPOOF_WHOLE_SCRIPT_CONFUSABLE, or 648 * zero if no errors are found. 649 * @param pe In the event of an error in the input, receives the position 650 * in the input text (line, offset) of the error. 651 * @param status an in/out ICU UErrorCode. Among the possible errors is 652 * U_PARSE_ERROR, which is used to report syntax errors 653 * in the input. 654 * @return A spoof checker that uses the rules from the input files. 655 * @stable ICU 4.2 656 */ 657 U_STABLE USpoofChecker * U_EXPORT2 658 uspoof_openFromSource(const char *confusables, int32_t confusablesLen, 659 const char *confusablesWholeScript, int32_t confusablesWholeScriptLen, 660 int32_t *errType, UParseError *pe, UErrorCode *status); 661 662 663 /** 664 * Close a Spoof Checker, freeing any memory that was being held by 665 * its implementation. 666 * @stable ICU 4.2 667 */ 668 U_STABLE void U_EXPORT2 669 uspoof_close(USpoofChecker *sc); 670 671 #if U_SHOW_CPLUSPLUS_API 672 673 U_NAMESPACE_BEGIN 674 675 /** 676 * \class LocalUSpoofCheckerPointer 677 * "Smart pointer" class, closes a USpoofChecker via uspoof_close(). 678 * For most methods see the LocalPointerBase base class. 679 * 680 * @see LocalPointerBase 681 * @see LocalPointer 682 * @stable ICU 4.4 683 */ 684 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckerPointer, USpoofChecker, uspoof_close); 685 686 U_NAMESPACE_END 687 688 #endif 689 690 /** 691 * Clone a Spoof Checker. The clone will be set to perform the same checks 692 * as the original source. 693 * 694 * @param sc The source USpoofChecker 695 * @param status The error code, set if this function encounters a problem. 696 * @return 697 * @stable ICU 4.2 698 */ 699 U_STABLE USpoofChecker * U_EXPORT2 700 uspoof_clone(const USpoofChecker *sc, UErrorCode *status); 701 702 703 /** 704 * Specify the bitmask of checks that will be performed by {@link uspoof_check}. Calling this method 705 * overwrites any checks that may have already been enabled. By default, all checks are enabled. 706 * 707 * To enable specific checks and disable all others, the "whitelisted" checks should be ORed together. For 708 * example, to fail strings containing characters outside of the set specified by {@link uspoof_setAllowedChars} and 709 * also strings that contain digits from mixed numbering systems: 710 * 711 * <pre> 712 * {@code 713 * uspoof_setChecks(USPOOF_CHAR_LIMIT | USPOOF_MIXED_NUMBERS); 714 * } 715 * </pre> 716 * 717 * To disable specific checks and enable all others, the "blacklisted" checks should be ANDed away from 718 * ALL_CHECKS. For example, if you are not planning to use the {@link uspoof_areConfusable} functionality, 719 * it is good practice to disable the CONFUSABLE check: 720 * 721 * <pre> 722 * {@code 723 * uspoof_setChecks(USPOOF_ALL_CHECKS & ~USPOOF_CONFUSABLE); 724 * } 725 * </pre> 726 * 727 * Note that methods such as {@link uspoof_setAllowedChars}, {@link uspoof_setAllowedLocales}, and 728 * {@link uspoof_setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they 729 * enable onto the existing bitmask specified by this method. For more details, see the documentation of those 730 * methods. 731 * 732 * @param sc The USpoofChecker 733 * @param checks The set of checks that this spoof checker will perform. 734 * The value is a bit set, obtained by OR-ing together 735 * values from enum USpoofChecks. 736 * @param status The error code, set if this function encounters a problem. 737 * @stable ICU 4.2 738 * 739 */ 740 U_STABLE void U_EXPORT2 741 uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status); 742 743 /** 744 * Get the set of checks that this Spoof Checker has been configured to perform. 745 * 746 * @param sc The USpoofChecker 747 * @param status The error code, set if this function encounters a problem. 748 * @return The set of checks that this spoof checker will perform. 749 * The value is a bit set, obtained by OR-ing together 750 * values from enum USpoofChecks. 751 * @stable ICU 4.2 752 * 753 */ 754 U_STABLE int32_t U_EXPORT2 755 uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status); 756 757 /** 758 * Set the loosest restriction level allowed for strings. The default if this is not called is 759 * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and 760 * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are 761 * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}. 762 * 763 * @param sc The USpoofChecker 764 * @param restrictionLevel The loosest restriction level allowed. 765 * @see URestrictionLevel 766 * @stable ICU 51 767 */ 768 U_STABLE void U_EXPORT2 769 uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel); 770 771 772 /** 773 * Get the Restriction Level that will be tested if the checks include {@link USPOOF_RESTRICTION_LEVEL}. 774 * 775 * @return The restriction level 776 * @see URestrictionLevel 777 * @stable ICU 51 778 */ 779 U_STABLE URestrictionLevel U_EXPORT2 780 uspoof_getRestrictionLevel(const USpoofChecker *sc); 781 782 /** 783 * Limit characters that are acceptable in identifiers being checked to those 784 * normally used with the languages associated with the specified locales. 785 * Any previously specified list of locales is replaced by the new settings. 786 * 787 * A set of languages is determined from the locale(s), and 788 * from those a set of acceptable Unicode scripts is determined. 789 * Characters from this set of scripts, along with characters from 790 * the "common" and "inherited" Unicode Script categories 791 * will be permitted. 792 * 793 * Supplying an empty string removes all restrictions; 794 * characters from any script will be allowed. 795 * 796 * The {@link USPOOF_CHAR_LIMIT} test is automatically enabled for this 797 * USpoofChecker when calling this function with a non-empty list 798 * of locales. 799 * 800 * The Unicode Set of characters that will be allowed is accessible 801 * via the uspoof_getAllowedChars() function. uspoof_setAllowedLocales() 802 * will <i>replace</i> any previously applied set of allowed characters. 803 * 804 * Adjustments, such as additions or deletions of certain classes of characters, 805 * can be made to the result of uspoof_setAllowedLocales() by 806 * fetching the resulting set with uspoof_getAllowedChars(), 807 * manipulating it with the Unicode Set API, then resetting the 808 * spoof detectors limits with uspoof_setAllowedChars(). 809 * 810 * @param sc The USpoofChecker 811 * @param localesList A list list of locales, from which the language 812 * and associated script are extracted. The locales 813 * are comma-separated if there is more than one. 814 * White space may not appear within an individual locale, 815 * but is ignored otherwise. 816 * The locales are syntactically like those from the 817 * HTTP Accept-Language header. 818 * If the localesList is empty, no restrictions will be placed on 819 * the allowed characters. 820 * 821 * @param status The error code, set if this function encounters a problem. 822 * @stable ICU 4.2 823 */ 824 U_STABLE void U_EXPORT2 825 uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status); 826 827 /** 828 * Get a list of locales for the scripts that are acceptable in strings 829 * to be checked. If no limitations on scripts have been specified, 830 * an empty string will be returned. 831 * 832 * uspoof_setAllowedChars() will reset the list of allowed to be empty. 833 * 834 * The format of the returned list is the same as that supplied to 835 * uspoof_setAllowedLocales(), but returned list may not be identical 836 * to the originally specified string; the string may be reformatted, 837 * and information other than languages from 838 * the originally specified locales may be omitted. 839 * 840 * @param sc The USpoofChecker 841 * @param status The error code, set if this function encounters a problem. 842 * @return A string containing a list of locales corresponding 843 * to the acceptable scripts, formatted like an 844 * HTTP Accept Language value. 845 * 846 * @stable ICU 4.2 847 */ 848 U_STABLE const char * U_EXPORT2 849 uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status); 850 851 852 /** 853 * Limit the acceptable characters to those specified by a Unicode Set. 854 * Any previously specified character limit is 855 * is replaced by the new settings. This includes limits on 856 * characters that were set with the uspoof_setAllowedLocales() function. 857 * 858 * The USPOOF_CHAR_LIMIT test is automatically enabled for this 859 * USpoofChecker by this function. 860 * 861 * @param sc The USpoofChecker 862 * @param chars A Unicode Set containing the list of 863 * characters that are permitted. Ownership of the set 864 * remains with the caller. The incoming set is cloned by 865 * this function, so there are no restrictions on modifying 866 * or deleting the USet after calling this function. 867 * @param status The error code, set if this function encounters a problem. 868 * @stable ICU 4.2 869 */ 870 U_STABLE void U_EXPORT2 871 uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status); 872 873 874 /** 875 * Get a USet for the characters permitted in an identifier. 876 * This corresponds to the limits imposed by the Set Allowed Characters 877 * functions. Limitations imposed by other checks will not be 878 * reflected in the set returned by this function. 879 * 880 * The returned set will be frozen, meaning that it cannot be modified 881 * by the caller. 882 * 883 * Ownership of the returned set remains with the Spoof Detector. The 884 * returned set will become invalid if the spoof detector is closed, 885 * or if a new set of allowed characters is specified. 886 * 887 * 888 * @param sc The USpoofChecker 889 * @param status The error code, set if this function encounters a problem. 890 * @return A USet containing the characters that are permitted by 891 * the USPOOF_CHAR_LIMIT test. 892 * @stable ICU 4.2 893 */ 894 U_STABLE const USet * U_EXPORT2 895 uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status); 896 897 898 #if U_SHOW_CPLUSPLUS_API 899 /** 900 * Limit the acceptable characters to those specified by a Unicode Set. 901 * Any previously specified character limit is 902 * is replaced by the new settings. This includes limits on 903 * characters that were set with the uspoof_setAllowedLocales() function. 904 * 905 * The USPOOF_CHAR_LIMIT test is automatically enabled for this 906 * USoofChecker by this function. 907 * 908 * @param sc The USpoofChecker 909 * @param chars A Unicode Set containing the list of 910 * characters that are permitted. Ownership of the set 911 * remains with the caller. The incoming set is cloned by 912 * this function, so there are no restrictions on modifying 913 * or deleting the UnicodeSet after calling this function. 914 * @param status The error code, set if this function encounters a problem. 915 * @stable ICU 4.2 916 */ 917 U_STABLE void U_EXPORT2 918 uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const icu::UnicodeSet *chars, UErrorCode *status); 919 920 921 /** 922 * Get a UnicodeSet for the characters permitted in an identifier. 923 * This corresponds to the limits imposed by the Set Allowed Characters / 924 * UnicodeSet functions. Limitations imposed by other checks will not be 925 * reflected in the set returned by this function. 926 * 927 * The returned set will be frozen, meaning that it cannot be modified 928 * by the caller. 929 * 930 * Ownership of the returned set remains with the Spoof Detector. The 931 * returned set will become invalid if the spoof detector is closed, 932 * or if a new set of allowed characters is specified. 933 * 934 * 935 * @param sc The USpoofChecker 936 * @param status The error code, set if this function encounters a problem. 937 * @return A UnicodeSet containing the characters that are permitted by 938 * the USPOOF_CHAR_LIMIT test. 939 * @stable ICU 4.2 940 */ 941 U_STABLE const icu::UnicodeSet * U_EXPORT2 942 uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status); 943 #endif 944 945 946 /** 947 * Check the specified string for possible security issues. 948 * The text to be checked will typically be an identifier of some sort. 949 * The set of checks to be performed is specified with uspoof_setChecks(). 950 * 951 * \note 952 * Consider using the newer API, {@link uspoof_check2}, instead. 953 * The newer API exposes additional information from the check procedure 954 * and is otherwise identical to this method. 955 * 956 * @param sc The USpoofChecker 957 * @param id The identifier to be checked for possible security issues, 958 * in UTF-16 format. 959 * @param length the length of the string to be checked, expressed in 960 * 16 bit UTF-16 code units, or -1 if the string is 961 * zero terminated. 962 * @param position Deprecated in ICU 51. Always returns zero. 963 * Originally, an out parameter for the index of the first 964 * string position that failed a check. 965 * This parameter may be NULL. 966 * @param status The error code, set if an error occurred while attempting to 967 * perform the check. 968 * Spoofing or security issues detected with the input string are 969 * not reported here, but through the function's return value. 970 * @return An integer value with bits set for any potential security 971 * or spoofing issues detected. The bits are defined by 972 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 973 * will be zero if the input string passes all of the 974 * enabled checks. 975 * @see uspoof_check2 976 * @stable ICU 4.2 977 */ 978 U_STABLE int32_t U_EXPORT2 979 uspoof_check(const USpoofChecker *sc, 980 const UChar *id, int32_t length, 981 int32_t *position, 982 UErrorCode *status); 983 984 985 /** 986 * Check the specified string for possible security issues. 987 * The text to be checked will typically be an identifier of some sort. 988 * The set of checks to be performed is specified with uspoof_setChecks(). 989 * 990 * \note 991 * Consider using the newer API, {@link uspoof_check2UTF8}, instead. 992 * The newer API exposes additional information from the check procedure 993 * and is otherwise identical to this method. 994 * 995 * @param sc The USpoofChecker 996 * @param id A identifier to be checked for possible security issues, in UTF8 format. 997 * @param length the length of the string to be checked, or -1 if the string is 998 * zero terminated. 999 * @param position Deprecated in ICU 51. Always returns zero. 1000 * Originally, an out parameter for the index of the first 1001 * string position that failed a check. 1002 * This parameter may be NULL. 1003 * @param status The error code, set if an error occurred while attempting to 1004 * perform the check. 1005 * Spoofing or security issues detected with the input string are 1006 * not reported here, but through the function's return value. 1007 * If the input contains invalid UTF-8 sequences, 1008 * a status of U_INVALID_CHAR_FOUND will be returned. 1009 * @return An integer value with bits set for any potential security 1010 * or spoofing issues detected. The bits are defined by 1011 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1012 * will be zero if the input string passes all of the 1013 * enabled checks. 1014 * @see uspoof_check2UTF8 1015 * @stable ICU 4.2 1016 */ 1017 U_STABLE int32_t U_EXPORT2 1018 uspoof_checkUTF8(const USpoofChecker *sc, 1019 const char *id, int32_t length, 1020 int32_t *position, 1021 UErrorCode *status); 1022 1023 1024 #if U_SHOW_CPLUSPLUS_API 1025 /** 1026 * Check the specified string for possible security issues. 1027 * The text to be checked will typically be an identifier of some sort. 1028 * The set of checks to be performed is specified with uspoof_setChecks(). 1029 * 1030 * \note 1031 * Consider using the newer API, {@link uspoof_check2UnicodeString}, instead. 1032 * The newer API exposes additional information from the check procedure 1033 * and is otherwise identical to this method. 1034 * 1035 * @param sc The USpoofChecker 1036 * @param id A identifier to be checked for possible security issues. 1037 * @param position Deprecated in ICU 51. Always returns zero. 1038 * Originally, an out parameter for the index of the first 1039 * string position that failed a check. 1040 * This parameter may be NULL. 1041 * @param status The error code, set if an error occurred while attempting to 1042 * perform the check. 1043 * Spoofing or security issues detected with the input string are 1044 * not reported here, but through the function's return value. 1045 * @return An integer value with bits set for any potential security 1046 * or spoofing issues detected. The bits are defined by 1047 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1048 * will be zero if the input string passes all of the 1049 * enabled checks. 1050 * @see uspoof_check2UnicodeString 1051 * @stable ICU 4.2 1052 */ 1053 U_STABLE int32_t U_EXPORT2 1054 uspoof_checkUnicodeString(const USpoofChecker *sc, 1055 const icu::UnicodeString &id, 1056 int32_t *position, 1057 UErrorCode *status); 1058 #endif 1059 1060 1061 #ifndef U_HIDE_DRAFT_API 1062 /** 1063 * Check the specified string for possible security issues. 1064 * The text to be checked will typically be an identifier of some sort. 1065 * The set of checks to be performed is specified with uspoof_setChecks(). 1066 * 1067 * @param sc The USpoofChecker 1068 * @param id The identifier to be checked for possible security issues, 1069 * in UTF-16 format. 1070 * @param length the length of the string to be checked, or -1 if the string is 1071 * zero terminated. 1072 * @param checkResult An instance of USpoofCheckResult to be filled with 1073 * details about the identifier. Can be NULL. 1074 * @param status The error code, set if an error occurred while attempting to 1075 * perform the check. 1076 * Spoofing or security issues detected with the input string are 1077 * not reported here, but through the function's return value. 1078 * @return An integer value with bits set for any potential security 1079 * or spoofing issues detected. The bits are defined by 1080 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1081 * will be zero if the input string passes all of the 1082 * enabled checks. Any information in this bitmask will be 1083 * consistent with the information saved in the optional 1084 * checkResult parameter. 1085 * @see uspoof_openCheckResult 1086 * @see uspoof_check2UTF8 1087 * @see uspoof_check2UnicodeString 1088 * @draft ICU 58 1089 */ 1090 U_DRAFT int32_t U_EXPORT2 1091 uspoof_check2(const USpoofChecker *sc, 1092 const UChar* id, int32_t length, 1093 USpoofCheckResult* checkResult, 1094 UErrorCode *status); 1095 1096 /** 1097 * Check the specified string for possible security issues. 1098 * The text to be checked will typically be an identifier of some sort. 1099 * The set of checks to be performed is specified with uspoof_setChecks(). 1100 * 1101 * This version of {@link uspoof_check} accepts a USpoofCheckResult, which 1102 * returns additional information about the identifier. For more 1103 * information, see {@link uspoof_openCheckResult}. 1104 * 1105 * @param sc The USpoofChecker 1106 * @param id A identifier to be checked for possible security issues, in UTF8 format. 1107 * @param length the length of the string to be checked, or -1 if the string is 1108 * zero terminated. 1109 * @param checkResult An instance of USpoofCheckResult to be filled with 1110 * details about the identifier. Can be NULL. 1111 * @param status The error code, set if an error occurred while attempting to 1112 * perform the check. 1113 * Spoofing or security issues detected with the input string are 1114 * not reported here, but through the function's return value. 1115 * @return An integer value with bits set for any potential security 1116 * or spoofing issues detected. The bits are defined by 1117 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1118 * will be zero if the input string passes all of the 1119 * enabled checks. Any information in this bitmask will be 1120 * consistent with the information saved in the optional 1121 * checkResult parameter. 1122 * @see uspoof_openCheckResult 1123 * @see uspoof_check2 1124 * @see uspoof_check2UnicodeString 1125 * @draft ICU 58 1126 */ 1127 U_DRAFT int32_t U_EXPORT2 1128 uspoof_check2UTF8(const USpoofChecker *sc, 1129 const char *id, int32_t length, 1130 USpoofCheckResult* checkResult, 1131 UErrorCode *status); 1132 1133 #if U_SHOW_CPLUSPLUS_API 1134 /** 1135 * Check the specified string for possible security issues. 1136 * The text to be checked will typically be an identifier of some sort. 1137 * The set of checks to be performed is specified with uspoof_setChecks(). 1138 * 1139 * @param sc The USpoofChecker 1140 * @param id A identifier to be checked for possible security issues. 1141 * @param checkResult An instance of USpoofCheckResult to be filled with 1142 * details about the identifier. Can be NULL. 1143 * @param status The error code, set if an error occurred while attempting to 1144 * perform the check. 1145 * Spoofing or security issues detected with the input string are 1146 * not reported here, but through the function's return value. 1147 * @return An integer value with bits set for any potential security 1148 * or spoofing issues detected. The bits are defined by 1149 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1150 * will be zero if the input string passes all of the 1151 * enabled checks. Any information in this bitmask will be 1152 * consistent with the information saved in the optional 1153 * checkResult parameter. 1154 * @see uspoof_openCheckResult 1155 * @see uspoof_check2 1156 * @see uspoof_check2UTF8 1157 * @draft ICU 58 1158 */ 1159 U_DRAFT int32_t U_EXPORT2 1160 uspoof_check2UnicodeString(const USpoofChecker *sc, 1161 const icu::UnicodeString &id, 1162 USpoofCheckResult* checkResult, 1163 UErrorCode *status); 1164 #endif 1165 1166 /** 1167 * Create a USpoofCheckResult, used by the {@link uspoof_check2} class of functions to return 1168 * information about the identifier. Information includes: 1169 * <ul> 1170 * <li>A bitmask of the checks that failed</li> 1171 * <li>The identifier's restriction level (UTS 39 section 5.2)</li> 1172 * <li>The set of numerics in the string (UTS 39 section 5.3)</li> 1173 * </ul> 1174 * The data held in a USpoofCheckResult is cleared whenever it is passed into a new call 1175 * of {@link uspoof_check2}. 1176 * 1177 * @param status The error code, set if this function encounters a problem. 1178 * @return the newly created USpoofCheckResult 1179 * @see uspoof_check2 1180 * @see uspoof_check2UTF8 1181 * @see uspoof_check2UnicodeString 1182 * @draft ICU 58 1183 */ 1184 U_DRAFT USpoofCheckResult* U_EXPORT2 1185 uspoof_openCheckResult(UErrorCode *status); 1186 1187 /** 1188 * Close a USpoofCheckResult, freeing any memory that was being held by 1189 * its implementation. 1190 * 1191 * @param checkResult The instance of USpoofCheckResult to close 1192 * @draft ICU 58 1193 */ 1194 U_DRAFT void U_EXPORT2 1195 uspoof_closeCheckResult(USpoofCheckResult *checkResult); 1196 1197 #if U_SHOW_CPLUSPLUS_API 1198 1199 U_NAMESPACE_BEGIN 1200 1201 /** 1202 * \class LocalUSpoofCheckResultPointer 1203 * "Smart pointer" class, closes a USpoofCheckResult via {@link uspoof_closeCheckResult}. 1204 * For most methods see the LocalPointerBase base class. 1205 * 1206 * @see LocalPointerBase 1207 * @see LocalPointer 1208 * @draft ICU 58 1209 */ 1210 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckResultPointer, USpoofCheckResult, uspoof_closeCheckResult); 1211 1212 U_NAMESPACE_END 1213 1214 #endif 1215 1216 /** 1217 * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests 1218 * in question: USPOOF_RESTRICTION_LEVEL, USPOOF_CHAR_LIMIT, and so on. 1219 * 1220 * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} 1221 * @param status The error code, set if an error occurred. 1222 * @return An integer value with bits set for any potential security 1223 * or spoofing issues detected. The bits are defined by 1224 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1225 * will be zero if the input string passes all of the 1226 * enabled checks. 1227 * @see uspoof_setChecks 1228 * @draft ICU 58 1229 */ 1230 U_DRAFT int32_t U_EXPORT2 1231 uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status); 1232 1233 /** 1234 * Gets the restriction level that the text meets, if the USPOOF_RESTRICTION_LEVEL check 1235 * was enabled; otherwise, undefined. 1236 * 1237 * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} 1238 * @param status The error code, set if an error occurred. 1239 * @return The restriction level contained in the USpoofCheckResult 1240 * @see uspoof_setRestrictionLevel 1241 * @draft ICU 58 1242 */ 1243 U_DRAFT URestrictionLevel U_EXPORT2 1244 uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status); 1245 1246 /** 1247 * Gets the set of numerics found in the string, if the USPOOF_MIXED_NUMBERS check was enabled; 1248 * otherwise, undefined. The set will contain the zero digit from each decimal number system found 1249 * in the input string. Ownership of the returned USet remains with the USpoofCheckResult. 1250 * The USet will be free'd when {@link uspoof_closeCheckResult} is called. 1251 * 1252 * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} 1253 * @return The set of numerics contained in the USpoofCheckResult 1254 * @param status The error code, set if an error occurred. 1255 * @draft ICU 58 1256 */ 1257 U_DRAFT const USet* U_EXPORT2 1258 uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status); 1259 #endif /* U_HIDE_DRAFT_API */ 1260 1261 1262 /** 1263 * Check the whether two specified strings are visually confusable. 1264 * 1265 * If the strings are confusable, the return value will be nonzero, as long as 1266 * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks(). 1267 * 1268 * The bits in the return value correspond to flags for each of the classes of 1269 * confusables applicable to the two input strings. According to UTS 39 1270 * section 4, the possible flags are: 1271 * 1272 * <ul> 1273 * <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li> 1274 * <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li> 1275 * <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li> 1276 * </ul> 1277 * 1278 * If one or more of the above flags were not listed in uspoof_setChecks(), this 1279 * function will never report that class of confusable. The check 1280 * {@link USPOOF_CONFUSABLE} enables all three flags. 1281 * 1282 * 1283 * @param sc The USpoofChecker 1284 * @param id1 The first of the two identifiers to be compared for 1285 * confusability. The strings are in UTF-16 format. 1286 * @param length1 the length of the first identifer, expressed in 1287 * 16 bit UTF-16 code units, or -1 if the string is 1288 * nul terminated. 1289 * @param id2 The second of the two identifiers to be compared for 1290 * confusability. The identifiers are in UTF-16 format. 1291 * @param length2 The length of the second identifiers, expressed in 1292 * 16 bit UTF-16 code units, or -1 if the string is 1293 * nul terminated. 1294 * @param status The error code, set if an error occurred while attempting to 1295 * perform the check. 1296 * Confusability of the identifiers is not reported here, 1297 * but through this function's return value. 1298 * @return An integer value with bit(s) set corresponding to 1299 * the type of confusability found, as defined by 1300 * enum USpoofChecks. Zero is returned if the identifiers 1301 * are not confusable. 1302 * 1303 * @stable ICU 4.2 1304 */ 1305 U_STABLE int32_t U_EXPORT2 1306 uspoof_areConfusable(const USpoofChecker *sc, 1307 const UChar *id1, int32_t length1, 1308 const UChar *id2, int32_t length2, 1309 UErrorCode *status); 1310 1311 1312 1313 /** 1314 * A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format. 1315 * 1316 * @param sc The USpoofChecker 1317 * @param id1 The first of the two identifiers to be compared for 1318 * confusability. The strings are in UTF-8 format. 1319 * @param length1 the length of the first identifiers, in bytes, or -1 1320 * if the string is nul terminated. 1321 * @param id2 The second of the two identifiers to be compared for 1322 * confusability. The strings are in UTF-8 format. 1323 * @param length2 The length of the second string in bytes, or -1 1324 * if the string is nul terminated. 1325 * @param status The error code, set if an error occurred while attempting to 1326 * perform the check. 1327 * Confusability of the strings is not reported here, 1328 * but through this function's return value. 1329 * @return An integer value with bit(s) set corresponding to 1330 * the type of confusability found, as defined by 1331 * enum USpoofChecks. Zero is returned if the strings 1332 * are not confusable. 1333 * 1334 * @stable ICU 4.2 1335 * 1336 * @see uspoof_areConfusable 1337 */ 1338 U_STABLE int32_t U_EXPORT2 1339 uspoof_areConfusableUTF8(const USpoofChecker *sc, 1340 const char *id1, int32_t length1, 1341 const char *id2, int32_t length2, 1342 UErrorCode *status); 1343 1344 1345 1346 1347 #if U_SHOW_CPLUSPLUS_API 1348 /** 1349 * A version of {@link uspoof_areConfusable} accepting UnicodeStrings. 1350 * 1351 * @param sc The USpoofChecker 1352 * @param s1 The first of the two identifiers to be compared for 1353 * confusability. The strings are in UTF-8 format. 1354 * @param s2 The second of the two identifiers to be compared for 1355 * confusability. The strings are in UTF-8 format. 1356 * @param status The error code, set if an error occurred while attempting to 1357 * perform the check. 1358 * Confusability of the identifiers is not reported here, 1359 * but through this function's return value. 1360 * @return An integer value with bit(s) set corresponding to 1361 * the type of confusability found, as defined by 1362 * enum USpoofChecks. Zero is returned if the identifiers 1363 * are not confusable. 1364 * 1365 * @stable ICU 4.2 1366 * 1367 * @see uspoof_areConfusable 1368 */ 1369 U_STABLE int32_t U_EXPORT2 1370 uspoof_areConfusableUnicodeString(const USpoofChecker *sc, 1371 const icu::UnicodeString &s1, 1372 const icu::UnicodeString &s2, 1373 UErrorCode *status); 1374 #endif 1375 1376 1377 /** 1378 * Get the "skeleton" for an identifier. 1379 * Skeletons are a transformation of the input identifier; 1380 * Two identifiers are confusable if their skeletons are identical. 1381 * See Unicode UAX #39 for additional information. 1382 * 1383 * Using skeletons directly makes it possible to quickly check 1384 * whether an identifier is confusable with any of some large 1385 * set of existing identifiers, by creating an efficiently 1386 * searchable collection of the skeletons. 1387 * 1388 * @param sc The USpoofChecker 1389 * @param type Deprecated in ICU 58. You may pass any number. 1390 * Originally, controlled which of the Unicode confusable data 1391 * tables to use. 1392 * @param id The input identifier whose skeleton will be computed. 1393 * @param length The length of the input identifier, expressed in 16 bit 1394 * UTF-16 code units, or -1 if the string is zero terminated. 1395 * @param dest The output buffer, to receive the skeleton string. 1396 * @param destCapacity The length of the output buffer, in 16 bit units. 1397 * The destCapacity may be zero, in which case the function will 1398 * return the actual length of the skeleton. 1399 * @param status The error code, set if an error occurred while attempting to 1400 * perform the check. 1401 * @return The length of the skeleton string. The returned length 1402 * is always that of the complete skeleton, even when the 1403 * supplied buffer is too small (or of zero length) 1404 * 1405 * @stable ICU 4.2 1406 * @see uspoof_areConfusable 1407 */ 1408 U_STABLE int32_t U_EXPORT2 1409 uspoof_getSkeleton(const USpoofChecker *sc, 1410 uint32_t type, 1411 const UChar *id, int32_t length, 1412 UChar *dest, int32_t destCapacity, 1413 UErrorCode *status); 1414 1415 /** 1416 * Get the "skeleton" for an identifier. 1417 * Skeletons are a transformation of the input identifier; 1418 * Two identifiers are confusable if their skeletons are identical. 1419 * See Unicode UAX #39 for additional information. 1420 * 1421 * Using skeletons directly makes it possible to quickly check 1422 * whether an identifier is confusable with any of some large 1423 * set of existing identifiers, by creating an efficiently 1424 * searchable collection of the skeletons. 1425 * 1426 * @param sc The USpoofChecker 1427 * @param type Deprecated in ICU 58. You may pass any number. 1428 * Originally, controlled which of the Unicode confusable data 1429 * tables to use. 1430 * @param id The UTF-8 format identifier whose skeleton will be computed. 1431 * @param length The length of the input string, in bytes, 1432 * or -1 if the string is zero terminated. 1433 * @param dest The output buffer, to receive the skeleton string. 1434 * @param destCapacity The length of the output buffer, in bytes. 1435 * The destCapacity may be zero, in which case the function will 1436 * return the actual length of the skeleton. 1437 * @param status The error code, set if an error occurred while attempting to 1438 * perform the check. Possible Errors include U_INVALID_CHAR_FOUND 1439 * for invalid UTF-8 sequences, and 1440 * U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small 1441 * to hold the complete skeleton. 1442 * @return The length of the skeleton string, in bytes. The returned length 1443 * is always that of the complete skeleton, even when the 1444 * supplied buffer is too small (or of zero length) 1445 * 1446 * @stable ICU 4.2 1447 */ 1448 U_STABLE int32_t U_EXPORT2 1449 uspoof_getSkeletonUTF8(const USpoofChecker *sc, 1450 uint32_t type, 1451 const char *id, int32_t length, 1452 char *dest, int32_t destCapacity, 1453 UErrorCode *status); 1454 1455 #if U_SHOW_CPLUSPLUS_API 1456 /** 1457 * Get the "skeleton" for an identifier. 1458 * Skeletons are a transformation of the input identifier; 1459 * Two identifiers are confusable if their skeletons are identical. 1460 * See Unicode UAX #39 for additional information. 1461 * 1462 * Using skeletons directly makes it possible to quickly check 1463 * whether an identifier is confusable with any of some large 1464 * set of existing identifiers, by creating an efficiently 1465 * searchable collection of the skeletons. 1466 * 1467 * @param sc The USpoofChecker. 1468 * @param type Deprecated in ICU 58. You may pass any number. 1469 * Originally, controlled which of the Unicode confusable data 1470 * tables to use. 1471 * @param id The input identifier whose skeleton will be computed. 1472 * @param dest The output identifier, to receive the skeleton string. 1473 * @param status The error code, set if an error occurred while attempting to 1474 * perform the check. 1475 * @return A reference to the destination (skeleton) string. 1476 * 1477 * @stable ICU 4.2 1478 */ 1479 U_I18N_API icu::UnicodeString & U_EXPORT2 1480 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, 1481 uint32_t type, 1482 const icu::UnicodeString &id, 1483 icu::UnicodeString &dest, 1484 UErrorCode *status); 1485 #endif /* U_SHOW_CPLUSPLUS_API */ 1486 1487 /** 1488 * Get the set of Candidate Characters for Inclusion in Identifiers, as defined 1489 * in http://unicode.org/Public/security/latest/xidmodifications.txt 1490 * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. 1491 * 1492 * The returned set is frozen. Ownership of the set remains with the ICU library; it must not 1493 * be deleted by the caller. 1494 * 1495 * @param status The error code, set if a problem occurs while creating the set. 1496 * 1497 * @stable ICU 51 1498 */ 1499 U_STABLE const USet * U_EXPORT2 1500 uspoof_getInclusionSet(UErrorCode *status); 1501 1502 /** 1503 * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined 1504 * in http://unicode.org/Public/security/latest/xidmodifications.txt 1505 * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. 1506 * 1507 * The returned set is frozen. Ownership of the set remains with the ICU library; it must not 1508 * be deleted by the caller. 1509 * 1510 * @param status The error code, set if a problem occurs while creating the set. 1511 * 1512 * @stable ICU 51 1513 */ 1514 U_STABLE const USet * U_EXPORT2 1515 uspoof_getRecommendedSet(UErrorCode *status); 1516 1517 #if U_SHOW_CPLUSPLUS_API 1518 1519 /** 1520 * Get the set of Candidate Characters for Inclusion in Identifiers, as defined 1521 * in http://unicode.org/Public/security/latest/xidmodifications.txt 1522 * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. 1523 * 1524 * The returned set is frozen. Ownership of the set remains with the ICU library; it must not 1525 * be deleted by the caller. 1526 * 1527 * @param status The error code, set if a problem occurs while creating the set. 1528 * 1529 * @stable ICU 51 1530 */ 1531 U_STABLE const icu::UnicodeSet * U_EXPORT2 1532 uspoof_getInclusionUnicodeSet(UErrorCode *status); 1533 1534 /** 1535 * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined 1536 * in http://unicode.org/Public/security/latest/xidmodifications.txt 1537 * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. 1538 * 1539 * The returned set is frozen. Ownership of the set remains with the ICU library; it must not 1540 * be deleted by the caller. 1541 * 1542 * @param status The error code, set if a problem occurs while creating the set. 1543 * 1544 * @stable ICU 51 1545 */ 1546 U_STABLE const icu::UnicodeSet * U_EXPORT2 1547 uspoof_getRecommendedUnicodeSet(UErrorCode *status); 1548 1549 #endif /* U_SHOW_CPLUSPLUS_API */ 1550 1551 /** 1552 * Serialize the data for a spoof detector into a chunk of memory. 1553 * The flattened spoof detection tables can later be used to efficiently 1554 * instantiate a new Spoof Detector. 1555 * 1556 * The serialized spoof checker includes only the data compiled from the 1557 * Unicode data tables by uspoof_openFromSource(); it does not include 1558 * include any other state or configuration that may have been set. 1559 * 1560 * @param sc the Spoof Detector whose data is to be serialized. 1561 * @param data a pointer to 32-bit-aligned memory to be filled with the data, 1562 * can be NULL if capacity==0 1563 * @param capacity the number of bytes available at data, 1564 * or 0 for preflighting 1565 * @param status an in/out ICU UErrorCode; possible errors include: 1566 * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization 1567 * - U_ILLEGAL_ARGUMENT_ERROR the data or capacity parameters are bad 1568 * @return the number of bytes written or needed for the spoof data 1569 * 1570 * @see utrie2_openFromSerialized() 1571 * @stable ICU 4.2 1572 */ 1573 U_STABLE int32_t U_EXPORT2 1574 uspoof_serialize(USpoofChecker *sc, 1575 void *data, int32_t capacity, 1576 UErrorCode *status); 1577 1578 1579 #endif 1580 1581 #endif /* USPOOF_H */ 1582