Home | History | Annotate | Download | only in i18n
      1 /*
      2 ***************************************************************************
      3 * Copyright (C) 2008-2013, International Business Machines Corporation
      4 * and others. All Rights Reserved.
      5 ***************************************************************************
      6 *   file name:  uspoof.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2008Feb13
     12 *   created by: Andy Heninger
     13 *
     14 *   Unicode Spoof Detection
     15 */
     16 #include "unicode/utypes.h"
     17 #include "unicode/normalizer2.h"
     18 #include "unicode/uspoof.h"
     19 #include "unicode/ustring.h"
     20 #include "unicode/utf16.h"
     21 #include "cmemory.h"
     22 #include "cstring.h"
     23 #include "identifier_info.h"
     24 #include "mutex.h"
     25 #include "scriptset.h"
     26 #include "uassert.h"
     27 #include "ucln_in.h"
     28 #include "uspoof_impl.h"
     29 #include "umutex.h"
     30 
     31 
     32 #if !UCONFIG_NO_NORMALIZATION
     33 
     34 U_NAMESPACE_USE
     35 
     36 
     37 //
     38 // Static Objects used by the spoof impl, their thread safe initialization and their cleanup.
     39 //
     40 static UnicodeSet *gInclusionSet = NULL;
     41 static UnicodeSet *gRecommendedSet = NULL;
     42 static const Normalizer2 *gNfdNormalizer = NULL;
     43 static UMutex gInitMutex = U_MUTEX_INITIALIZER;
     44 
     45 static UBool U_CALLCONV
     46 uspoof_cleanup(void) {
     47     delete gInclusionSet;
     48     gInclusionSet = NULL;
     49     delete gRecommendedSet;
     50     gRecommendedSet = NULL;
     51     gNfdNormalizer = NULL;
     52     return TRUE;
     53 }
     54 
     55 static void initializeStatics() {
     56     Mutex m(&gInitMutex);
     57     UErrorCode status = U_ZERO_ERROR;
     58     if (gInclusionSet == NULL) {
     59         gInclusionSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\
     60             \\-.\\u00B7\\u05F3\\u05F4\\u0F0B\\u200C\\u200D\\u2019]"), status);
     61         gRecommendedSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\
     62             [0-z\\u00C0-\\u017E\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-\
     63             \\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F5\\u01F8-\\u021B\\u021E\
     64             \\u021F\\u0226-\\u0233\\u02BB\\u02BC\\u02EC\\u0300-\\u0304\
     65             \\u0306-\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-\
     66             \\u0328\\u032D\\u032E\\u0330\\u0331\\u0335\\u0338\\u0339\
     67             \\u0342-\\u0345\\u037B-\\u03CE\\u03FC-\\u045F\\u048A-\\u0525\
     68             \\u0531-\\u0586\\u05D0-\\u05F2\\u0621-\\u063F\\u0641-\\u0655\
     69             \\u0660-\\u0669\\u0670-\\u068D\\u068F-\\u06D5\\u06E5\\u06E6\
     70             \\u06EE-\\u06FF\\u0750-\\u07B1\\u0901-\\u0939\\u093C-\\u094D\
     71             \\u0950\\u0960-\\u0972\\u0979-\\u0A4D\\u0A5C-\\u0A74\\u0A81-\
     72             \\u0B43\\u0B47-\\u0B61\\u0B66-\\u0C56\\u0C60\\u0C61\\u0C66-\
     73             \\u0CD6\\u0CE0-\\u0CEF\\u0D02-\\u0D28\\u0D2A-\\u0D39\\u0D3D-\
     74             \\u0D43\\u0D46-\\u0D4D\\u0D57-\\u0D61\\u0D66-\\u0D8E\\u0D91-\
     75             \\u0DA5\\u0DA7-\\u0DDE\\u0DF2\\u0E01-\\u0ED9\\u0F00\\u0F20-\
     76             \\u0F8B\\u0F90-\\u109D\\u10D0-\\u10F0\\u10F7-\\u10FA\\u1200-\
     77             \\u135A\\u135F\\u1380-\\u138F\\u1401-\\u167F\\u1780-\\u17A2\
     78             \\u17A5-\\u17A7\\u17A9-\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7-\
     79             \\u17DC\\u17E0-\\u17E9\\u1810-\\u18A8\\u18AA-\\u18F5\\u1E00-\
     80             \\u1E99\\u1F00-\\u1FFC\\u2D30-\\u2D65\\u2D80-\\u2DDE\\u3005-\
     81             \\u3007\\u3041-\\u31B7\\u3400-\\u9FCB\\uA000-\\uA48C\\uA67F\
     82             \\uA717-\\uA71F\\uA788\\uAA60-\\uAA7B\\uAC00-\\uD7A3\\uFA0E-\
     83             \\uFA29\\U00020000-\
     84             \\U0002B734]-[[:Cn:][:nfkcqc=n:][:XIDC=n:]]]"), status);
     85         gNfdNormalizer = Normalizer2::getNFDInstance(status);
     86     }
     87     ucln_i18n_registerCleanup(UCLN_I18N_SPOOF, uspoof_cleanup);
     88 
     89     return;
     90 }
     91 
     92 
     93 U_CAPI USpoofChecker * U_EXPORT2
     94 uspoof_open(UErrorCode *status) {
     95     if (U_FAILURE(*status)) {
     96         return NULL;
     97     }
     98     initializeStatics();
     99     SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status);
    100     if (U_FAILURE(*status)) {
    101         delete si;
    102         si = NULL;
    103     }
    104     return reinterpret_cast<USpoofChecker *>(si);
    105 }
    106 
    107 
    108 U_CAPI USpoofChecker * U_EXPORT2
    109 uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
    110                           UErrorCode *status) {
    111     if (U_FAILURE(*status)) {
    112         return NULL;
    113     }
    114     initializeStatics();
    115     SpoofData *sd = new SpoofData(data, length, *status);
    116     SpoofImpl *si = new SpoofImpl(sd, *status);
    117     if (U_FAILURE(*status)) {
    118         delete sd;
    119         delete si;
    120         return NULL;
    121     }
    122     if (sd == NULL || si == NULL) {
    123         *status = U_MEMORY_ALLOCATION_ERROR;
    124         delete sd;
    125         delete si;
    126         return NULL;
    127     }
    128 
    129     if (pActualLength != NULL) {
    130         *pActualLength = sd->fRawData->fLength;
    131     }
    132     return reinterpret_cast<USpoofChecker *>(si);
    133 }
    134 
    135 
    136 U_CAPI USpoofChecker * U_EXPORT2
    137 uspoof_clone(const USpoofChecker *sc, UErrorCode *status) {
    138     const SpoofImpl *src = SpoofImpl::validateThis(sc, *status);
    139     if (src == NULL) {
    140         return NULL;
    141     }
    142     SpoofImpl *result = new SpoofImpl(*src, *status);   // copy constructor
    143     if (U_FAILURE(*status)) {
    144         delete result;
    145         result = NULL;
    146     }
    147     return reinterpret_cast<USpoofChecker *>(result);
    148 }
    149 
    150 
    151 U_CAPI void U_EXPORT2
    152 uspoof_close(USpoofChecker *sc) {
    153     UErrorCode status = U_ZERO_ERROR;
    154     SpoofImpl *This = SpoofImpl::validateThis(sc, status);
    155     delete This;
    156 }
    157 
    158 
    159 U_CAPI void U_EXPORT2
    160 uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) {
    161     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    162     if (This == NULL) {
    163         return;
    164     }
    165 
    166     // Verify that the requested checks are all ones (bits) that
    167     //   are acceptable, known values.
    168     if (checks & ~USPOOF_ALL_CHECKS) {
    169         *status = U_ILLEGAL_ARGUMENT_ERROR;
    170         return;
    171     }
    172 
    173     This->fChecks = checks;
    174 }
    175 
    176 
    177 U_CAPI int32_t U_EXPORT2
    178 uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) {
    179     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    180     if (This == NULL) {
    181         return 0;
    182     }
    183     return This->fChecks;
    184 }
    185 
    186 U_CAPI void U_EXPORT2
    187 uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel) {
    188     UErrorCode status = U_ZERO_ERROR;
    189     SpoofImpl *This = SpoofImpl::validateThis(sc, status);
    190     if (This != NULL) {
    191         This->fRestrictionLevel = restrictionLevel;
    192     }
    193 }
    194 
    195 U_CAPI URestrictionLevel U_EXPORT2
    196 uspoof_getRestrictionLevel(const USpoofChecker *sc) {
    197     UErrorCode status = U_ZERO_ERROR;
    198     const SpoofImpl *This = SpoofImpl::validateThis(sc, status);
    199     if (This == NULL) {
    200         return USPOOF_UNRESTRICTIVE;
    201     }
    202     return This->fRestrictionLevel;
    203 }
    204 
    205 U_CAPI void U_EXPORT2
    206 uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) {
    207     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    208     if (This == NULL) {
    209         return;
    210     }
    211     This->setAllowedLocales(localesList, *status);
    212 }
    213 
    214 U_CAPI const char * U_EXPORT2
    215 uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status) {
    216     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    217     if (This == NULL) {
    218         return NULL;
    219     }
    220     return This->getAllowedLocales(*status);
    221 }
    222 
    223 
    224 U_CAPI const USet * U_EXPORT2
    225 uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) {
    226     const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status);
    227     return result->toUSet();
    228 }
    229 
    230 U_CAPI const UnicodeSet * U_EXPORT2
    231 uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) {
    232     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    233     if (This == NULL) {
    234         return NULL;
    235     }
    236     return This->fAllowedCharsSet;
    237 }
    238 
    239 
    240 U_CAPI void U_EXPORT2
    241 uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) {
    242     const UnicodeSet *set = UnicodeSet::fromUSet(chars);
    243     uspoof_setAllowedUnicodeSet(sc, set, status);
    244 }
    245 
    246 
    247 U_CAPI void U_EXPORT2
    248 uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) {
    249     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    250     if (This == NULL) {
    251         return;
    252     }
    253     if (chars->isBogus()) {
    254         *status = U_ILLEGAL_ARGUMENT_ERROR;
    255         return;
    256     }
    257     UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone());
    258     if (clonedSet == NULL || clonedSet->isBogus()) {
    259         *status = U_MEMORY_ALLOCATION_ERROR;
    260         return;
    261     }
    262     clonedSet->freeze();
    263     delete This->fAllowedCharsSet;
    264     This->fAllowedCharsSet = clonedSet;
    265     This->fChecks |= USPOOF_CHAR_LIMIT;
    266 }
    267 
    268 
    269 U_CAPI int32_t U_EXPORT2
    270 uspoof_check(const USpoofChecker *sc,
    271              const UChar *id, int32_t length,
    272              int32_t *position,
    273              UErrorCode *status) {
    274 
    275     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    276     if (This == NULL) {
    277         return 0;
    278     }
    279     if (length < -1) {
    280         *status = U_ILLEGAL_ARGUMENT_ERROR;
    281         return 0;
    282     }
    283     UnicodeString idStr((length == -1), id, length);  // Aliasing constructor.
    284     int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status);
    285     return result;
    286 }
    287 
    288 
    289 U_CAPI int32_t U_EXPORT2
    290 uspoof_checkUTF8(const USpoofChecker *sc,
    291                  const char *id, int32_t length,
    292                  int32_t *position,
    293                  UErrorCode *status) {
    294 
    295     if (U_FAILURE(*status)) {
    296         return 0;
    297     }
    298     UnicodeString idStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id)));
    299     int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status);
    300     return result;
    301 }
    302 
    303 
    304 U_CAPI int32_t U_EXPORT2
    305 uspoof_areConfusable(const USpoofChecker *sc,
    306                      const UChar *id1, int32_t length1,
    307                      const UChar *id2, int32_t length2,
    308                      UErrorCode *status) {
    309     SpoofImpl::validateThis(sc, *status);
    310     if (U_FAILURE(*status)) {
    311         return 0;
    312     }
    313     if (length1 < -1 || length2 < -1) {
    314         *status = U_ILLEGAL_ARGUMENT_ERROR;
    315         return 0;
    316     }
    317 
    318     UnicodeString id1Str((length1==-1), id1, length1);  // Aliasing constructor
    319     UnicodeString id2Str((length2==-1), id2, length2);  // Aliasing constructor
    320     return uspoof_areConfusableUnicodeString(sc, id1Str, id2Str, status);
    321 }
    322 
    323 
    324 U_CAPI int32_t U_EXPORT2
    325 uspoof_areConfusableUTF8(const USpoofChecker *sc,
    326                          const char *id1, int32_t length1,
    327                          const char *id2, int32_t length2,
    328                          UErrorCode *status) {
    329     SpoofImpl::validateThis(sc, *status);
    330     if (U_FAILURE(*status)) {
    331         return 0;
    332     }
    333     if (length1 < -1 || length2 < -1) {
    334         *status = U_ILLEGAL_ARGUMENT_ERROR;
    335         return 0;
    336     }
    337     UnicodeString id1Str = UnicodeString::fromUTF8(StringPiece(id1, length1>=0? length1 : uprv_strlen(id1)));
    338     UnicodeString id2Str = UnicodeString::fromUTF8(StringPiece(id2, length2>=0? length2 : uprv_strlen(id2)));
    339     int32_t results = uspoof_areConfusableUnicodeString(sc, id1Str, id2Str, status);
    340     return results;
    341 }
    342 
    343 
    344 U_CAPI int32_t U_EXPORT2
    345 uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
    346                                   const icu::UnicodeString &id1,
    347                                   const icu::UnicodeString &id2,
    348                                   UErrorCode *status) {
    349     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    350     if (U_FAILURE(*status)) {
    351         return 0;
    352     }
    353     //
    354     // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
    355     //   and for definitions of the types (single, whole, mixed-script) of confusables.
    356 
    357     // We only care about a few of the check flags.  Ignore the others.
    358     // If no tests relavant to this function have been specified, return an error.
    359     // TODO:  is this really the right thing to do?  It's probably an error on the caller's part,
    360     //        but logically we would just return 0 (no error).
    361     if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE |
    362                           USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) {
    363         *status = U_INVALID_STATE_ERROR;
    364         return 0;
    365     }
    366     int32_t  flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE;
    367 
    368     int32_t  result = 0;
    369     IdentifierInfo *identifierInfo = This->getIdentifierInfo(*status);
    370     if (U_FAILURE(*status)) {
    371         return 0;
    372     }
    373     identifierInfo->setIdentifier(id1, *status);
    374     int32_t id1ScriptCount = identifierInfo->getScriptCount();
    375     identifierInfo->setIdentifier(id2, *status);
    376     int32_t id2ScriptCount = identifierInfo->getScriptCount();
    377     This->releaseIdentifierInfo(identifierInfo);
    378     identifierInfo = NULL;
    379 
    380     if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
    381         UnicodeString   id1Skeleton;
    382         UnicodeString   id2Skeleton;
    383         if (id1ScriptCount <= 1 && id2ScriptCount <= 1) {
    384             flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
    385             uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status);
    386             uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status);
    387             if (id1Skeleton == id2Skeleton) {
    388                 result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
    389             }
    390         }
    391     }
    392 
    393     if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
    394          // If the two inputs are single script confusable they cannot also be
    395          // mixed or whole script confusable, according to the UAX39 definitions.
    396          // So we can skip those tests.
    397          return result;
    398     }
    399 
    400     // Two identifiers are whole script confusable if each is of a single script
    401     // and they are mixed script confusable.
    402     UBool possiblyWholeScriptConfusables =
    403         id1ScriptCount <= 1 && id2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE);
    404 
    405     //
    406     // Mixed Script Check
    407     //
    408     if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) {
    409         // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
    410         // the mixed script table skeleton, which is what we want.
    411         // The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
    412         UnicodeString id1Skeleton;
    413         UnicodeString id2Skeleton;
    414         flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
    415         uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status);
    416         uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status);
    417         if (id1Skeleton == id2Skeleton) {
    418             result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
    419             if (possiblyWholeScriptConfusables) {
    420                 result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
    421             }
    422         }
    423     }
    424 
    425     return result;
    426 }
    427 
    428 
    429 
    430 
    431 U_CAPI int32_t U_EXPORT2
    432 uspoof_checkUnicodeString(const USpoofChecker *sc,
    433                           const icu::UnicodeString &id,
    434                           int32_t *position,
    435                           UErrorCode *status) {
    436     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    437     if (This == NULL) {
    438         return 0;
    439     }
    440     int32_t result = 0;
    441 
    442     IdentifierInfo *identifierInfo = NULL;
    443     if ((This->fChecks) & (USPOOF_RESTRICTION_LEVEL | USPOOF_MIXED_NUMBERS)) {
    444         identifierInfo = This->getIdentifierInfo(*status);
    445         if (U_FAILURE(*status)) {
    446             goto cleanupAndReturn;
    447         }
    448         identifierInfo->setIdentifier(id, *status);
    449         identifierInfo->setIdentifierProfile(*This->fAllowedCharsSet);
    450     }
    451 
    452 
    453     if ((This->fChecks) & USPOOF_RESTRICTION_LEVEL) {
    454         URestrictionLevel idRestrictionLevel = identifierInfo->getRestrictionLevel(*status);
    455         if (idRestrictionLevel > This->fRestrictionLevel) {
    456             result |= USPOOF_RESTRICTION_LEVEL;
    457         }
    458         if (This->fChecks & USPOOF_AUX_INFO) {
    459             result |= idRestrictionLevel;
    460         }
    461     }
    462 
    463     if ((This->fChecks) & USPOOF_MIXED_NUMBERS) {
    464         const UnicodeSet *numerics = identifierInfo->getNumerics();
    465         if (numerics->size() > 1) {
    466             result |= USPOOF_MIXED_NUMBERS;
    467         }
    468 
    469         // TODO: ICU4J returns the UnicodeSet of the numerics found in the identifier.
    470         //       We have no easy way to do the same in C.
    471         // if (checkResult != null) {
    472         //     checkResult.numerics = numerics;
    473         // }
    474     }
    475 
    476 
    477     if (This->fChecks & (USPOOF_CHAR_LIMIT)) {
    478         int32_t i;
    479         UChar32 c;
    480         int32_t length = id.length();
    481         for (i=0; i<length ;) {
    482             c = id.char32At(i);
    483             i += U16_LENGTH(c);
    484             if (!This->fAllowedCharsSet->contains(c)) {
    485                 result |= USPOOF_CHAR_LIMIT;
    486                 break;
    487             }
    488         }
    489     }
    490 
    491     if (This->fChecks &
    492         (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
    493         // These are the checks that need to be done on NFD input
    494         UnicodeString nfdText;
    495         gNfdNormalizer->normalize(id, nfdText, *status);
    496         int32_t nfdLength = nfdText.length();
    497 
    498         if (This->fChecks & USPOOF_INVISIBLE) {
    499 
    500             // scan for more than one occurence of the same non-spacing mark
    501             // in a sequence of non-spacing marks.
    502             int32_t     i;
    503             UChar32     c;
    504             UChar32     firstNonspacingMark = 0;
    505             UBool       haveMultipleMarks = FALSE;
    506             UnicodeSet  marksSeenSoFar;   // Set of combining marks in a single combining sequence.
    507 
    508             for (i=0; i<nfdLength ;) {
    509                 c = nfdText.char32At(i);
    510                 i += U16_LENGTH(c);
    511                 if (u_charType(c) != U_NON_SPACING_MARK) {
    512                     firstNonspacingMark = 0;
    513                     if (haveMultipleMarks) {
    514                         marksSeenSoFar.clear();
    515                         haveMultipleMarks = FALSE;
    516                     }
    517                     continue;
    518                 }
    519                 if (firstNonspacingMark == 0) {
    520                     firstNonspacingMark = c;
    521                     continue;
    522                 }
    523                 if (!haveMultipleMarks) {
    524                     marksSeenSoFar.add(firstNonspacingMark);
    525                     haveMultipleMarks = TRUE;
    526                 }
    527                 if (marksSeenSoFar.contains(c)) {
    528                     // report the error, and stop scanning.
    529                     // No need to find more than the first failure.
    530                     result |= USPOOF_INVISIBLE;
    531                     break;
    532                 }
    533                 marksSeenSoFar.add(c);
    534             }
    535         }
    536 
    537 
    538         if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
    539             // The basic test is the same for both whole and mixed script confusables.
    540             // Compute the set of scripts that every input character has a confusable in.
    541             // For this computation an input character is always considered to be
    542             // confusable with itself in its own script.
    543             //
    544             // If the number of such scripts is two or more, and the input consisted of
    545             // characters all from a single script, we have a whole script confusable.
    546             // (The two scripts will be the original script and the one that is confusable)
    547             //
    548             // If the number of such scripts >= one, and the original input contained characters from
    549             // more than one script, we have a mixed script confusable.  (We can transform
    550             // some of the characters, and end up with a visually similar string all in
    551             // one script.)
    552 
    553             if (identifierInfo == NULL) {
    554                 identifierInfo = This->getIdentifierInfo(*status);
    555                 if (U_FAILURE(*status)) {
    556                     goto cleanupAndReturn;
    557                 }
    558                 identifierInfo->setIdentifier(id, *status);
    559             }
    560 
    561             int32_t scriptCount = identifierInfo->getScriptCount();
    562 
    563             ScriptSet scripts;
    564             This->wholeScriptCheck(nfdText, &scripts, *status);
    565             int32_t confusableScriptCount = scripts.countMembers();
    566             //printf("confusableScriptCount = %d\n", confusableScriptCount);
    567 
    568             if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
    569                 confusableScriptCount >= 2 &&
    570                 scriptCount == 1) {
    571                 result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
    572             }
    573 
    574             if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
    575                 confusableScriptCount >= 1 &&
    576                 scriptCount > 1) {
    577                 result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
    578             }
    579         }
    580     }
    581 
    582 cleanupAndReturn:
    583     This->releaseIdentifierInfo(identifierInfo);
    584     if (position != NULL) {
    585         *position = 0;
    586     }
    587     return result;
    588 }
    589 
    590 
    591 U_CAPI int32_t U_EXPORT2
    592 uspoof_getSkeleton(const USpoofChecker *sc,
    593                    uint32_t type,
    594                    const UChar *id,  int32_t length,
    595                    UChar *dest, int32_t destCapacity,
    596                    UErrorCode *status) {
    597 
    598     SpoofImpl::validateThis(sc, *status);
    599     if (U_FAILURE(*status)) {
    600         return 0;
    601     }
    602     if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL)) {
    603         *status = U_ILLEGAL_ARGUMENT_ERROR;
    604         return 0;
    605     }
    606 
    607     UnicodeString idStr((length==-1), id, length);  // Aliasing constructor
    608     UnicodeString destStr;
    609     uspoof_getSkeletonUnicodeString(sc, type, idStr, destStr, status);
    610     destStr.extract(dest, destCapacity, *status);
    611     return destStr.length();
    612 }
    613 
    614 
    615 
    616 U_I18N_API UnicodeString &  U_EXPORT2
    617 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
    618                                 uint32_t type,
    619                                 const UnicodeString &id,
    620                                 UnicodeString &dest,
    621                                 UErrorCode *status) {
    622     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    623     if (U_FAILURE(*status)) {
    624         return dest;
    625     }
    626 
    627    int32_t tableMask = 0;
    628    switch (type) {
    629       case 0:
    630         tableMask = USPOOF_ML_TABLE_FLAG;
    631         break;
    632       case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
    633         tableMask = USPOOF_SL_TABLE_FLAG;
    634         break;
    635       case USPOOF_ANY_CASE:
    636         tableMask = USPOOF_MA_TABLE_FLAG;
    637         break;
    638       case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
    639         tableMask = USPOOF_SA_TABLE_FLAG;
    640         break;
    641       default:
    642         *status = U_ILLEGAL_ARGUMENT_ERROR;
    643         return dest;
    644     }
    645 
    646     UnicodeString nfdId;
    647     gNfdNormalizer->normalize(id, nfdId, *status);
    648 
    649     // Apply the skeleton mapping to the NFD normalized input string
    650     // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
    651     int32_t inputIndex = 0;
    652     UnicodeString skelStr;
    653     int32_t normalizedLen = nfdId.length();
    654     for (inputIndex=0; inputIndex < normalizedLen; ) {
    655         UChar32 c = nfdId.char32At(inputIndex);
    656         inputIndex += U16_LENGTH(c);
    657         This->confusableLookup(c, tableMask, skelStr);
    658     }
    659 
    660     gNfdNormalizer->normalize(skelStr, dest, *status);
    661     return dest;
    662 }
    663 
    664 
    665 U_CAPI int32_t U_EXPORT2
    666 uspoof_getSkeletonUTF8(const USpoofChecker *sc,
    667                        uint32_t type,
    668                        const char *id,  int32_t length,
    669                        char *dest, int32_t destCapacity,
    670                        UErrorCode *status) {
    671     SpoofImpl::validateThis(sc, *status);
    672     if (U_FAILURE(*status)) {
    673         return 0;
    674     }
    675     if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL)) {
    676         *status = U_ILLEGAL_ARGUMENT_ERROR;
    677         return 0;
    678     }
    679 
    680     UnicodeString srcStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id)));
    681     UnicodeString destStr;
    682     uspoof_getSkeletonUnicodeString(sc, type, srcStr, destStr, status);
    683     if (U_FAILURE(*status)) {
    684         return 0;
    685     }
    686 
    687     int32_t lengthInUTF8 = 0;
    688     u_strToUTF8(dest, destCapacity, &lengthInUTF8,
    689                 destStr.getBuffer(), destStr.length(), status);
    690     return lengthInUTF8;
    691 }
    692 
    693 
    694 U_CAPI int32_t U_EXPORT2
    695 uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *status) {
    696     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    697     if (This == NULL) {
    698         U_ASSERT(U_FAILURE(*status));
    699         return 0;
    700     }
    701     int32_t dataSize = This->fSpoofData->fRawData->fLength;
    702     if (capacity < dataSize) {
    703         *status = U_BUFFER_OVERFLOW_ERROR;
    704         return dataSize;
    705     }
    706     uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);
    707     return dataSize;
    708 }
    709 
    710 U_CAPI const USet * U_EXPORT2
    711 uspoof_getInclusionSet(UErrorCode *) {
    712     initializeStatics();
    713     return gInclusionSet->toUSet();
    714 }
    715 
    716 U_CAPI const USet * U_EXPORT2
    717 uspoof_getRecommendedSet(UErrorCode *) {
    718     initializeStatics();
    719     return gRecommendedSet->toUSet();
    720 }
    721 
    722 U_I18N_API const UnicodeSet * U_EXPORT2
    723 uspoof_getInclusionUnicodeSet(UErrorCode *) {
    724     initializeStatics();
    725     return gInclusionSet;
    726 }
    727 
    728 U_I18N_API const UnicodeSet * U_EXPORT2
    729 uspoof_getRecommendedUnicodeSet(UErrorCode *) {
    730     initializeStatics();
    731     return gRecommendedSet;
    732 }
    733 
    734 
    735 
    736 #endif // !UCONFIG_NO_NORMALIZATION
    737