Home | History | Annotate | Download | only in i18n
      1 /*
      2 ***************************************************************************
      3 * Copyright (C) 2008-2011, International Business Machines Corporation
      4 * and others. All Rights Reserved.
      5 ***************************************************************************
      6 *   file name:  uspoof.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2008Feb13
     12 *   created by: Andy Heninger
     13 *
     14 *   Unicode Spoof Detection
     15 */
     16 #include "unicode/utypes.h"
     17 #include "unicode/uspoof.h"
     18 #include "unicode/unorm.h"
     19 #include "unicode/ustring.h"
     20 #include "cmemory.h"
     21 #include "uspoof_impl.h"
     22 #include "uassert.h"
     23 
     24 
     25 #if !UCONFIG_NO_NORMALIZATION
     26 
     27 U_NAMESPACE_USE
     28 
     29 
     30 U_CAPI USpoofChecker * U_EXPORT2
     31 uspoof_open(UErrorCode *status) {
     32     if (U_FAILURE(*status)) {
     33         return NULL;
     34     }
     35     SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status);
     36     if (U_FAILURE(*status)) {
     37         delete si;
     38         si = NULL;
     39     }
     40     return (USpoofChecker *)si;
     41 }
     42 
     43 
     44 U_CAPI USpoofChecker * U_EXPORT2
     45 uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
     46                           UErrorCode *status) {
     47     if (U_FAILURE(*status)) {
     48         return NULL;
     49     }
     50     SpoofData *sd = new SpoofData(data, length, *status);
     51     SpoofImpl *si = new SpoofImpl(sd, *status);
     52     if (U_FAILURE(*status)) {
     53         delete sd;
     54         delete si;
     55         return NULL;
     56     }
     57     if (sd == NULL || si == NULL) {
     58         *status = U_MEMORY_ALLOCATION_ERROR;
     59         delete sd;
     60         delete si;
     61         return NULL;
     62     }
     63 
     64     if (pActualLength != NULL) {
     65         *pActualLength = sd->fRawData->fLength;
     66     }
     67     return reinterpret_cast<USpoofChecker *>(si);
     68 }
     69 
     70 
     71 U_CAPI USpoofChecker * U_EXPORT2
     72 uspoof_clone(const USpoofChecker *sc, UErrorCode *status) {
     73     const SpoofImpl *src = SpoofImpl::validateThis(sc, *status);
     74     if (src == NULL) {
     75         return NULL;
     76     }
     77     SpoofImpl *result = new SpoofImpl(*src, *status);   // copy constructor
     78     if (U_FAILURE(*status)) {
     79         delete result;
     80         result = NULL;
     81     }
     82     return (USpoofChecker *)result;
     83 }
     84 
     85 
     86 U_CAPI void U_EXPORT2
     87 uspoof_close(USpoofChecker *sc) {
     88     UErrorCode status = U_ZERO_ERROR;
     89     SpoofImpl *This = SpoofImpl::validateThis(sc, status);
     90     delete This;
     91 }
     92 
     93 
     94 U_CAPI void U_EXPORT2
     95 uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) {
     96     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
     97     if (This == NULL) {
     98         return;
     99     }
    100 
    101     // Verify that the requested checks are all ones (bits) that
    102     //   are acceptable, known values.
    103     if (checks & ~USPOOF_ALL_CHECKS) {
    104         *status = U_ILLEGAL_ARGUMENT_ERROR;
    105         return;
    106     }
    107 
    108     This->fChecks = checks;
    109 }
    110 
    111 
    112 U_CAPI int32_t U_EXPORT2
    113 uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) {
    114     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    115     if (This == NULL) {
    116         return 0;
    117     }
    118     return This->fChecks;
    119 }
    120 
    121 U_CAPI void U_EXPORT2
    122 uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) {
    123     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    124     if (This == NULL) {
    125         return;
    126     }
    127     This->setAllowedLocales(localesList, *status);
    128 }
    129 
    130 U_CAPI const char * U_EXPORT2
    131 uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status) {
    132     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    133     if (This == NULL) {
    134         return NULL;
    135     }
    136     return This->getAllowedLocales(*status);
    137 }
    138 
    139 
    140 U_CAPI const USet * U_EXPORT2
    141 uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) {
    142     const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status);
    143     return reinterpret_cast<const USet *>(result);
    144 }
    145 
    146 U_CAPI const UnicodeSet * U_EXPORT2
    147 uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) {
    148     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    149     if (This == NULL) {
    150         return NULL;
    151     }
    152     return This->fAllowedCharsSet;
    153 }
    154 
    155 
    156 U_CAPI void U_EXPORT2
    157 uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) {
    158     const UnicodeSet *set = reinterpret_cast<const UnicodeSet *>(chars);
    159     uspoof_setAllowedUnicodeSet(sc, set, status);
    160 }
    161 
    162 
    163 U_CAPI void U_EXPORT2
    164 uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) {
    165     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    166     if (This == NULL) {
    167         return;
    168     }
    169     if (chars->isBogus()) {
    170         *status = U_ILLEGAL_ARGUMENT_ERROR;
    171         return;
    172     }
    173     UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone());
    174     if (clonedSet == NULL || clonedSet->isBogus()) {
    175         *status = U_MEMORY_ALLOCATION_ERROR;
    176         return;
    177     }
    178     clonedSet->freeze();
    179     delete This->fAllowedCharsSet;
    180     This->fAllowedCharsSet = clonedSet;
    181     This->fChecks |= USPOOF_CHAR_LIMIT;
    182 }
    183 
    184 
    185 U_CAPI int32_t U_EXPORT2
    186 uspoof_check(const USpoofChecker *sc,
    187              const UChar *text, int32_t length,
    188              int32_t *position,
    189              UErrorCode *status) {
    190 
    191     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    192     if (This == NULL) {
    193         return 0;
    194     }
    195     if (length < -1) {
    196         *status = U_ILLEGAL_ARGUMENT_ERROR;
    197         return 0;
    198     }
    199     if (length == -1) {
    200         // It's not worth the bother to handle nul terminated strings everywhere.
    201         //   Just get the length and be done with it.
    202         length = u_strlen(text);
    203     }
    204 
    205     int32_t result = 0;
    206     int32_t failPos = 0x7fffffff;   // TODO: do we have a #define for max int32?
    207 
    208     // A count of the number of non-Common or inherited scripts.
    209     // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests.
    210     // Share the computation when possible.  scriptCount == -1 means that we haven't
    211     // done it yet.
    212     int32_t scriptCount = -1;
    213 
    214     if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) {
    215         scriptCount = This->scriptScan(text, length, failPos, *status);
    216         // printf("scriptCount (clipped to 2) = %d\n", scriptCount);
    217         if ( scriptCount >= 2) {
    218             // Note: scriptCount == 2 covers all cases of the number of scripts >= 2
    219             result |= USPOOF_SINGLE_SCRIPT;
    220         }
    221     }
    222 
    223     if (This->fChecks & USPOOF_CHAR_LIMIT) {
    224         int32_t i;
    225         UChar32 c;
    226         for (i=0; i<length ;) {
    227             U16_NEXT(text, i, length, c);
    228             if (!This->fAllowedCharsSet->contains(c)) {
    229                 result |= USPOOF_CHAR_LIMIT;
    230                 if (i < failPos) {
    231                     failPos = i;
    232                 }
    233                 break;
    234             }
    235         }
    236     }
    237 
    238     if (This->fChecks &
    239         (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
    240         // These are the checks that need to be done on NFD input
    241         NFDBuffer   normalizedInput(text, length, *status);
    242         const UChar  *nfdText = normalizedInput.getBuffer();
    243         int32_t      nfdLength = normalizedInput.getLength();
    244 
    245         if (This->fChecks & USPOOF_INVISIBLE) {
    246 
    247             // scan for more than one occurence of the same non-spacing mark
    248             // in a sequence of non-spacing marks.
    249             int32_t     i;
    250             UChar32     c;
    251             UChar32     firstNonspacingMark = 0;
    252             UBool       haveMultipleMarks = FALSE;
    253             UnicodeSet  marksSeenSoFar;   // Set of combining marks in a single combining sequence.
    254 
    255             for (i=0; i<nfdLength ;) {
    256                 U16_NEXT(nfdText, i, nfdLength, c);
    257                 if (u_charType(c) != U_NON_SPACING_MARK) {
    258                     firstNonspacingMark = 0;
    259                     if (haveMultipleMarks) {
    260                         marksSeenSoFar.clear();
    261                         haveMultipleMarks = FALSE;
    262                     }
    263                     continue;
    264                 }
    265                 if (firstNonspacingMark == 0) {
    266                     firstNonspacingMark = c;
    267                     continue;
    268                 }
    269                 if (!haveMultipleMarks) {
    270                     marksSeenSoFar.add(firstNonspacingMark);
    271                     haveMultipleMarks = TRUE;
    272                 }
    273                 if (marksSeenSoFar.contains(c)) {
    274                     // report the error, and stop scanning.
    275                     // No need to find more than the first failure.
    276                     result |= USPOOF_INVISIBLE;
    277                     failPos = i;
    278                     // TODO: Bug 8655: failPos is the position in the NFD buffer, but what we want
    279                     //       to give back to our caller is a position in the original input string.
    280                     if (failPos > length) {
    281                         failPos = length;
    282                     }
    283                     break;
    284                 }
    285                 marksSeenSoFar.add(c);
    286             }
    287         }
    288 
    289 
    290         if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
    291             // The basic test is the same for both whole and mixed script confusables.
    292             // Compute the set of scripts that every input character has a confusable in.
    293             // For this computation an input character is always considered to be
    294             //    confusable with itself in its own script.
    295             // If the number of such scripts is two or more, and the input consisted of
    296             //   characters all from a single script, we have a whole script confusable.
    297             //   (The two scripts will be the original script and the one that is confusable)
    298             // If the number of such scripts >= one, and the original input contained characters from
    299             //   more than one script, we have a mixed script confusable.  (We can transform
    300             //   some of the characters, and end up with a visually similar string all in
    301             //   one script.)
    302 
    303             if (scriptCount == -1) {
    304                 int32_t t;
    305                 scriptCount = This->scriptScan(text, length, t, *status);
    306             }
    307 
    308             ScriptSet scripts;
    309             This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status);
    310             int32_t confusableScriptCount = scripts.countMembers();
    311             //printf("confusableScriptCount = %d\n", confusableScriptCount);
    312 
    313             if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
    314                 confusableScriptCount >= 2 &&
    315                 scriptCount == 1) {
    316                 result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
    317             }
    318 
    319             if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
    320                 confusableScriptCount >= 1 &&
    321                 scriptCount > 1) {
    322                 result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
    323             }
    324         }
    325     }
    326     if (position != NULL && failPos != 0x7fffffff) {
    327         *position = failPos;
    328     }
    329     return result;
    330 }
    331 
    332 
    333 U_CAPI int32_t U_EXPORT2
    334 uspoof_checkUTF8(const USpoofChecker *sc,
    335                  const char *text, int32_t length,
    336                  int32_t *position,
    337                  UErrorCode *status) {
    338 
    339     if (U_FAILURE(*status)) {
    340         return 0;
    341     }
    342     UChar stackBuf[USPOOF_STACK_BUFFER_SIZE];
    343     UChar* text16 = stackBuf;
    344     int32_t len16;
    345 
    346     u_strFromUTF8(text16, USPOOF_STACK_BUFFER_SIZE, &len16, text, length, status);
    347     if (U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
    348         return 0;
    349     }
    350     if (*status == U_BUFFER_OVERFLOW_ERROR) {
    351         text16 = static_cast<UChar *>(uprv_malloc(len16 * sizeof(UChar) + 2));
    352         if (text16 == NULL) {
    353             *status = U_MEMORY_ALLOCATION_ERROR;
    354             return 0;
    355         }
    356         *status = U_ZERO_ERROR;
    357         u_strFromUTF8(text16, len16+1, NULL, text, length, status);
    358     }
    359 
    360     int32_t position16 = -1;
    361     int32_t result = uspoof_check(sc, text16, len16, &position16, status);
    362     if (U_FAILURE(*status)) {
    363         return 0;
    364     }
    365 
    366     if (position16 > 0) {
    367         // Translate a UTF-16 based error position back to a UTF-8 offset.
    368         // u_strToUTF8() in preflight mode is an easy way to do it.
    369         U_ASSERT(position16 <= len16);
    370         u_strToUTF8(NULL, 0, position, text16, position16, status);
    371         if (position > 0) {
    372             // position is the required buffer length from u_strToUTF8, which includes
    373             // space for a terminating NULL, which we don't want, hence the -1.
    374             *position -= 1;
    375         }
    376         *status = U_ZERO_ERROR;   // u_strToUTF8, above sets BUFFER_OVERFLOW_ERROR.
    377     }
    378 
    379     if (text16 != stackBuf) {
    380         uprv_free(text16);
    381     }
    382     return result;
    383 
    384 }
    385 
    386 /*  A convenience wrapper around the public uspoof_getSkeleton that handles
    387  *  allocating a larger buffer than provided if the original is too small.
    388  */
    389 static UChar *getSkeleton(const USpoofChecker *sc, uint32_t type, const UChar *s, int32_t inputLength,
    390                          UChar *dest, int32_t destCapacity, int32_t *outputLength, UErrorCode *status) {
    391     int32_t requiredCapacity = 0;
    392     UChar *buf = dest;
    393 
    394     if (U_FAILURE(*status)) {
    395         return NULL;
    396     }
    397     requiredCapacity = uspoof_getSkeleton(sc, type, s, inputLength, dest, destCapacity, status);
    398     if (*status == U_BUFFER_OVERFLOW_ERROR) {
    399         buf = static_cast<UChar *>(uprv_malloc(requiredCapacity * sizeof(UChar)));
    400         if (buf == NULL) {
    401             *status = U_MEMORY_ALLOCATION_ERROR;
    402             return NULL;
    403         }
    404         *status = U_ZERO_ERROR;
    405         uspoof_getSkeleton(sc, type, s, inputLength, buf, requiredCapacity, status);
    406     }
    407     *outputLength = requiredCapacity;
    408     return buf;
    409 }
    410 
    411 
    412 U_CAPI int32_t U_EXPORT2
    413 uspoof_areConfusable(const USpoofChecker *sc,
    414                      const UChar *s1, int32_t length1,
    415                      const UChar *s2, int32_t length2,
    416                      UErrorCode *status) {
    417     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    418     if (U_FAILURE(*status)) {
    419         return 0;
    420     }
    421     //
    422     // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
    423     //   and for definitions of the types (single, whole, mixed-script) of confusables.
    424 
    425     // We only care about a few of the check flags.  Ignore the others.
    426     // If no tests relavant to this function have been specified, return an error.
    427     // TODO:  is this really the right thing to do?  It's probably an error on the caller's part,
    428     //        but logically we would just return 0 (no error).
    429     if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE |
    430                           USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) {
    431         *status = U_INVALID_STATE_ERROR;
    432         return 0;
    433     }
    434     int32_t  flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE;
    435     UChar    s1SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];
    436     UChar   *s1Skeleton;
    437     int32_t  s1SkeletonLength = 0;
    438 
    439     UChar    s2SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];
    440     UChar   *s2Skeleton;
    441     int32_t  s2SkeletonLength = 0;
    442 
    443     int32_t  result = 0;
    444     int32_t  t;
    445     int32_t  s1ScriptCount = This->scriptScan(s1, length1, t, *status);
    446     int32_t  s2ScriptCount = This->scriptScan(s2, length2, t, *status);
    447 
    448     if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
    449         // Do the Single Script compare.
    450         if (s1ScriptCount <= 1 && s2ScriptCount <= 1) {
    451             flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
    452             s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf,
    453                                      sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status);
    454             s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf,
    455                                      sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status);
    456             if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) {
    457                 result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
    458             }
    459             if (s1Skeleton != s1SkeletonBuf) {
    460                 uprv_free(s1Skeleton);
    461             }
    462             if (s2Skeleton != s2SkeletonBuf) {
    463                 uprv_free(s2Skeleton);
    464             }
    465         }
    466     }
    467 
    468     if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
    469          // If the two inputs are single script confusable they cannot also be
    470          // mixed or whole script confusable, according to the UAX39 definitions.
    471          // So we can skip those tests.
    472          return result;
    473     }
    474 
    475     // Optimization for whole script confusables test:  two identifiers are whole script confusable if
    476     // each is of a single script and they are mixed script confusable.
    477     UBool possiblyWholeScriptConfusables =
    478         s1ScriptCount <= 1 && s2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE);
    479 
    480     //
    481     // Mixed Script Check
    482     //
    483     if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) {
    484         // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
    485         // the mixed script table skeleton, which is what we want.
    486         // The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
    487         flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
    488         s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf,
    489                                  sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status);
    490         s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf,
    491                                  sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status);
    492         if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) {
    493             result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
    494             if (possiblyWholeScriptConfusables) {
    495                 result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
    496             }
    497         }
    498         if (s1Skeleton != s1SkeletonBuf) {
    499             uprv_free(s1Skeleton);
    500         }
    501         if (s2Skeleton != s2SkeletonBuf) {
    502             uprv_free(s2Skeleton);
    503         }
    504     }
    505 
    506     return result;
    507 }
    508 
    509 
    510 // Convenience function for converting a UTF-8 input to a UChar * string, including
    511 //          reallocating a buffer when required.  Parameters and their interpretation mostly
    512 //          match u_strFromUTF8.
    513 
    514 static UChar * convertFromUTF8(UChar *outBuf, int32_t outBufCapacity, int32_t *outputLength,
    515                                const char *in, int32_t inLength, UErrorCode *status) {
    516     if (U_FAILURE(*status)) {
    517         return NULL;
    518     }
    519     UChar *dest = outBuf;
    520     u_strFromUTF8(dest, outBufCapacity, outputLength, in, inLength, status);
    521     if (*status == U_BUFFER_OVERFLOW_ERROR) {
    522         dest = static_cast<UChar *>(uprv_malloc(*outputLength * sizeof(UChar)));
    523         if (dest == NULL) {
    524             *status = U_MEMORY_ALLOCATION_ERROR;
    525             return NULL;
    526         }
    527         *status = U_ZERO_ERROR;
    528         u_strFromUTF8(dest, *outputLength, NULL, in, inLength, status);
    529     }
    530     return dest;
    531 }
    532 
    533 
    534 
    535 U_CAPI int32_t U_EXPORT2
    536 uspoof_areConfusableUTF8(const USpoofChecker *sc,
    537                          const char *s1, int32_t length1,
    538                          const char *s2, int32_t length2,
    539                          UErrorCode *status) {
    540 
    541     SpoofImpl::validateThis(sc, *status);
    542     if (U_FAILURE(*status)) {
    543         return 0;
    544     }
    545 
    546     UChar    s1Buf[USPOOF_STACK_BUFFER_SIZE];
    547     int32_t  lengthS1U;
    548     UChar   *s1U = convertFromUTF8(s1Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS1U, s1, length1, status);
    549 
    550     UChar    s2Buf[USPOOF_STACK_BUFFER_SIZE];
    551     int32_t  lengthS2U;
    552     UChar   *s2U = convertFromUTF8(s2Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS2U, s2, length2, status);
    553 
    554     int32_t results = uspoof_areConfusable(sc, s1U, lengthS1U, s2U, lengthS2U, status);
    555 
    556     if (s1U != s1Buf) {
    557         uprv_free(s1U);
    558     }
    559     if (s2U != s2Buf) {
    560         uprv_free(s2U);
    561     }
    562     return results;
    563 }
    564 
    565 
    566 U_CAPI int32_t U_EXPORT2
    567 uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
    568                                   const U_NAMESPACE_QUALIFIER UnicodeString &s1,
    569                                   const U_NAMESPACE_QUALIFIER UnicodeString &s2,
    570                                   UErrorCode *status) {
    571 
    572     const UChar *u1  = s1.getBuffer();
    573     int32_t  length1 = s1.length();
    574     const UChar *u2  = s2.getBuffer();
    575     int32_t  length2 = s2.length();
    576 
    577     int32_t results  = uspoof_areConfusable(sc, u1, length1, u2, length2, status);
    578     return results;
    579 }
    580 
    581 
    582 
    583 
    584 U_CAPI int32_t U_EXPORT2
    585 uspoof_checkUnicodeString(const USpoofChecker *sc,
    586                           const U_NAMESPACE_QUALIFIER UnicodeString &text,
    587                           int32_t *position,
    588                           UErrorCode *status) {
    589     int32_t result = uspoof_check(sc, text.getBuffer(), text.length(), position, status);
    590     return result;
    591 }
    592 
    593 
    594 U_CAPI int32_t U_EXPORT2
    595 uspoof_getSkeleton(const USpoofChecker *sc,
    596                    uint32_t type,
    597                    const UChar *s,  int32_t length,
    598                    UChar *dest, int32_t destCapacity,
    599                    UErrorCode *status) {
    600 
    601     // TODO:  this function could be sped up a bit
    602     //        Skip the input normalization when not needed, work from callers data.
    603     //        Put the initial skeleton straight into the caller's destination buffer.
    604     //        It probably won't need normalization.
    605     //        But these would make the structure more complicated.
    606 
    607     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    608     if (U_FAILURE(*status)) {
    609         return 0;
    610     }
    611     if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) ||
    612         (type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) {
    613         *status = U_ILLEGAL_ARGUMENT_ERROR;
    614         return 0;
    615     }
    616 
    617    int32_t tableMask = 0;
    618    switch (type) {
    619       case 0:
    620         tableMask = USPOOF_ML_TABLE_FLAG;
    621         break;
    622       case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
    623         tableMask = USPOOF_SL_TABLE_FLAG;
    624         break;
    625       case USPOOF_ANY_CASE:
    626         tableMask = USPOOF_MA_TABLE_FLAG;
    627         break;
    628       case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
    629         tableMask = USPOOF_SA_TABLE_FLAG;
    630         break;
    631       default:
    632         *status = U_ILLEGAL_ARGUMENT_ERROR;
    633         return 0;
    634     }
    635 
    636     // NFD transform of the user supplied input
    637 
    638     UChar nfdStackBuf[USPOOF_STACK_BUFFER_SIZE];
    639     UChar *nfdInput = nfdStackBuf;
    640     int32_t normalizedLen = unorm_normalize(
    641         s, length, UNORM_NFD, 0, nfdInput, USPOOF_STACK_BUFFER_SIZE, status);
    642     if (*status == U_BUFFER_OVERFLOW_ERROR) {
    643         nfdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
    644         if (nfdInput == NULL) {
    645             *status = U_MEMORY_ALLOCATION_ERROR;
    646             return 0;
    647         }
    648         *status = U_ZERO_ERROR;
    649         normalizedLen = unorm_normalize(s, length, UNORM_NFD, 0,
    650                                         nfdInput, normalizedLen+1, status);
    651     }
    652     if (U_FAILURE(*status)) {
    653         if (nfdInput != nfdStackBuf) {
    654             uprv_free(nfdInput);
    655         }
    656         return 0;
    657     }
    658 
    659     // buffer to hold the Unicode defined skeleton mappings for a single code point
    660     UChar buf[USPOOF_MAX_SKELETON_EXPANSION];
    661 
    662     // Apply the skeleton mapping to the NFD normalized input string
    663     // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
    664     int32_t inputIndex = 0;
    665     UnicodeString skelStr;
    666     while (inputIndex < normalizedLen) {
    667         UChar32 c;
    668         U16_NEXT(nfdInput, inputIndex, normalizedLen, c);
    669         int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
    670         skelStr.append(buf, replaceLen);
    671     }
    672 
    673     if (nfdInput != nfdStackBuf) {
    674         uprv_free(nfdInput);
    675     }
    676 
    677     const UChar *result = skelStr.getBuffer();
    678     int32_t  resultLen  = skelStr.length();
    679     UChar   *normedResult = NULL;
    680 
    681     // Check the skeleton for NFD, normalize it if needed.
    682     // Unnormalized results should be very rare.
    683     if (!unorm_isNormalized(result, resultLen, UNORM_NFD, status)) {
    684         normalizedLen = unorm_normalize(result, resultLen, UNORM_NFD, 0, NULL, 0, status);
    685         normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
    686         if (normedResult == NULL) {
    687             *status = U_MEMORY_ALLOCATION_ERROR;
    688             return 0;
    689         }
    690         *status = U_ZERO_ERROR;
    691         unorm_normalize(result, resultLen, UNORM_NFD, 0, normedResult, normalizedLen+1, status);
    692         result = normedResult;
    693         resultLen = normalizedLen;
    694     }
    695 
    696     // Copy the skeleton to the caller's buffer
    697     if (U_SUCCESS(*status)) {
    698         if (destCapacity == 0 || resultLen > destCapacity) {
    699             *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING;
    700         } else {
    701             u_memcpy(dest, result, resultLen);
    702             if (destCapacity > resultLen) {
    703                 dest[resultLen] = 0;
    704             } else {
    705                 *status = U_STRING_NOT_TERMINATED_WARNING;
    706             }
    707         }
    708      }
    709      uprv_free(normedResult);
    710      return resultLen;
    711 }
    712 
    713 
    714 
    715 U_CAPI UnicodeString &  U_EXPORT2
    716 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
    717                                 uint32_t type,
    718                                 const UnicodeString &s,
    719                                 UnicodeString &dest,
    720                                 UErrorCode *status) {
    721     if (U_FAILURE(*status)) {
    722         return dest;
    723     }
    724     dest.remove();
    725 
    726     const UChar *str = s.getBuffer();
    727     int32_t      strLen = s.length();
    728     UChar        smallBuf[USPOOF_STACK_BUFFER_SIZE];
    729     UChar       *buf = smallBuf;
    730     int32_t outputSize = uspoof_getSkeleton(sc, type, str, strLen, smallBuf, USPOOF_STACK_BUFFER_SIZE, status);
    731     if (*status == U_BUFFER_OVERFLOW_ERROR) {
    732         buf = static_cast<UChar *>(uprv_malloc((outputSize+1)*sizeof(UChar)));
    733         if (buf == NULL) {
    734             *status = U_MEMORY_ALLOCATION_ERROR;
    735             return dest;
    736         }
    737         *status = U_ZERO_ERROR;
    738         uspoof_getSkeleton(sc, type, str, strLen, buf, outputSize+1, status);
    739     }
    740     if (U_SUCCESS(*status)) {
    741         dest.setTo(buf, outputSize);
    742     }
    743 
    744     if (buf != smallBuf) {
    745         uprv_free(buf);
    746     }
    747     return dest;
    748 }
    749 
    750 
    751 U_CAPI int32_t U_EXPORT2
    752 uspoof_getSkeletonUTF8(const USpoofChecker *sc,
    753                        uint32_t type,
    754                        const char *s,  int32_t length,
    755                        char *dest, int32_t destCapacity,
    756                        UErrorCode *status) {
    757     // Lacking a UTF-8 normalization API, just converting the input to
    758     // UTF-16 seems as good an approach as any.  In typical use, input will
    759     // be an identifier, which is to say not too long for stack buffers.
    760     if (U_FAILURE(*status)) {
    761         return 0;
    762     }
    763     // Buffers for the UChar form of the input and skeleton strings.
    764     UChar    smallInBuf[USPOOF_STACK_BUFFER_SIZE];
    765     UChar   *inBuf = smallInBuf;
    766     UChar    smallOutBuf[USPOOF_STACK_BUFFER_SIZE];
    767     UChar   *outBuf = smallOutBuf;
    768 
    769     int32_t  lengthInUChars = 0;
    770     int32_t  skelLengthInUChars = 0;
    771     int32_t  skelLengthInUTF8 = 0;
    772 
    773     u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars,
    774                   s, length, status);
    775     if (*status == U_BUFFER_OVERFLOW_ERROR) {
    776         inBuf = static_cast<UChar *>(uprv_malloc((lengthInUChars+1)*sizeof(UChar)));
    777         if (inBuf == NULL) {
    778             *status = U_MEMORY_ALLOCATION_ERROR;
    779             goto cleanup;
    780         }
    781         *status = U_ZERO_ERROR;
    782         u_strFromUTF8(inBuf, lengthInUChars+1, &lengthInUChars,
    783                       s, length, status);
    784     }
    785 
    786     skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
    787                                          outBuf, USPOOF_STACK_BUFFER_SIZE, status);
    788     if (*status == U_BUFFER_OVERFLOW_ERROR) {
    789         outBuf = static_cast<UChar *>(uprv_malloc((skelLengthInUChars+1)*sizeof(UChar)));
    790         if (outBuf == NULL) {
    791             *status = U_MEMORY_ALLOCATION_ERROR;
    792             goto cleanup;
    793         }
    794         *status = U_ZERO_ERROR;
    795         skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
    796                                          outBuf, skelLengthInUChars+1, status);
    797     }
    798 
    799     u_strToUTF8(dest, destCapacity, &skelLengthInUTF8,
    800                 outBuf, skelLengthInUChars, status);
    801 
    802   cleanup:
    803     if (inBuf != smallInBuf) {
    804         uprv_free(inBuf);
    805     }
    806     if (outBuf != smallOutBuf) {
    807         uprv_free(outBuf);
    808     }
    809     return skelLengthInUTF8;
    810 }
    811 
    812 
    813 U_CAPI int32_t U_EXPORT2
    814 uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *status) {
    815     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    816     if (This == NULL) {
    817         U_ASSERT(U_FAILURE(*status));
    818         return 0;
    819     }
    820     int32_t dataSize = This->fSpoofData->fRawData->fLength;
    821     if (capacity < dataSize) {
    822         *status = U_BUFFER_OVERFLOW_ERROR;
    823         return dataSize;
    824     }
    825     uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);
    826     return dataSize;
    827 }
    828 
    829 #endif
    830