Home | History | Annotate | Download | only in i18n
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2008-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 */
      9 
     10 #include "unicode/utypes.h"
     11 #include "unicode/uspoof.h"
     12 #include "unicode/uchar.h"
     13 #include "unicode/uniset.h"
     14 #include "unicode/utf16.h"
     15 #include "utrie2.h"
     16 #include "cmemory.h"
     17 #include "cstring.h"
     18 #include "scriptset.h"
     19 #include "umutex.h"
     20 #include "udataswp.h"
     21 #include "uassert.h"
     22 #include "ucln_in.h"
     23 #include "uspoof_impl.h"
     24 
     25 #if !UCONFIG_NO_NORMALIZATION
     26 
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
     31 
     32 SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
     33     construct(status);
     34     fSpoofData = data;
     35 }
     36 
     37 SpoofImpl::SpoofImpl(UErrorCode& status) {
     38     construct(status);
     39 
     40     // TODO: Call this method where it is actually needed, instead of in the
     41     // constructor, to allow for lazy data loading.  See #12696.
     42     fSpoofData = SpoofData::getDefault(status);
     43 }
     44 
     45 SpoofImpl::SpoofImpl() {
     46     UErrorCode status = U_ZERO_ERROR;
     47     construct(status);
     48 
     49     // TODO: Call this method where it is actually needed, instead of in the
     50     // constructor, to allow for lazy data loading.  See #12696.
     51     fSpoofData = SpoofData::getDefault(status);
     52 }
     53 
     54 void SpoofImpl::construct(UErrorCode& status) {
     55     fMagic = USPOOF_MAGIC;
     56     fChecks = USPOOF_ALL_CHECKS;
     57     fSpoofData = NULL;
     58     fAllowedCharsSet = NULL;
     59     fAllowedLocales = NULL;
     60     fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
     61 
     62     if (U_FAILURE(status)) { return; }
     63 
     64     UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
     65     fAllowedCharsSet = allowedCharsSet;
     66     fAllowedLocales  = uprv_strdup("");
     67     if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
     68         status = U_MEMORY_ALLOCATION_ERROR;
     69         return;
     70     }
     71     allowedCharsSet->freeze();
     72 }
     73 
     74 
     75 // Copy Constructor, used by the user level clone() function.
     76 SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status)  :
     77         fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
     78         fAllowedLocales(NULL) {
     79     if (U_FAILURE(status)) {
     80         return;
     81     }
     82     fMagic = src.fMagic;
     83     fChecks = src.fChecks;
     84     if (src.fSpoofData != NULL) {
     85         fSpoofData = src.fSpoofData->addReference();
     86     }
     87     fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
     88     fAllowedLocales = uprv_strdup(src.fAllowedLocales);
     89     if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
     90         status = U_MEMORY_ALLOCATION_ERROR;
     91     }
     92     fRestrictionLevel = src.fRestrictionLevel;
     93 }
     94 
     95 SpoofImpl::~SpoofImpl() {
     96     fMagic = 0;                // head off application errors by preventing use of
     97                                //    of deleted objects.
     98     if (fSpoofData != NULL) {
     99         fSpoofData->removeReference();   // Will delete if refCount goes to zero.
    100     }
    101     delete fAllowedCharsSet;
    102     uprv_free((void *)fAllowedLocales);
    103 }
    104 
    105 //  Cast this instance as a USpoofChecker for the C API.
    106 USpoofChecker *SpoofImpl::asUSpoofChecker() {
    107     return reinterpret_cast<USpoofChecker*>(this);
    108 }
    109 
    110 //
    111 //  Incoming parameter check on Status and the SpoofChecker object
    112 //    received from the C API.
    113 //
    114 const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
    115     if (U_FAILURE(status)) {
    116         return NULL;
    117     }
    118     if (sc == NULL) {
    119         status = U_ILLEGAL_ARGUMENT_ERROR;
    120         return NULL;
    121     }
    122     SpoofImpl *This = (SpoofImpl *)sc;
    123     if (This->fMagic != USPOOF_MAGIC) {
    124         status = U_INVALID_FORMAT_ERROR;
    125         return NULL;
    126     }
    127     if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {
    128         return NULL;
    129     }
    130     return This;
    131 }
    132 
    133 SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
    134     return const_cast<SpoofImpl *>
    135         (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
    136 }
    137 
    138 
    139 void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
    140     UnicodeSet    allowedChars;
    141     UnicodeSet    *tmpSet = NULL;
    142     const char    *locStart = localesList;
    143     const char    *locEnd = NULL;
    144     const char    *localesListEnd = localesList + uprv_strlen(localesList);
    145     int32_t        localeListCount = 0;   // Number of locales provided by caller.
    146 
    147     // Loop runs once per locale from the localesList, a comma separated list of locales.
    148     do {
    149         locEnd = uprv_strchr(locStart, ',');
    150         if (locEnd == NULL) {
    151             locEnd = localesListEnd;
    152         }
    153         while (*locStart == ' ') {
    154             locStart++;
    155         }
    156         const char *trimmedEnd = locEnd-1;
    157         while (trimmedEnd > locStart && *trimmedEnd == ' ') {
    158             trimmedEnd--;
    159         }
    160         if (trimmedEnd <= locStart) {
    161             break;
    162         }
    163         const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
    164         localeListCount++;
    165 
    166         // We have one locale from the locales list.
    167         // Add the script chars for this locale to the accumulating set of allowed chars.
    168         // If the locale is no good, we will be notified back via status.
    169         addScriptChars(locale, &allowedChars, status);
    170         uprv_free((void *)locale);
    171         if (U_FAILURE(status)) {
    172             break;
    173         }
    174         locStart = locEnd + 1;
    175     } while (locStart < localesListEnd);
    176 
    177     // If our caller provided an empty list of locales, we disable the allowed characters checking
    178     if (localeListCount == 0) {
    179         uprv_free((void *)fAllowedLocales);
    180         fAllowedLocales = uprv_strdup("");
    181         tmpSet = new UnicodeSet(0, 0x10ffff);
    182         if (fAllowedLocales == NULL || tmpSet == NULL) {
    183             status = U_MEMORY_ALLOCATION_ERROR;
    184             return;
    185         }
    186         tmpSet->freeze();
    187         delete fAllowedCharsSet;
    188         fAllowedCharsSet = tmpSet;
    189         fChecks &= ~USPOOF_CHAR_LIMIT;
    190         return;
    191     }
    192 
    193 
    194     // Add all common and inherited characters to the set of allowed chars.
    195     UnicodeSet tempSet;
    196     tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
    197     allowedChars.addAll(tempSet);
    198     tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
    199     allowedChars.addAll(tempSet);
    200 
    201     // If anything went wrong, we bail out without changing
    202     // the state of the spoof checker.
    203     if (U_FAILURE(status)) {
    204         return;
    205     }
    206 
    207     // Store the updated spoof checker state.
    208     tmpSet = static_cast<UnicodeSet *>(allowedChars.clone());
    209     const char *tmpLocalesList = uprv_strdup(localesList);
    210     if (tmpSet == NULL || tmpLocalesList == NULL) {
    211         status = U_MEMORY_ALLOCATION_ERROR;
    212         return;
    213     }
    214     uprv_free((void *)fAllowedLocales);
    215     fAllowedLocales = tmpLocalesList;
    216     tmpSet->freeze();
    217     delete fAllowedCharsSet;
    218     fAllowedCharsSet = tmpSet;
    219     fChecks |= USPOOF_CHAR_LIMIT;
    220 }
    221 
    222 
    223 const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
    224     return fAllowedLocales;
    225 }
    226 
    227 
    228 // Given a locale (a language), add all the characters from all of the scripts used with that language
    229 // to the allowedChars UnicodeSet
    230 
    231 void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
    232     UScriptCode scripts[30];
    233 
    234     int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);
    235     if (U_FAILURE(status)) {
    236         return;
    237     }
    238     if (status == U_USING_DEFAULT_WARNING) {
    239         status = U_ILLEGAL_ARGUMENT_ERROR;
    240         return;
    241     }
    242     UnicodeSet tmpSet;
    243     int32_t    i;
    244     for (i=0; i<numScripts; i++) {
    245         tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
    246         allowedChars->addAll(tmpSet);
    247     }
    248 }
    249 
    250 // Computes the augmented script set for a code point, according to UTS 39 section 5.1.
    251 void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
    252     result.resetAll();
    253     result.setScriptExtensions(codePoint, status);
    254     if (U_FAILURE(status)) { return; }
    255 
    256     // Section 5.1 step 1
    257     if (result.test(USCRIPT_HAN, status)) {
    258         result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
    259         result.set(USCRIPT_JAPANESE, status);
    260         result.set(USCRIPT_KOREAN, status);
    261     }
    262     if (result.test(USCRIPT_HIRAGANA, status)) {
    263         result.set(USCRIPT_JAPANESE, status);
    264     }
    265     if (result.test(USCRIPT_KATAKANA, status)) {
    266         result.set(USCRIPT_JAPANESE, status);
    267     }
    268     if (result.test(USCRIPT_HANGUL, status)) {
    269         result.set(USCRIPT_KOREAN, status);
    270     }
    271     if (result.test(USCRIPT_BOPOMOFO, status)) {
    272         result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
    273     }
    274 
    275     // Section 5.1 step 2
    276     if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
    277         result.setAll();
    278     }
    279 }
    280 
    281 // Computes the resolved script set for a string, according to UTS 39 section 5.1.
    282 void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
    283     getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
    284 }
    285 
    286 // Computes the resolved script set for a string, omitting characters having the specified script.
    287 // If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
    288 void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
    289     result.setAll();
    290 
    291     ScriptSet temp;
    292     UChar32 codePoint;
    293     for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
    294         codePoint = input.char32At(i);
    295 
    296         // Compute the augmented script set for the character
    297         getAugmentedScriptSet(codePoint, temp, status);
    298         if (U_FAILURE(status)) { return; }
    299 
    300         // Intersect the augmented script set with the resolved script set, but only if the character doesn't
    301         // have the script specified in the function call
    302         if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
    303             result.intersect(temp);
    304         }
    305     }
    306 }
    307 
    308 // Computes the set of numerics for a string, according to UTS 39 section 5.3.
    309 void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
    310     result.clear();
    311 
    312     UChar32 codePoint;
    313     for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
    314         codePoint = input.char32At(i);
    315 
    316         // Store a representative character for each kind of decimal digit
    317         if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
    318             // Store the zero character as a representative for comparison.
    319             // Unicode guarantees it is codePoint - value
    320             result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
    321         }
    322     }
    323 }
    324 
    325 // Computes the restriction level of a string, according to UTS 39 section 5.2.
    326 URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
    327     // Section 5.2 step 1:
    328     if (!fAllowedCharsSet->containsAll(input)) {
    329         return USPOOF_UNRESTRICTIVE;
    330     }
    331 
    332     // Section 5.2 step 2
    333     // Java use a static UnicodeSet for this test.  In C++, avoid the static variable
    334     // and just do a simple for loop.
    335     UBool allASCII = TRUE;
    336     for (int32_t i=0, length=input.length(); i<length; i++) {
    337         if (input.charAt(i) > 0x7f) {
    338             allASCII = FALSE;
    339             break;
    340         }
    341     }
    342     if (allASCII) {
    343         return USPOOF_ASCII;
    344     }
    345 
    346     // Section 5.2 steps 3:
    347     ScriptSet resolvedScriptSet;
    348     getResolvedScriptSet(input, resolvedScriptSet, status);
    349     if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
    350 
    351     // Section 5.2 step 4:
    352     if (!resolvedScriptSet.isEmpty()) {
    353         return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
    354     }
    355 
    356     // Section 5.2 step 5:
    357     ScriptSet resolvedNoLatn;
    358     getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
    359     if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
    360 
    361     // Section 5.2 step 6:
    362     if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
    363             || resolvedNoLatn.test(USCRIPT_JAPANESE, status)
    364             || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
    365         return USPOOF_HIGHLY_RESTRICTIVE;
    366     }
    367 
    368     // Section 5.2 step 7:
    369     if (!resolvedNoLatn.isEmpty()
    370             && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
    371             && !resolvedNoLatn.test(USCRIPT_GREEK, status)
    372             && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
    373         return USPOOF_MODERATELY_RESTRICTIVE;
    374     }
    375 
    376     // Section 5.2 step 8:
    377     return USPOOF_MINIMALLY_RESTRICTIVE;
    378 }
    379 
    380 
    381 
    382 // Convert a text format hex number.  Utility function used by builder code.  Static.
    383 // Input: UChar *string text.  Output: a UChar32
    384 // Input has been pre-checked, and will have no non-hex chars.
    385 // The number must fall in the code point range of 0..0x10ffff
    386 // Static Function.
    387 UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
    388     if (U_FAILURE(status)) {
    389         return 0;
    390     }
    391     U_ASSERT(limit-start > 0);
    392     uint32_t val = 0;
    393     int i;
    394     for (i=start; i<limit; i++) {
    395         int digitVal = s[i] - 0x30;
    396         if (digitVal>9) {
    397             digitVal = 0xa + (s[i] - 0x41);  // Upper Case 'A'
    398         }
    399         if (digitVal>15) {
    400             digitVal = 0xa + (s[i] - 0x61);  // Lower Case 'a'
    401         }
    402         U_ASSERT(digitVal <= 0xf);
    403         val <<= 4;
    404         val += digitVal;
    405     }
    406     if (val > 0x10ffff) {
    407         status = U_PARSE_ERROR;
    408         val = 0;
    409     }
    410     return (UChar32)val;
    411 }
    412 
    413 
    414 //-----------------------------------------
    415 //
    416 //   class CheckResult Implementation
    417 //
    418 //-----------------------------------------
    419 
    420 CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) {
    421     clear();
    422 }
    423 
    424 USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
    425     return reinterpret_cast<USpoofCheckResult*>(this);
    426 }
    427 
    428 //
    429 //  Incoming parameter check on Status and the CheckResult object
    430 //    received from the C API.
    431 //
    432 const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
    433     if (U_FAILURE(status)) { return NULL; }
    434     if (ptr == NULL) {
    435         status = U_ILLEGAL_ARGUMENT_ERROR;
    436         return NULL;
    437     }
    438     CheckResult *This = (CheckResult*) ptr;
    439     if (This->fMagic != USPOOF_CHECK_MAGIC) {
    440         status = U_INVALID_FORMAT_ERROR;
    441         return NULL;
    442     }
    443     return This;
    444 }
    445 
    446 CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
    447     return const_cast<CheckResult *>
    448         (CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status));
    449 }
    450 
    451 void CheckResult::clear() {
    452     fChecks = 0;
    453     fNumerics.clear();
    454     fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
    455 }
    456 
    457 int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
    458     if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
    459         return fChecks | fRestrictionLevel;
    460     } else {
    461         return fChecks;
    462     }
    463 }
    464 
    465 CheckResult::~CheckResult() {
    466 }
    467 
    468 //----------------------------------------------------------------------------------------------
    469 //
    470 //   class SpoofData Implementation
    471 //
    472 //----------------------------------------------------------------------------------------------
    473 
    474 
    475 UBool SpoofData::validateDataVersion(UErrorCode &status) const {
    476     if (U_FAILURE(status) ||
    477         fRawData == NULL ||
    478         fRawData->fMagic != USPOOF_MAGIC ||
    479         fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
    480         fRawData->fFormatVersion[1] != 0 ||
    481         fRawData->fFormatVersion[2] != 0 ||
    482         fRawData->fFormatVersion[3] != 0) {
    483             status = U_INVALID_FORMAT_ERROR;
    484             return FALSE;
    485     }
    486     return TRUE;
    487 }
    488 
    489 static UBool U_CALLCONV
    490 spoofDataIsAcceptable(void *context,
    491                         const char * /* type */, const char * /*name*/,
    492                         const UDataInfo *pInfo) {
    493     if(
    494         pInfo->size >= 20 &&
    495         pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
    496         pInfo->charsetFamily == U_CHARSET_FAMILY &&
    497         pInfo->dataFormat[0] == 0x43 &&  // dataFormat="Cfu "
    498         pInfo->dataFormat[1] == 0x66 &&
    499         pInfo->dataFormat[2] == 0x75 &&
    500         pInfo->dataFormat[3] == 0x20 &&
    501         pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
    502     ) {
    503         UVersionInfo *version = static_cast<UVersionInfo *>(context);
    504         if(version != NULL) {
    505             uprv_memcpy(version, pInfo->dataVersion, 4);
    506         }
    507         return TRUE;
    508     } else {
    509         return FALSE;
    510     }
    511 }
    512 
    513 //  Methods for the loading of the default confusables data file.  The confusable
    514 //  data is loaded only when it is needed.
    515 //
    516 //  SpoofData::getDefault() - Return the default confusables data, and call the
    517 //                            initOnce() if it is not available.  Adds a reference
    518 //                            to the SpoofData that the caller is responsible for
    519 //                            decrementing when they are done with the data.
    520 //
    521 //  uspoof_loadDefaultData - Called once, from initOnce().  The resulting SpoofData
    522 //                           is shared by all spoof checkers using the default data.
    523 //
    524 //  uspoof_cleanupDefaultData - Called during cleanup.
    525 //
    526 
    527 static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
    528 static SpoofData* gDefaultSpoofData;
    529 
    530 static UBool U_CALLCONV
    531 uspoof_cleanupDefaultData(void) {
    532     if (gDefaultSpoofData) {
    533         // Will delete, assuming all user-level spoof checkers were closed.
    534         gDefaultSpoofData->removeReference();
    535         gDefaultSpoofData = NULL;
    536         gSpoofInitDefaultOnce.reset();
    537     }
    538     return TRUE;
    539 }
    540 
    541 static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
    542     UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
    543                                         spoofDataIsAcceptable,
    544                                         NULL,       // context, would receive dataVersion if supplied.
    545                                         &status);
    546     if (U_FAILURE(status)) { return; }
    547     gDefaultSpoofData = new SpoofData(udm, status);
    548     if (U_FAILURE(status)) {
    549         delete gDefaultSpoofData;
    550         return;
    551     }
    552     if (gDefaultSpoofData == NULL) {
    553         status = U_MEMORY_ALLOCATION_ERROR;
    554         return;
    555     }
    556     ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
    557 }
    558 
    559 SpoofData* SpoofData::getDefault(UErrorCode& status) {
    560     umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
    561     if (U_FAILURE(status)) { return NULL; }
    562     gDefaultSpoofData->addReference();
    563     return gDefaultSpoofData;
    564 }
    565 
    566 
    567 
    568 SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
    569 {
    570     reset();
    571     if (U_FAILURE(status)) {
    572         return;
    573     }
    574     fUDM = udm;
    575     // fRawData is non-const because it may be constructed by the data builder.
    576     fRawData = reinterpret_cast<SpoofDataHeader *>(
    577             const_cast<void *>(udata_getMemory(udm)));
    578     validateDataVersion(status);
    579     initPtrs(status);
    580 }
    581 
    582 
    583 SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
    584 {
    585     reset();
    586     if (U_FAILURE(status)) {
    587         return;
    588     }
    589     if ((size_t)length < sizeof(SpoofDataHeader)) {
    590         status = U_INVALID_FORMAT_ERROR;
    591         return;
    592     }
    593     void *ncData = const_cast<void *>(data);
    594     fRawData = static_cast<SpoofDataHeader *>(ncData);
    595     if (length < fRawData->fLength) {
    596         status = U_INVALID_FORMAT_ERROR;
    597         return;
    598     }
    599     validateDataVersion(status);
    600     initPtrs(status);
    601 }
    602 
    603 
    604 // Spoof Data constructor for use from data builder.
    605 //   Initializes a new, empty data area that will be populated later.
    606 SpoofData::SpoofData(UErrorCode &status) {
    607     reset();
    608     if (U_FAILURE(status)) {
    609         return;
    610     }
    611     fDataOwned = true;
    612 
    613     // The spoof header should already be sized to be a multiple of 16 bytes.
    614     // Just in case it's not, round it up.
    615     uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
    616     U_ASSERT(initialSize == sizeof(SpoofDataHeader));
    617 
    618     fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
    619     fMemLimit = initialSize;
    620     if (fRawData == NULL) {
    621         status = U_MEMORY_ALLOCATION_ERROR;
    622         return;
    623     }
    624     uprv_memset(fRawData, 0, initialSize);
    625 
    626     fRawData->fMagic = USPOOF_MAGIC;
    627     fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
    628     fRawData->fFormatVersion[1] = 0;
    629     fRawData->fFormatVersion[2] = 0;
    630     fRawData->fFormatVersion[3] = 0;
    631     initPtrs(status);
    632 }
    633 
    634 // reset() - initialize all fields.
    635 //           Should be updated if any new fields are added.
    636 //           Called by constructors to put things in a known initial state.
    637 void SpoofData::reset() {
    638    fRawData = NULL;
    639    fDataOwned = FALSE;
    640    fUDM      = NULL;
    641    fMemLimit = 0;
    642    fRefCount = 1;
    643    fCFUKeys = NULL;
    644    fCFUValues = NULL;
    645    fCFUStrings = NULL;
    646 }
    647 
    648 
    649 //  SpoofData::initPtrs()
    650 //            Initialize the pointers to the various sections of the raw data.
    651 //
    652 //            This function is used both during the Trie building process (multiple
    653 //            times, as the individual data sections are added), and
    654 //            during the opening of a Spoof Checker from prebuilt data.
    655 //
    656 //            The pointers for non-existent data sections (identified by an offset of 0)
    657 //            are set to NULL.
    658 //
    659 //            Note:  During building the data, adding each new data section
    660 //            reallocs the raw data area, which likely relocates it, which
    661 //            in turn requires reinitializing all of the pointers into it, hence
    662 //            multiple calls to this function during building.
    663 //
    664 void SpoofData::initPtrs(UErrorCode &status) {
    665     fCFUKeys = NULL;
    666     fCFUValues = NULL;
    667     fCFUStrings = NULL;
    668     if (U_FAILURE(status)) {
    669         return;
    670     }
    671     if (fRawData->fCFUKeys != 0) {
    672         fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
    673     }
    674     if (fRawData->fCFUStringIndex != 0) {
    675         fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
    676     }
    677     if (fRawData->fCFUStringTable != 0) {
    678         fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
    679     }
    680 }
    681 
    682 
    683 SpoofData::~SpoofData() {
    684     if (fDataOwned) {
    685         uprv_free(fRawData);
    686     }
    687     fRawData = NULL;
    688     if (fUDM != NULL) {
    689         udata_close(fUDM);
    690     }
    691     fUDM = NULL;
    692 }
    693 
    694 
    695 void SpoofData::removeReference() {
    696     if (umtx_atomic_dec(&fRefCount) == 0) {
    697         delete this;
    698     }
    699 }
    700 
    701 
    702 SpoofData *SpoofData::addReference() {
    703     umtx_atomic_inc(&fRefCount);
    704     return this;
    705 }
    706 
    707 
    708 void *SpoofData::reserveSpace(int32_t numBytes,  UErrorCode &status) {
    709     if (U_FAILURE(status)) {
    710         return NULL;
    711     }
    712     if (!fDataOwned) {
    713         U_ASSERT(FALSE);
    714         status = U_INTERNAL_PROGRAM_ERROR;
    715         return NULL;
    716     }
    717 
    718     numBytes = (numBytes + 15) & ~15;   // Round up to a multiple of 16
    719     uint32_t returnOffset = fMemLimit;
    720     fMemLimit += numBytes;
    721     fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
    722     fRawData->fLength = fMemLimit;
    723     uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
    724     initPtrs(status);
    725     return (char *)fRawData + returnOffset;
    726 }
    727 
    728 int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
    729     int32_t dataSize = fRawData->fLength;
    730     if (capacity < dataSize) {
    731         status = U_BUFFER_OVERFLOW_ERROR;
    732         return dataSize;
    733     }
    734     uprv_memcpy(buf, fRawData, dataSize);
    735     return dataSize;
    736 }
    737 
    738 int32_t SpoofData::size() const {
    739     return fRawData->fLength;
    740 }
    741 
    742 //-------------------------------
    743 //
    744 // Front-end APIs for SpoofData
    745 //
    746 //-------------------------------
    747 
    748 int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
    749     // Perform a binary search.
    750     // [lo, hi), i.e lo is inclusive, hi is exclusive.
    751     // The result after the loop will be in lo.
    752     int32_t lo = 0;
    753     int32_t hi = length();
    754     do {
    755         int32_t mid = (lo + hi) / 2;
    756         if (codePointAt(mid) > inChar) {
    757             hi = mid;
    758         } else if (codePointAt(mid) < inChar) {
    759             lo = mid;
    760         } else {
    761             // Found result.  Break early.
    762             lo = mid;
    763             break;
    764         }
    765     } while (hi - lo > 1);
    766 
    767     // Did we find an entry?  If not, the char maps to itself.
    768     if (codePointAt(lo) != inChar) {
    769         dest.append(inChar);
    770         return 1;
    771     }
    772 
    773     // Add the element to the string builder and return.
    774     return appendValueTo(lo, dest);
    775 }
    776 
    777 int32_t SpoofData::length() const {
    778     return fRawData->fCFUKeysSize;
    779 }
    780 
    781 UChar32 SpoofData::codePointAt(int32_t index) const {
    782     return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
    783 }
    784 
    785 int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
    786     int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
    787 
    788     // Value is either a char (for strings of length 1) or
    789     // an index into the string table (for longer strings)
    790     uint16_t value = fCFUValues[index];
    791     if (stringLength == 1) {
    792         dest.append((UChar)value);
    793     } else {
    794         dest.append(fCFUStrings + value, stringLength);
    795     }
    796 
    797     return stringLength;
    798 }
    799 
    800 
    801 U_NAMESPACE_END
    802 
    803 U_NAMESPACE_USE
    804 
    805 //-----------------------------------------------------------------------------
    806 //
    807 //  uspoof_swap   -  byte swap and char encoding swap of spoof data
    808 //
    809 //-----------------------------------------------------------------------------
    810 U_CAPI int32_t U_EXPORT2
    811 uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
    812            UErrorCode *status) {
    813 
    814     if (status == NULL || U_FAILURE(*status)) {
    815         return 0;
    816     }
    817     if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
    818         *status=U_ILLEGAL_ARGUMENT_ERROR;
    819         return 0;
    820     }
    821 
    822     //
    823     //  Check that the data header is for spoof data.
    824     //    (Header contents are defined in gencfu.cpp)
    825     //
    826     const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
    827     if(!(  pInfo->dataFormat[0]==0x43 &&   /* dataFormat="Cfu " */
    828            pInfo->dataFormat[1]==0x66 &&
    829            pInfo->dataFormat[2]==0x75 &&
    830            pInfo->dataFormat[3]==0x20 &&
    831            pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
    832            pInfo->formatVersion[1]==0 &&
    833            pInfo->formatVersion[2]==0 &&
    834            pInfo->formatVersion[3]==0  )) {
    835         udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
    836                              "(format version %02x %02x %02x %02x) is not recognized\n",
    837                          pInfo->dataFormat[0], pInfo->dataFormat[1],
    838                          pInfo->dataFormat[2], pInfo->dataFormat[3],
    839                          pInfo->formatVersion[0], pInfo->formatVersion[1],
    840                          pInfo->formatVersion[2], pInfo->formatVersion[3]);
    841         *status=U_UNSUPPORTED_ERROR;
    842         return 0;
    843     }
    844 
    845     //
    846     // Swap the data header.  (This is the generic ICU Data Header, not the uspoof Specific
    847     //                         header).  This swap also conveniently gets us
    848     //                         the size of the ICU d.h., which lets us locate the start
    849     //                         of the uspoof specific data.
    850     //
    851     int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
    852 
    853 
    854     //
    855     // Get the Spoof Data Header, and check that it appears to be OK.
    856     //
    857     //
    858     const uint8_t   *inBytes =(const uint8_t *)inData+headerSize;
    859     SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
    860     if (ds->readUInt32(spoofDH->fMagic)   != USPOOF_MAGIC ||
    861         ds->readUInt32(spoofDH->fLength)  <  sizeof(SpoofDataHeader))
    862     {
    863         udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
    864         *status=U_UNSUPPORTED_ERROR;
    865         return 0;
    866     }
    867 
    868     //
    869     // Prefight operation?  Just return the size
    870     //
    871     int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
    872     int32_t totalSize = headerSize + spoofDataLength;
    873     if (length < 0) {
    874         return totalSize;
    875     }
    876 
    877     //
    878     // Check that length passed in is consistent with length from Spoof data header.
    879     //
    880     if (length < totalSize) {
    881         udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
    882                             spoofDataLength);
    883         *status=U_INDEX_OUTOFBOUNDS_ERROR;
    884         return 0;
    885         }
    886 
    887 
    888     //
    889     // Swap the Data.  Do the data itself first, then the Spoof Data Header, because
    890     //                 we need to reference the header to locate the data, and an
    891     //                 inplace swap of the header leaves it unusable.
    892     //
    893     uint8_t          *outBytes = (uint8_t *)outData + headerSize;
    894     SpoofDataHeader  *outputDH = (SpoofDataHeader *)outBytes;
    895 
    896     int32_t   sectionStart;
    897     int32_t   sectionLength;
    898 
    899     //
    900     // If not swapping in place, zero out the output buffer before starting.
    901     //    Gaps may exist between the individual sections, and these must be zeroed in
    902     //    the output buffer.  The simplest way to do that is to just zero the whole thing.
    903     //
    904     if (inBytes != outBytes) {
    905         uprv_memset(outBytes, 0, spoofDataLength);
    906     }
    907 
    908     // Confusables Keys Section   (fCFUKeys)
    909     sectionStart  = ds->readUInt32(spoofDH->fCFUKeys);
    910     sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
    911     ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
    912 
    913     // String Index Section
    914     sectionStart  = ds->readUInt32(spoofDH->fCFUStringIndex);
    915     sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
    916     ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
    917 
    918     // String Table Section
    919     sectionStart  = ds->readUInt32(spoofDH->fCFUStringTable);
    920     sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
    921     ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
    922 
    923     // And, last, swap the header itself.
    924     //   int32_t   fMagic             // swap this
    925     //   uint8_t   fFormatVersion[4]  // Do not swap this, just copy
    926     //   int32_t   fLength and all the rest       // Swap the rest, all is 32 bit stuff.
    927     //
    928     uint32_t magic = ds->readUInt32(spoofDH->fMagic);
    929     ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
    930 
    931     if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
    932         uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
    933     }
    934     // swap starting at fLength
    935     ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
    936 
    937     return totalSize;
    938 }
    939 
    940 #endif
    941 
    942 
    943