1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2008-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 10 #include "unicode/utypes.h" 11 #include "unicode/uspoof.h" 12 #include "unicode/uchar.h" 13 #include "unicode/uniset.h" 14 #include "unicode/utf16.h" 15 #include "utrie2.h" 16 #include "cmemory.h" 17 #include "cstring.h" 18 #include "scriptset.h" 19 #include "umutex.h" 20 #include "udataswp.h" 21 #include "uassert.h" 22 #include "ucln_in.h" 23 #include "uspoof_impl.h" 24 25 #if !UCONFIG_NO_NORMALIZATION 26 27 28 U_NAMESPACE_BEGIN 29 30 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl) 31 32 SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) { 33 construct(status); 34 fSpoofData = data; 35 } 36 37 SpoofImpl::SpoofImpl(UErrorCode& status) { 38 construct(status); 39 40 // TODO: Call this method where it is actually needed, instead of in the 41 // constructor, to allow for lazy data loading. See #12696. 42 fSpoofData = SpoofData::getDefault(status); 43 } 44 45 SpoofImpl::SpoofImpl() { 46 UErrorCode status = U_ZERO_ERROR; 47 construct(status); 48 49 // TODO: Call this method where it is actually needed, instead of in the 50 // constructor, to allow for lazy data loading. See #12696. 51 fSpoofData = SpoofData::getDefault(status); 52 } 53 54 void SpoofImpl::construct(UErrorCode& status) { 55 fMagic = USPOOF_MAGIC; 56 fChecks = USPOOF_ALL_CHECKS; 57 fSpoofData = NULL; 58 fAllowedCharsSet = NULL; 59 fAllowedLocales = NULL; 60 fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; 61 62 if (U_FAILURE(status)) { return; } 63 64 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); 65 fAllowedCharsSet = allowedCharsSet; 66 fAllowedLocales = uprv_strdup(""); 67 if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { 68 status = U_MEMORY_ALLOCATION_ERROR; 69 return; 70 } 71 allowedCharsSet->freeze(); 72 } 73 74 75 // Copy Constructor, used by the user level clone() function. 76 SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : 77 fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , 78 fAllowedLocales(NULL) { 79 if (U_FAILURE(status)) { 80 return; 81 } 82 fMagic = src.fMagic; 83 fChecks = src.fChecks; 84 if (src.fSpoofData != NULL) { 85 fSpoofData = src.fSpoofData->addReference(); 86 } 87 fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone()); 88 fAllowedLocales = uprv_strdup(src.fAllowedLocales); 89 if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { 90 status = U_MEMORY_ALLOCATION_ERROR; 91 } 92 fRestrictionLevel = src.fRestrictionLevel; 93 } 94 95 SpoofImpl::~SpoofImpl() { 96 fMagic = 0; // head off application errors by preventing use of 97 // of deleted objects. 98 if (fSpoofData != NULL) { 99 fSpoofData->removeReference(); // Will delete if refCount goes to zero. 100 } 101 delete fAllowedCharsSet; 102 uprv_free((void *)fAllowedLocales); 103 } 104 105 // Cast this instance as a USpoofChecker for the C API. 106 USpoofChecker *SpoofImpl::asUSpoofChecker() { 107 return reinterpret_cast<USpoofChecker*>(this); 108 } 109 110 // 111 // Incoming parameter check on Status and the SpoofChecker object 112 // received from the C API. 113 // 114 const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) { 115 if (U_FAILURE(status)) { 116 return NULL; 117 } 118 if (sc == NULL) { 119 status = U_ILLEGAL_ARGUMENT_ERROR; 120 return NULL; 121 } 122 SpoofImpl *This = (SpoofImpl *)sc; 123 if (This->fMagic != USPOOF_MAGIC) { 124 status = U_INVALID_FORMAT_ERROR; 125 return NULL; 126 } 127 if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) { 128 return NULL; 129 } 130 return This; 131 } 132 133 SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) { 134 return const_cast<SpoofImpl *> 135 (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status)); 136 } 137 138 139 void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) { 140 UnicodeSet allowedChars; 141 UnicodeSet *tmpSet = NULL; 142 const char *locStart = localesList; 143 const char *locEnd = NULL; 144 const char *localesListEnd = localesList + uprv_strlen(localesList); 145 int32_t localeListCount = 0; // Number of locales provided by caller. 146 147 // Loop runs once per locale from the localesList, a comma separated list of locales. 148 do { 149 locEnd = uprv_strchr(locStart, ','); 150 if (locEnd == NULL) { 151 locEnd = localesListEnd; 152 } 153 while (*locStart == ' ') { 154 locStart++; 155 } 156 const char *trimmedEnd = locEnd-1; 157 while (trimmedEnd > locStart && *trimmedEnd == ' ') { 158 trimmedEnd--; 159 } 160 if (trimmedEnd <= locStart) { 161 break; 162 } 163 const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart)); 164 localeListCount++; 165 166 // We have one locale from the locales list. 167 // Add the script chars for this locale to the accumulating set of allowed chars. 168 // If the locale is no good, we will be notified back via status. 169 addScriptChars(locale, &allowedChars, status); 170 uprv_free((void *)locale); 171 if (U_FAILURE(status)) { 172 break; 173 } 174 locStart = locEnd + 1; 175 } while (locStart < localesListEnd); 176 177 // If our caller provided an empty list of locales, we disable the allowed characters checking 178 if (localeListCount == 0) { 179 uprv_free((void *)fAllowedLocales); 180 fAllowedLocales = uprv_strdup(""); 181 tmpSet = new UnicodeSet(0, 0x10ffff); 182 if (fAllowedLocales == NULL || tmpSet == NULL) { 183 status = U_MEMORY_ALLOCATION_ERROR; 184 return; 185 } 186 tmpSet->freeze(); 187 delete fAllowedCharsSet; 188 fAllowedCharsSet = tmpSet; 189 fChecks &= ~USPOOF_CHAR_LIMIT; 190 return; 191 } 192 193 194 // Add all common and inherited characters to the set of allowed chars. 195 UnicodeSet tempSet; 196 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); 197 allowedChars.addAll(tempSet); 198 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); 199 allowedChars.addAll(tempSet); 200 201 // If anything went wrong, we bail out without changing 202 // the state of the spoof checker. 203 if (U_FAILURE(status)) { 204 return; 205 } 206 207 // Store the updated spoof checker state. 208 tmpSet = static_cast<UnicodeSet *>(allowedChars.clone()); 209 const char *tmpLocalesList = uprv_strdup(localesList); 210 if (tmpSet == NULL || tmpLocalesList == NULL) { 211 status = U_MEMORY_ALLOCATION_ERROR; 212 return; 213 } 214 uprv_free((void *)fAllowedLocales); 215 fAllowedLocales = tmpLocalesList; 216 tmpSet->freeze(); 217 delete fAllowedCharsSet; 218 fAllowedCharsSet = tmpSet; 219 fChecks |= USPOOF_CHAR_LIMIT; 220 } 221 222 223 const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) { 224 return fAllowedLocales; 225 } 226 227 228 // Given a locale (a language), add all the characters from all of the scripts used with that language 229 // to the allowedChars UnicodeSet 230 231 void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) { 232 UScriptCode scripts[30]; 233 234 int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status); 235 if (U_FAILURE(status)) { 236 return; 237 } 238 if (status == U_USING_DEFAULT_WARNING) { 239 status = U_ILLEGAL_ARGUMENT_ERROR; 240 return; 241 } 242 UnicodeSet tmpSet; 243 int32_t i; 244 for (i=0; i<numScripts; i++) { 245 tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status); 246 allowedChars->addAll(tmpSet); 247 } 248 } 249 250 // Computes the augmented script set for a code point, according to UTS 39 section 5.1. 251 void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) { 252 result.resetAll(); 253 result.setScriptExtensions(codePoint, status); 254 if (U_FAILURE(status)) { return; } 255 256 // Section 5.1 step 1 257 if (result.test(USCRIPT_HAN, status)) { 258 result.set(USCRIPT_HAN_WITH_BOPOMOFO, status); 259 result.set(USCRIPT_JAPANESE, status); 260 result.set(USCRIPT_KOREAN, status); 261 } 262 if (result.test(USCRIPT_HIRAGANA, status)) { 263 result.set(USCRIPT_JAPANESE, status); 264 } 265 if (result.test(USCRIPT_KATAKANA, status)) { 266 result.set(USCRIPT_JAPANESE, status); 267 } 268 if (result.test(USCRIPT_HANGUL, status)) { 269 result.set(USCRIPT_KOREAN, status); 270 } 271 if (result.test(USCRIPT_BOPOMOFO, status)) { 272 result.set(USCRIPT_HAN_WITH_BOPOMOFO, status); 273 } 274 275 // Section 5.1 step 2 276 if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) { 277 result.setAll(); 278 } 279 } 280 281 // Computes the resolved script set for a string, according to UTS 39 section 5.1. 282 void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const { 283 getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status); 284 } 285 286 // Computes the resolved script set for a string, omitting characters having the specified script. 287 // If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included. 288 void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const { 289 result.setAll(); 290 291 ScriptSet temp; 292 UChar32 codePoint; 293 for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { 294 codePoint = input.char32At(i); 295 296 // Compute the augmented script set for the character 297 getAugmentedScriptSet(codePoint, temp, status); 298 if (U_FAILURE(status)) { return; } 299 300 // Intersect the augmented script set with the resolved script set, but only if the character doesn't 301 // have the script specified in the function call 302 if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) { 303 result.intersect(temp); 304 } 305 } 306 } 307 308 // Computes the set of numerics for a string, according to UTS 39 section 5.3. 309 void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const { 310 result.clear(); 311 312 UChar32 codePoint; 313 for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { 314 codePoint = input.char32At(i); 315 316 // Store a representative character for each kind of decimal digit 317 if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) { 318 // Store the zero character as a representative for comparison. 319 // Unicode guarantees it is codePoint - value 320 result.add(codePoint - (UChar32)u_getNumericValue(codePoint)); 321 } 322 } 323 } 324 325 // Computes the restriction level of a string, according to UTS 39 section 5.2. 326 URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const { 327 // Section 5.2 step 1: 328 if (!fAllowedCharsSet->containsAll(input)) { 329 return USPOOF_UNRESTRICTIVE; 330 } 331 332 // Section 5.2 step 2 333 // Java use a static UnicodeSet for this test. In C++, avoid the static variable 334 // and just do a simple for loop. 335 UBool allASCII = TRUE; 336 for (int32_t i=0, length=input.length(); i<length; i++) { 337 if (input.charAt(i) > 0x7f) { 338 allASCII = FALSE; 339 break; 340 } 341 } 342 if (allASCII) { 343 return USPOOF_ASCII; 344 } 345 346 // Section 5.2 steps 3: 347 ScriptSet resolvedScriptSet; 348 getResolvedScriptSet(input, resolvedScriptSet, status); 349 if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } 350 351 // Section 5.2 step 4: 352 if (!resolvedScriptSet.isEmpty()) { 353 return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; 354 } 355 356 // Section 5.2 step 5: 357 ScriptSet resolvedNoLatn; 358 getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status); 359 if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } 360 361 // Section 5.2 step 6: 362 if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status) 363 || resolvedNoLatn.test(USCRIPT_JAPANESE, status) 364 || resolvedNoLatn.test(USCRIPT_KOREAN, status)) { 365 return USPOOF_HIGHLY_RESTRICTIVE; 366 } 367 368 // Section 5.2 step 7: 369 if (!resolvedNoLatn.isEmpty() 370 && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status) 371 && !resolvedNoLatn.test(USCRIPT_GREEK, status) 372 && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) { 373 return USPOOF_MODERATELY_RESTRICTIVE; 374 } 375 376 // Section 5.2 step 8: 377 return USPOOF_MINIMALLY_RESTRICTIVE; 378 } 379 380 381 382 // Convert a text format hex number. Utility function used by builder code. Static. 383 // Input: UChar *string text. Output: a UChar32 384 // Input has been pre-checked, and will have no non-hex chars. 385 // The number must fall in the code point range of 0..0x10ffff 386 // Static Function. 387 UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) { 388 if (U_FAILURE(status)) { 389 return 0; 390 } 391 U_ASSERT(limit-start > 0); 392 uint32_t val = 0; 393 int i; 394 for (i=start; i<limit; i++) { 395 int digitVal = s[i] - 0x30; 396 if (digitVal>9) { 397 digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A' 398 } 399 if (digitVal>15) { 400 digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a' 401 } 402 U_ASSERT(digitVal <= 0xf); 403 val <<= 4; 404 val += digitVal; 405 } 406 if (val > 0x10ffff) { 407 status = U_PARSE_ERROR; 408 val = 0; 409 } 410 return (UChar32)val; 411 } 412 413 414 //----------------------------------------- 415 // 416 // class CheckResult Implementation 417 // 418 //----------------------------------------- 419 420 CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) { 421 clear(); 422 } 423 424 USpoofCheckResult* CheckResult::asUSpoofCheckResult() { 425 return reinterpret_cast<USpoofCheckResult*>(this); 426 } 427 428 // 429 // Incoming parameter check on Status and the CheckResult object 430 // received from the C API. 431 // 432 const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) { 433 if (U_FAILURE(status)) { return NULL; } 434 if (ptr == NULL) { 435 status = U_ILLEGAL_ARGUMENT_ERROR; 436 return NULL; 437 } 438 CheckResult *This = (CheckResult*) ptr; 439 if (This->fMagic != USPOOF_CHECK_MAGIC) { 440 status = U_INVALID_FORMAT_ERROR; 441 return NULL; 442 } 443 return This; 444 } 445 446 CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) { 447 return const_cast<CheckResult *> 448 (CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status)); 449 } 450 451 void CheckResult::clear() { 452 fChecks = 0; 453 fNumerics.clear(); 454 fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE; 455 } 456 457 int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) { 458 if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) { 459 return fChecks | fRestrictionLevel; 460 } else { 461 return fChecks; 462 } 463 } 464 465 CheckResult::~CheckResult() { 466 } 467 468 //---------------------------------------------------------------------------------------------- 469 // 470 // class SpoofData Implementation 471 // 472 //---------------------------------------------------------------------------------------------- 473 474 475 UBool SpoofData::validateDataVersion(UErrorCode &status) const { 476 if (U_FAILURE(status) || 477 fRawData == NULL || 478 fRawData->fMagic != USPOOF_MAGIC || 479 fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION || 480 fRawData->fFormatVersion[1] != 0 || 481 fRawData->fFormatVersion[2] != 0 || 482 fRawData->fFormatVersion[3] != 0) { 483 status = U_INVALID_FORMAT_ERROR; 484 return FALSE; 485 } 486 return TRUE; 487 } 488 489 static UBool U_CALLCONV 490 spoofDataIsAcceptable(void *context, 491 const char * /* type */, const char * /*name*/, 492 const UDataInfo *pInfo) { 493 if( 494 pInfo->size >= 20 && 495 pInfo->isBigEndian == U_IS_BIG_ENDIAN && 496 pInfo->charsetFamily == U_CHARSET_FAMILY && 497 pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu " 498 pInfo->dataFormat[1] == 0x66 && 499 pInfo->dataFormat[2] == 0x75 && 500 pInfo->dataFormat[3] == 0x20 && 501 pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION 502 ) { 503 UVersionInfo *version = static_cast<UVersionInfo *>(context); 504 if(version != NULL) { 505 uprv_memcpy(version, pInfo->dataVersion, 4); 506 } 507 return TRUE; 508 } else { 509 return FALSE; 510 } 511 } 512 513 // Methods for the loading of the default confusables data file. The confusable 514 // data is loaded only when it is needed. 515 // 516 // SpoofData::getDefault() - Return the default confusables data, and call the 517 // initOnce() if it is not available. Adds a reference 518 // to the SpoofData that the caller is responsible for 519 // decrementing when they are done with the data. 520 // 521 // uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData 522 // is shared by all spoof checkers using the default data. 523 // 524 // uspoof_cleanupDefaultData - Called during cleanup. 525 // 526 527 static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER; 528 static SpoofData* gDefaultSpoofData; 529 530 static UBool U_CALLCONV 531 uspoof_cleanupDefaultData(void) { 532 if (gDefaultSpoofData) { 533 // Will delete, assuming all user-level spoof checkers were closed. 534 gDefaultSpoofData->removeReference(); 535 gDefaultSpoofData = NULL; 536 gSpoofInitDefaultOnce.reset(); 537 } 538 return TRUE; 539 } 540 541 static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) { 542 UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables", 543 spoofDataIsAcceptable, 544 NULL, // context, would receive dataVersion if supplied. 545 &status); 546 if (U_FAILURE(status)) { return; } 547 gDefaultSpoofData = new SpoofData(udm, status); 548 if (U_FAILURE(status)) { 549 delete gDefaultSpoofData; 550 return; 551 } 552 if (gDefaultSpoofData == NULL) { 553 status = U_MEMORY_ALLOCATION_ERROR; 554 return; 555 } 556 ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData); 557 } 558 559 SpoofData* SpoofData::getDefault(UErrorCode& status) { 560 umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status); 561 if (U_FAILURE(status)) { return NULL; } 562 gDefaultSpoofData->addReference(); 563 return gDefaultSpoofData; 564 } 565 566 567 568 SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status) 569 { 570 reset(); 571 if (U_FAILURE(status)) { 572 return; 573 } 574 fUDM = udm; 575 // fRawData is non-const because it may be constructed by the data builder. 576 fRawData = reinterpret_cast<SpoofDataHeader *>( 577 const_cast<void *>(udata_getMemory(udm))); 578 validateDataVersion(status); 579 initPtrs(status); 580 } 581 582 583 SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status) 584 { 585 reset(); 586 if (U_FAILURE(status)) { 587 return; 588 } 589 if ((size_t)length < sizeof(SpoofDataHeader)) { 590 status = U_INVALID_FORMAT_ERROR; 591 return; 592 } 593 void *ncData = const_cast<void *>(data); 594 fRawData = static_cast<SpoofDataHeader *>(ncData); 595 if (length < fRawData->fLength) { 596 status = U_INVALID_FORMAT_ERROR; 597 return; 598 } 599 validateDataVersion(status); 600 initPtrs(status); 601 } 602 603 604 // Spoof Data constructor for use from data builder. 605 // Initializes a new, empty data area that will be populated later. 606 SpoofData::SpoofData(UErrorCode &status) { 607 reset(); 608 if (U_FAILURE(status)) { 609 return; 610 } 611 fDataOwned = true; 612 613 // The spoof header should already be sized to be a multiple of 16 bytes. 614 // Just in case it's not, round it up. 615 uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15; 616 U_ASSERT(initialSize == sizeof(SpoofDataHeader)); 617 618 fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize)); 619 fMemLimit = initialSize; 620 if (fRawData == NULL) { 621 status = U_MEMORY_ALLOCATION_ERROR; 622 return; 623 } 624 uprv_memset(fRawData, 0, initialSize); 625 626 fRawData->fMagic = USPOOF_MAGIC; 627 fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION; 628 fRawData->fFormatVersion[1] = 0; 629 fRawData->fFormatVersion[2] = 0; 630 fRawData->fFormatVersion[3] = 0; 631 initPtrs(status); 632 } 633 634 // reset() - initialize all fields. 635 // Should be updated if any new fields are added. 636 // Called by constructors to put things in a known initial state. 637 void SpoofData::reset() { 638 fRawData = NULL; 639 fDataOwned = FALSE; 640 fUDM = NULL; 641 fMemLimit = 0; 642 fRefCount = 1; 643 fCFUKeys = NULL; 644 fCFUValues = NULL; 645 fCFUStrings = NULL; 646 } 647 648 649 // SpoofData::initPtrs() 650 // Initialize the pointers to the various sections of the raw data. 651 // 652 // This function is used both during the Trie building process (multiple 653 // times, as the individual data sections are added), and 654 // during the opening of a Spoof Checker from prebuilt data. 655 // 656 // The pointers for non-existent data sections (identified by an offset of 0) 657 // are set to NULL. 658 // 659 // Note: During building the data, adding each new data section 660 // reallocs the raw data area, which likely relocates it, which 661 // in turn requires reinitializing all of the pointers into it, hence 662 // multiple calls to this function during building. 663 // 664 void SpoofData::initPtrs(UErrorCode &status) { 665 fCFUKeys = NULL; 666 fCFUValues = NULL; 667 fCFUStrings = NULL; 668 if (U_FAILURE(status)) { 669 return; 670 } 671 if (fRawData->fCFUKeys != 0) { 672 fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys); 673 } 674 if (fRawData->fCFUStringIndex != 0) { 675 fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex); 676 } 677 if (fRawData->fCFUStringTable != 0) { 678 fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable); 679 } 680 } 681 682 683 SpoofData::~SpoofData() { 684 if (fDataOwned) { 685 uprv_free(fRawData); 686 } 687 fRawData = NULL; 688 if (fUDM != NULL) { 689 udata_close(fUDM); 690 } 691 fUDM = NULL; 692 } 693 694 695 void SpoofData::removeReference() { 696 if (umtx_atomic_dec(&fRefCount) == 0) { 697 delete this; 698 } 699 } 700 701 702 SpoofData *SpoofData::addReference() { 703 umtx_atomic_inc(&fRefCount); 704 return this; 705 } 706 707 708 void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) { 709 if (U_FAILURE(status)) { 710 return NULL; 711 } 712 if (!fDataOwned) { 713 U_ASSERT(FALSE); 714 status = U_INTERNAL_PROGRAM_ERROR; 715 return NULL; 716 } 717 718 numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16 719 uint32_t returnOffset = fMemLimit; 720 fMemLimit += numBytes; 721 fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit)); 722 fRawData->fLength = fMemLimit; 723 uprv_memset((char *)fRawData + returnOffset, 0, numBytes); 724 initPtrs(status); 725 return (char *)fRawData + returnOffset; 726 } 727 728 int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const { 729 int32_t dataSize = fRawData->fLength; 730 if (capacity < dataSize) { 731 status = U_BUFFER_OVERFLOW_ERROR; 732 return dataSize; 733 } 734 uprv_memcpy(buf, fRawData, dataSize); 735 return dataSize; 736 } 737 738 int32_t SpoofData::size() const { 739 return fRawData->fLength; 740 } 741 742 //------------------------------- 743 // 744 // Front-end APIs for SpoofData 745 // 746 //------------------------------- 747 748 int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const { 749 // Perform a binary search. 750 // [lo, hi), i.e lo is inclusive, hi is exclusive. 751 // The result after the loop will be in lo. 752 int32_t lo = 0; 753 int32_t hi = length(); 754 do { 755 int32_t mid = (lo + hi) / 2; 756 if (codePointAt(mid) > inChar) { 757 hi = mid; 758 } else if (codePointAt(mid) < inChar) { 759 lo = mid; 760 } else { 761 // Found result. Break early. 762 lo = mid; 763 break; 764 } 765 } while (hi - lo > 1); 766 767 // Did we find an entry? If not, the char maps to itself. 768 if (codePointAt(lo) != inChar) { 769 dest.append(inChar); 770 return 1; 771 } 772 773 // Add the element to the string builder and return. 774 return appendValueTo(lo, dest); 775 } 776 777 int32_t SpoofData::length() const { 778 return fRawData->fCFUKeysSize; 779 } 780 781 UChar32 SpoofData::codePointAt(int32_t index) const { 782 return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]); 783 } 784 785 int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const { 786 int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]); 787 788 // Value is either a char (for strings of length 1) or 789 // an index into the string table (for longer strings) 790 uint16_t value = fCFUValues[index]; 791 if (stringLength == 1) { 792 dest.append((UChar)value); 793 } else { 794 dest.append(fCFUStrings + value, stringLength); 795 } 796 797 return stringLength; 798 } 799 800 801 U_NAMESPACE_END 802 803 U_NAMESPACE_USE 804 805 //----------------------------------------------------------------------------- 806 // 807 // uspoof_swap - byte swap and char encoding swap of spoof data 808 // 809 //----------------------------------------------------------------------------- 810 U_CAPI int32_t U_EXPORT2 811 uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, 812 UErrorCode *status) { 813 814 if (status == NULL || U_FAILURE(*status)) { 815 return 0; 816 } 817 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { 818 *status=U_ILLEGAL_ARGUMENT_ERROR; 819 return 0; 820 } 821 822 // 823 // Check that the data header is for spoof data. 824 // (Header contents are defined in gencfu.cpp) 825 // 826 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); 827 if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */ 828 pInfo->dataFormat[1]==0x66 && 829 pInfo->dataFormat[2]==0x75 && 830 pInfo->dataFormat[3]==0x20 && 831 pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION && 832 pInfo->formatVersion[1]==0 && 833 pInfo->formatVersion[2]==0 && 834 pInfo->formatVersion[3]==0 )) { 835 udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x " 836 "(format version %02x %02x %02x %02x) is not recognized\n", 837 pInfo->dataFormat[0], pInfo->dataFormat[1], 838 pInfo->dataFormat[2], pInfo->dataFormat[3], 839 pInfo->formatVersion[0], pInfo->formatVersion[1], 840 pInfo->formatVersion[2], pInfo->formatVersion[3]); 841 *status=U_UNSUPPORTED_ERROR; 842 return 0; 843 } 844 845 // 846 // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific 847 // header). This swap also conveniently gets us 848 // the size of the ICU d.h., which lets us locate the start 849 // of the uspoof specific data. 850 // 851 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); 852 853 854 // 855 // Get the Spoof Data Header, and check that it appears to be OK. 856 // 857 // 858 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; 859 SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes; 860 if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC || 861 ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader)) 862 { 863 udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n"); 864 *status=U_UNSUPPORTED_ERROR; 865 return 0; 866 } 867 868 // 869 // Prefight operation? Just return the size 870 // 871 int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength); 872 int32_t totalSize = headerSize + spoofDataLength; 873 if (length < 0) { 874 return totalSize; 875 } 876 877 // 878 // Check that length passed in is consistent with length from Spoof data header. 879 // 880 if (length < totalSize) { 881 udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n", 882 spoofDataLength); 883 *status=U_INDEX_OUTOFBOUNDS_ERROR; 884 return 0; 885 } 886 887 888 // 889 // Swap the Data. Do the data itself first, then the Spoof Data Header, because 890 // we need to reference the header to locate the data, and an 891 // inplace swap of the header leaves it unusable. 892 // 893 uint8_t *outBytes = (uint8_t *)outData + headerSize; 894 SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes; 895 896 int32_t sectionStart; 897 int32_t sectionLength; 898 899 // 900 // If not swapping in place, zero out the output buffer before starting. 901 // Gaps may exist between the individual sections, and these must be zeroed in 902 // the output buffer. The simplest way to do that is to just zero the whole thing. 903 // 904 if (inBytes != outBytes) { 905 uprv_memset(outBytes, 0, spoofDataLength); 906 } 907 908 // Confusables Keys Section (fCFUKeys) 909 sectionStart = ds->readUInt32(spoofDH->fCFUKeys); 910 sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4; 911 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 912 913 // String Index Section 914 sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex); 915 sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2; 916 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 917 918 // String Table Section 919 sectionStart = ds->readUInt32(spoofDH->fCFUStringTable); 920 sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2; 921 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 922 923 // And, last, swap the header itself. 924 // int32_t fMagic // swap this 925 // uint8_t fFormatVersion[4] // Do not swap this, just copy 926 // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff. 927 // 928 uint32_t magic = ds->readUInt32(spoofDH->fMagic); 929 ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic); 930 931 if (outputDH->fFormatVersion != spoofDH->fFormatVersion) { 932 uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion)); 933 } 934 // swap starting at fLength 935 ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status); 936 937 return totalSize; 938 } 939 940 #endif 941 942 943