1 /* 2 ********************************************************************** 3 * Copyright (C) 2008-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 #include "unicode/uspoof.h" 10 #include "unicode/uchar.h" 11 #include "unicode/uniset.h" 12 #include "unicode/utf16.h" 13 #include "utrie2.h" 14 #include "cmemory.h" 15 #include "cstring.h" 16 #include "identifier_info.h" 17 #include "scriptset.h" 18 #include "umutex.h" 19 #include "udataswp.h" 20 #include "uassert.h" 21 #include "uspoof_impl.h" 22 23 #if !UCONFIG_NO_NORMALIZATION 24 25 26 U_NAMESPACE_BEGIN 27 28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl) 29 30 SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) : 31 fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(data), fAllowedCharsSet(NULL) , 32 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { 33 if (U_FAILURE(status)) { 34 return; 35 } 36 fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; 37 38 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); 39 allowedCharsSet->freeze(); 40 fAllowedCharsSet = allowedCharsSet; 41 fAllowedLocales = uprv_strdup(""); 42 if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { 43 status = U_MEMORY_ALLOCATION_ERROR; 44 return; 45 } 46 fMagic = USPOOF_MAGIC; 47 } 48 49 50 SpoofImpl::SpoofImpl() : 51 fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , 52 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { 53 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); 54 allowedCharsSet->freeze(); 55 fAllowedCharsSet = allowedCharsSet; 56 fAllowedLocales = uprv_strdup(""); 57 fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; 58 } 59 60 61 // Copy Constructor, used by the user level clone() function. 62 SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : 63 fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , 64 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { 65 if (U_FAILURE(status)) { 66 return; 67 } 68 fMagic = src.fMagic; 69 fChecks = src.fChecks; 70 if (src.fSpoofData != NULL) { 71 fSpoofData = src.fSpoofData->addReference(); 72 } 73 fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone()); 74 if (fAllowedCharsSet == NULL) { 75 status = U_MEMORY_ALLOCATION_ERROR; 76 } 77 fAllowedLocales = uprv_strdup(src.fAllowedLocales); 78 fRestrictionLevel = src.fRestrictionLevel; 79 } 80 81 SpoofImpl::~SpoofImpl() { 82 fMagic = 0; // head off application errors by preventing use of 83 // of deleted objects. 84 if (fSpoofData != NULL) { 85 fSpoofData->removeReference(); // Will delete if refCount goes to zero. 86 } 87 delete fAllowedCharsSet; 88 uprv_free((void *)fAllowedLocales); 89 delete fCachedIdentifierInfo; 90 } 91 92 // 93 // Incoming parameter check on Status and the SpoofChecker object 94 // received from the C API. 95 // 96 const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) { 97 if (U_FAILURE(status)) { 98 return NULL; 99 } 100 if (sc == NULL) { 101 status = U_ILLEGAL_ARGUMENT_ERROR; 102 return NULL; 103 } 104 SpoofImpl *This = (SpoofImpl *)sc; 105 if (This->fMagic != USPOOF_MAGIC || 106 This->fSpoofData == NULL) { 107 status = U_INVALID_FORMAT_ERROR; 108 return NULL; 109 } 110 if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) { 111 return NULL; 112 } 113 return This; 114 } 115 116 SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) { 117 return const_cast<SpoofImpl *> 118 (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status)); 119 } 120 121 122 123 //-------------------------------------------------------------------------------------- 124 // 125 // confusableLookup() This is the heart of the confusable skeleton generation 126 // implementation. 127 // 128 // Given a source character, produce the corresponding 129 // replacement character(s), appending them to the dest string. 130 // 131 //--------------------------------------------------------------------------------------- 132 int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const { 133 134 // Binary search the spoof data key table for the inChar 135 int32_t *low = fSpoofData->fCFUKeys; 136 int32_t *mid = NULL; 137 int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize; 138 UChar32 midc; 139 do { 140 int32_t delta = ((int32_t)(limit-low))/2; 141 mid = low + delta; 142 midc = *mid & 0x1fffff; 143 if (inChar == midc) { 144 goto foundChar; 145 } else if (inChar < midc) { 146 limit = mid; 147 } else { 148 low = mid; 149 } 150 } while (low < limit-1); 151 mid = low; 152 midc = *mid & 0x1fffff; 153 if (inChar != midc) { 154 // Char not found. It maps to itself. 155 int i = 0; 156 dest.append(inChar); 157 return i; 158 } 159 foundChar: 160 int32_t keyFlags = *mid & 0xff000000; 161 if ((keyFlags & tableMask) == 0) { 162 // We found the right key char, but the entry doesn't pertain to the 163 // table we need. See if there is an adjacent key that does 164 if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) { 165 int32_t *altMid; 166 for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) { 167 keyFlags = *altMid & 0xff000000; 168 if (keyFlags & tableMask) { 169 mid = altMid; 170 goto foundKey; 171 } 172 } 173 for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) { 174 keyFlags = *altMid & 0xff000000; 175 if (keyFlags & tableMask) { 176 mid = altMid; 177 goto foundKey; 178 } 179 } 180 } 181 // No key entry for this char & table. 182 // The input char maps to itself. 183 int i = 0; 184 dest.append(inChar); 185 return i; 186 } 187 188 foundKey: 189 int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1; 190 int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys); 191 192 // Value is either a UChar (for strings of length 1) or 193 // an index into the string table (for longer strings) 194 uint16_t value = fSpoofData->fCFUValues[keyTableIndex]; 195 if (stringLen == 1) { 196 dest.append((UChar)value); 197 return 1; 198 } 199 200 // String length of 4 from the above lookup is used for all strings of length >= 4. 201 // For these, get the real length from the string lengths table, 202 // which maps string table indexes to lengths. 203 // All strings of the same length are stored contiguously in the string table. 204 // 'value' from the lookup above is the starting index for the desired string. 205 206 int32_t ix; 207 if (stringLen == 4) { 208 int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize; 209 for (ix = 0; ix < stringLengthsLimit; ix++) { 210 if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) { 211 stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength; 212 break; 213 } 214 } 215 U_ASSERT(ix < stringLengthsLimit); 216 } 217 218 U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen); 219 UChar *src = &fSpoofData->fCFUStrings[value]; 220 dest.append(src, stringLen); 221 return stringLen; 222 } 223 224 225 //--------------------------------------------------------------------------------------- 226 // 227 // wholeScriptCheck() 228 // 229 // Input text is already normalized to NFD 230 // Return the set of scripts, each of which can represent something that is 231 // confusable with the input text. The script of the input text 232 // is included; input consisting of characters from a single script will 233 // always produce a result consisting of a set containing that script. 234 // 235 //--------------------------------------------------------------------------------------- 236 void SpoofImpl::wholeScriptCheck( 237 const UnicodeString &text, ScriptSet *result, UErrorCode &status) const { 238 239 UTrie2 *table = 240 (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie; 241 result->setAll(); 242 int32_t length = text.length(); 243 for (int32_t inputIdx=0; inputIdx < length;) { 244 UChar32 c = text.char32At(inputIdx); 245 inputIdx += U16_LENGTH(c); 246 uint32_t index = utrie2_get32(table, c); 247 if (index == 0) { 248 // No confusables in another script for this char. 249 // TODO: we should change the data to have sets with just the single script 250 // bit for the script of this char. Gets rid of this special case. 251 // Until then, grab the script from the char and intersect it with the set. 252 UScriptCode cpScript = uscript_getScript(c, &status); 253 U_ASSERT(cpScript > USCRIPT_INHERITED); 254 result->intersect(cpScript, status); 255 } else if (index == 1) { 256 // Script == Common or Inherited. Nothing to do. 257 } else { 258 result->intersect(fSpoofData->fScriptSets[index]); 259 } 260 } 261 } 262 263 264 void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) { 265 UnicodeSet allowedChars; 266 UnicodeSet *tmpSet = NULL; 267 const char *locStart = localesList; 268 const char *locEnd = NULL; 269 const char *localesListEnd = localesList + uprv_strlen(localesList); 270 int32_t localeListCount = 0; // Number of locales provided by caller. 271 272 // Loop runs once per locale from the localesList, a comma separated list of locales. 273 do { 274 locEnd = uprv_strchr(locStart, ','); 275 if (locEnd == NULL) { 276 locEnd = localesListEnd; 277 } 278 while (*locStart == ' ') { 279 locStart++; 280 } 281 const char *trimmedEnd = locEnd-1; 282 while (trimmedEnd > locStart && *trimmedEnd == ' ') { 283 trimmedEnd--; 284 } 285 if (trimmedEnd <= locStart) { 286 break; 287 } 288 const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart)); 289 localeListCount++; 290 291 // We have one locale from the locales list. 292 // Add the script chars for this locale to the accumulating set of allowed chars. 293 // If the locale is no good, we will be notified back via status. 294 addScriptChars(locale, &allowedChars, status); 295 uprv_free((void *)locale); 296 if (U_FAILURE(status)) { 297 break; 298 } 299 locStart = locEnd + 1; 300 } while (locStart < localesListEnd); 301 302 // If our caller provided an empty list of locales, we disable the allowed characters checking 303 if (localeListCount == 0) { 304 uprv_free((void *)fAllowedLocales); 305 fAllowedLocales = uprv_strdup(""); 306 tmpSet = new UnicodeSet(0, 0x10ffff); 307 if (fAllowedLocales == NULL || tmpSet == NULL) { 308 status = U_MEMORY_ALLOCATION_ERROR; 309 return; 310 } 311 tmpSet->freeze(); 312 delete fAllowedCharsSet; 313 fAllowedCharsSet = tmpSet; 314 fChecks &= ~USPOOF_CHAR_LIMIT; 315 return; 316 } 317 318 319 // Add all common and inherited characters to the set of allowed chars. 320 UnicodeSet tempSet; 321 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); 322 allowedChars.addAll(tempSet); 323 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); 324 allowedChars.addAll(tempSet); 325 326 // If anything went wrong, we bail out without changing 327 // the state of the spoof checker. 328 if (U_FAILURE(status)) { 329 return; 330 } 331 332 // Store the updated spoof checker state. 333 tmpSet = static_cast<UnicodeSet *>(allowedChars.clone()); 334 const char *tmpLocalesList = uprv_strdup(localesList); 335 if (tmpSet == NULL || tmpLocalesList == NULL) { 336 status = U_MEMORY_ALLOCATION_ERROR; 337 return; 338 } 339 uprv_free((void *)fAllowedLocales); 340 fAllowedLocales = tmpLocalesList; 341 tmpSet->freeze(); 342 delete fAllowedCharsSet; 343 fAllowedCharsSet = tmpSet; 344 fChecks |= USPOOF_CHAR_LIMIT; 345 } 346 347 348 const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) { 349 return fAllowedLocales; 350 } 351 352 353 // Given a locale (a language), add all the characters from all of the scripts used with that language 354 // to the allowedChars UnicodeSet 355 356 void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) { 357 UScriptCode scripts[30]; 358 359 int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status); 360 if (U_FAILURE(status)) { 361 return; 362 } 363 if (status == U_USING_DEFAULT_WARNING) { 364 status = U_ILLEGAL_ARGUMENT_ERROR; 365 return; 366 } 367 UnicodeSet tmpSet; 368 int32_t i; 369 for (i=0; i<numScripts; i++) { 370 tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status); 371 allowedChars->addAll(tmpSet); 372 } 373 } 374 375 376 // Convert a text format hex number. Utility function used by builder code. Static. 377 // Input: UChar *string text. Output: a UChar32 378 // Input has been pre-checked, and will have no non-hex chars. 379 // The number must fall in the code point range of 0..0x10ffff 380 // Static Function. 381 UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) { 382 if (U_FAILURE(status)) { 383 return 0; 384 } 385 U_ASSERT(limit-start > 0); 386 uint32_t val = 0; 387 int i; 388 for (i=start; i<limit; i++) { 389 int digitVal = s[i] - 0x30; 390 if (digitVal>9) { 391 digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A' 392 } 393 if (digitVal>15) { 394 digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a' 395 } 396 U_ASSERT(digitVal <= 0xf); 397 val <<= 4; 398 val += digitVal; 399 } 400 if (val > 0x10ffff) { 401 status = U_PARSE_ERROR; 402 val = 0; 403 } 404 return (UChar32)val; 405 } 406 407 // IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create. 408 // Maintain a one-element cache, which is sufficient to avoid repeatedly 409 // creating new ones unless we get multi-thread concurrency in spoof 410 // check operations, which should be statistically uncommon. 411 412 // These functions are used in place of new & delete of an IdentifierInfo. 413 // They will recycle the IdentifierInfo when possible. 414 // They are logically const, and used within const functions that must be thread safe. 415 IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const { 416 IdentifierInfo *returnIdInfo = NULL; 417 if (U_FAILURE(status)) { 418 return returnIdInfo; 419 } 420 SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this); 421 { 422 Mutex m; 423 returnIdInfo = nonConstThis->fCachedIdentifierInfo; 424 nonConstThis->fCachedIdentifierInfo = NULL; 425 } 426 if (returnIdInfo == NULL) { 427 returnIdInfo = new IdentifierInfo(status); 428 if (U_SUCCESS(status) && returnIdInfo == NULL) { 429 status = U_MEMORY_ALLOCATION_ERROR; 430 } 431 if (U_FAILURE(status) && returnIdInfo != NULL) { 432 delete returnIdInfo; 433 returnIdInfo = NULL; 434 } 435 } 436 return returnIdInfo; 437 } 438 439 440 void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const { 441 if (idInfo != NULL) { 442 SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this); 443 { 444 Mutex m; 445 if (nonConstThis->fCachedIdentifierInfo == NULL) { 446 nonConstThis->fCachedIdentifierInfo = idInfo; 447 idInfo = NULL; 448 } 449 } 450 delete idInfo; 451 } 452 } 453 454 455 456 457 //---------------------------------------------------------------------------------------------- 458 // 459 // class SpoofData Implementation 460 // 461 //---------------------------------------------------------------------------------------------- 462 463 464 UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) { 465 if (U_FAILURE(status) || 466 rawData == NULL || 467 rawData->fMagic != USPOOF_MAGIC || 468 rawData->fFormatVersion[0] > 1 || 469 rawData->fFormatVersion[1] > 0) { 470 status = U_INVALID_FORMAT_ERROR; 471 return FALSE; 472 } 473 return TRUE; 474 } 475 476 static UBool U_CALLCONV 477 spoofDataIsAcceptable(void *context, 478 const char * /* type */, const char * /*name*/, 479 const UDataInfo *pInfo) { 480 if( 481 pInfo->size >= 20 && 482 pInfo->isBigEndian == U_IS_BIG_ENDIAN && 483 pInfo->charsetFamily == U_CHARSET_FAMILY && 484 pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu " 485 pInfo->dataFormat[1] == 0x66 && 486 pInfo->dataFormat[2] == 0x75 && 487 pInfo->dataFormat[3] == 0x20 && 488 pInfo->formatVersion[0] == 1 489 ) { 490 UVersionInfo *version = static_cast<UVersionInfo *>(context); 491 if(version != NULL) { 492 uprv_memcpy(version, pInfo->dataVersion, 4); 493 } 494 return TRUE; 495 } else { 496 return FALSE; 497 } 498 } 499 500 // 501 // SpoofData::getDefault() - return a wrapper around the spoof data that is 502 // baked into the default ICU data. 503 // 504 // Called once, from the initOnce() function in uspoof_impl.cpp; the resulting 505 // SpoofData is shared by all spoof checkers using the default data. 506 // 507 SpoofData *SpoofData::getDefault(UErrorCode &status) { 508 UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables", 509 spoofDataIsAcceptable, 510 NULL, // context, would receive dataVersion if supplied. 511 &status); 512 if (U_FAILURE(status)) { 513 return NULL; 514 } 515 SpoofData *This = new SpoofData(udm, status); 516 if (U_FAILURE(status)) { 517 delete This; 518 return NULL; 519 } 520 if (This == NULL) { 521 status = U_MEMORY_ALLOCATION_ERROR; 522 } 523 return This; 524 } 525 526 SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status) 527 { 528 reset(); 529 if (U_FAILURE(status)) { 530 return; 531 } 532 fUDM = udm; 533 // fRawData is non-const because it may be constructed by the data builder. 534 fRawData = reinterpret_cast<SpoofDataHeader *>( 535 const_cast<void *>(udata_getMemory(udm))); 536 validateDataVersion(fRawData, status); 537 initPtrs(status); 538 } 539 540 541 SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status) 542 { 543 reset(); 544 if (U_FAILURE(status)) { 545 return; 546 } 547 if ((size_t)length < sizeof(SpoofDataHeader)) { 548 status = U_INVALID_FORMAT_ERROR; 549 return; 550 } 551 void *ncData = const_cast<void *>(data); 552 fRawData = static_cast<SpoofDataHeader *>(ncData); 553 if (length < fRawData->fLength) { 554 status = U_INVALID_FORMAT_ERROR; 555 return; 556 } 557 validateDataVersion(fRawData, status); 558 initPtrs(status); 559 } 560 561 562 // Spoof Data constructor for use from data builder. 563 // Initializes a new, empty data area that will be populated later. 564 SpoofData::SpoofData(UErrorCode &status) { 565 reset(); 566 if (U_FAILURE(status)) { 567 return; 568 } 569 fDataOwned = true; 570 571 // The spoof header should already be sized to be a multiple of 16 bytes. 572 // Just in case it's not, round it up. 573 uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15; 574 U_ASSERT(initialSize == sizeof(SpoofDataHeader)); 575 576 fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize)); 577 fMemLimit = initialSize; 578 if (fRawData == NULL) { 579 status = U_MEMORY_ALLOCATION_ERROR; 580 return; 581 } 582 uprv_memset(fRawData, 0, initialSize); 583 584 fRawData->fMagic = USPOOF_MAGIC; 585 fRawData->fFormatVersion[0] = 1; 586 fRawData->fFormatVersion[1] = 0; 587 fRawData->fFormatVersion[2] = 0; 588 fRawData->fFormatVersion[3] = 0; 589 initPtrs(status); 590 } 591 592 // reset() - initialize all fields. 593 // Should be updated if any new fields are added. 594 // Called by constructors to put things in a known initial state. 595 void SpoofData::reset() { 596 fRawData = NULL; 597 fDataOwned = FALSE; 598 fUDM = NULL; 599 fMemLimit = 0; 600 fRefCount = 1; 601 fCFUKeys = NULL; 602 fCFUValues = NULL; 603 fCFUStringLengths = NULL; 604 fCFUStrings = NULL; 605 fAnyCaseTrie = NULL; 606 fLowerCaseTrie = NULL; 607 fScriptSets = NULL; 608 } 609 610 611 // SpoofData::initPtrs() 612 // Initialize the pointers to the various sections of the raw data. 613 // 614 // This function is used both during the Trie building process (multiple 615 // times, as the individual data sections are added), and 616 // during the opening of a Spoof Checker from prebuilt data. 617 // 618 // The pointers for non-existent data sections (identified by an offset of 0) 619 // are set to NULL. 620 // 621 // Note: During building the data, adding each new data section 622 // reallocs the raw data area, which likely relocates it, which 623 // in turn requires reinitializing all of the pointers into it, hence 624 // multiple calls to this function during building. 625 // 626 void SpoofData::initPtrs(UErrorCode &status) { 627 fCFUKeys = NULL; 628 fCFUValues = NULL; 629 fCFUStringLengths = NULL; 630 fCFUStrings = NULL; 631 if (U_FAILURE(status)) { 632 return; 633 } 634 if (fRawData->fCFUKeys != 0) { 635 fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys); 636 } 637 if (fRawData->fCFUStringIndex != 0) { 638 fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex); 639 } 640 if (fRawData->fCFUStringLengths != 0) { 641 fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths); 642 } 643 if (fRawData->fCFUStringTable != 0) { 644 fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable); 645 } 646 647 if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) { 648 fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 649 (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status); 650 } 651 if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) { 652 fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 653 (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status); 654 } 655 656 if (fRawData->fScriptSets != 0) { 657 fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets); 658 } 659 } 660 661 662 SpoofData::~SpoofData() { 663 utrie2_close(fAnyCaseTrie); 664 fAnyCaseTrie = NULL; 665 utrie2_close(fLowerCaseTrie); 666 fLowerCaseTrie = NULL; 667 if (fDataOwned) { 668 uprv_free(fRawData); 669 } 670 fRawData = NULL; 671 if (fUDM != NULL) { 672 udata_close(fUDM); 673 } 674 fUDM = NULL; 675 } 676 677 678 void SpoofData::removeReference() { 679 if (umtx_atomic_dec(&fRefCount) == 0) { 680 delete this; 681 } 682 } 683 684 685 SpoofData *SpoofData::addReference() { 686 umtx_atomic_inc(&fRefCount); 687 return this; 688 } 689 690 691 void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) { 692 if (U_FAILURE(status)) { 693 return NULL; 694 } 695 if (!fDataOwned) { 696 U_ASSERT(FALSE); 697 status = U_INTERNAL_PROGRAM_ERROR; 698 return NULL; 699 } 700 701 numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16 702 uint32_t returnOffset = fMemLimit; 703 fMemLimit += numBytes; 704 fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit)); 705 fRawData->fLength = fMemLimit; 706 uprv_memset((char *)fRawData + returnOffset, 0, numBytes); 707 initPtrs(status); 708 return (char *)fRawData + returnOffset; 709 } 710 711 712 U_NAMESPACE_END 713 714 U_NAMESPACE_USE 715 716 //----------------------------------------------------------------------------- 717 // 718 // uspoof_swap - byte swap and char encoding swap of spoof data 719 // 720 //----------------------------------------------------------------------------- 721 U_CAPI int32_t U_EXPORT2 722 uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, 723 UErrorCode *status) { 724 725 if (status == NULL || U_FAILURE(*status)) { 726 return 0; 727 } 728 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { 729 *status=U_ILLEGAL_ARGUMENT_ERROR; 730 return 0; 731 } 732 733 // 734 // Check that the data header is for spoof data. 735 // (Header contents are defined in gencfu.cpp) 736 // 737 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); 738 if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */ 739 pInfo->dataFormat[1]==0x66 && 740 pInfo->dataFormat[2]==0x75 && 741 pInfo->dataFormat[3]==0x20 && 742 pInfo->formatVersion[0]==1 )) { 743 udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x " 744 "(format version %02x %02x %02x %02x) is not recognized\n", 745 pInfo->dataFormat[0], pInfo->dataFormat[1], 746 pInfo->dataFormat[2], pInfo->dataFormat[3], 747 pInfo->formatVersion[0], pInfo->formatVersion[1], 748 pInfo->formatVersion[2], pInfo->formatVersion[3]); 749 *status=U_UNSUPPORTED_ERROR; 750 return 0; 751 } 752 753 // 754 // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific 755 // header). This swap also conveniently gets us 756 // the size of the ICU d.h., which lets us locate the start 757 // of the uspoof specific data. 758 // 759 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); 760 761 762 // 763 // Get the Spoof Data Header, and check that it appears to be OK. 764 // 765 // 766 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; 767 SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes; 768 if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC || 769 ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader)) 770 { 771 udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n"); 772 *status=U_UNSUPPORTED_ERROR; 773 return 0; 774 } 775 776 // 777 // Prefight operation? Just return the size 778 // 779 int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength); 780 int32_t totalSize = headerSize + spoofDataLength; 781 if (length < 0) { 782 return totalSize; 783 } 784 785 // 786 // Check that length passed in is consistent with length from Spoof data header. 787 // 788 if (length < totalSize) { 789 udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n", 790 spoofDataLength); 791 *status=U_INDEX_OUTOFBOUNDS_ERROR; 792 return 0; 793 } 794 795 796 // 797 // Swap the Data. Do the data itself first, then the Spoof Data Header, because 798 // we need to reference the header to locate the data, and an 799 // inplace swap of the header leaves it unusable. 800 // 801 uint8_t *outBytes = (uint8_t *)outData + headerSize; 802 SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes; 803 804 int32_t sectionStart; 805 int32_t sectionLength; 806 807 // 808 // If not swapping in place, zero out the output buffer before starting. 809 // Gaps may exist between the individual sections, and these must be zeroed in 810 // the output buffer. The simplest way to do that is to just zero the whole thing. 811 // 812 if (inBytes != outBytes) { 813 uprv_memset(outBytes, 0, spoofDataLength); 814 } 815 816 // Confusables Keys Section (fCFUKeys) 817 sectionStart = ds->readUInt32(spoofDH->fCFUKeys); 818 sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4; 819 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 820 821 // String Index Section 822 sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex); 823 sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2; 824 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 825 826 // String Table Section 827 sectionStart = ds->readUInt32(spoofDH->fCFUStringTable); 828 sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2; 829 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 830 831 // String Lengths Section 832 sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths); 833 sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4; 834 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 835 836 // Any Case Trie 837 sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie); 838 sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength); 839 utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 840 841 // Lower Case Trie 842 sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie); 843 sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength); 844 utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 845 846 // Script Sets. The data is an array of int32_t 847 sectionStart = ds->readUInt32(spoofDH->fScriptSets); 848 sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet); 849 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 850 851 // And, last, swap the header itself. 852 // int32_t fMagic // swap this 853 // uint8_t fFormatVersion[4] // Do not swap this, just copy 854 // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff. 855 // 856 uint32_t magic = ds->readUInt32(spoofDH->fMagic); 857 ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic); 858 859 if (outputDH->fFormatVersion != spoofDH->fFormatVersion) { 860 uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion)); 861 } 862 // swap starting at fLength 863 ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status); 864 865 return totalSize; 866 } 867 868 #endif 869 870 871