1 /* 2 ********************************************************************** 3 * Copyright (C) 2008-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 #include "unicode/uspoof.h" 10 #include "unicode/uchar.h" 11 #include "unicode/uniset.h" 12 #include "unicode/utf16.h" 13 #include "utrie2.h" 14 #include "cmemory.h" 15 #include "cstring.h" 16 #include "identifier_info.h" 17 #include "scriptset.h" 18 #include "umutex.h" 19 #include "udataswp.h" 20 #include "uassert.h" 21 #include "uspoof_impl.h" 22 23 #if !UCONFIG_NO_NORMALIZATION 24 25 26 U_NAMESPACE_BEGIN 27 28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl) 29 30 SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) : 31 fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , 32 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { 33 if (U_FAILURE(status)) { 34 return; 35 } 36 fSpoofData = data; 37 fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; 38 39 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); 40 allowedCharsSet->freeze(); 41 fAllowedCharsSet = allowedCharsSet; 42 fAllowedLocales = uprv_strdup(""); 43 if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { 44 status = U_MEMORY_ALLOCATION_ERROR; 45 return; 46 } 47 fMagic = USPOOF_MAGIC; 48 } 49 50 51 SpoofImpl::SpoofImpl() : 52 fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , 53 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { 54 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); 55 allowedCharsSet->freeze(); 56 fAllowedCharsSet = allowedCharsSet; 57 fAllowedLocales = uprv_strdup(""); 58 fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; 59 } 60 61 62 // Copy Constructor, used by the user level clone() function. 63 SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : 64 fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , 65 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) { 66 if (U_FAILURE(status)) { 67 return; 68 } 69 fMagic = src.fMagic; 70 fChecks = src.fChecks; 71 if (src.fSpoofData != NULL) { 72 fSpoofData = src.fSpoofData->addReference(); 73 } 74 fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone()); 75 if (fAllowedCharsSet == NULL) { 76 status = U_MEMORY_ALLOCATION_ERROR; 77 } 78 fAllowedLocales = uprv_strdup(src.fAllowedLocales); 79 fRestrictionLevel = src.fRestrictionLevel; 80 } 81 82 SpoofImpl::~SpoofImpl() { 83 fMagic = 0; // head off application errors by preventing use of 84 // of deleted objects. 85 if (fSpoofData != NULL) { 86 fSpoofData->removeReference(); // Will delete if refCount goes to zero. 87 } 88 delete fAllowedCharsSet; 89 uprv_free((void *)fAllowedLocales); 90 delete fCachedIdentifierInfo; 91 } 92 93 // 94 // Incoming parameter check on Status and the SpoofChecker object 95 // received from the C API. 96 // 97 const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) { 98 if (U_FAILURE(status)) { 99 return NULL; 100 } 101 if (sc == NULL) { 102 status = U_ILLEGAL_ARGUMENT_ERROR; 103 return NULL; 104 } 105 SpoofImpl *This = (SpoofImpl *)sc; 106 if (This->fMagic != USPOOF_MAGIC || 107 This->fSpoofData == NULL) { 108 status = U_INVALID_FORMAT_ERROR; 109 return NULL; 110 } 111 if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) { 112 return NULL; 113 } 114 return This; 115 } 116 117 SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) { 118 return const_cast<SpoofImpl *> 119 (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status)); 120 } 121 122 123 124 //-------------------------------------------------------------------------------------- 125 // 126 // confusableLookup() This is the heart of the confusable skeleton generation 127 // implementation. 128 // 129 // Given a source character, produce the corresponding 130 // replacement character(s), appending them to the dest string. 131 // 132 //--------------------------------------------------------------------------------------- 133 int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const { 134 135 // Binary search the spoof data key table for the inChar 136 int32_t *low = fSpoofData->fCFUKeys; 137 int32_t *mid = NULL; 138 int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize; 139 UChar32 midc; 140 do { 141 int32_t delta = ((int32_t)(limit-low))/2; 142 mid = low + delta; 143 midc = *mid & 0x1fffff; 144 if (inChar == midc) { 145 goto foundChar; 146 } else if (inChar < midc) { 147 limit = mid; 148 } else { 149 low = mid; 150 } 151 } while (low < limit-1); 152 mid = low; 153 midc = *mid & 0x1fffff; 154 if (inChar != midc) { 155 // Char not found. It maps to itself. 156 int i = 0; 157 dest.append(inChar); 158 return i; 159 } 160 foundChar: 161 int32_t keyFlags = *mid & 0xff000000; 162 if ((keyFlags & tableMask) == 0) { 163 // We found the right key char, but the entry doesn't pertain to the 164 // table we need. See if there is an adjacent key that does 165 if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) { 166 int32_t *altMid; 167 for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) { 168 keyFlags = *altMid & 0xff000000; 169 if (keyFlags & tableMask) { 170 mid = altMid; 171 goto foundKey; 172 } 173 } 174 for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) { 175 keyFlags = *altMid & 0xff000000; 176 if (keyFlags & tableMask) { 177 mid = altMid; 178 goto foundKey; 179 } 180 } 181 } 182 // No key entry for this char & table. 183 // The input char maps to itself. 184 int i = 0; 185 dest.append(inChar); 186 return i; 187 } 188 189 foundKey: 190 int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1; 191 int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys); 192 193 // Value is either a UChar (for strings of length 1) or 194 // an index into the string table (for longer strings) 195 uint16_t value = fSpoofData->fCFUValues[keyTableIndex]; 196 if (stringLen == 1) { 197 dest.append((UChar)value); 198 return 1; 199 } 200 201 // String length of 4 from the above lookup is used for all strings of length >= 4. 202 // For these, get the real length from the string lengths table, 203 // which maps string table indexes to lengths. 204 // All strings of the same length are stored contiguously in the string table. 205 // 'value' from the lookup above is the starting index for the desired string. 206 207 int32_t ix; 208 if (stringLen == 4) { 209 int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize; 210 for (ix = 0; ix < stringLengthsLimit; ix++) { 211 if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) { 212 stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength; 213 break; 214 } 215 } 216 U_ASSERT(ix < stringLengthsLimit); 217 } 218 219 U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen); 220 UChar *src = &fSpoofData->fCFUStrings[value]; 221 dest.append(src, stringLen); 222 return stringLen; 223 } 224 225 226 //--------------------------------------------------------------------------------------- 227 // 228 // wholeScriptCheck() 229 // 230 // Input text is already normalized to NFD 231 // Return the set of scripts, each of which can represent something that is 232 // confusable with the input text. The script of the input text 233 // is included; input consisting of characters from a single script will 234 // always produce a result consisting of a set containing that script. 235 // 236 //--------------------------------------------------------------------------------------- 237 void SpoofImpl::wholeScriptCheck( 238 const UnicodeString &text, ScriptSet *result, UErrorCode &status) const { 239 240 UTrie2 *table = 241 (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie; 242 result->setAll(); 243 int32_t length = text.length(); 244 for (int32_t inputIdx=0; inputIdx < length;) { 245 UChar32 c = text.char32At(inputIdx); 246 inputIdx += U16_LENGTH(c); 247 uint32_t index = utrie2_get32(table, c); 248 if (index == 0) { 249 // No confusables in another script for this char. 250 // TODO: we should change the data to have sets with just the single script 251 // bit for the script of this char. Gets rid of this special case. 252 // Until then, grab the script from the char and intersect it with the set. 253 UScriptCode cpScript = uscript_getScript(c, &status); 254 U_ASSERT(cpScript > USCRIPT_INHERITED); 255 result->intersect(cpScript, status); 256 } else if (index == 1) { 257 // Script == Common or Inherited. Nothing to do. 258 } else { 259 result->intersect(fSpoofData->fScriptSets[index]); 260 } 261 } 262 } 263 264 265 void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) { 266 UnicodeSet allowedChars; 267 UnicodeSet *tmpSet = NULL; 268 const char *locStart = localesList; 269 const char *locEnd = NULL; 270 const char *localesListEnd = localesList + uprv_strlen(localesList); 271 int32_t localeListCount = 0; // Number of locales provided by caller. 272 273 // Loop runs once per locale from the localesList, a comma separated list of locales. 274 do { 275 locEnd = uprv_strchr(locStart, ','); 276 if (locEnd == NULL) { 277 locEnd = localesListEnd; 278 } 279 while (*locStart == ' ') { 280 locStart++; 281 } 282 const char *trimmedEnd = locEnd-1; 283 while (trimmedEnd > locStart && *trimmedEnd == ' ') { 284 trimmedEnd--; 285 } 286 if (trimmedEnd <= locStart) { 287 break; 288 } 289 const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart)); 290 localeListCount++; 291 292 // We have one locale from the locales list. 293 // Add the script chars for this locale to the accumulating set of allowed chars. 294 // If the locale is no good, we will be notified back via status. 295 addScriptChars(locale, &allowedChars, status); 296 uprv_free((void *)locale); 297 if (U_FAILURE(status)) { 298 break; 299 } 300 locStart = locEnd + 1; 301 } while (locStart < localesListEnd); 302 303 // If our caller provided an empty list of locales, we disable the allowed characters checking 304 if (localeListCount == 0) { 305 uprv_free((void *)fAllowedLocales); 306 fAllowedLocales = uprv_strdup(""); 307 tmpSet = new UnicodeSet(0, 0x10ffff); 308 if (fAllowedLocales == NULL || tmpSet == NULL) { 309 status = U_MEMORY_ALLOCATION_ERROR; 310 return; 311 } 312 tmpSet->freeze(); 313 delete fAllowedCharsSet; 314 fAllowedCharsSet = tmpSet; 315 fChecks &= ~USPOOF_CHAR_LIMIT; 316 return; 317 } 318 319 320 // Add all common and inherited characters to the set of allowed chars. 321 UnicodeSet tempSet; 322 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); 323 allowedChars.addAll(tempSet); 324 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); 325 allowedChars.addAll(tempSet); 326 327 // If anything went wrong, we bail out without changing 328 // the state of the spoof checker. 329 if (U_FAILURE(status)) { 330 return; 331 } 332 333 // Store the updated spoof checker state. 334 tmpSet = static_cast<UnicodeSet *>(allowedChars.clone()); 335 const char *tmpLocalesList = uprv_strdup(localesList); 336 if (tmpSet == NULL || tmpLocalesList == NULL) { 337 status = U_MEMORY_ALLOCATION_ERROR; 338 return; 339 } 340 uprv_free((void *)fAllowedLocales); 341 fAllowedLocales = tmpLocalesList; 342 tmpSet->freeze(); 343 delete fAllowedCharsSet; 344 fAllowedCharsSet = tmpSet; 345 fChecks |= USPOOF_CHAR_LIMIT; 346 } 347 348 349 const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) { 350 return fAllowedLocales; 351 } 352 353 354 // Given a locale (a language), add all the characters from all of the scripts used with that language 355 // to the allowedChars UnicodeSet 356 357 void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) { 358 UScriptCode scripts[30]; 359 360 int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status); 361 if (U_FAILURE(status)) { 362 return; 363 } 364 if (status == U_USING_DEFAULT_WARNING) { 365 status = U_ILLEGAL_ARGUMENT_ERROR; 366 return; 367 } 368 UnicodeSet tmpSet; 369 int32_t i; 370 for (i=0; i<numScripts; i++) { 371 tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status); 372 allowedChars->addAll(tmpSet); 373 } 374 } 375 376 377 // Convert a text format hex number. Utility function used by builder code. Static. 378 // Input: UChar *string text. Output: a UChar32 379 // Input has been pre-checked, and will have no non-hex chars. 380 // The number must fall in the code point range of 0..0x10ffff 381 // Static Function. 382 UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) { 383 if (U_FAILURE(status)) { 384 return 0; 385 } 386 U_ASSERT(limit-start > 0); 387 uint32_t val = 0; 388 int i; 389 for (i=start; i<limit; i++) { 390 int digitVal = s[i] - 0x30; 391 if (digitVal>9) { 392 digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A' 393 } 394 if (digitVal>15) { 395 digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a' 396 } 397 U_ASSERT(digitVal <= 0xf); 398 val <<= 4; 399 val += digitVal; 400 } 401 if (val > 0x10ffff) { 402 status = U_PARSE_ERROR; 403 val = 0; 404 } 405 return (UChar32)val; 406 } 407 408 // IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create. 409 // Maintain a one-element cache, which is sufficient to avoid repeatedly 410 // creating new ones unless we get multi-thread concurrency in spoof 411 // check operations, which should be statistically uncommon. 412 413 // These functions are used in place of new & delete of an IdentifierInfo. 414 // They will recycle the IdentifierInfo when possible. 415 // They are logically const, and used within const functions that must be thread safe. 416 IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const { 417 IdentifierInfo *returnIdInfo = NULL; 418 if (U_FAILURE(status)) { 419 return returnIdInfo; 420 } 421 SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this); 422 { 423 Mutex m; 424 returnIdInfo = nonConstThis->fCachedIdentifierInfo; 425 nonConstThis->fCachedIdentifierInfo = NULL; 426 } 427 if (returnIdInfo == NULL) { 428 returnIdInfo = new IdentifierInfo(status); 429 if (U_SUCCESS(status) && returnIdInfo == NULL) { 430 status = U_MEMORY_ALLOCATION_ERROR; 431 } 432 if (U_FAILURE(status) && returnIdInfo != NULL) { 433 delete returnIdInfo; 434 returnIdInfo = NULL; 435 } 436 } 437 return returnIdInfo; 438 } 439 440 441 void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const { 442 if (idInfo != NULL) { 443 SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this); 444 { 445 Mutex m; 446 if (nonConstThis->fCachedIdentifierInfo == NULL) { 447 nonConstThis->fCachedIdentifierInfo = idInfo; 448 idInfo = NULL; 449 } 450 } 451 delete idInfo; 452 } 453 } 454 455 456 457 458 //---------------------------------------------------------------------------------------------- 459 // 460 // class SpoofData Implementation 461 // 462 //---------------------------------------------------------------------------------------------- 463 464 465 UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) { 466 if (U_FAILURE(status) || 467 rawData == NULL || 468 rawData->fMagic != USPOOF_MAGIC || 469 rawData->fFormatVersion[0] > 1 || 470 rawData->fFormatVersion[1] > 0) { 471 status = U_INVALID_FORMAT_ERROR; 472 return FALSE; 473 } 474 return TRUE; 475 } 476 477 static UBool U_CALLCONV 478 spoofDataIsAcceptable(void *context, 479 const char * /* type */, const char * /*name*/, 480 const UDataInfo *pInfo) { 481 if( 482 pInfo->size >= 20 && 483 pInfo->isBigEndian == U_IS_BIG_ENDIAN && 484 pInfo->charsetFamily == U_CHARSET_FAMILY && 485 pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu " 486 pInfo->dataFormat[1] == 0x66 && 487 pInfo->dataFormat[2] == 0x75 && 488 pInfo->dataFormat[3] == 0x20 && 489 pInfo->formatVersion[0] == 1 490 ) { 491 UVersionInfo *version = static_cast<UVersionInfo *>(context); 492 if(version != NULL) { 493 uprv_memcpy(version, pInfo->dataVersion, 4); 494 } 495 return TRUE; 496 } else { 497 return FALSE; 498 } 499 } 500 501 // 502 // SpoofData::getDefault() - return a wrapper around the spoof data that is 503 // baked into the default ICU data. 504 // 505 SpoofData *SpoofData::getDefault(UErrorCode &status) { 506 // TODO: Cache it. Lazy create, keep until cleanup. 507 508 UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables", 509 spoofDataIsAcceptable, 510 NULL, // context, would receive dataVersion if supplied. 511 &status); 512 if (U_FAILURE(status)) { 513 return NULL; 514 } 515 SpoofData *This = new SpoofData(udm, status); 516 if (U_FAILURE(status)) { 517 delete This; 518 return NULL; 519 } 520 if (This == NULL) { 521 status = U_MEMORY_ALLOCATION_ERROR; 522 } 523 return This; 524 } 525 526 SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status) 527 { 528 reset(); 529 if (U_FAILURE(status)) { 530 return; 531 } 532 fUDM = udm; 533 // fRawData is non-const because it may be constructed by the data builder. 534 fRawData = reinterpret_cast<SpoofDataHeader *>( 535 const_cast<void *>(udata_getMemory(udm))); 536 validateDataVersion(fRawData, status); 537 initPtrs(status); 538 } 539 540 541 SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status) 542 { 543 reset(); 544 if (U_FAILURE(status)) { 545 return; 546 } 547 if ((size_t)length < sizeof(SpoofDataHeader)) { 548 status = U_INVALID_FORMAT_ERROR; 549 return; 550 } 551 void *ncData = const_cast<void *>(data); 552 fRawData = static_cast<SpoofDataHeader *>(ncData); 553 if (length < fRawData->fLength) { 554 status = U_INVALID_FORMAT_ERROR; 555 return; 556 } 557 validateDataVersion(fRawData, status); 558 initPtrs(status); 559 } 560 561 562 // Spoof Data constructor for use from data builder. 563 // Initializes a new, empty data area that will be populated later. 564 SpoofData::SpoofData(UErrorCode &status) { 565 reset(); 566 if (U_FAILURE(status)) { 567 return; 568 } 569 fDataOwned = true; 570 fRefCount = 1; 571 572 // The spoof header should already be sized to be a multiple of 16 bytes. 573 // Just in case it's not, round it up. 574 uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15; 575 U_ASSERT(initialSize == sizeof(SpoofDataHeader)); 576 577 fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize)); 578 fMemLimit = initialSize; 579 if (fRawData == NULL) { 580 status = U_MEMORY_ALLOCATION_ERROR; 581 return; 582 } 583 uprv_memset(fRawData, 0, initialSize); 584 585 fRawData->fMagic = USPOOF_MAGIC; 586 fRawData->fFormatVersion[0] = 1; 587 fRawData->fFormatVersion[1] = 0; 588 fRawData->fFormatVersion[2] = 0; 589 fRawData->fFormatVersion[3] = 0; 590 initPtrs(status); 591 } 592 593 // reset() - initialize all fields. 594 // Should be updated if any new fields are added. 595 // Called by constructors to put things in a known initial state. 596 void SpoofData::reset() { 597 fRawData = NULL; 598 fDataOwned = FALSE; 599 fUDM = NULL; 600 fMemLimit = 0; 601 fRefCount = 1; 602 fCFUKeys = NULL; 603 fCFUValues = NULL; 604 fCFUStringLengths = NULL; 605 fCFUStrings = NULL; 606 fAnyCaseTrie = NULL; 607 fLowerCaseTrie = NULL; 608 fScriptSets = NULL; 609 } 610 611 612 // SpoofData::initPtrs() 613 // Initialize the pointers to the various sections of the raw data. 614 // 615 // This function is used both during the Trie building process (multiple 616 // times, as the individual data sections are added), and 617 // during the opening of a Spoof Checker from prebuilt data. 618 // 619 // The pointers for non-existent data sections (identified by an offset of 0) 620 // are set to NULL. 621 // 622 // Note: During building the data, adding each new data section 623 // reallocs the raw data area, which likely relocates it, which 624 // in turn requires reinitializing all of the pointers into it, hence 625 // multiple calls to this function during building. 626 // 627 void SpoofData::initPtrs(UErrorCode &status) { 628 fCFUKeys = NULL; 629 fCFUValues = NULL; 630 fCFUStringLengths = NULL; 631 fCFUStrings = NULL; 632 if (U_FAILURE(status)) { 633 return; 634 } 635 if (fRawData->fCFUKeys != 0) { 636 fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys); 637 } 638 if (fRawData->fCFUStringIndex != 0) { 639 fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex); 640 } 641 if (fRawData->fCFUStringLengths != 0) { 642 fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths); 643 } 644 if (fRawData->fCFUStringTable != 0) { 645 fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable); 646 } 647 648 if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) { 649 fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 650 (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status); 651 } 652 if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) { 653 fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 654 (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status); 655 } 656 657 if (fRawData->fScriptSets != 0) { 658 fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets); 659 } 660 } 661 662 663 SpoofData::~SpoofData() { 664 utrie2_close(fAnyCaseTrie); 665 fAnyCaseTrie = NULL; 666 utrie2_close(fLowerCaseTrie); 667 fLowerCaseTrie = NULL; 668 if (fDataOwned) { 669 uprv_free(fRawData); 670 } 671 fRawData = NULL; 672 if (fUDM != NULL) { 673 udata_close(fUDM); 674 } 675 fUDM = NULL; 676 } 677 678 679 void SpoofData::removeReference() { 680 if (umtx_atomic_dec(&fRefCount) == 0) { 681 delete this; 682 } 683 } 684 685 686 SpoofData *SpoofData::addReference() { 687 umtx_atomic_inc(&fRefCount); 688 return this; 689 } 690 691 692 void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) { 693 if (U_FAILURE(status)) { 694 return NULL; 695 } 696 if (!fDataOwned) { 697 U_ASSERT(FALSE); 698 status = U_INTERNAL_PROGRAM_ERROR; 699 return NULL; 700 } 701 702 numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16 703 uint32_t returnOffset = fMemLimit; 704 fMemLimit += numBytes; 705 fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit)); 706 fRawData->fLength = fMemLimit; 707 uprv_memset((char *)fRawData + returnOffset, 0, numBytes); 708 initPtrs(status); 709 return (char *)fRawData + returnOffset; 710 } 711 712 713 U_NAMESPACE_END 714 715 U_NAMESPACE_USE 716 717 //----------------------------------------------------------------------------- 718 // 719 // uspoof_swap - byte swap and char encoding swap of spoof data 720 // 721 //----------------------------------------------------------------------------- 722 U_CAPI int32_t U_EXPORT2 723 uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, 724 UErrorCode *status) { 725 726 if (status == NULL || U_FAILURE(*status)) { 727 return 0; 728 } 729 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { 730 *status=U_ILLEGAL_ARGUMENT_ERROR; 731 return 0; 732 } 733 734 // 735 // Check that the data header is for spoof data. 736 // (Header contents are defined in gencfu.cpp) 737 // 738 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); 739 if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */ 740 pInfo->dataFormat[1]==0x66 && 741 pInfo->dataFormat[2]==0x75 && 742 pInfo->dataFormat[3]==0x20 && 743 pInfo->formatVersion[0]==1 )) { 744 udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x " 745 "(format version %02x %02x %02x %02x) is not recognized\n", 746 pInfo->dataFormat[0], pInfo->dataFormat[1], 747 pInfo->dataFormat[2], pInfo->dataFormat[3], 748 pInfo->formatVersion[0], pInfo->formatVersion[1], 749 pInfo->formatVersion[2], pInfo->formatVersion[3]); 750 *status=U_UNSUPPORTED_ERROR; 751 return 0; 752 } 753 754 // 755 // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific 756 // header). This swap also conveniently gets us 757 // the size of the ICU d.h., which lets us locate the start 758 // of the uspoof specific data. 759 // 760 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); 761 762 763 // 764 // Get the Spoof Data Header, and check that it appears to be OK. 765 // 766 // 767 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; 768 SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes; 769 if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC || 770 ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader)) 771 { 772 udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n"); 773 *status=U_UNSUPPORTED_ERROR; 774 return 0; 775 } 776 777 // 778 // Prefight operation? Just return the size 779 // 780 int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength); 781 int32_t totalSize = headerSize + spoofDataLength; 782 if (length < 0) { 783 return totalSize; 784 } 785 786 // 787 // Check that length passed in is consistent with length from Spoof data header. 788 // 789 if (length < totalSize) { 790 udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n", 791 spoofDataLength); 792 *status=U_INDEX_OUTOFBOUNDS_ERROR; 793 return 0; 794 } 795 796 797 // 798 // Swap the Data. Do the data itself first, then the Spoof Data Header, because 799 // we need to reference the header to locate the data, and an 800 // inplace swap of the header leaves it unusable. 801 // 802 uint8_t *outBytes = (uint8_t *)outData + headerSize; 803 SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes; 804 805 int32_t sectionStart; 806 int32_t sectionLength; 807 808 // 809 // If not swapping in place, zero out the output buffer before starting. 810 // Gaps may exist between the individual sections, and these must be zeroed in 811 // the output buffer. The simplest way to do that is to just zero the whole thing. 812 // 813 if (inBytes != outBytes) { 814 uprv_memset(outBytes, 0, spoofDataLength); 815 } 816 817 // Confusables Keys Section (fCFUKeys) 818 sectionStart = ds->readUInt32(spoofDH->fCFUKeys); 819 sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4; 820 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 821 822 // String Index Section 823 sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex); 824 sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2; 825 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 826 827 // String Table Section 828 sectionStart = ds->readUInt32(spoofDH->fCFUStringTable); 829 sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2; 830 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 831 832 // String Lengths Section 833 sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths); 834 sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4; 835 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 836 837 // Any Case Trie 838 sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie); 839 sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength); 840 utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 841 842 // Lower Case Trie 843 sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie); 844 sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength); 845 utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 846 847 // Script Sets. The data is an array of int32_t 848 sectionStart = ds->readUInt32(spoofDH->fScriptSets); 849 sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet); 850 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 851 852 // And, last, swap the header itself. 853 // int32_t fMagic // swap this 854 // uint8_t fFormatVersion[4] // Do not swap this, just copy 855 // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff. 856 // 857 uint32_t magic = ds->readUInt32(spoofDH->fMagic); 858 ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic); 859 860 if (outputDH->fFormatVersion != spoofDH->fFormatVersion) { 861 uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion)); 862 } 863 // swap starting at fLength 864 ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status); 865 866 return totalSize; 867 } 868 869 #endif 870 871 872