1 /* 2 ********************************************************************** 3 * Copyright (C) 2008-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 #include "unicode/uspoof.h" 10 #include "unicode/unorm.h" 11 #include "unicode/uchar.h" 12 #include "unicode/uniset.h" 13 #include "unicode/utf16.h" 14 #include "utrie2.h" 15 #include "cmemory.h" 16 #include "cstring.h" 17 #include "udatamem.h" 18 #include "umutex.h" 19 #include "udataswp.h" 20 #include "uassert.h" 21 #include "uspoof_impl.h" 22 23 #if !UCONFIG_NO_NORMALIZATION 24 25 26 U_NAMESPACE_BEGIN 27 28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl) 29 30 SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) : 31 fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) , fAllowedLocales(uprv_strdup("")) { 32 if (U_FAILURE(status)) { 33 return; 34 } 35 fMagic = USPOOF_MAGIC; 36 fSpoofData = data; 37 fChecks = USPOOF_ALL_CHECKS; 38 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); 39 if (allowedCharsSet == NULL || fAllowedLocales == NULL) { 40 status = U_MEMORY_ALLOCATION_ERROR; 41 return; 42 } 43 allowedCharsSet->freeze(); 44 fAllowedCharsSet = allowedCharsSet; 45 } 46 47 48 SpoofImpl::SpoofImpl() { 49 fMagic = USPOOF_MAGIC; 50 fSpoofData = NULL; 51 fChecks = USPOOF_ALL_CHECKS; 52 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); 53 allowedCharsSet->freeze(); 54 fAllowedCharsSet = allowedCharsSet; 55 fAllowedLocales = uprv_strdup(""); 56 } 57 58 59 // Copy Constructor, used by the user level clone() function. 60 SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : 61 fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) { 62 if (U_FAILURE(status)) { 63 return; 64 } 65 fMagic = src.fMagic; 66 fChecks = src.fChecks; 67 if (src.fSpoofData != NULL) { 68 fSpoofData = src.fSpoofData->addReference(); 69 } 70 fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone()); 71 if (fAllowedCharsSet == NULL) { 72 status = U_MEMORY_ALLOCATION_ERROR; 73 } 74 fAllowedLocales = uprv_strdup(src.fAllowedLocales); 75 } 76 77 SpoofImpl::~SpoofImpl() { 78 fMagic = 0; // head off application errors by preventing use of 79 // of deleted objects. 80 if (fSpoofData != NULL) { 81 fSpoofData->removeReference(); // Will delete if refCount goes to zero. 82 } 83 delete fAllowedCharsSet; 84 uprv_free((void *)fAllowedLocales); 85 } 86 87 // 88 // Incoming parameter check on Status and the SpoofChecker object 89 // received from the C API. 90 // 91 const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) { 92 if (U_FAILURE(status)) { 93 return NULL; 94 } 95 if (sc == NULL) { 96 status = U_ILLEGAL_ARGUMENT_ERROR; 97 return NULL; 98 }; 99 SpoofImpl *This = (SpoofImpl *)sc; 100 if (This->fMagic != USPOOF_MAGIC || 101 This->fSpoofData == NULL) { 102 status = U_INVALID_FORMAT_ERROR; 103 return NULL; 104 } 105 if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) { 106 return NULL; 107 } 108 return This; 109 } 110 111 SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) { 112 return const_cast<SpoofImpl *> 113 (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status)); 114 } 115 116 117 118 //-------------------------------------------------------------------------------------- 119 // 120 // confusableLookup() This is the heart of the confusable skeleton generation 121 // implementation. 122 // 123 // Given a source character, produce the corresponding 124 // replacement character(s) 125 // 126 //--------------------------------------------------------------------------------------- 127 int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const { 128 129 // Binary search the spoof data key table for the inChar 130 int32_t *low = fSpoofData->fCFUKeys; 131 int32_t *mid = NULL; 132 int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize; 133 UChar32 midc; 134 do { 135 int32_t delta = ((int32_t)(limit-low))/2; 136 mid = low + delta; 137 midc = *mid & 0x1fffff; 138 if (inChar == midc) { 139 goto foundChar; 140 } else if (inChar < midc) { 141 limit = mid; 142 } else { 143 low = mid; 144 } 145 } while (low < limit-1); 146 mid = low; 147 midc = *mid & 0x1fffff; 148 if (inChar != midc) { 149 // Char not found. It maps to itself. 150 int i = 0; 151 U16_APPEND_UNSAFE(destBuf, i, inChar) 152 return i; 153 } 154 foundChar: 155 int32_t keyFlags = *mid & 0xff000000; 156 if ((keyFlags & tableMask) == 0) { 157 // We found the right key char, but the entry doesn't pertain to the 158 // table we need. See if there is an adjacent key that does 159 if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) { 160 int32_t *altMid; 161 for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) { 162 keyFlags = *altMid & 0xff000000; 163 if (keyFlags & tableMask) { 164 mid = altMid; 165 goto foundKey; 166 } 167 } 168 for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) { 169 keyFlags = *altMid & 0xff000000; 170 if (keyFlags & tableMask) { 171 mid = altMid; 172 goto foundKey; 173 } 174 } 175 } 176 // No key entry for this char & table. 177 // The input char maps to itself. 178 int i = 0; 179 U16_APPEND_UNSAFE(destBuf, i, inChar) 180 return i; 181 } 182 183 foundKey: 184 int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1; 185 int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys); 186 187 // Value is either a UChar (for strings of length 1) or 188 // an index into the string table (for longer strings) 189 uint16_t value = fSpoofData->fCFUValues[keyTableIndex]; 190 if (stringLen == 1) { 191 destBuf[0] = value; 192 return 1; 193 } 194 195 // String length of 4 from the above lookup is used for all strings of length >= 4. 196 // For these, get the real length from the string lengths table, 197 // which maps string table indexes to lengths. 198 // All strings of the same length are stored contiguously in the string table. 199 // 'value' from the lookup above is the starting index for the desired string. 200 201 int32_t ix; 202 if (stringLen == 4) { 203 int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize; 204 for (ix = 0; ix < stringLengthsLimit; ix++) { 205 if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) { 206 stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength; 207 break; 208 } 209 } 210 U_ASSERT(ix < stringLengthsLimit); 211 } 212 213 U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen); 214 UChar *src = &fSpoofData->fCFUStrings[value]; 215 for (ix=0; ix<stringLen; ix++) { 216 destBuf[ix] = src[ix]; 217 } 218 return stringLen; 219 } 220 221 222 //--------------------------------------------------------------------------------------- 223 // 224 // wholeScriptCheck() 225 // 226 // Input text is already normalized to NFD 227 // Return the set of scripts, each of which can represent something that is 228 // confusable with the input text. The script of the input text 229 // is included; input consisting of characters from a single script will 230 // always produce a result consisting of a set containing that script. 231 // 232 //--------------------------------------------------------------------------------------- 233 void SpoofImpl::wholeScriptCheck( 234 const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const { 235 236 int32_t inputIdx = 0; 237 UChar32 c; 238 239 UTrie2 *table = 240 (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie; 241 result->setAll(); 242 while (inputIdx < length) { 243 U16_NEXT(text, inputIdx, length, c); 244 uint32_t index = utrie2_get32(table, c); 245 if (index == 0) { 246 // No confusables in another script for this char. 247 // TODO: we should change the data to have sets with just the single script 248 // bit for the script of this char. Gets rid of this special case. 249 // Until then, grab the script from the char and intersect it with the set. 250 UScriptCode cpScript = uscript_getScript(c, &status); 251 U_ASSERT(cpScript > USCRIPT_INHERITED); 252 result->intersect(cpScript); 253 } else if (index == 1) { 254 // Script == Common or Inherited. Nothing to do. 255 } else { 256 result->intersect(fSpoofData->fScriptSets[index]); 257 } 258 } 259 } 260 261 262 void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) { 263 UnicodeSet allowedChars; 264 UnicodeSet *tmpSet = NULL; 265 const char *locStart = localesList; 266 const char *locEnd = NULL; 267 const char *localesListEnd = localesList + uprv_strlen(localesList); 268 int32_t localeListCount = 0; // Number of locales provided by caller. 269 270 // Loop runs once per locale from the localesList, a comma separated list of locales. 271 do { 272 locEnd = uprv_strchr(locStart, ','); 273 if (locEnd == NULL) { 274 locEnd = localesListEnd; 275 } 276 while (*locStart == ' ') { 277 locStart++; 278 } 279 const char *trimmedEnd = locEnd-1; 280 while (trimmedEnd > locStart && *trimmedEnd == ' ') { 281 trimmedEnd--; 282 } 283 if (trimmedEnd <= locStart) { 284 break; 285 } 286 const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart)); 287 localeListCount++; 288 289 // We have one locale from the locales list. 290 // Add the script chars for this locale to the accumulating set of allowed chars. 291 // If the locale is no good, we will be notified back via status. 292 addScriptChars(locale, &allowedChars, status); 293 uprv_free((void *)locale); 294 if (U_FAILURE(status)) { 295 break; 296 } 297 locStart = locEnd + 1; 298 } while (locStart < localesListEnd); 299 300 // If our caller provided an empty list of locales, we disable the allowed characters checking 301 if (localeListCount == 0) { 302 uprv_free((void *)fAllowedLocales); 303 fAllowedLocales = uprv_strdup(""); 304 tmpSet = new UnicodeSet(0, 0x10ffff); 305 if (fAllowedLocales == NULL || tmpSet == NULL) { 306 status = U_MEMORY_ALLOCATION_ERROR; 307 return; 308 } 309 tmpSet->freeze(); 310 delete fAllowedCharsSet; 311 fAllowedCharsSet = tmpSet; 312 fChecks &= ~USPOOF_CHAR_LIMIT; 313 return; 314 } 315 316 317 // Add all common and inherited characters to the set of allowed chars. 318 UnicodeSet tempSet; 319 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); 320 allowedChars.addAll(tempSet); 321 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); 322 allowedChars.addAll(tempSet); 323 324 // If anything went wrong, we bail out without changing 325 // the state of the spoof checker. 326 if (U_FAILURE(status)) { 327 return; 328 } 329 330 // Store the updated spoof checker state. 331 tmpSet = static_cast<UnicodeSet *>(allowedChars.clone()); 332 const char *tmpLocalesList = uprv_strdup(localesList); 333 if (tmpSet == NULL || tmpLocalesList == NULL) { 334 status = U_MEMORY_ALLOCATION_ERROR; 335 return; 336 } 337 uprv_free((void *)fAllowedLocales); 338 fAllowedLocales = tmpLocalesList; 339 tmpSet->freeze(); 340 delete fAllowedCharsSet; 341 fAllowedCharsSet = tmpSet; 342 fChecks |= USPOOF_CHAR_LIMIT; 343 } 344 345 346 const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) { 347 return fAllowedLocales; 348 } 349 350 351 // Given a locale (a language), add all the characters from all of the scripts used with that language 352 // to the allowedChars UnicodeSet 353 354 void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) { 355 UScriptCode scripts[30]; 356 357 int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status); 358 if (U_FAILURE(status)) { 359 return; 360 } 361 if (status == U_USING_DEFAULT_WARNING) { 362 status = U_ILLEGAL_ARGUMENT_ERROR; 363 return; 364 } 365 UnicodeSet tmpSet; 366 int32_t i; 367 for (i=0; i<numScripts; i++) { 368 tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status); 369 allowedChars->addAll(tmpSet); 370 } 371 } 372 373 374 int32_t SpoofImpl::scriptScan 375 (const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const { 376 if (U_FAILURE(status)) { 377 return 0; 378 } 379 int32_t inputIdx = 0; 380 UChar32 c; 381 int32_t scriptCount = 0; 382 UScriptCode lastScript = USCRIPT_INVALID_CODE; 383 UScriptCode sc = USCRIPT_INVALID_CODE; 384 while ((inputIdx < length || length == -1) && scriptCount < 2) { 385 U16_NEXT(text, inputIdx, length, c); 386 if (c == 0 && length == -1) { 387 break; 388 } 389 sc = uscript_getScript(c, &status); 390 if (sc == USCRIPT_COMMON || sc == USCRIPT_INHERITED || sc == USCRIPT_UNKNOWN) { 391 continue; 392 } 393 394 // Temporary fix: fold Japanese Hiragana and Katakana into Han. 395 // Names are allowed to mix these scripts. 396 // A more general solution will follow later for characters that are 397 // used with multiple scripts. 398 399 if (sc == USCRIPT_HIRAGANA || sc == USCRIPT_KATAKANA || sc == USCRIPT_HANGUL) { 400 sc = USCRIPT_HAN; 401 } 402 403 if (sc != lastScript) { 404 scriptCount++; 405 lastScript = sc; 406 } 407 } 408 if (scriptCount == 2) { 409 pos = inputIdx; 410 } 411 return scriptCount; 412 } 413 414 415 // Convert a text format hex number. Utility function used by builder code. Static. 416 // Input: UChar *string text. Output: a UChar32 417 // Input has been pre-checked, and will have no non-hex chars. 418 // The number must fall in the code point range of 0..0x10ffff 419 // Static Function. 420 UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) { 421 if (U_FAILURE(status)) { 422 return 0; 423 } 424 U_ASSERT(limit-start > 0); 425 uint32_t val = 0; 426 int i; 427 for (i=start; i<limit; i++) { 428 int digitVal = s[i] - 0x30; 429 if (digitVal>9) { 430 digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A' 431 } 432 if (digitVal>15) { 433 digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a' 434 } 435 U_ASSERT(digitVal <= 0xf); 436 val <<= 4; 437 val += digitVal; 438 } 439 if (val > 0x10ffff) { 440 status = U_PARSE_ERROR; 441 val = 0; 442 } 443 return (UChar32)val; 444 } 445 446 447 448 //---------------------------------------------------------------------------------------------- 449 // 450 // class SpoofData Implementation 451 // 452 //---------------------------------------------------------------------------------------------- 453 454 455 UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) { 456 if (U_FAILURE(status) || 457 rawData == NULL || 458 rawData->fMagic != USPOOF_MAGIC || 459 rawData->fFormatVersion[0] > 1 || 460 rawData->fFormatVersion[1] > 0) { 461 status = U_INVALID_FORMAT_ERROR; 462 return FALSE; 463 } 464 return TRUE; 465 } 466 467 // 468 // SpoofData::getDefault() - return a wrapper around the spoof data that is 469 // baked into the default ICU data. 470 // 471 SpoofData *SpoofData::getDefault(UErrorCode &status) { 472 // TODO: Cache it. Lazy create, keep until cleanup. 473 474 UDataMemory *udm = udata_open(NULL, "cfu", "confusables", &status); 475 if (U_FAILURE(status)) { 476 return NULL; 477 } 478 SpoofData *This = new SpoofData(udm, status); 479 if (U_FAILURE(status)) { 480 delete This; 481 return NULL; 482 } 483 if (This == NULL) { 484 status = U_MEMORY_ALLOCATION_ERROR; 485 } 486 return This; 487 } 488 489 490 SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status) 491 { 492 reset(); 493 if (U_FAILURE(status)) { 494 return; 495 } 496 fRawData = reinterpret_cast<SpoofDataHeader *> 497 ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize); 498 fUDM = udm; 499 validateDataVersion(fRawData, status); 500 initPtrs(status); 501 } 502 503 504 SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status) 505 { 506 reset(); 507 if (U_FAILURE(status)) { 508 return; 509 } 510 if ((size_t)length < sizeof(SpoofDataHeader)) { 511 status = U_INVALID_FORMAT_ERROR; 512 return; 513 } 514 void *ncData = const_cast<void *>(data); 515 fRawData = static_cast<SpoofDataHeader *>(ncData); 516 if (length < fRawData->fLength) { 517 status = U_INVALID_FORMAT_ERROR; 518 return; 519 } 520 validateDataVersion(fRawData, status); 521 initPtrs(status); 522 } 523 524 525 // Spoof Data constructor for use from data builder. 526 // Initializes a new, empty data area that will be populated later. 527 SpoofData::SpoofData(UErrorCode &status) { 528 reset(); 529 if (U_FAILURE(status)) { 530 return; 531 } 532 fDataOwned = true; 533 fRefCount = 1; 534 535 // The spoof header should already be sized to be a multiple of 16 bytes. 536 // Just in case it's not, round it up. 537 uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15; 538 U_ASSERT(initialSize == sizeof(SpoofDataHeader)); 539 540 fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize)); 541 fMemLimit = initialSize; 542 if (fRawData == NULL) { 543 status = U_MEMORY_ALLOCATION_ERROR; 544 return; 545 } 546 uprv_memset(fRawData, 0, initialSize); 547 548 fRawData->fMagic = USPOOF_MAGIC; 549 fRawData->fFormatVersion[0] = 1; 550 fRawData->fFormatVersion[1] = 0; 551 fRawData->fFormatVersion[2] = 0; 552 fRawData->fFormatVersion[3] = 0; 553 initPtrs(status); 554 } 555 556 // reset() - initialize all fields. 557 // Should be updated if any new fields are added. 558 // Called by constructors to put things in a known initial state. 559 void SpoofData::reset() { 560 fRawData = NULL; 561 fDataOwned = FALSE; 562 fUDM = NULL; 563 fMemLimit = 0; 564 fRefCount = 1; 565 fCFUKeys = NULL; 566 fCFUValues = NULL; 567 fCFUStringLengths = NULL; 568 fCFUStrings = NULL; 569 fAnyCaseTrie = NULL; 570 fLowerCaseTrie = NULL; 571 fScriptSets = NULL; 572 } 573 574 575 // SpoofData::initPtrs() 576 // Initialize the pointers to the various sections of the raw data. 577 // 578 // This function is used both during the Trie building process (multiple 579 // times, as the individual data sections are added), and 580 // during the opening of a Spoof Checker from prebuilt data. 581 // 582 // The pointers for non-existent data sections (identified by an offset of 0) 583 // are set to NULL. 584 // 585 // Note: During building the data, adding each new data section 586 // reallocs the raw data area, which likely relocates it, which 587 // in turn requires reinitializing all of the pointers into it, hence 588 // multiple calls to this function during building. 589 // 590 void SpoofData::initPtrs(UErrorCode &status) { 591 fCFUKeys = NULL; 592 fCFUValues = NULL; 593 fCFUStringLengths = NULL; 594 fCFUStrings = NULL; 595 if (U_FAILURE(status)) { 596 return; 597 } 598 if (fRawData->fCFUKeys != 0) { 599 fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys); 600 } 601 if (fRawData->fCFUStringIndex != 0) { 602 fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex); 603 } 604 if (fRawData->fCFUStringLengths != 0) { 605 fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths); 606 } 607 if (fRawData->fCFUStringTable != 0) { 608 fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable); 609 } 610 611 if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) { 612 fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 613 (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status); 614 } 615 if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) { 616 fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 617 (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status); 618 } 619 620 if (fRawData->fScriptSets != 0) { 621 fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets); 622 } 623 } 624 625 626 SpoofData::~SpoofData() { 627 utrie2_close(fAnyCaseTrie); 628 fAnyCaseTrie = NULL; 629 utrie2_close(fLowerCaseTrie); 630 fLowerCaseTrie = NULL; 631 if (fDataOwned) { 632 uprv_free(fRawData); 633 } 634 fRawData = NULL; 635 if (fUDM != NULL) { 636 udata_close(fUDM); 637 } 638 fUDM = NULL; 639 } 640 641 642 void SpoofData::removeReference() { 643 if (umtx_atomic_dec(&fRefCount) == 0) { 644 delete this; 645 } 646 } 647 648 649 SpoofData *SpoofData::addReference() { 650 umtx_atomic_inc(&fRefCount); 651 return this; 652 } 653 654 655 void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) { 656 if (U_FAILURE(status)) { 657 return NULL; 658 } 659 if (!fDataOwned) { 660 U_ASSERT(FALSE); 661 status = U_INTERNAL_PROGRAM_ERROR; 662 return NULL; 663 } 664 665 numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16 666 uint32_t returnOffset = fMemLimit; 667 fMemLimit += numBytes; 668 fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit)); 669 fRawData->fLength = fMemLimit; 670 uprv_memset((char *)fRawData + returnOffset, 0, numBytes); 671 initPtrs(status); 672 return (char *)fRawData + returnOffset; 673 } 674 675 676 //---------------------------------------------------------------------------- 677 // 678 // ScriptSet implementation 679 // 680 //---------------------------------------------------------------------------- 681 ScriptSet::ScriptSet() { 682 for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) { 683 bits[i] = 0; 684 } 685 } 686 687 ScriptSet::~ScriptSet() { 688 } 689 690 UBool ScriptSet::operator == (const ScriptSet &other) { 691 for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) { 692 if (bits[i] != other.bits[i]) { 693 return FALSE; 694 } 695 } 696 return TRUE; 697 } 698 699 void ScriptSet::Union(UScriptCode script) { 700 uint32_t index = script / 32; 701 uint32_t bit = 1 << (script & 31); 702 U_ASSERT(index < sizeof(bits)*4); 703 bits[index] |= bit; 704 } 705 706 707 void ScriptSet::Union(const ScriptSet &other) { 708 for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) { 709 bits[i] |= other.bits[i]; 710 } 711 } 712 713 void ScriptSet::intersect(const ScriptSet &other) { 714 for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) { 715 bits[i] &= other.bits[i]; 716 } 717 } 718 719 void ScriptSet::intersect(UScriptCode script) { 720 uint32_t index = script / 32; 721 uint32_t bit = 1 << (script & 31); 722 U_ASSERT(index < sizeof(bits)*4); 723 uint32_t i; 724 for (i=0; i<index; i++) { 725 bits[i] = 0; 726 } 727 bits[index] &= bit; 728 for (i=index+1; i<sizeof(bits)/sizeof(uint32_t); i++) { 729 bits[i] = 0; 730 } 731 } 732 733 734 ScriptSet & ScriptSet::operator =(const ScriptSet &other) { 735 for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) { 736 bits[i] = other.bits[i]; 737 } 738 return *this; 739 } 740 741 742 void ScriptSet::setAll() { 743 for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) { 744 bits[i] = 0xffffffffu; 745 } 746 } 747 748 749 void ScriptSet::resetAll() { 750 for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) { 751 bits[i] = 0; 752 } 753 } 754 755 int32_t ScriptSet::countMembers() { 756 // This bit counter is good for sparse numbers of '1's, which is 757 // very much the case that we will usually have. 758 int32_t count = 0; 759 for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) { 760 uint32_t x = bits[i]; 761 while (x > 0) { 762 count++; 763 x &= (x - 1); // and off the least significant one bit. 764 } 765 } 766 return count; 767 } 768 769 770 771 //----------------------------------------------------------------------------- 772 // 773 // NFDBuffer Implementation. 774 // 775 //----------------------------------------------------------------------------- 776 777 NFDBuffer::NFDBuffer(const UChar *text, int32_t length, UErrorCode &status) { 778 fNormalizedText = NULL; 779 fNormalizedTextLength = 0; 780 fOriginalText = text; 781 if (U_FAILURE(status)) { 782 return; 783 } 784 fNormalizedText = fSmallBuf; 785 fNormalizedTextLength = unorm_normalize( 786 text, length, UNORM_NFD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status); 787 if (status == U_BUFFER_OVERFLOW_ERROR) { 788 status = U_ZERO_ERROR; 789 fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar)); 790 if (fNormalizedText == NULL) { 791 status = U_MEMORY_ALLOCATION_ERROR; 792 } else { 793 fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFD, 0, 794 fNormalizedText, fNormalizedTextLength+1, &status); 795 } 796 } 797 } 798 799 800 NFDBuffer::~NFDBuffer() { 801 if (fNormalizedText != fSmallBuf) { 802 uprv_free(fNormalizedText); 803 } 804 fNormalizedText = 0; 805 } 806 807 const UChar *NFDBuffer::getBuffer() { 808 return fNormalizedText; 809 } 810 811 int32_t NFDBuffer::getLength() { 812 return fNormalizedTextLength; 813 } 814 815 816 817 818 819 U_NAMESPACE_END 820 821 U_NAMESPACE_USE 822 823 //----------------------------------------------------------------------------- 824 // 825 // uspoof_swap - byte swap and char encoding swap of spoof data 826 // 827 //----------------------------------------------------------------------------- 828 U_CAPI int32_t U_EXPORT2 829 uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, 830 UErrorCode *status) { 831 832 if (status == NULL || U_FAILURE(*status)) { 833 return 0; 834 } 835 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { 836 *status=U_ILLEGAL_ARGUMENT_ERROR; 837 return 0; 838 } 839 840 // 841 // Check that the data header is for spoof data. 842 // (Header contents are defined in gencfu.cpp) 843 // 844 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); 845 if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */ 846 pInfo->dataFormat[1]==0x66 && 847 pInfo->dataFormat[2]==0x75 && 848 pInfo->dataFormat[3]==0x20 && 849 pInfo->formatVersion[0]==1 )) { 850 udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x " 851 "(format version %02x %02x %02x %02x) is not recognized\n", 852 pInfo->dataFormat[0], pInfo->dataFormat[1], 853 pInfo->dataFormat[2], pInfo->dataFormat[3], 854 pInfo->formatVersion[0], pInfo->formatVersion[1], 855 pInfo->formatVersion[2], pInfo->formatVersion[3]); 856 *status=U_UNSUPPORTED_ERROR; 857 return 0; 858 } 859 860 // 861 // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific 862 // header). This swap also conveniently gets us 863 // the size of the ICU d.h., which lets us locate the start 864 // of the uspoof specific data. 865 // 866 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); 867 868 869 // 870 // Get the Spoof Data Header, and check that it appears to be OK. 871 // 872 // 873 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; 874 SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes; 875 if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC || 876 ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader)) 877 { 878 udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n"); 879 *status=U_UNSUPPORTED_ERROR; 880 return 0; 881 } 882 883 // 884 // Prefight operation? Just return the size 885 // 886 int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength); 887 int32_t totalSize = headerSize + spoofDataLength; 888 if (length < 0) { 889 return totalSize; 890 } 891 892 // 893 // Check that length passed in is consistent with length from Spoof data header. 894 // 895 if (length < totalSize) { 896 udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n", 897 spoofDataLength); 898 *status=U_INDEX_OUTOFBOUNDS_ERROR; 899 return 0; 900 } 901 902 903 // 904 // Swap the Data. Do the data itself first, then the Spoof Data Header, because 905 // we need to reference the header to locate the data, and an 906 // inplace swap of the header leaves it unusable. 907 // 908 uint8_t *outBytes = (uint8_t *)outData + headerSize; 909 SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes; 910 911 int32_t sectionStart; 912 int32_t sectionLength; 913 914 // 915 // If not swapping in place, zero out the output buffer before starting. 916 // Gaps may exist between the individual sections, and these must be zeroed in 917 // the output buffer. The simplest way to do that is to just zero the whole thing. 918 // 919 if (inBytes != outBytes) { 920 uprv_memset(outBytes, 0, spoofDataLength); 921 } 922 923 // Confusables Keys Section (fCFUKeys) 924 sectionStart = ds->readUInt32(spoofDH->fCFUKeys); 925 sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4; 926 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 927 928 // String Index Section 929 sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex); 930 sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2; 931 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 932 933 // String Table Section 934 sectionStart = ds->readUInt32(spoofDH->fCFUStringTable); 935 sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2; 936 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 937 938 // String Lengths Section 939 sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths); 940 sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4; 941 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 942 943 // Any Case Trie 944 sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie); 945 sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength); 946 utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 947 948 // Lower Case Trie 949 sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie); 950 sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength); 951 utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 952 953 // Script Sets. The data is an array of int32_t 954 sectionStart = ds->readUInt32(spoofDH->fScriptSets); 955 sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet); 956 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 957 958 // And, last, swap the header itself. 959 // int32_t fMagic // swap this 960 // uint8_t fFormatVersion[4] // Do not swap this, just copy 961 // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff. 962 // 963 uint32_t magic = ds->readUInt32(spoofDH->fMagic); 964 ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic); 965 966 if (outputDH->fFormatVersion != spoofDH->fFormatVersion) { 967 uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion)); 968 } 969 // swap starting at fLength 970 ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status); 971 972 return totalSize; 973 } 974 975 #endif 976 977 978