1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2008-2009, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: uspoof_buildconf.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009Jan05 (refactoring earlier files) 14 * created by: Andy Heninger 15 * 16 * Internal classes for compililing confusable data into its binary (runtime) form. 17 */ 18 19 #include "unicode/utypes.h" 20 #include "unicode/uspoof.h" 21 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 22 #if !UCONFIG_NO_NORMALIZATION 23 24 #include "unicode/unorm.h" 25 #include "unicode/uregex.h" 26 #include "unicode/ustring.h" 27 #include "cmemory.h" 28 #include "uspoof_impl.h" 29 #include "uhash.h" 30 #include "uvector.h" 31 #include "uassert.h" 32 #include "uarrsort.h" 33 #include "uspoof_buildconf.h" 34 35 U_NAMESPACE_USE 36 37 38 //--------------------------------------------------------------------- 39 // 40 // buildConfusableData Compile the source confusable data, as defined by 41 // the Unicode data file confusables.txt, into the binary 42 // structures used by the confusable detector. 43 // 44 // The binary structures are described in uspoof_impl.h 45 // 46 // 1. parse the data, building 4 hash tables, one each for the SL, SA, ML and MA 47 // tables. Each maps from a UChar32 to a String. 48 // 49 // 2. Sort all of the strings encountered by length, since they will need to 50 // be stored in that order in the final string table. 51 // 52 // 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the 53 // list because that will be the ordering of our runtime table. 54 // 55 // 4. Generate the run time string table. This is generated before the key & value 56 // tables because we need the string indexes when building those tables. 57 // 58 // 5. Build the run-time key and value tables. These are parallel tables, and are built 59 // at the same time 60 // 61 62 SPUString::SPUString(UnicodeString *s) { 63 fStr = s; 64 fStrTableIndex = 0; 65 } 66 67 68 SPUString::~SPUString() { 69 delete fStr; 70 } 71 72 73 SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(NULL), fHash(NULL) { 74 fVec = new UVector(status); 75 fHash = uhash_open(uhash_hashUnicodeString, // key hash function 76 uhash_compareUnicodeString, // Key Comparator 77 NULL, // Value Comparator 78 &status); 79 } 80 81 82 SPUStringPool::~SPUStringPool() { 83 int i; 84 for (i=fVec->size()-1; i>=0; i--) { 85 SPUString *s = static_cast<SPUString *>(fVec->elementAt(i)); 86 delete s; 87 } 88 delete fVec; 89 uhash_close(fHash); 90 } 91 92 93 int32_t SPUStringPool::size() { 94 return fVec->size(); 95 } 96 97 SPUString *SPUStringPool::getByIndex(int32_t index) { 98 SPUString *retString = (SPUString *)fVec->elementAt(index); 99 return retString; 100 } 101 102 103 // Comparison function for ordering strings in the string pool. 104 // Compare by length first, then, within a group of the same length, 105 // by code point order. 106 // Conforms to the type signature for a USortComparator in uvector.h 107 108 static int8_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) { 109 const SPUString *sL = static_cast<const SPUString *>(left.pointer); 110 const SPUString *sR = static_cast<const SPUString *>(right.pointer); 111 int32_t lenL = sL->fStr->length(); 112 int32_t lenR = sR->fStr->length(); 113 if (lenL < lenR) { 114 return -1; 115 } else if (lenL > lenR) { 116 return 1; 117 } else { 118 return sL->fStr->compare(*(sR->fStr)); 119 } 120 } 121 122 void SPUStringPool::sort(UErrorCode &status) { 123 fVec->sort(SPUStringCompare, status); 124 } 125 126 127 SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) { 128 SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src)); 129 if (hashedString != NULL) { 130 delete src; 131 } else { 132 hashedString = new SPUString(src); 133 uhash_put(fHash, src, hashedString, &status); 134 fVec->addElement(hashedString, status); 135 } 136 return hashedString; 137 } 138 139 140 141 ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) : 142 fSpoofImpl(spImpl), 143 fInput(NULL), 144 fSLTable(NULL), 145 fSATable(NULL), 146 fMLTable(NULL), 147 fMATable(NULL), 148 fKeySet(NULL), 149 fKeyVec(NULL), 150 fValueVec(NULL), 151 fStringTable(NULL), 152 fStringLengthsTable(NULL), 153 stringPool(NULL), 154 fParseLine(NULL), 155 fParseHexNum(NULL), 156 fLineNum(0) 157 { 158 if (U_FAILURE(status)) { 159 return; 160 } 161 fSLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); 162 fSATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); 163 fMLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); 164 fMATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); 165 fKeySet = new UnicodeSet(); 166 fKeyVec = new UVector(status); 167 fValueVec = new UVector(status); 168 stringPool = new SPUStringPool(status); 169 } 170 171 172 ConfusabledataBuilder::~ConfusabledataBuilder() { 173 uprv_free(fInput); 174 uregex_close(fParseLine); 175 uregex_close(fParseHexNum); 176 uhash_close(fSLTable); 177 uhash_close(fSATable); 178 uhash_close(fMLTable); 179 uhash_close(fMATable); 180 delete fKeySet; 181 delete fKeyVec; 182 delete fStringTable; 183 delete fStringLengthsTable; 184 delete fValueVec; 185 delete stringPool; 186 } 187 188 189 void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables, 190 int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) { 191 192 if (U_FAILURE(status)) { 193 return; 194 } 195 ConfusabledataBuilder builder(spImpl, status); 196 builder.build(confusables, confusablesLen, status); 197 if (U_FAILURE(status) && errorType != NULL) { 198 *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; 199 pe->line = builder.fLineNum; 200 } 201 } 202 203 204 void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen, 205 UErrorCode &status) { 206 207 // Convert the user input data from UTF-8 to UChar (UTF-16) 208 int32_t inputLen = 0; 209 if (U_FAILURE(status)) { 210 return; 211 } 212 u_strFromUTF8(NULL, 0, &inputLen, confusables, confusablesLen, &status); 213 if (status != U_BUFFER_OVERFLOW_ERROR) { 214 return; 215 } 216 status = U_ZERO_ERROR; 217 fInput = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); 218 if (fInput == NULL) { 219 status = U_MEMORY_ALLOCATION_ERROR; 220 } 221 u_strFromUTF8(fInput, inputLen+1, NULL, confusables, confusablesLen, &status); 222 223 224 // Regular Expression to parse a line from Confusables.txt. The expression will match 225 // any line. What was matched is determined by examining which capture groups have a match. 226 // Capture Group 1: the source char 227 // Capture Group 2: the replacement chars 228 // Capture Group 3-6 the table type, SL, SA, ML, or MA 229 // Capture Group 7: A blank or comment only line. 230 // Capture Group 8: A syntactically invalid line. Anything that didn't match before. 231 // Example Line from the confusables.txt source file: 232 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... " 233 fParseLine = uregex_openC( 234 "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char 235 "[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s) 236 "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued) 237 "\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type 238 "[ \\t]*(?:#.*?)?$" // Match any trailing #comment 239 "|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment 240 "|^(.*?)$", // OR match any line, which catches illegal lines. 241 0, NULL, &status); 242 243 // Regular expression for parsing a hex number out of a space-separated list of them. 244 // Capture group 1 gets the number, with spaces removed. 245 fParseHexNum = uregex_openC("\\s*([0-9A-F]+)", 0, NULL, &status); 246 247 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign 248 // given the syntax of the input. 249 if (*fInput == 0xfeff) { 250 *fInput = 0x20; 251 } 252 253 // Parse the input, one line per iteration of this loop. 254 uregex_setText(fParseLine, fInput, inputLen, &status); 255 while (uregex_findNext(fParseLine, &status)) { 256 fLineNum++; 257 if (uregex_start(fParseLine, 7, &status) >= 0) { 258 // this was a blank or comment line. 259 continue; 260 } 261 if (uregex_start(fParseLine, 8, &status) >= 0) { 262 // input file syntax error. 263 status = U_PARSE_ERROR; 264 return; 265 } 266 267 // We have a good input line. Extract the key character and mapping string, and 268 // put them into the appropriate mapping table. 269 UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status), 270 uregex_end(fParseLine, 1, &status), status); 271 272 int32_t mapStringStart = uregex_start(fParseLine, 2, &status); 273 int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart; 274 uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status); 275 276 UnicodeString *mapString = new UnicodeString(); 277 if (mapString == NULL) { 278 status = U_MEMORY_ALLOCATION_ERROR; 279 return; 280 } 281 while (uregex_findNext(fParseHexNum, &status)) { 282 UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status), 283 uregex_end(fParseHexNum, 1, &status), status); 284 mapString->append(c); 285 } 286 U_ASSERT(mapString->length() >= 1); 287 288 // Put the map (value) string into the string pool 289 // This a little like a Java intern() - any duplicates will be eliminated. 290 SPUString *smapString = stringPool->addString(mapString, status); 291 292 // Add the UChar -> string mapping to the appropriate table. 293 UHashtable *table = uregex_start(fParseLine, 3, &status) >= 0 ? fSLTable : 294 uregex_start(fParseLine, 4, &status) >= 0 ? fSATable : 295 uregex_start(fParseLine, 5, &status) >= 0 ? fMLTable : 296 uregex_start(fParseLine, 6, &status) >= 0 ? fMATable : 297 NULL; 298 U_ASSERT(table != NULL); 299 uhash_iput(table, keyChar, smapString, &status); 300 fKeySet->add(keyChar); 301 if (U_FAILURE(status)) { 302 return; 303 } 304 } 305 306 // Input data is now all parsed and collected. 307 // Now create the run-time binary form of the data. 308 // 309 // This is done in two steps. First the data is assembled into vectors and strings, 310 // for ease of construction, then the contents of these collections are dumped 311 // into the actual raw-bytes data storage. 312 313 // Build up the string array, and record the index of each string therein 314 // in the (build time only) string pool. 315 // Strings of length one are not entered into the strings array. 316 // At the same time, build up the string lengths table, which records the 317 // position in the string table of the first string of each length >= 4. 318 // (Strings in the table are sorted by length) 319 stringPool->sort(status); 320 fStringTable = new UnicodeString(); 321 fStringLengthsTable = new UVector(status); 322 int32_t previousStringLength = 0; 323 int32_t previousStringIndex = 0; 324 int32_t poolSize = stringPool->size(); 325 int32_t i; 326 for (i=0; i<poolSize; i++) { 327 SPUString *s = stringPool->getByIndex(i); 328 int32_t strLen = s->fStr->length(); 329 int32_t strIndex = fStringTable->length(); 330 U_ASSERT(strLen >= previousStringLength); 331 if (strLen == 1) { 332 // strings of length one do not get an entry in the string table. 333 // Keep the single string character itself here, which is the same 334 // convention that is used in the final run-time string table index. 335 s->fStrTableIndex = s->fStr->charAt(0); 336 } else { 337 if ((strLen > previousStringLength) && (previousStringLength >= 4)) { 338 fStringLengthsTable->addElement(previousStringIndex, status); 339 fStringLengthsTable->addElement(previousStringLength, status); 340 } 341 s->fStrTableIndex = strIndex; 342 fStringTable->append(*(s->fStr)); 343 } 344 previousStringLength = strLen; 345 previousStringIndex = strIndex; 346 } 347 // Make the final entry to the string lengths table. 348 // (it holds an entry for the _last_ string of each length, so adding the 349 // final one doesn't happen in the main loop because no longer string was encountered.) 350 if (previousStringLength >= 4) { 351 fStringLengthsTable->addElement(previousStringIndex, status); 352 fStringLengthsTable->addElement(previousStringLength, status); 353 } 354 355 // Construct the compile-time Key and Value tables 356 // 357 // For each key code point, check which mapping tables it applies to, 358 // and create the final data for the key & value structures. 359 // 360 // The four logical mapping tables are conflated into one combined table. 361 // If multiple logical tables have the same mapping for some key, they 362 // share a single entry in the combined table. 363 // If more than one mapping exists for the same key code point, multiple 364 // entries will be created in the table 365 366 for (int32_t range=0; range<fKeySet->getRangeCount(); range++) { 367 // It is an oddity of the UnicodeSet API that simply enumerating the contained 368 // code points requires a nested loop. 369 for (UChar32 keyChar=fKeySet->getRangeStart(range); 370 keyChar <= fKeySet->getRangeEnd(range); keyChar++) { 371 addKeyEntry(keyChar, fSLTable, USPOOF_SL_TABLE_FLAG, status); 372 addKeyEntry(keyChar, fSATable, USPOOF_SA_TABLE_FLAG, status); 373 addKeyEntry(keyChar, fMLTable, USPOOF_ML_TABLE_FLAG, status); 374 addKeyEntry(keyChar, fMATable, USPOOF_MA_TABLE_FLAG, status); 375 } 376 } 377 378 // Put the assembled data into the flat runtime array 379 outputData(status); 380 381 // All of the intermediate allocated data belongs to the ConfusabledataBuilder 382 // object (this), and is deleted in the destructor. 383 return; 384 } 385 386 // 387 // outputData The confusable data has been compiled and stored in intermediate 388 // collections and strings. Copy it from there to the final flat 389 // binary array. 390 // 391 // Note that as each section is added to the output data, the 392 // expand (reserveSpace() function will likely relocate it in memory. 393 // Be careful with pointers. 394 // 395 void ConfusabledataBuilder::outputData(UErrorCode &status) { 396 397 U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned == TRUE); 398 399 // The Key Table 400 // While copying the keys to the runtime array, 401 // also sanity check that they are sorted. 402 403 int32_t numKeys = fKeyVec->size(); 404 int32_t *keys = 405 static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status)); 406 if (U_FAILURE(status)) { 407 return; 408 } 409 int i; 410 int32_t previousKey = 0; 411 for (i=0; i<numKeys; i++) { 412 int32_t key = fKeyVec->elementAti(i); 413 U_ASSERT((key & 0x00ffffff) >= (previousKey & 0x00ffffff)); 414 U_ASSERT((key & 0xff000000) != 0); 415 keys[i] = key; 416 previousKey = key; 417 } 418 SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData; 419 rawData->fCFUKeys = (char *)keys - (char *)rawData; 420 rawData->fCFUKeysSize = numKeys; 421 fSpoofImpl->fSpoofData->fCFUKeys = keys; 422 423 424 // The Value Table, parallels the key table 425 int32_t numValues = fValueVec->size(); 426 U_ASSERT(numKeys == numValues); 427 uint16_t *values = 428 static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status)); 429 if (U_FAILURE(status)) { 430 return; 431 } 432 for (i=0; i<numValues; i++) { 433 uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i)); 434 U_ASSERT(value < 0xffff); 435 values[i] = static_cast<uint16_t>(value); 436 } 437 rawData = fSpoofImpl->fSpoofData->fRawData; 438 rawData->fCFUStringIndex = (char *)values - (char *)rawData; 439 rawData->fCFUStringIndexSize = numValues; 440 fSpoofImpl->fSpoofData->fCFUValues = values; 441 442 // The Strings Table. 443 444 uint32_t stringsLength = fStringTable->length(); 445 // Reserve an extra space so the string will be nul-terminated. This is 446 // only a convenience, for when debugging; it is not needed otherwise. 447 UChar *strings = 448 static_cast<UChar *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(UChar)+2, status)); 449 if (U_FAILURE(status)) { 450 return; 451 } 452 fStringTable->extract(strings, stringsLength+1, status); 453 rawData = fSpoofImpl->fSpoofData->fRawData; 454 U_ASSERT(rawData->fCFUStringTable == 0); 455 rawData->fCFUStringTable = (char *)strings - (char *)rawData; 456 rawData->fCFUStringTableLen = stringsLength; 457 fSpoofImpl->fSpoofData->fCFUStrings = strings; 458 459 // The String Lengths Table 460 // While copying into the runtime array do some sanity checks on the values 461 // Each complete entry contains two fields, an index and an offset. 462 // Lengths should increase with each entry. 463 // Offsets should be less than the size of the string table. 464 int32_t lengthTableLength = fStringLengthsTable->size(); 465 uint16_t *stringLengths = 466 static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(lengthTableLength*sizeof(uint16_t), status)); 467 if (U_FAILURE(status)) { 468 return; 469 } 470 int32_t destIndex = 0; 471 uint32_t previousLength = 0; 472 for (i=0; i<lengthTableLength; i+=2) { 473 uint32_t offset = static_cast<uint32_t>(fStringLengthsTable->elementAti(i)); 474 uint32_t length = static_cast<uint32_t>(fStringLengthsTable->elementAti(i+1)); 475 U_ASSERT(offset < stringsLength); 476 U_ASSERT(length < 40); 477 U_ASSERT(length > previousLength); 478 stringLengths[destIndex++] = static_cast<uint16_t>(offset); 479 stringLengths[destIndex++] = static_cast<uint16_t>(length); 480 previousLength = length; 481 } 482 rawData = fSpoofImpl->fSpoofData->fRawData; 483 rawData->fCFUStringLengths = (char *)stringLengths - (char *)rawData; 484 // Note: StringLengthsSize in the raw data is the number of complete entries, 485 // each consisting of a pair of 16 bit values, hence the divide by 2. 486 rawData->fCFUStringLengthsSize = lengthTableLength / 2; 487 fSpoofImpl->fSpoofData->fCFUStringLengths = 488 reinterpret_cast<SpoofStringLengthsElement *>(stringLengths); 489 } 490 491 492 493 // addKeyEntry Construction of the confusable Key and Mapping Values tables. 494 // This is an intermediate point in the building process. 495 // We already have the mappings in the hash tables fSLTable, etc. 496 // This function builds corresponding run-time style table entries into 497 // fKeyVec and fValueVec 498 499 void ConfusabledataBuilder::addKeyEntry( 500 UChar32 keyChar, // The key character 501 UHashtable *table, // The table, one of SATable, MATable, etc. 502 int32_t tableFlag, // One of USPOOF_SA_TABLE_FLAG, etc. 503 UErrorCode &status) { 504 505 SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(table, keyChar)); 506 if (targetMapping == NULL) { 507 // No mapping for this key character. 508 // (This function is called for all four tables for each key char that 509 // is seen anywhere, so this no entry cases are very much expected.) 510 return; 511 } 512 513 // Check whether there is already an entry with the correct mapping. 514 // If so, simply set the flag in the keyTable saying that the existing entry 515 // applies to the table that we're doing now. 516 517 UBool keyHasMultipleValues = FALSE; 518 int32_t i; 519 for (i=fKeyVec->size()-1; i>=0 ; i--) { 520 int32_t key = fKeyVec->elementAti(i); 521 if ((key & 0x0ffffff) != keyChar) { 522 // We have now checked all existing key entries for this key char (if any) 523 // without finding one with the same mapping. 524 break; 525 } 526 UnicodeString mapping = getMapping(i); 527 if (mapping == *(targetMapping->fStr)) { 528 // The run time entry we are currently testing has the correct mapping. 529 // Set the flag in it indicating that it applies to the new table also. 530 key |= tableFlag; 531 fKeyVec->setElementAt(key, i); 532 return; 533 } 534 keyHasMultipleValues = TRUE; 535 } 536 537 // Need to add a new entry to the binary data being built for this mapping. 538 // Includes adding entries to both the key table and the parallel values table. 539 540 int32_t newKey = keyChar | tableFlag; 541 if (keyHasMultipleValues) { 542 newKey |= USPOOF_KEY_MULTIPLE_VALUES; 543 } 544 int32_t adjustedMappingLength = targetMapping->fStr->length() - 1; 545 if (adjustedMappingLength>3) { 546 adjustedMappingLength = 3; 547 } 548 newKey |= adjustedMappingLength << USPOOF_KEY_LENGTH_SHIFT; 549 550 int32_t newData = targetMapping->fStrTableIndex; 551 552 fKeyVec->addElement(newKey, status); 553 fValueVec->addElement(newData, status); 554 555 // If the preceding key entry is for the same key character (but with a different mapping) 556 // set the multiple-values flag on it. 557 if (keyHasMultipleValues) { 558 int32_t previousKeyIndex = fKeyVec->size() - 2; 559 int32_t previousKey = fKeyVec->elementAti(previousKeyIndex); 560 previousKey |= USPOOF_KEY_MULTIPLE_VALUES; 561 fKeyVec->setElementAt(previousKey, previousKeyIndex); 562 } 563 } 564 565 566 567 UnicodeString ConfusabledataBuilder::getMapping(int32_t index) { 568 int32_t key = fKeyVec->elementAti(index); 569 int32_t value = fValueVec->elementAti(index); 570 int32_t length = USPOOF_KEY_LENGTH_FIELD(key); 571 int32_t lastIndexWithLen; 572 switch (length) { 573 case 0: 574 return UnicodeString(static_cast<UChar>(value)); 575 case 1: 576 case 2: 577 return UnicodeString(*fStringTable, value, length+1); 578 case 3: 579 length = 0; 580 int32_t i; 581 for (i=0; i<fStringLengthsTable->size(); i+=2) { 582 lastIndexWithLen = fStringLengthsTable->elementAti(i); 583 if (value <= lastIndexWithLen) { 584 length = fStringLengthsTable->elementAti(i+1); 585 break; 586 } 587 } 588 U_ASSERT(length>=3); 589 return UnicodeString(*fStringTable, value, length); 590 default: 591 U_ASSERT(FALSE); 592 } 593 return UnicodeString(); 594 } 595 596 #endif 597 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 598 599