1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2008-2012, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: uspoof_wsconf.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009Jan05 (refactoring earlier files) 14 * created by: Andy Heninger 15 * 16 * Internal functions for compililing Whole Script confusable source data 17 * into its binary (runtime) form. The binary data format is described 18 * in uspoof_impl.h 19 */ 20 21 #include "unicode/utypes.h" 22 #include "unicode/uspoof.h" 23 24 #if !UCONFIG_NO_NORMALIZATION 25 26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 27 28 #include "unicode/unorm.h" 29 #include "unicode/uregex.h" 30 #include "unicode/ustring.h" 31 #include "cmemory.h" 32 #include "uspoof_impl.h" 33 #include "uhash.h" 34 #include "uvector.h" 35 #include "uassert.h" 36 #include "uspoof_wsconf.h" 37 38 U_NAMESPACE_USE 39 40 41 // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt 42 // Example Lines: 43 // 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O 44 // 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I 45 // | | | | 46 // | | | |---- Which table, Any Case or Lower Case (A or L) 47 // | | |----------Target script. We need this. 48 // | |----------------Src script. Should match the script of the source 49 // | code points. Beyond checking that, we don't keep it. 50 // |--------------------------------Source code points or range. 51 // 52 // The expression will match _all_ lines, including erroneous lines. 53 // The result of the parse is returned via the contents of the (match) groups. 54 static const char *parseExp = 55 "(?m)" // Multi-line mode 56 "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1. 57 "|^(?:" // OR 58 "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3. 59 "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4. 60 "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5. 61 "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7 62 "[ \\t]*(?:#.*?)?" // Trailing commment 63 ")$|" // OR 64 "^(.*?)$"; // An error line. Group 8. 65 // Any line not matching the preceding 66 // parts of the expression.will match 67 // this, and thus be flagged as an error 68 69 70 // Extract a regular expression match group into a char * string. 71 // The group must contain only invariant characters. 72 // Used for script names 73 // 74 static void extractGroup( 75 URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) { 76 77 UChar ubuf[50]; 78 ubuf[0] = 0; 79 destBuf[0] = 0; 80 int32_t len = uregex_group(e, group, ubuf, 50, &status); 81 if (U_FAILURE(status) || len == -1 || len >= destCapacity) { 82 return; 83 } 84 UnicodeString s(FALSE, ubuf, len); // Aliasing constructor 85 s.extract(0, len, destBuf, destCapacity, US_INV); 86 } 87 88 89 90 U_NAMESPACE_BEGIN 91 92 // Build the Whole Script Confusable data 93 // 94 // TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, 95 // because everything is local to this one build function anyhow, 96 // OR 97 // break this function into more reasonably sized pieces, with 98 // state in WSConfusableDataBuilder. 99 // 100 void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, 101 int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) 102 { 103 if (U_FAILURE(status)) { 104 return; 105 } 106 URegularExpression *parseRegexp = NULL; 107 int32_t inputLen = 0; 108 UChar *input = NULL; 109 int32_t lineNum = 0; 110 111 UVector *scriptSets = NULL; 112 uint32_t rtScriptSetsCount = 2; 113 114 UTrie2 *anyCaseTrie = NULL; 115 UTrie2 *lowerCaseTrie = NULL; 116 117 anyCaseTrie = utrie2_open(0, 0, &status); 118 lowerCaseTrie = utrie2_open(0, 0, &status); 119 120 UnicodeString pattern(parseExp, -1, US_INV); 121 122 // The scriptSets vector provides a mapping from TRIE values to the set of scripts. 123 // 124 // Reserved TRIE values: 125 // 0: Code point has no whole script confusables. 126 // 1: Code point is of script Common or Inherited. 127 // These code points do not participate in whole script confusable detection. 128 // (This is logically equivalent to saying that they contain confusables in 129 // all scripts) 130 // 131 // Because Trie values are indexes into the ScriptSets vector, pre-fill 132 // vector positions 0 and 1 to avoid conflicts with the reserved values. 133 134 scriptSets = new UVector(status); 135 if (scriptSets == NULL) { 136 status = U_MEMORY_ALLOCATION_ERROR; 137 goto cleanup; 138 } 139 scriptSets->addElement((void *)NULL, status); 140 scriptSets->addElement((void *)NULL, status); 141 142 // Convert the user input data from UTF-8 to UChar (UTF-16) 143 u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); 144 if (status != U_BUFFER_OVERFLOW_ERROR) { 145 goto cleanup; 146 } 147 status = U_ZERO_ERROR; 148 input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); 149 if (input == NULL) { 150 status = U_MEMORY_ALLOCATION_ERROR; 151 goto cleanup; 152 } 153 u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); 154 155 parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); 156 157 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign 158 // given the syntax of the input. 159 if (*input == 0xfeff) { 160 *input = 0x20; 161 } 162 163 // Parse the input, one line per iteration of this loop. 164 uregex_setText(parseRegexp, input, inputLen, &status); 165 while (uregex_findNext(parseRegexp, &status)) { 166 lineNum++; 167 if (uregex_start(parseRegexp, 1, &status) >= 0) { 168 // this was a blank or comment line. 169 continue; 170 } 171 if (uregex_start(parseRegexp, 8, &status) >= 0) { 172 // input file syntax error. 173 status = U_PARSE_ERROR; 174 goto cleanup; 175 } 176 if (U_FAILURE(status)) { 177 goto cleanup; 178 } 179 180 // Pick up the start and optional range end code points from the parsed line. 181 UChar32 startCodePoint = SpoofImpl::ScanHex( 182 input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); 183 UChar32 endCodePoint = startCodePoint; 184 if (uregex_start(parseRegexp, 3, &status) >=0) { 185 endCodePoint = SpoofImpl::ScanHex( 186 input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); 187 } 188 189 // Extract the two script names from the source line. We need these in an 8 bit 190 // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on 191 // to the ICU u_getPropertyValueEnum() function. Ugh. 192 char srcScriptName[20]; 193 char targScriptName[20]; 194 extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); 195 extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); 196 UScriptCode srcScript = 197 static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); 198 UScriptCode targScript = 199 static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); 200 if (U_FAILURE(status)) { 201 goto cleanup; 202 } 203 if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { 204 status = U_INVALID_FORMAT_ERROR; 205 goto cleanup; 206 } 207 208 // select the table - (A) any case or (L) lower case only 209 UTrie2 *table = anyCaseTrie; 210 if (uregex_start(parseRegexp, 7, &status) >= 0) { 211 table = lowerCaseTrie; 212 } 213 214 // Build the set of scripts containing confusable characters for 215 // the code point(s) specified in this input line. 216 // Sanity check that the script of the source code point is the same 217 // as the source script indicated in the input file. Failure of this check is 218 // an error in the input file. 219 // Include the source script in the set (needed for Mixed Script Confusable detection). 220 // 221 UChar32 cp; 222 for (cp=startCodePoint; cp<=endCodePoint; cp++) { 223 int32_t setIndex = utrie2_get32(table, cp); 224 BuilderScriptSet *bsset = NULL; 225 if (setIndex > 0) { 226 U_ASSERT(setIndex < scriptSets->size()); 227 bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex)); 228 } else { 229 bsset = new BuilderScriptSet(); 230 if (bsset == NULL) { 231 status = U_MEMORY_ALLOCATION_ERROR; 232 goto cleanup; 233 } 234 bsset->codePoint = cp; 235 bsset->trie = table; 236 bsset->sset = new ScriptSet(); 237 setIndex = scriptSets->size(); 238 bsset->index = setIndex; 239 bsset->rindex = 0; 240 if (bsset->sset == NULL) { 241 status = U_MEMORY_ALLOCATION_ERROR; 242 goto cleanup; 243 } 244 scriptSets->addElement(bsset, status); 245 utrie2_set32(table, cp, setIndex, &status); 246 } 247 bsset->sset->Union(targScript); 248 bsset->sset->Union(srcScript); 249 250 if (U_FAILURE(status)) { 251 goto cleanup; 252 } 253 UScriptCode cpScript = uscript_getScript(cp, &status); 254 if (cpScript != srcScript) { 255 status = U_INVALID_FORMAT_ERROR; 256 goto cleanup; 257 } 258 } 259 } 260 261 // Eliminate duplicate script sets. At this point we have a separate 262 // script set for every code point that had data in the input file. 263 // 264 // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them 265 // 266 // printf("Number of scriptSets: %d\n", scriptSets->size()); 267 { 268 int32_t duplicateCount = 0; 269 rtScriptSetsCount = 2; 270 for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { 271 BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri)); 272 if (outerSet->index != static_cast<uint32_t>(outeri)) { 273 // This set was already identified as a duplicate. 274 // It will not be allocated a position in the runtime array of ScriptSets. 275 continue; 276 } 277 outerSet->rindex = rtScriptSetsCount++; 278 for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { 279 BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri)); 280 if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { 281 delete innerSet->sset; 282 innerSet->scriptSetOwned = FALSE; 283 innerSet->sset = outerSet->sset; 284 innerSet->index = outeri; 285 innerSet->rindex = outerSet->rindex; 286 duplicateCount++; 287 } 288 // But this doesn't get all. We need to fix the TRIE. 289 } 290 } 291 // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); 292 } 293 294 295 296 // Update the Trie values to be reflect the run time script indexes (after duplicate merging). 297 // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets 298 // are unused, which is why the loop index starts at 2.) 299 { 300 for (int32_t i=2; i<scriptSets->size(); i++) { 301 BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); 302 if (bSet->rindex != (uint32_t)i) { 303 utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); 304 } 305 } 306 } 307 308 // For code points with script==Common or script==Inherited, 309 // Set the reserved value of 1 into both Tries. These characters do not participate 310 // in Whole Script Confusable detection; this reserved value is the means 311 // by which they are detected. 312 { 313 UnicodeSet ignoreSet; 314 ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); 315 UnicodeSet inheritedSet; 316 inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); 317 ignoreSet.addAll(inheritedSet); 318 for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { 319 UChar32 rangeStart = ignoreSet.getRangeStart(rn); 320 UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); 321 utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); 322 utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); 323 } 324 } 325 326 // Serialize the data to the Spoof Detector 327 { 328 utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); 329 int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); 330 // printf("Any case Trie size: %d\n", size); 331 if (status != U_BUFFER_OVERFLOW_ERROR) { 332 goto cleanup; 333 } 334 status = U_ZERO_ERROR; 335 spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; 336 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; 337 spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; 338 void *where = spImpl->fSpoofData->reserveSpace(size, status); 339 utrie2_serialize(anyCaseTrie, where, size, &status); 340 341 utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); 342 size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); 343 // printf("Lower case Trie size: %d\n", size); 344 if (status != U_BUFFER_OVERFLOW_ERROR) { 345 goto cleanup; 346 } 347 status = U_ZERO_ERROR; 348 spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; 349 spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; 350 spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; 351 where = spImpl->fSpoofData->reserveSpace(size, status); 352 utrie2_serialize(lowerCaseTrie, where, size, &status); 353 354 spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; 355 spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; 356 ScriptSet *rtScriptSets = static_cast<ScriptSet *> 357 (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); 358 uint32_t rindex = 2; 359 for (int32_t i=2; i<scriptSets->size(); i++) { 360 BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); 361 if (bSet->rindex < rindex) { 362 // We have already copied this script set to the serialized data. 363 continue; 364 } 365 U_ASSERT(rindex == bSet->rindex); 366 rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. 367 rindex++; 368 } 369 } 370 371 // Open new utrie2s from the serialized data. We don't want to keep the ones 372 // we just built because we would then have two copies of the data, one internal to 373 // the utries that we have already constructed, and one in the serialized data area. 374 // An alternative would be to not pre-serialize the Trie data, but that makes the 375 // spoof detector data different, depending on how the detector was constructed. 376 // It's simpler to keep the data always the same. 377 378 spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( 379 UTRIE2_16_VALUE_BITS, 380 (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, 381 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, 382 NULL, 383 &status); 384 385 spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( 386 UTRIE2_16_VALUE_BITS, 387 (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, 388 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, 389 NULL, 390 &status); 391 392 393 394 cleanup: 395 if (U_FAILURE(status)) { 396 pe->line = lineNum; 397 } 398 uregex_close(parseRegexp); 399 uprv_free(input); 400 401 int32_t i; 402 if (scriptSets != NULL) { 403 for (i=0; i<scriptSets->size(); i++) { 404 BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); 405 delete bsset; 406 } 407 delete scriptSets; 408 } 409 utrie2_close(anyCaseTrie); 410 utrie2_close(lowerCaseTrie); 411 return; 412 } 413 414 U_NAMESPACE_END 415 416 417 418 BuilderScriptSet::BuilderScriptSet() { 419 codePoint = -1; 420 trie = NULL; 421 sset = NULL; 422 index = 0; 423 rindex = 0; 424 scriptSetOwned = TRUE; 425 } 426 427 BuilderScriptSet::~BuilderScriptSet() { 428 if (scriptSetOwned) { 429 delete sset; 430 } 431 } 432 433 #endif 434 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 435 436