Home | History | Annotate | Download | only in i18n
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2008-2015, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  uspoof_conf.cpp
     11 *   encoding:   US-ASCII
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2009Jan05  (refactoring earlier files)
     16 *   created by: Andy Heninger
     17 *
     18 *   Internal classes for compililing confusable data into its binary (runtime) form.
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "unicode/uspoof.h"
     23 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     24 #if !UCONFIG_NO_NORMALIZATION
     25 
     26 #include "unicode/unorm.h"
     27 #include "unicode/uregex.h"
     28 #include "unicode/ustring.h"
     29 #include "cmemory.h"
     30 #include "uspoof_impl.h"
     31 #include "uhash.h"
     32 #include "uvector.h"
     33 #include "uassert.h"
     34 #include "uarrsort.h"
     35 #include "uspoof_conf.h"
     36 
     37 U_NAMESPACE_USE
     38 
     39 
     40 //---------------------------------------------------------------------
     41 //
     42 //  buildConfusableData   Compile the source confusable data, as defined by
     43 //                        the Unicode data file confusables.txt, into the binary
     44 //                        structures used by the confusable detector.
     45 //
     46 //                        The binary structures are described in uspoof_impl.h
     47 //
     48 //     1.  Parse the data, making a hash table mapping from a UChar32 to a String.
     49 //
     50 //     2.  Sort all of the strings encountered by length, since they will need to
     51 //         be stored in that order in the final string table.
     52 //         TODO: Sorting these strings by length is no longer needed since the removal of
     53 //         the string lengths table.  This logic can be removed to save processing time
     54 //         when building confusables data.
     55 //
     56 //     3.  Build a list of keys (UChar32s) from the four mapping tables.  Sort the
     57 //         list because that will be the ordering of our runtime table.
     58 //
     59 //     4.  Generate the run time string table.  This is generated before the key & value
     60 //         tables because we need the string indexes when building those tables.
     61 //
     62 //     5.  Build the run-time key and value tables.  These are parallel tables, and are built
     63 //         at the same time
     64 //
     65 
     66 SPUString::SPUString(UnicodeString *s) {
     67     fStr = s;
     68     fCharOrStrTableIndex = 0;
     69 }
     70 
     71 
     72 SPUString::~SPUString() {
     73     delete fStr;
     74 }
     75 
     76 
     77 SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(NULL), fHash(NULL) {
     78     fVec = new UVector(status);
     79     fHash = uhash_open(uhash_hashUnicodeString,           // key hash function
     80                        uhash_compareUnicodeString,        // Key Comparator
     81                        NULL,                              // Value Comparator
     82                        &status);
     83 }
     84 
     85 
     86 SPUStringPool::~SPUStringPool() {
     87     int i;
     88     for (i=fVec->size()-1; i>=0; i--) {
     89         SPUString *s = static_cast<SPUString *>(fVec->elementAt(i));
     90         delete s;
     91     }
     92     delete fVec;
     93     uhash_close(fHash);
     94 }
     95 
     96 
     97 int32_t SPUStringPool::size() {
     98     return fVec->size();
     99 }
    100 
    101 SPUString *SPUStringPool::getByIndex(int32_t index) {
    102     SPUString *retString = (SPUString *)fVec->elementAt(index);
    103     return retString;
    104 }
    105 
    106 
    107 // Comparison function for ordering strings in the string pool.
    108 // Compare by length first, then, within a group of the same length,
    109 // by code point order.
    110 // Conforms to the type signature for a USortComparator in uvector.h
    111 
    112 static int8_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) {
    113 	const SPUString *sL = const_cast<const SPUString *>(
    114         static_cast<SPUString *>(left.pointer));
    115  	const SPUString *sR = const_cast<const SPUString *>(
    116  	    static_cast<SPUString *>(right.pointer));
    117     int32_t lenL = sL->fStr->length();
    118     int32_t lenR = sR->fStr->length();
    119     if (lenL < lenR) {
    120         return -1;
    121     } else if (lenL > lenR) {
    122         return 1;
    123     } else {
    124         return sL->fStr->compare(*(sR->fStr));
    125     }
    126 }
    127 
    128 void SPUStringPool::sort(UErrorCode &status) {
    129     fVec->sort(SPUStringCompare, status);
    130 }
    131 
    132 
    133 SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) {
    134     SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src));
    135     if (hashedString != NULL) {
    136         delete src;
    137     } else {
    138         hashedString = new SPUString(src);
    139         uhash_put(fHash, src, hashedString, &status);
    140         fVec->addElement(hashedString, status);
    141     }
    142     return hashedString;
    143 }
    144 
    145 
    146 
    147 ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) :
    148     fSpoofImpl(spImpl),
    149     fInput(NULL),
    150     fTable(NULL),
    151     fKeySet(NULL),
    152     fKeyVec(NULL),
    153     fValueVec(NULL),
    154     fStringTable(NULL),
    155     stringPool(NULL),
    156     fParseLine(NULL),
    157     fParseHexNum(NULL),
    158     fLineNum(0)
    159 {
    160     if (U_FAILURE(status)) {
    161         return;
    162     }
    163     fTable    = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
    164     fKeySet     = new UnicodeSet();
    165     fKeyVec     = new UVector(status);
    166     fValueVec   = new UVector(status);
    167     stringPool = new SPUStringPool(status);
    168 }
    169 
    170 
    171 ConfusabledataBuilder::~ConfusabledataBuilder() {
    172     uprv_free(fInput);
    173     uregex_close(fParseLine);
    174     uregex_close(fParseHexNum);
    175     uhash_close(fTable);
    176     delete fKeySet;
    177     delete fKeyVec;
    178     delete fStringTable;
    179     delete fValueVec;
    180     delete stringPool;
    181 }
    182 
    183 
    184 void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables,
    185     int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) {
    186 
    187     if (U_FAILURE(status)) {
    188         return;
    189     }
    190     ConfusabledataBuilder builder(spImpl, status);
    191     builder.build(confusables, confusablesLen, status);
    192     if (U_FAILURE(status) && errorType != NULL) {
    193         *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
    194         pe->line = builder.fLineNum;
    195     }
    196 }
    197 
    198 
    199 void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen,
    200                UErrorCode &status) {
    201 
    202     // Convert the user input data from UTF-8 to UChar (UTF-16)
    203     int32_t inputLen = 0;
    204     if (U_FAILURE(status)) {
    205         return;
    206     }
    207     u_strFromUTF8(NULL, 0, &inputLen, confusables, confusablesLen, &status);
    208     if (status != U_BUFFER_OVERFLOW_ERROR) {
    209         return;
    210     }
    211     status = U_ZERO_ERROR;
    212     fInput = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
    213     if (fInput == NULL) {
    214         status = U_MEMORY_ALLOCATION_ERROR;
    215         return;
    216     }
    217     u_strFromUTF8(fInput, inputLen+1, NULL, confusables, confusablesLen, &status);
    218 
    219 
    220     // Regular Expression to parse a line from Confusables.txt.  The expression will match
    221     // any line.  What was matched is determined by examining which capture groups have a match.
    222     //   Capture Group 1:  the source char
    223     //   Capture Group 2:  the replacement chars
    224     //   Capture Group 3-6  the table type, SL, SA, ML, or MA (deprecated)
    225     //   Capture Group 7:  A blank or comment only line.
    226     //   Capture Group 8:  A syntactically invalid line.  Anything that didn't match before.
    227     // Example Line from the confusables.txt source file:
    228     //   "1D702 ;	006E 0329 ;	SL	# MATHEMATICAL ITALIC SMALL ETA ... "
    229     UnicodeString pattern(
    230         "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;"      // Match the source char
    231         "[ \\t]*([0-9A-Fa-f]+"                    // Match the replacement char(s)
    232            "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;"    //     (continued)
    233         "\\s*(?:(SL)|(SA)|(ML)|(MA))"             // Match the table type
    234         "[ \\t]*(?:#.*?)?$"                       // Match any trailing #comment
    235         "|^([ \\t]*(?:#.*?)?)$"       // OR match empty lines or lines with only a #comment
    236         "|^(.*?)$", -1, US_INV);      // OR match any line, which catches illegal lines.
    237     // TODO: Why are we using the regex C API here? C++ would just take UnicodeString...
    238     fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
    239 
    240     // Regular expression for parsing a hex number out of a space-separated list of them.
    241     //   Capture group 1 gets the number, with spaces removed.
    242     pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)");
    243     fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
    244 
    245     // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
    246     //   given the syntax of the input.
    247     if (*fInput == 0xfeff) {
    248         *fInput = 0x20;
    249     }
    250 
    251     // Parse the input, one line per iteration of this loop.
    252     uregex_setText(fParseLine, fInput, inputLen, &status);
    253     while (uregex_findNext(fParseLine, &status)) {
    254         fLineNum++;
    255         if (uregex_start(fParseLine, 7, &status) >= 0) {
    256             // this was a blank or comment line.
    257             continue;
    258         }
    259         if (uregex_start(fParseLine, 8, &status) >= 0) {
    260             // input file syntax error.
    261             status = U_PARSE_ERROR;
    262             return;
    263         }
    264 
    265         // We have a good input line.  Extract the key character and mapping string, and
    266         //    put them into the appropriate mapping table.
    267         UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status),
    268                           uregex_end(fParseLine, 1, &status), status);
    269 
    270         int32_t mapStringStart = uregex_start(fParseLine, 2, &status);
    271         int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart;
    272         uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status);
    273 
    274         UnicodeString  *mapString = new UnicodeString();
    275         if (mapString == NULL) {
    276             status = U_MEMORY_ALLOCATION_ERROR;
    277             return;
    278         }
    279         while (uregex_findNext(fParseHexNum, &status)) {
    280             UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status),
    281                                  uregex_end(fParseHexNum, 1, &status), status);
    282             mapString->append(c);
    283         }
    284         U_ASSERT(mapString->length() >= 1);
    285 
    286         // Put the map (value) string into the string pool
    287         // This a little like a Java intern() - any duplicates will be eliminated.
    288         SPUString *smapString = stringPool->addString(mapString, status);
    289 
    290         // Add the UChar32 -> string mapping to the table.
    291         // For Unicode 8, the SL, SA and ML tables have been discontinued.
    292         //                All input data from confusables.txt is tagged MA.
    293         uhash_iput(fTable, keyChar, smapString, &status);
    294         if (U_FAILURE(status)) { return; }
    295         fKeySet->add(keyChar);
    296     }
    297 
    298     // Input data is now all parsed and collected.
    299     // Now create the run-time binary form of the data.
    300     //
    301     // This is done in two steps.  First the data is assembled into vectors and strings,
    302     //   for ease of construction, then the contents of these collections are dumped
    303     //   into the actual raw-bytes data storage.
    304 
    305     // Build up the string array, and record the index of each string therein
    306     //  in the (build time only) string pool.
    307     // Strings of length one are not entered into the strings array.
    308     // (Strings in the table are sorted by length)
    309     stringPool->sort(status);
    310     fStringTable = new UnicodeString();
    311     int32_t poolSize = stringPool->size();
    312     int32_t i;
    313     for (i=0; i<poolSize; i++) {
    314         SPUString *s = stringPool->getByIndex(i);
    315         int32_t strLen = s->fStr->length();
    316         int32_t strIndex = fStringTable->length();
    317         if (strLen == 1) {
    318             // strings of length one do not get an entry in the string table.
    319             // Keep the single string character itself here, which is the same
    320             //  convention that is used in the final run-time string table index.
    321             s->fCharOrStrTableIndex = s->fStr->charAt(0);
    322         } else {
    323             s->fCharOrStrTableIndex = strIndex;
    324             fStringTable->append(*(s->fStr));
    325         }
    326     }
    327 
    328     // Construct the compile-time Key and Value tables
    329     //
    330     // For each key code point, check which mapping tables it applies to,
    331     //   and create the final data for the key & value structures.
    332     //
    333     //   The four logical mapping tables are conflated into one combined table.
    334     //   If multiple logical tables have the same mapping for some key, they
    335     //     share a single entry in the combined table.
    336     //   If more than one mapping exists for the same key code point, multiple
    337     //     entries will be created in the table
    338 
    339     for (int32_t range=0; range<fKeySet->getRangeCount(); range++) {
    340         // It is an oddity of the UnicodeSet API that simply enumerating the contained
    341         //   code points requires a nested loop.
    342         for (UChar32 keyChar=fKeySet->getRangeStart(range);
    343                 keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
    344             SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(fTable, keyChar));
    345             U_ASSERT(targetMapping != NULL);
    346 
    347             // Set an error code if trying to consume a long string.  Otherwise,
    348             // codePointAndLengthToKey will abort on a U_ASSERT.
    349             if (targetMapping->fStr->length() > 256) {
    350                 status = U_ILLEGAL_ARGUMENT_ERROR;
    351                 return;
    352             }
    353 
    354             int32_t key = ConfusableDataUtils::codePointAndLengthToKey(keyChar,
    355                 targetMapping->fStr->length());
    356             int32_t value = targetMapping->fCharOrStrTableIndex;
    357 
    358             fKeyVec->addElement(key, status);
    359             fValueVec->addElement(value, status);
    360         }
    361     }
    362 
    363     // Put the assembled data into the flat runtime array
    364     outputData(status);
    365 
    366     // All of the intermediate allocated data belongs to the ConfusabledataBuilder
    367     //  object  (this), and is deleted in the destructor.
    368     return;
    369 }
    370 
    371 //
    372 // outputData     The confusable data has been compiled and stored in intermediate
    373 //                collections and strings.  Copy it from there to the final flat
    374 //                binary array.
    375 //
    376 //                Note that as each section is added to the output data, the
    377 //                expand (reserveSpace() function will likely relocate it in memory.
    378 //                Be careful with pointers.
    379 //
    380 void ConfusabledataBuilder::outputData(UErrorCode &status) {
    381 
    382     U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned == TRUE);
    383 
    384     //  The Key Table
    385     //     While copying the keys to the runtime array,
    386     //       also sanity check that they are sorted.
    387 
    388     int32_t numKeys = fKeyVec->size();
    389     int32_t *keys =
    390         static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status));
    391     if (U_FAILURE(status)) {
    392         return;
    393     }
    394     int i;
    395     UChar32 previousCodePoint = 0;
    396     for (i=0; i<numKeys; i++) {
    397         int32_t key =  fKeyVec->elementAti(i);
    398         UChar32 codePoint = ConfusableDataUtils::keyToCodePoint(key);
    399         // strictly greater because there can be only one entry per code point
    400         U_ASSERT(codePoint > previousCodePoint);
    401         keys[i] = key;
    402         previousCodePoint = codePoint;
    403     }
    404     SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData;
    405     rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData);
    406     rawData->fCFUKeysSize = numKeys;
    407     fSpoofImpl->fSpoofData->fCFUKeys = keys;
    408 
    409 
    410     // The Value Table, parallels the key table
    411     int32_t numValues = fValueVec->size();
    412     U_ASSERT(numKeys == numValues);
    413     uint16_t *values =
    414         static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status));
    415     if (U_FAILURE(status)) {
    416         return;
    417     }
    418     for (i=0; i<numValues; i++) {
    419         uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i));
    420         U_ASSERT(value < 0xffff);
    421         values[i] = static_cast<uint16_t>(value);
    422     }
    423     rawData = fSpoofImpl->fSpoofData->fRawData;
    424     rawData->fCFUStringIndex = (int32_t)((char *)values - (char *)rawData);
    425     rawData->fCFUStringIndexSize = numValues;
    426     fSpoofImpl->fSpoofData->fCFUValues = values;
    427 
    428     // The Strings Table.
    429 
    430     uint32_t stringsLength = fStringTable->length();
    431     // Reserve an extra space so the string will be nul-terminated.  This is
    432     // only a convenience, for when debugging; it is not needed otherwise.
    433     UChar *strings =
    434         static_cast<UChar *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(UChar)+2, status));
    435     if (U_FAILURE(status)) {
    436         return;
    437     }
    438     fStringTable->extract(strings, stringsLength+1, status);
    439     rawData = fSpoofImpl->fSpoofData->fRawData;
    440     U_ASSERT(rawData->fCFUStringTable == 0);
    441     rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData);
    442     rawData->fCFUStringTableLen = stringsLength;
    443     fSpoofImpl->fSpoofData->fCFUStrings = strings;
    444 }
    445 
    446 #endif
    447 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
    448 
    449