Home | History | Annotate | Download | only in i18n
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2008-2013, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  uspoof_wsconf.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2009Jan05  (refactoring earlier files)
     14 *   created by: Andy Heninger
     15 *
     16 *   Internal functions for compililing Whole Script confusable source data
     17 *   into its binary (runtime) form.  The binary data format is described
     18 *   in uspoof_impl.h
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "unicode/uspoof.h"
     23 
     24 #if !UCONFIG_NO_NORMALIZATION
     25 
     26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     27 
     28 #include "unicode/unorm.h"
     29 #include "unicode/uregex.h"
     30 #include "unicode/ustring.h"
     31 #include "cmemory.h"
     32 #include "scriptset.h"
     33 #include "uspoof_impl.h"
     34 #include "uhash.h"
     35 #include "uvector.h"
     36 #include "uassert.h"
     37 #include "uspoof_wsconf.h"
     38 
     39 U_NAMESPACE_USE
     40 
     41 
     42 // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
     43 // Example Lines:
     44 //   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
     45 //   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
     46 //    |               |     |    |
     47 //    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
     48 //    |               |     |----------Target script.   We need this.
     49 //    |               |----------------Src script.  Should match the script of the source
     50 //    |                                code points.  Beyond checking that, we don't keep it.
     51 //    |--------------------------------Source code points or range.
     52 //
     53 // The expression will match _all_ lines, including erroneous lines.
     54 // The result of the parse is returned via the contents of the (match) groups.
     55 static const char *parseExp =
     56         "(?m)"                                         // Multi-line mode
     57         "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
     58         "|^(?:"                                        //   OR
     59         "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
     60         "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
     61         "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
     62         "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
     63         "[ \\t]*(?:#.*?)?"                             // Trailing commment
     64         ")$|"                                          //   OR
     65         "^(.*?)$";                                     // An error line.      Group 8.
     66                                                        //    Any line not matching the preceding
     67                                                        //    parts of the expression.will match
     68                                                        //    this, and thus be flagged as an error
     69 
     70 
     71 // Extract a regular expression match group into a char * string.
     72 //    The group must contain only invariant characters.
     73 //    Used for script names
     74 //
     75 static void extractGroup(
     76     URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
     77 
     78     UChar ubuf[50];
     79     ubuf[0] = 0;
     80     destBuf[0] = 0;
     81     int32_t len = uregex_group(e, group, ubuf, 50, &status);
     82     if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
     83         return;
     84     }
     85     UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
     86     s.extract(0, len, destBuf, destCapacity, US_INV);
     87 }
     88 
     89 
     90 
     91 U_NAMESPACE_BEGIN
     92 
     93 //  Build the Whole Script Confusable data
     94 //
     95 //     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
     96 //                         because everything is local to this one build function anyhow,
     97 //                           OR
     98 //                         break this function into more reasonably sized pieces, with
     99 //                         state in WSConfusableDataBuilder.
    100 //
    101 void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
    102           int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
    103 {
    104     if (U_FAILURE(status)) {
    105         return;
    106     }
    107     URegularExpression *parseRegexp = NULL;
    108     int32_t             inputLen    = 0;
    109     UChar              *input       = NULL;
    110     int32_t             lineNum     = 0;
    111 
    112     UVector            *scriptSets        = NULL;
    113     uint32_t            rtScriptSetsCount = 2;
    114 
    115     UTrie2             *anyCaseTrie   = NULL;
    116     UTrie2             *lowerCaseTrie = NULL;
    117 
    118     anyCaseTrie = utrie2_open(0, 0, &status);
    119     lowerCaseTrie = utrie2_open(0, 0, &status);
    120 
    121     UnicodeString pattern(parseExp, -1, US_INV);
    122 
    123     // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
    124     //
    125     // Reserved TRIE values:
    126     //   0:  Code point has no whole script confusables.
    127     //   1:  Code point is of script Common or Inherited.
    128     //       These code points do not participate in whole script confusable detection.
    129     //       (This is logically equivalent to saying that they contain confusables in
    130     //        all scripts)
    131     //
    132     // Because Trie values are indexes into the ScriptSets vector, pre-fill
    133     // vector positions 0 and 1 to avoid conflicts with the reserved values.
    134 
    135     scriptSets = new UVector(status);
    136     if (scriptSets == NULL) {
    137         status = U_MEMORY_ALLOCATION_ERROR;
    138         goto cleanup;
    139     }
    140     scriptSets->addElement((void *)NULL, status);
    141     scriptSets->addElement((void *)NULL, status);
    142 
    143     // Convert the user input data from UTF-8 to UChar (UTF-16)
    144     u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
    145     if (status != U_BUFFER_OVERFLOW_ERROR) {
    146         goto cleanup;
    147     }
    148     status = U_ZERO_ERROR;
    149     input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
    150     if (input == NULL) {
    151         status = U_MEMORY_ALLOCATION_ERROR;
    152         goto cleanup;
    153     }
    154     u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
    155 
    156     parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
    157 
    158     // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
    159     //   given the syntax of the input.
    160     if (*input == 0xfeff) {
    161         *input = 0x20;
    162     }
    163 
    164     // Parse the input, one line per iteration of this loop.
    165     uregex_setText(parseRegexp, input, inputLen, &status);
    166     while (uregex_findNext(parseRegexp, &status)) {
    167         lineNum++;
    168         if (uregex_start(parseRegexp, 1, &status) >= 0) {
    169             // this was a blank or comment line.
    170             continue;
    171         }
    172         if (uregex_start(parseRegexp, 8, &status) >= 0) {
    173             // input file syntax error.
    174             status = U_PARSE_ERROR;
    175             goto cleanup;
    176         }
    177         if (U_FAILURE(status)) {
    178             goto cleanup;
    179         }
    180 
    181         // Pick up the start and optional range end code points from the parsed line.
    182         UChar32  startCodePoint = SpoofImpl::ScanHex(
    183             input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
    184         UChar32  endCodePoint = startCodePoint;
    185         if (uregex_start(parseRegexp, 3, &status) >=0) {
    186             endCodePoint = SpoofImpl::ScanHex(
    187                 input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
    188         }
    189 
    190         // Extract the two script names from the source line.  We need these in an 8 bit
    191         //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
    192         //   to the ICU u_getPropertyValueEnum() function.  Ugh.
    193         char  srcScriptName[20];
    194         char  targScriptName[20];
    195         extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
    196         extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
    197         UScriptCode srcScript  =
    198             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
    199         UScriptCode targScript =
    200             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
    201         if (U_FAILURE(status)) {
    202             goto cleanup;
    203         }
    204         if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
    205             status = U_INVALID_FORMAT_ERROR;
    206             goto cleanup;
    207         }
    208 
    209         // select the table - (A) any case or (L) lower case only
    210         UTrie2 *table = anyCaseTrie;
    211         if (uregex_start(parseRegexp, 7, &status) >= 0) {
    212             table = lowerCaseTrie;
    213         }
    214 
    215         // Build the set of scripts containing confusable characters for
    216         //   the code point(s) specified in this input line.
    217         // Sanity check that the script of the source code point is the same
    218         //   as the source script indicated in the input file.  Failure of this check is
    219         //   an error in the input file.
    220         // Include the source script in the set (needed for Mixed Script Confusable detection).
    221         //
    222         UChar32 cp;
    223         for (cp=startCodePoint; cp<=endCodePoint; cp++) {
    224             int32_t setIndex = utrie2_get32(table, cp);
    225             BuilderScriptSet *bsset = NULL;
    226             if (setIndex > 0) {
    227                 U_ASSERT(setIndex < scriptSets->size());
    228                 bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
    229             } else {
    230                 bsset = new BuilderScriptSet();
    231                 if (bsset == NULL) {
    232                     status = U_MEMORY_ALLOCATION_ERROR;
    233                     goto cleanup;
    234                 }
    235                 bsset->codePoint = cp;
    236                 bsset->trie = table;
    237                 bsset->sset = new ScriptSet();
    238                 setIndex = scriptSets->size();
    239                 bsset->index = setIndex;
    240                 bsset->rindex = 0;
    241                 if (bsset->sset == NULL) {
    242                     status = U_MEMORY_ALLOCATION_ERROR;
    243                     goto cleanup;
    244                 }
    245                 scriptSets->addElement(bsset, status);
    246                 utrie2_set32(table, cp, setIndex, &status);
    247             }
    248             bsset->sset->set(targScript, status);
    249             bsset->sset->set(srcScript, status);
    250 
    251             if (U_FAILURE(status)) {
    252                 goto cleanup;
    253             }
    254             UScriptCode cpScript = uscript_getScript(cp, &status);
    255             if (cpScript != srcScript) {
    256                 status = U_INVALID_FORMAT_ERROR;
    257                 goto cleanup;
    258             }
    259         }
    260     }
    261 
    262     // Eliminate duplicate script sets.  At this point we have a separate
    263     // script set for every code point that had data in the input file.
    264     //
    265     // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
    266     //
    267     // printf("Number of scriptSets: %d\n", scriptSets->size());
    268     {
    269         int32_t duplicateCount = 0;
    270         rtScriptSetsCount = 2;
    271         for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
    272             BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
    273             if (outerSet->index != static_cast<uint32_t>(outeri)) {
    274                 // This set was already identified as a duplicate.
    275                 //   It will not be allocated a position in the runtime array of ScriptSets.
    276                 continue;
    277             }
    278             outerSet->rindex = rtScriptSetsCount++;
    279             for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
    280                 BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
    281                 if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
    282                     delete innerSet->sset;
    283                     innerSet->scriptSetOwned = FALSE;
    284                     innerSet->sset = outerSet->sset;
    285                     innerSet->index = outeri;
    286                     innerSet->rindex = outerSet->rindex;
    287                     duplicateCount++;
    288                 }
    289                 // But this doesn't get all.  We need to fix the TRIE.
    290             }
    291         }
    292         // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
    293     }
    294 
    295 
    296 
    297     // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
    298     //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
    299     //     are unused, which is why the loop index starts at 2.)
    300     {
    301         for (int32_t i=2; i<scriptSets->size(); i++) {
    302             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
    303             if (bSet->rindex != (uint32_t)i) {
    304                 utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
    305             }
    306         }
    307     }
    308 
    309     // For code points with script==Common or script==Inherited,
    310     //   Set the reserved value of 1 into both Tries.  These characters do not participate
    311     //   in Whole Script Confusable detection; this reserved value is the means
    312     //   by which they are detected.
    313     {
    314         UnicodeSet ignoreSet;
    315         ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
    316         UnicodeSet inheritedSet;
    317         inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
    318         ignoreSet.addAll(inheritedSet);
    319         for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
    320             UChar32 rangeStart = ignoreSet.getRangeStart(rn);
    321             UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
    322             utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
    323             utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
    324         }
    325     }
    326 
    327     // Serialize the data to the Spoof Detector
    328     {
    329         utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
    330         int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
    331         // printf("Any case Trie size: %d\n", size);
    332         if (status != U_BUFFER_OVERFLOW_ERROR) {
    333             goto cleanup;
    334         }
    335         status = U_ZERO_ERROR;
    336         spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
    337         spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
    338         spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
    339         void *where = spImpl->fSpoofData->reserveSpace(size, status);
    340         utrie2_serialize(anyCaseTrie, where, size, &status);
    341 
    342         utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
    343         size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
    344         // printf("Lower case Trie size: %d\n", size);
    345         if (status != U_BUFFER_OVERFLOW_ERROR) {
    346             goto cleanup;
    347         }
    348         status = U_ZERO_ERROR;
    349         spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
    350         spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
    351         spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
    352         where = spImpl->fSpoofData->reserveSpace(size, status);
    353         utrie2_serialize(lowerCaseTrie, where, size, &status);
    354 
    355         spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
    356         spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
    357         ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
    358             (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
    359         uint32_t rindex = 2;
    360         for (int32_t i=2; i<scriptSets->size(); i++) {
    361             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
    362             if (bSet->rindex < rindex) {
    363                 // We have already copied this script set to the serialized data.
    364                 continue;
    365             }
    366             U_ASSERT(rindex == bSet->rindex);
    367             rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
    368             rindex++;
    369         }
    370     }
    371 
    372     // Open new utrie2s from the serialized data.  We don't want to keep the ones
    373     //   we just built because we would then have two copies of the data, one internal to
    374     //   the utries that we have already constructed, and one in the serialized data area.
    375     //   An alternative would be to not pre-serialize the Trie data, but that makes the
    376     //   spoof detector data different, depending on how the detector was constructed.
    377     //   It's simpler to keep the data always the same.
    378 
    379     spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
    380             UTRIE2_16_VALUE_BITS,
    381             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
    382             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
    383             NULL,
    384             &status);
    385 
    386     spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
    387             UTRIE2_16_VALUE_BITS,
    388             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
    389             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
    390             NULL,
    391             &status);
    392 
    393 
    394 
    395 cleanup:
    396     if (U_FAILURE(status)) {
    397         pe->line = lineNum;
    398     }
    399     uregex_close(parseRegexp);
    400     uprv_free(input);
    401 
    402     int32_t i;
    403     if (scriptSets != NULL) {
    404         for (i=0; i<scriptSets->size(); i++) {
    405             BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
    406             delete bsset;
    407         }
    408         delete scriptSets;
    409     }
    410     utrie2_close(anyCaseTrie);
    411     utrie2_close(lowerCaseTrie);
    412     return;
    413 }
    414 
    415 U_NAMESPACE_END
    416 
    417 
    418 
    419 BuilderScriptSet::BuilderScriptSet() {
    420     codePoint = -1;
    421     trie = NULL;
    422     sset = NULL;
    423     index = 0;
    424     rindex = 0;
    425     scriptSetOwned = TRUE;
    426 }
    427 
    428 BuilderScriptSet::~BuilderScriptSet() {
    429     if (scriptSetOwned) {
    430         delete sset;
    431     }
    432 }
    433 
    434 #endif
    435 #endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS
    436 
    437