Home | History | Annotate | Download | only in i18n
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2008-2009, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  uspoof_wsconf.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2009Jan05  (refactoring earlier files)
     14 *   created by: Andy Heninger
     15 *
     16 *   Internal functions for compililing Whole Script confusable source data
     17 *   into its binary (runtime) form.  The binary data format is described
     18 *   in uspoof_impl.h
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "unicode/uspoof.h"
     23 
     24 #if !UCONFIG_NO_NORMALIZATION
     25 
     26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     27 
     28 #include "unicode/unorm.h"
     29 #include "unicode/uregex.h"
     30 #include "unicode/ustring.h"
     31 #include "cmemory.h"
     32 #include "uspoof_impl.h"
     33 #include "uhash.h"
     34 #include "uvector.h"
     35 #include "uassert.h"
     36 #include "uspoof_wsconf.h"
     37 
     38 U_NAMESPACE_USE
     39 
     40 
     41 // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
     42 // Example Lines:
     43 //   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
     44 //   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
     45 //    |               |     |    |
     46 //    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
     47 //    |               |     |----------Target script.   We need this.
     48 //    |               |----------------Src script.  Should match the script of the source
     49 //    |                                code points.  Beyond checking that, we don't keep it.
     50 //    |--------------------------------Source code points or range.
     51 //
     52 // The expression will match _all_ lines, including erroneous lines.
     53 // The result of the parse is returned via the contents of the (match) groups.
     54 static const char *parseExp =
     55 
     56         "(?m)"                                         // Multi-line mode
     57         "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
     58         "|^(?:"                                        //   OR
     59         "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
     60         "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
     61         "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
     62         "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
     63         "[ \\t]*(?:#.*?)?"                             // Trailing commment
     64         ")$|"                                          //   OR
     65         "^(.*?)$";                                     // An error line.      Group 8.
     66                                                        //    Any line not matching the preceding
     67                                                        //    parts of the expression.will match
     68                                                        //    this, and thus be flagged as an error
     69 
     70 
     71 // Extract a regular expression match group into a char * string.
     72 //    The group must contain only invariant characters.
     73 //    Used for script names
     74 //
     75 static void extractGroup(
     76     URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
     77 
     78     UChar ubuf[50];
     79     ubuf[0] = 0;
     80     destBuf[0] = 0;
     81     int32_t len = uregex_group(e, group, ubuf, 50, &status);
     82     if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
     83         return;
     84     }
     85     UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
     86     s.extract(0, len, destBuf, destCapacity, US_INV);
     87 }
     88 
     89 
     90 
     91 //  Build the Whole Script Confusable data
     92 //
     93 //     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
     94 //                         because everything is local to this one build function anyhow,
     95 //                           OR
     96 //                         break this function into more reasonably sized pieces, with
     97 //                         state in WSConfusableDataBuilder.
     98 //
     99 void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
    100           int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
    101 {
    102     if (U_FAILURE(status)) {
    103         return;
    104     }
    105     URegularExpression *parseRegexp = NULL;
    106     int32_t             inputLen    = 0;
    107     UChar              *input       = NULL;
    108     int32_t             lineNum     = 0;
    109 
    110     UVector            *scriptSets        = NULL;
    111     uint32_t            rtScriptSetsCount = 2;
    112 
    113     UTrie2             *anyCaseTrie   = NULL;
    114     UTrie2             *lowerCaseTrie = NULL;
    115 
    116     anyCaseTrie = utrie2_open(0, 0, &status);
    117     lowerCaseTrie = utrie2_open(0, 0, &status);
    118 
    119 
    120     // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
    121     //
    122     // Reserved TRIE values:
    123     //   0:  Code point has no whole script confusables.
    124     //   1:  Code point is of script Common or Inherited.
    125     //       These code points do not participate in whole script confusable detection.
    126     //       (This is logically equivalent to saying that they contain confusables in
    127     //        all scripts)
    128     //
    129     // Because Trie values are indexes into the ScriptSets vector, pre-fill
    130     // vector positions 0 and 1 to avoid conflicts with the reserved values.
    131 
    132     scriptSets = new UVector(status);
    133     if (scriptSets == NULL) {
    134         status = U_MEMORY_ALLOCATION_ERROR;
    135         goto cleanup;
    136     }
    137     scriptSets->addElement((void *)NULL, status);
    138     scriptSets->addElement((void *)NULL, status);
    139 
    140     // Convert the user input data from UTF-8 to UChar (UTF-16)
    141     u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
    142     if (status != U_BUFFER_OVERFLOW_ERROR) {
    143         goto cleanup;
    144     }
    145     status = U_ZERO_ERROR;
    146     input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
    147     if (input == NULL) {
    148         status = U_MEMORY_ALLOCATION_ERROR;
    149         goto cleanup;
    150     }
    151     u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
    152 
    153 
    154 
    155     parseRegexp = uregex_openC(parseExp, 0, NULL, &status);
    156 
    157     // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
    158     //   given the syntax of the input.
    159     if (*input == 0xfeff) {
    160         *input = 0x20;
    161     }
    162 
    163     // Parse the input, one line per iteration of this loop.
    164     uregex_setText(parseRegexp, input, inputLen, &status);
    165     while (uregex_findNext(parseRegexp, &status)) {
    166         lineNum++;
    167         UChar  line[200];
    168         uregex_group(parseRegexp, 0, line, 200, &status);
    169         if (uregex_start(parseRegexp, 1, &status) >= 0) {
    170             // this was a blank or comment line.
    171             continue;
    172         }
    173         if (uregex_start(parseRegexp, 8, &status) >= 0) {
    174             // input file syntax error.
    175             status = U_PARSE_ERROR;
    176             goto cleanup;
    177         }
    178         if (U_FAILURE(status)) {
    179             goto cleanup;
    180         }
    181 
    182         // Pick up the start and optional range end code points from the parsed line.
    183         UChar32  startCodePoint = SpoofImpl::ScanHex(
    184             input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
    185         UChar32  endCodePoint = startCodePoint;
    186         if (uregex_start(parseRegexp, 3, &status) >=0) {
    187             endCodePoint = SpoofImpl::ScanHex(
    188                 input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
    189         }
    190 
    191         // Extract the two script names from the source line.  We need these in an 8 bit
    192         //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
    193         //   to the ICU u_getPropertyValueEnum() function.  Ugh.
    194         char  srcScriptName[20];
    195         char  targScriptName[20];
    196         extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
    197         extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
    198         UScriptCode srcScript  =
    199             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
    200         UScriptCode targScript =
    201             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
    202         if (U_FAILURE(status)) {
    203             goto cleanup;
    204         }
    205         if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
    206             status = U_INVALID_FORMAT_ERROR;
    207             goto cleanup;
    208         }
    209 
    210         // select the table - (A) any case or (L) lower case only
    211         UTrie2 *table = anyCaseTrie;
    212         if (uregex_start(parseRegexp, 7, &status) >= 0) {
    213             table = lowerCaseTrie;
    214         }
    215 
    216         // Build the set of scripts containing confusable characters for
    217         //   the code point(s) specified in this input line.
    218         // Sanity check that the script of the source code point is the same
    219         //   as the source script indicated in the input file.  Failure of this check is
    220         //   an error in the input file.
    221         // Include the source script in the set (needed for Mixed Script Confusable detection).
    222         //
    223         UChar32 cp;
    224         for (cp=startCodePoint; cp<=endCodePoint; cp++) {
    225             int32_t setIndex = utrie2_get32(table, cp);
    226             BuilderScriptSet *bsset = NULL;
    227             if (setIndex > 0) {
    228                 U_ASSERT(setIndex < scriptSets->size());
    229                 bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
    230             } else {
    231                 bsset = new BuilderScriptSet();
    232                 if (bsset == NULL) {
    233                     status = U_MEMORY_ALLOCATION_ERROR;
    234                     goto cleanup;
    235                 }
    236                 bsset->codePoint = cp;
    237                 bsset->trie = table;
    238                 bsset->sset = new ScriptSet();
    239                 setIndex = scriptSets->size();
    240                 bsset->index = setIndex;
    241                 bsset->rindex = 0;
    242                 if (bsset->sset == NULL) {
    243                     status = U_MEMORY_ALLOCATION_ERROR;
    244                     goto cleanup;
    245                 }
    246                 scriptSets->addElement(bsset, status);
    247                 utrie2_set32(table, cp, setIndex, &status);
    248             }
    249             bsset->sset->Union(targScript);
    250             bsset->sset->Union(srcScript);
    251 
    252             if (U_FAILURE(status)) {
    253                 goto cleanup;
    254             }
    255             UScriptCode cpScript = uscript_getScript(cp, &status);
    256             if (cpScript != srcScript) {
    257                 status = U_INVALID_FORMAT_ERROR;
    258                 goto cleanup;
    259             }
    260         }
    261     }
    262 
    263     // Eliminate duplicate script sets.  At this point we have a separate
    264     // script set for every code point that had data in the input file.
    265     //
    266     // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
    267     //
    268     // printf("Number of scriptSets: %d\n", scriptSets->size());
    269     {
    270         int32_t duplicateCount = 0;
    271         rtScriptSetsCount = 2;
    272         for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
    273             BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
    274             if (outerSet->index != static_cast<uint32_t>(outeri)) {
    275                 // This set was already identified as a duplicate.
    276                 //   It will not be allocated a position in the runtime array of ScriptSets.
    277                 continue;
    278             }
    279             outerSet->rindex = rtScriptSetsCount++;
    280             for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
    281                 BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
    282                 if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
    283                     delete innerSet->sset;
    284                     innerSet->scriptSetOwned = FALSE;
    285                     innerSet->sset = outerSet->sset;
    286                     innerSet->index = outeri;
    287                     innerSet->rindex = outerSet->rindex;
    288                     duplicateCount++;
    289                 }
    290                 // But this doesn't get all.  We need to fix the TRIE.
    291             }
    292         }
    293         // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
    294     }
    295 
    296 
    297 
    298     // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
    299     //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
    300     //     are unused, which is why the loop index starts at 2.)
    301     {
    302         for (int32_t i=2; i<scriptSets->size(); i++) {
    303             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
    304             if (bSet->rindex != (uint32_t)i) {
    305                 utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
    306             }
    307         }
    308     }
    309 
    310     // For code points with script==Common or script==Inherited,
    311     //   Set the reserved value of 1 into both Tries.  These characters do not participate
    312     //   in Whole Script Confusable detection; this reserved value is the means
    313     //   by which they are detected.
    314     {
    315         UnicodeSet ignoreSet;
    316         ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
    317         UnicodeSet inheritedSet;
    318         inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
    319         ignoreSet.addAll(inheritedSet);
    320         for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
    321             UChar32 rangeStart = ignoreSet.getRangeStart(rn);
    322             UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
    323             utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
    324             utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
    325         }
    326     }
    327 
    328     // Serialize the data to the Spoof Detector
    329     {
    330         utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
    331         int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
    332         // printf("Any case Trie size: %d\n", size);
    333         if (status != U_BUFFER_OVERFLOW_ERROR) {
    334             goto cleanup;
    335         }
    336         status = U_ZERO_ERROR;
    337         spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
    338         spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
    339         spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
    340         void *where = spImpl->fSpoofData->reserveSpace(size, status);
    341         utrie2_serialize(anyCaseTrie, where, size, &status);
    342 
    343         utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
    344         size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
    345         // printf("Lower case Trie size: %d\n", size);
    346         if (status != U_BUFFER_OVERFLOW_ERROR) {
    347             goto cleanup;
    348         }
    349         status = U_ZERO_ERROR;
    350         spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
    351         spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
    352         spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
    353         where = spImpl->fSpoofData->reserveSpace(size, status);
    354         utrie2_serialize(lowerCaseTrie, where, size, &status);
    355 
    356         spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
    357         spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
    358         ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
    359             (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
    360         uint32_t rindex = 2;
    361         for (int32_t i=2; i<scriptSets->size(); i++) {
    362             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
    363             if (bSet->rindex < rindex) {
    364                 // We have already copied this script set to the serialized data.
    365                 continue;
    366             }
    367             U_ASSERT(rindex == bSet->rindex);
    368             rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
    369             rindex++;
    370         }
    371     }
    372 
    373     // Open new utrie2s from the serialized data.  We don't want to keep the ones
    374     //   we just built because we would then have two copies of the data, one internal to
    375     //   the utries that we have already constructed, and one in the serialized data area.
    376     //   An alternative would be to not pre-serialize the Trie data, but that makes the
    377     //   spoof detector data different, depending on how the detector was constructed.
    378     //   It's simpler to keep the data always the same.
    379 
    380     spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
    381             UTRIE2_16_VALUE_BITS,
    382             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
    383             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
    384             NULL,
    385             &status);
    386 
    387     spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
    388             UTRIE2_16_VALUE_BITS,
    389             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
    390             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
    391             NULL,
    392             &status);
    393 
    394 
    395 
    396 cleanup:
    397     if (U_FAILURE(status)) {
    398         pe->line = lineNum;
    399     }
    400     uregex_close(parseRegexp);
    401     uprv_free(input);
    402 
    403     int32_t i;
    404     for (i=0; i<scriptSets->size(); i++) {
    405         BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
    406         delete bsset;
    407     }
    408     delete scriptSets;
    409     utrie2_close(anyCaseTrie);
    410     utrie2_close(lowerCaseTrie);
    411     return;
    412 }
    413 
    414 
    415 
    416 
    417 
    418 BuilderScriptSet::BuilderScriptSet() {
    419     codePoint = -1;
    420     trie = NULL;
    421     sset = NULL;
    422     index = 0;
    423     rindex = 0;
    424     scriptSetOwned = TRUE;
    425 }
    426 
    427 BuilderScriptSet::~BuilderScriptSet() {
    428     if (scriptSetOwned) {
    429         delete sset;
    430     }
    431 }
    432 
    433 #endif
    434 #endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS
    435 
    436