Home | History | Annotate | Download | only in i18n
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2008-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  uspoof_wsconf.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2009Jan05  (refactoring earlier files)
     14 *   created by: Andy Heninger
     15 *
     16 *   Internal functions for compililing Whole Script confusable source data
     17 *   into its binary (runtime) form.  The binary data format is described
     18 *   in uspoof_impl.h
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "unicode/uspoof.h"
     23 
     24 #if !UCONFIG_NO_NORMALIZATION
     25 
     26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     27 
     28 #include "unicode/unorm.h"
     29 #include "unicode/uregex.h"
     30 #include "unicode/ustring.h"
     31 #include "cmemory.h"
     32 #include "uspoof_impl.h"
     33 #include "uhash.h"
     34 #include "uvector.h"
     35 #include "uassert.h"
     36 #include "uspoof_wsconf.h"
     37 
     38 U_NAMESPACE_USE
     39 
     40 
     41 // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
     42 // Example Lines:
     43 //   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
     44 //   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
     45 //    |               |     |    |
     46 //    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
     47 //    |               |     |----------Target script.   We need this.
     48 //    |               |----------------Src script.  Should match the script of the source
     49 //    |                                code points.  Beyond checking that, we don't keep it.
     50 //    |--------------------------------Source code points or range.
     51 //
     52 // The expression will match _all_ lines, including erroneous lines.
     53 // The result of the parse is returned via the contents of the (match) groups.
     54 static const char *parseExp =
     55         "(?m)"                                         // Multi-line mode
     56         "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
     57         "|^(?:"                                        //   OR
     58         "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
     59         "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
     60         "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
     61         "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
     62         "[ \\t]*(?:#.*?)?"                             // Trailing commment
     63         ")$|"                                          //   OR
     64         "^(.*?)$";                                     // An error line.      Group 8.
     65                                                        //    Any line not matching the preceding
     66                                                        //    parts of the expression.will match
     67                                                        //    this, and thus be flagged as an error
     68 
     69 
     70 // Extract a regular expression match group into a char * string.
     71 //    The group must contain only invariant characters.
     72 //    Used for script names
     73 //
     74 static void extractGroup(
     75     URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
     76 
     77     UChar ubuf[50];
     78     ubuf[0] = 0;
     79     destBuf[0] = 0;
     80     int32_t len = uregex_group(e, group, ubuf, 50, &status);
     81     if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
     82         return;
     83     }
     84     UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
     85     s.extract(0, len, destBuf, destCapacity, US_INV);
     86 }
     87 
     88 
     89 
     90 U_NAMESPACE_BEGIN
     91 
     92 //  Build the Whole Script Confusable data
     93 //
     94 //     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
     95 //                         because everything is local to this one build function anyhow,
     96 //                           OR
     97 //                         break this function into more reasonably sized pieces, with
     98 //                         state in WSConfusableDataBuilder.
     99 //
    100 void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
    101           int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
    102 {
    103     if (U_FAILURE(status)) {
    104         return;
    105     }
    106     URegularExpression *parseRegexp = NULL;
    107     int32_t             inputLen    = 0;
    108     UChar              *input       = NULL;
    109     int32_t             lineNum     = 0;
    110 
    111     UVector            *scriptSets        = NULL;
    112     uint32_t            rtScriptSetsCount = 2;
    113 
    114     UTrie2             *anyCaseTrie   = NULL;
    115     UTrie2             *lowerCaseTrie = NULL;
    116 
    117     anyCaseTrie = utrie2_open(0, 0, &status);
    118     lowerCaseTrie = utrie2_open(0, 0, &status);
    119 
    120     UnicodeString pattern(parseExp, -1, US_INV);
    121 
    122     // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
    123     //
    124     // Reserved TRIE values:
    125     //   0:  Code point has no whole script confusables.
    126     //   1:  Code point is of script Common or Inherited.
    127     //       These code points do not participate in whole script confusable detection.
    128     //       (This is logically equivalent to saying that they contain confusables in
    129     //        all scripts)
    130     //
    131     // Because Trie values are indexes into the ScriptSets vector, pre-fill
    132     // vector positions 0 and 1 to avoid conflicts with the reserved values.
    133 
    134     scriptSets = new UVector(status);
    135     if (scriptSets == NULL) {
    136         status = U_MEMORY_ALLOCATION_ERROR;
    137         goto cleanup;
    138     }
    139     scriptSets->addElement((void *)NULL, status);
    140     scriptSets->addElement((void *)NULL, status);
    141 
    142     // Convert the user input data from UTF-8 to UChar (UTF-16)
    143     u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
    144     if (status != U_BUFFER_OVERFLOW_ERROR) {
    145         goto cleanup;
    146     }
    147     status = U_ZERO_ERROR;
    148     input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
    149     if (input == NULL) {
    150         status = U_MEMORY_ALLOCATION_ERROR;
    151         goto cleanup;
    152     }
    153     u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
    154 
    155     parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
    156 
    157     // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
    158     //   given the syntax of the input.
    159     if (*input == 0xfeff) {
    160         *input = 0x20;
    161     }
    162 
    163     // Parse the input, one line per iteration of this loop.
    164     uregex_setText(parseRegexp, input, inputLen, &status);
    165     while (uregex_findNext(parseRegexp, &status)) {
    166         lineNum++;
    167         if (uregex_start(parseRegexp, 1, &status) >= 0) {
    168             // this was a blank or comment line.
    169             continue;
    170         }
    171         if (uregex_start(parseRegexp, 8, &status) >= 0) {
    172             // input file syntax error.
    173             status = U_PARSE_ERROR;
    174             goto cleanup;
    175         }
    176         if (U_FAILURE(status)) {
    177             goto cleanup;
    178         }
    179 
    180         // Pick up the start and optional range end code points from the parsed line.
    181         UChar32  startCodePoint = SpoofImpl::ScanHex(
    182             input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
    183         UChar32  endCodePoint = startCodePoint;
    184         if (uregex_start(parseRegexp, 3, &status) >=0) {
    185             endCodePoint = SpoofImpl::ScanHex(
    186                 input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
    187         }
    188 
    189         // Extract the two script names from the source line.  We need these in an 8 bit
    190         //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
    191         //   to the ICU u_getPropertyValueEnum() function.  Ugh.
    192         char  srcScriptName[20];
    193         char  targScriptName[20];
    194         extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
    195         extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
    196         UScriptCode srcScript  =
    197             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
    198         UScriptCode targScript =
    199             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
    200         if (U_FAILURE(status)) {
    201             goto cleanup;
    202         }
    203         if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
    204             status = U_INVALID_FORMAT_ERROR;
    205             goto cleanup;
    206         }
    207 
    208         // select the table - (A) any case or (L) lower case only
    209         UTrie2 *table = anyCaseTrie;
    210         if (uregex_start(parseRegexp, 7, &status) >= 0) {
    211             table = lowerCaseTrie;
    212         }
    213 
    214         // Build the set of scripts containing confusable characters for
    215         //   the code point(s) specified in this input line.
    216         // Sanity check that the script of the source code point is the same
    217         //   as the source script indicated in the input file.  Failure of this check is
    218         //   an error in the input file.
    219         // Include the source script in the set (needed for Mixed Script Confusable detection).
    220         //
    221         UChar32 cp;
    222         for (cp=startCodePoint; cp<=endCodePoint; cp++) {
    223             int32_t setIndex = utrie2_get32(table, cp);
    224             BuilderScriptSet *bsset = NULL;
    225             if (setIndex > 0) {
    226                 U_ASSERT(setIndex < scriptSets->size());
    227                 bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
    228             } else {
    229                 bsset = new BuilderScriptSet();
    230                 if (bsset == NULL) {
    231                     status = U_MEMORY_ALLOCATION_ERROR;
    232                     goto cleanup;
    233                 }
    234                 bsset->codePoint = cp;
    235                 bsset->trie = table;
    236                 bsset->sset = new ScriptSet();
    237                 setIndex = scriptSets->size();
    238                 bsset->index = setIndex;
    239                 bsset->rindex = 0;
    240                 if (bsset->sset == NULL) {
    241                     status = U_MEMORY_ALLOCATION_ERROR;
    242                     goto cleanup;
    243                 }
    244                 scriptSets->addElement(bsset, status);
    245                 utrie2_set32(table, cp, setIndex, &status);
    246             }
    247             bsset->sset->Union(targScript);
    248             bsset->sset->Union(srcScript);
    249 
    250             if (U_FAILURE(status)) {
    251                 goto cleanup;
    252             }
    253             UScriptCode cpScript = uscript_getScript(cp, &status);
    254             if (cpScript != srcScript) {
    255                 status = U_INVALID_FORMAT_ERROR;
    256                 goto cleanup;
    257             }
    258         }
    259     }
    260 
    261     // Eliminate duplicate script sets.  At this point we have a separate
    262     // script set for every code point that had data in the input file.
    263     //
    264     // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
    265     //
    266     // printf("Number of scriptSets: %d\n", scriptSets->size());
    267     {
    268         int32_t duplicateCount = 0;
    269         rtScriptSetsCount = 2;
    270         for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
    271             BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
    272             if (outerSet->index != static_cast<uint32_t>(outeri)) {
    273                 // This set was already identified as a duplicate.
    274                 //   It will not be allocated a position in the runtime array of ScriptSets.
    275                 continue;
    276             }
    277             outerSet->rindex = rtScriptSetsCount++;
    278             for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
    279                 BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
    280                 if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
    281                     delete innerSet->sset;
    282                     innerSet->scriptSetOwned = FALSE;
    283                     innerSet->sset = outerSet->sset;
    284                     innerSet->index = outeri;
    285                     innerSet->rindex = outerSet->rindex;
    286                     duplicateCount++;
    287                 }
    288                 // But this doesn't get all.  We need to fix the TRIE.
    289             }
    290         }
    291         // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
    292     }
    293 
    294 
    295 
    296     // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
    297     //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
    298     //     are unused, which is why the loop index starts at 2.)
    299     {
    300         for (int32_t i=2; i<scriptSets->size(); i++) {
    301             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
    302             if (bSet->rindex != (uint32_t)i) {
    303                 utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
    304             }
    305         }
    306     }
    307 
    308     // For code points with script==Common or script==Inherited,
    309     //   Set the reserved value of 1 into both Tries.  These characters do not participate
    310     //   in Whole Script Confusable detection; this reserved value is the means
    311     //   by which they are detected.
    312     {
    313         UnicodeSet ignoreSet;
    314         ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
    315         UnicodeSet inheritedSet;
    316         inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
    317         ignoreSet.addAll(inheritedSet);
    318         for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
    319             UChar32 rangeStart = ignoreSet.getRangeStart(rn);
    320             UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
    321             utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
    322             utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
    323         }
    324     }
    325 
    326     // Serialize the data to the Spoof Detector
    327     {
    328         utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
    329         int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
    330         // printf("Any case Trie size: %d\n", size);
    331         if (status != U_BUFFER_OVERFLOW_ERROR) {
    332             goto cleanup;
    333         }
    334         status = U_ZERO_ERROR;
    335         spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
    336         spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
    337         spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
    338         void *where = spImpl->fSpoofData->reserveSpace(size, status);
    339         utrie2_serialize(anyCaseTrie, where, size, &status);
    340 
    341         utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
    342         size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
    343         // printf("Lower case Trie size: %d\n", size);
    344         if (status != U_BUFFER_OVERFLOW_ERROR) {
    345             goto cleanup;
    346         }
    347         status = U_ZERO_ERROR;
    348         spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
    349         spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
    350         spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
    351         where = spImpl->fSpoofData->reserveSpace(size, status);
    352         utrie2_serialize(lowerCaseTrie, where, size, &status);
    353 
    354         spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
    355         spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
    356         ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
    357             (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
    358         uint32_t rindex = 2;
    359         for (int32_t i=2; i<scriptSets->size(); i++) {
    360             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
    361             if (bSet->rindex < rindex) {
    362                 // We have already copied this script set to the serialized data.
    363                 continue;
    364             }
    365             U_ASSERT(rindex == bSet->rindex);
    366             rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
    367             rindex++;
    368         }
    369     }
    370 
    371     // Open new utrie2s from the serialized data.  We don't want to keep the ones
    372     //   we just built because we would then have two copies of the data, one internal to
    373     //   the utries that we have already constructed, and one in the serialized data area.
    374     //   An alternative would be to not pre-serialize the Trie data, but that makes the
    375     //   spoof detector data different, depending on how the detector was constructed.
    376     //   It's simpler to keep the data always the same.
    377 
    378     spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
    379             UTRIE2_16_VALUE_BITS,
    380             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
    381             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
    382             NULL,
    383             &status);
    384 
    385     spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
    386             UTRIE2_16_VALUE_BITS,
    387             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
    388             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
    389             NULL,
    390             &status);
    391 
    392 
    393 
    394 cleanup:
    395     if (U_FAILURE(status)) {
    396         pe->line = lineNum;
    397     }
    398     uregex_close(parseRegexp);
    399     uprv_free(input);
    400 
    401     int32_t i;
    402     if (scriptSets != NULL) {
    403         for (i=0; i<scriptSets->size(); i++) {
    404             BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
    405             delete bsset;
    406         }
    407         delete scriptSets;
    408     }
    409     utrie2_close(anyCaseTrie);
    410     utrie2_close(lowerCaseTrie);
    411     return;
    412 }
    413 
    414 U_NAMESPACE_END
    415 
    416 
    417 
    418 BuilderScriptSet::BuilderScriptSet() {
    419     codePoint = -1;
    420     trie = NULL;
    421     sset = NULL;
    422     index = 0;
    423     rindex = 0;
    424     scriptSetOwned = TRUE;
    425 }
    426 
    427 BuilderScriptSet::~BuilderScriptSet() {
    428     if (scriptSetOwned) {
    429         delete sset;
    430     }
    431 }
    432 
    433 #endif
    434 #endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS
    435 
    436