Home | History | Annotate | Download | only in gencfu
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2009-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *
      9 * File gencfu.c
     10 */
     11 
     12 //--------------------------------------------------------------------
     13 //
     14 //   Tool for generating Unicode Confusable data files (.cfu files).
     15 //   .cfu files contain the compiled of the confusable data
     16 //   derived from the Unicode Consortium data described in
     17 //   Unicode UAX 39.
     18 //
     19 //   Usage:  gencfu [options] -r confusables-file.txt -o output-file.cfu
     20 //
     21 //       options:   -v         verbose
     22 //                  -? or -h   help
     23 //
     24 //   The input rule filew is are plain text files containing confusable character
     25 //    definitions in the input format defined by Unicode UAX39 for the files
     26 //    confusables.txt.  This source (.txt) format
     27 //    is also accepted direaccepted by ICU spoof detedtors.  The
     28 //    files must be encoded in utf-8 format, with or without a BOM.
     29 //
     30 //   The script used to compile confusablesWholeScript.txt into the CFU file
     31 //    until the Unicode consortium deprecated it.
     32 //
     33 //--------------------------------------------------------------------
     34 
     35 #include "unicode/utypes.h"
     36 #include "unicode/unistr.h"
     37 #include "unicode/uclean.h"
     38 #include "unicode/udata.h"
     39 #include "unicode/putil.h"
     40 
     41 #include "uoptions.h"
     42 #include "unewdata.h"
     43 #include "ucmndata.h"
     44 #include "uspoof_impl.h"
     45 #include "cmemory.h"
     46 
     47 #include <stdio.h>
     48 #include <stdlib.h>
     49 #include <string.h>
     50 
     51 U_NAMESPACE_USE
     52 
     53 static char *progName;
     54 static UOption options[]={
     55     UOPTION_HELP_H,             /* 0 */
     56     UOPTION_HELP_QUESTION_MARK, /* 1 */
     57     UOPTION_VERBOSE,            /* 2 */
     58     { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 },   /* 3 */
     59     { "wsrules", NULL, NULL, NULL, 'w', UOPT_REQUIRES_ARG, 0},  /* 4 */  // deprecated
     60     { "out",   NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 },   /* 5 */
     61     UOPTION_ICUDATADIR,         /* 6 */
     62     UOPTION_DESTDIR,            /* 7 */
     63     UOPTION_COPYRIGHT,          /* 8 */
     64     UOPTION_QUIET,              /* 9 */
     65 };
     66 
     67 void usageAndDie(int retCode) {
     68         printf("Usage: %s [-v] [-options] -r confusablesRules.txt -o output-file\n", progName);
     69         printf("\tRead in Unicode confusable character definitions and write out the binary data\n"
     70             "options:\n"
     71             "\t-h or -? or --help  this usage text\n"
     72             "\t-V or --version     show a version message\n"
     73             "\t-c or --copyright   include a copyright notice\n"
     74             "\t-v or --verbose     turn on verbose output\n"
     75             "\t-q or --quiet       do not display warnings and progress\n"
     76             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
     77             "\t                    followed by path, defaults to %s\n"
     78             "\t-d or --destdir     destination directory, followed by the path\n",
     79             u_getDataDirectory());
     80         exit (retCode);
     81 }
     82 
     83 
     84 #if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO
     85 
     86 /* dummy UDataInfo cf. udata.h */
     87 static UDataInfo dummyDataInfo = {
     88     sizeof(UDataInfo),
     89     0,
     90 
     91     U_IS_BIG_ENDIAN,
     92     U_CHARSET_FAMILY,
     93     U_SIZEOF_UCHAR,
     94     0,
     95 
     96     { 0, 0, 0, 0 },                 /* dummy dataFormat */
     97     { 0, 0, 0, 0 },                 /* dummy formatVersion */
     98     { 0, 0, 0, 0 }                  /* dummy dataVersion */
     99 };
    100 
    101 #else
    102 
    103 //
    104 //  Set up the ICU data header, defined in ucmndata.h
    105 //
    106 DataHeader dh ={
    107     {sizeof(DataHeader),           // Struct MappedData
    108         0xda,
    109         0x27},
    110 
    111     {                               // struct UDataInfo
    112         sizeof(UDataInfo),          //     size
    113         0,                          //     reserved
    114         U_IS_BIG_ENDIAN,
    115         U_CHARSET_FAMILY,
    116         U_SIZEOF_UCHAR,
    117         0,                          //     reserved
    118 
    119     { 0x43, 0x66, 0x75, 0x20 },     //     dataFormat="Cfu "
    120     { 0xff, 0, 0, 0 },              //     formatVersion.  Filled in later with values
    121                                     //      from the  builder.  The  values declared
    122                                     //      here should never appear in any real data.
    123         { 5, 1, 0, 0 }              //   dataVersion (Unicode version)
    124     }};
    125 
    126 #endif
    127 
    128 // Forward declaration for function for reading source files.
    129 static const char *readFile(const char *fileName, int32_t *len);
    130 
    131 //----------------------------------------------------------------------------
    132 //
    133 //  main      for gencfu
    134 //
    135 //----------------------------------------------------------------------------
    136 int  main(int argc, char **argv) {
    137     UErrorCode  status = U_ZERO_ERROR;
    138     const char *confFileName;
    139     const char *outFileName;
    140     const char *outDir = NULL;
    141     const char *copyright = NULL;
    142 
    143     //
    144     // Pick up and check the command line arguments,
    145     //    using the standard ICU tool utils option handling.
    146     //
    147     U_MAIN_INIT_ARGS(argc, argv);
    148     progName = argv[0];
    149     argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
    150     if(argc<0) {
    151         // Unrecognized option
    152         fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
    153         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    154     }
    155 
    156     if(options[0].doesOccur || options[1].doesOccur) {
    157         //  -? or -h for help.
    158         usageAndDie(0);
    159     }
    160 
    161     if (!(options[3].doesOccur && options[5].doesOccur)) {
    162         fprintf(stderr, "confusables file and output file must all be specified.\n");
    163         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    164     }
    165     confFileName   = options[3].value;
    166     outFileName    = options[5].value;
    167 
    168     if (options[6].doesOccur) {
    169         u_setDataDirectory(options[6].value);
    170     }
    171 
    172     status = U_ZERO_ERROR;
    173 
    174     /* Combine the directory with the file name */
    175     if(options[7].doesOccur) {
    176         outDir = options[7].value;
    177     }
    178     if (options[8].doesOccur) {
    179         copyright = U_COPYRIGHT_STRING;
    180     }
    181 
    182     UBool quiet = FALSE;
    183     if (options[9].doesOccur) {
    184       quiet = TRUE;
    185     }
    186 
    187 #if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO
    188     // spoof detection data file parsing is dependent on regular expressions.
    189     // TODO: have the tool return an error status.  Requires fixing the ICU data build
    190     //       so that it doesn't abort entirely on that error.
    191 
    192     UNewDataMemory *pData;
    193     char msg[1024];
    194 
    195     /* write message with just the name */
    196     sprintf(msg, "gencfu writes dummy %s because of UCONFIG_NO_REGULAR_EXPRESSIONS and/or UCONFIG_NO_NORMALIZATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
    197     fprintf(stderr, "%s\n", msg);
    198 
    199     /* write the dummy data file */
    200     pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
    201     udata_writeBlock(pData, msg, strlen(msg));
    202     udata_finish(pData, &status);
    203     return (int)status;
    204 
    205 #else
    206     /* Initialize ICU */
    207     u_init(&status);
    208     if (U_FAILURE(status)) {
    209         fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
    210             argv[0], u_errorName(status));
    211         exit(1);
    212     }
    213     status = U_ZERO_ERROR;
    214 
    215     //  Read in the confusables source file
    216 
    217     int32_t      confusablesLen = 0;
    218     const char  *confusables = readFile(confFileName, &confusablesLen);
    219     if (confusables == NULL) {
    220         printf("gencfu: error reading file  \"%s\"\n", confFileName);
    221         exit(-1);
    222     }
    223 
    224     //
    225     //  Create the Spoof Detector from the source confusables files.
    226     //     This will compile the data.
    227     //
    228     UParseError parseError;
    229     parseError.line = 0;
    230     parseError.offset = 0;
    231     int32_t errType;
    232     USpoofChecker *sc = uspoof_openFromSource(confusables, confusablesLen,
    233                                               NULL, 0,
    234                                               &errType, &parseError, &status);
    235     if (U_FAILURE(status)) {
    236         fprintf(stderr, "gencfu: uspoof_openFromSource error \"%s\"  at file %s, line %d, column %d\n",
    237                 u_errorName(status), confFileName, (int)parseError.line, (int)parseError.offset);
    238         exit(status);
    239     };
    240 
    241 
    242     //
    243     //  Get the compiled rule data from the USpoofChecker.
    244     //
    245     uint32_t        outDataSize;
    246     uint8_t        *outData;
    247     outDataSize = uspoof_serialize(sc, NULL, 0, &status);
    248     if (status != U_BUFFER_OVERFLOW_ERROR) {
    249         fprintf(stderr, "gencfu: uspoof_serialize() returned %s\n", u_errorName(status));
    250         exit(status);
    251     }
    252     status = U_ZERO_ERROR;
    253     outData = new uint8_t[outDataSize];
    254     uspoof_serialize(sc, outData, outDataSize, &status);
    255 
    256     // Copy the data format version numbers from the spoof data header into the UDataMemory header.
    257 
    258     uprv_memcpy(dh.info.formatVersion,
    259                 reinterpret_cast<SpoofDataHeader *>(outData)->fFormatVersion,
    260                 sizeof(dh.info.formatVersion));
    261 
    262     //
    263     //  Create the output file
    264     //
    265     size_t bytesWritten;
    266     UNewDataMemory *pData;
    267     pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
    268     if(U_FAILURE(status)) {
    269         fprintf(stderr, "gencfu: Could not open output file \"%s\", \"%s\"\n",
    270                          outFileName, u_errorName(status));
    271         exit(status);
    272     }
    273 
    274 
    275     //  Write the data itself.
    276     udata_writeBlock(pData, outData, outDataSize);
    277     // finish up
    278     bytesWritten = udata_finish(pData, &status);
    279     if(U_FAILURE(status)) {
    280         fprintf(stderr, "gencfu: Error %d writing the output file\n", status);
    281         exit(status);
    282     }
    283 
    284     if (bytesWritten != outDataSize) {
    285         fprintf(stderr, "gencfu: Error writing to output file \"%s\"\n", outFileName);
    286         exit(-1);
    287     }
    288 
    289     uspoof_close(sc);
    290     delete [] outData;
    291     delete [] confusables;
    292     u_cleanup();
    293     if (!quiet) {
    294         printf("gencfu: tool completed successfully.\n");
    295     }
    296     return 0;
    297 #endif   // UCONFIG_NO_REGULAR_EXPRESSIONS
    298 }
    299 
    300 
    301  //
    302  //  Read in a confusables source file
    303  //
    304  static const char *readFile(const char *fileName, int32_t *len) {
    305     char       *result;
    306     long        fileSize;
    307     FILE        *file;
    308 
    309     file = fopen(fileName, "rb");
    310     if( file == 0 ) {
    311         return NULL;
    312     }
    313     fseek(file, 0, SEEK_END);
    314     fileSize = ftell(file);
    315     fseek(file, 0, SEEK_SET);
    316     result = new char[fileSize+10];
    317     if (result==NULL) {
    318         fclose(file);
    319         return NULL;
    320     }
    321 
    322     long t = fread(result, 1, fileSize, file);
    323     if (t != fileSize)  {
    324         delete [] result;
    325         fclose(file);
    326         return NULL;
    327     }
    328     result[fileSize]=0;
    329     *len = static_cast<int32_t>(fileSize);
    330     fclose(file);
    331     return result;
    332  }
    333