Home | History | Annotate | Download | only in gencfu
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2009-2010, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *
      7 * File gencfu.c
      8 */
      9 
     10 //--------------------------------------------------------------------
     11 //
     12 //   Tool for generating Unicode Confusable data files (.cfu files).
     13 //   .cfu files contain the compiled of the confusable data
     14 //   derived from the Unicode Consortium data described in
     15 //   Unicode UAX 39.
     16 //
     17 //   Usage:  gencfu [options] -r confusables-file.txt -w whole-script-confusables.txt  -o output-file.cfu
     18 //
     19 //       options:   -v         verbose
     20 //                  -? or -h   help
     21 //
     22 //   The input rule filew is are plain text files containing confusable character
     23 //    definitions in the input format defined by Unicode UAX39 for the files
     24 //    confusables.txt and confusablesWholeScript.txt.  This source (.txt) format
     25 //    is also accepted direaccepted by ICU spoof detedtors.  The
     26 //    files must be encoded in utf-8 format, with or without a BOM.
     27 //
     28 //--------------------------------------------------------------------
     29 
     30 #include "unicode/utypes.h"
     31 #include "unicode/unistr.h"
     32 #include "unicode/uclean.h"
     33 #include "unicode/udata.h"
     34 #include "unicode/putil.h"
     35 
     36 #include "uoptions.h"
     37 #include "unewdata.h"
     38 #include "ucmndata.h"
     39 #include "uspoof_impl.h"
     40 #include "cmemory.h"
     41 
     42 #include <stdio.h>
     43 #include <stdlib.h>
     44 #include <string.h>
     45 
     46 U_NAMESPACE_USE
     47 
     48 static char *progName;
     49 static UOption options[]={
     50     UOPTION_HELP_H,             /* 0 */
     51     UOPTION_HELP_QUESTION_MARK, /* 1 */
     52     UOPTION_VERBOSE,            /* 2 */
     53     { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 },   /* 3 */
     54     { "wsrules", NULL, NULL, NULL, 'w', UOPT_REQUIRES_ARG, 0},  /* 4 */
     55     { "out",   NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 },   /* 5 */
     56     UOPTION_ICUDATADIR,         /* 6 */
     57     UOPTION_DESTDIR,            /* 7 */
     58     UOPTION_COPYRIGHT,          /* 8 */
     59 };
     60 
     61 void usageAndDie(int retCode) {
     62         printf("Usage: %s [-v] [-options] -r confusablesRules.txt -w wholeScriptConfusables.txt -o output-file\n", progName);
     63         printf("\tRead in Unicode confusable character definitions and write out the binary data\n"
     64             "options:\n"
     65             "\t-h or -? or --help  this usage text\n"
     66             "\t-V or --version     show a version message\n"
     67             "\t-c or --copyright   include a copyright notice\n"
     68             "\t-v or --verbose     turn on verbose output\n"
     69             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
     70             "\t                    followed by path, defaults to %s\n"
     71             "\t-d or --destdir     destination directory, followed by the path\n",
     72             u_getDataDirectory());
     73         exit (retCode);
     74 }
     75 
     76 
     77 #if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO
     78 
     79 /* dummy UDataInfo cf. udata.h */
     80 static UDataInfo dummyDataInfo = {
     81     sizeof(UDataInfo),
     82     0,
     83 
     84     U_IS_BIG_ENDIAN,
     85     U_CHARSET_FAMILY,
     86     U_SIZEOF_UCHAR,
     87     0,
     88 
     89     { 0, 0, 0, 0 },                 /* dummy dataFormat */
     90     { 0, 0, 0, 0 },                 /* dummy formatVersion */
     91     { 0, 0, 0, 0 }                  /* dummy dataVersion */
     92 };
     93 
     94 #else
     95 
     96 //
     97 //  Set up the ICU data header, defined in ucmndata.h
     98 //
     99 DataHeader dh ={
    100     {sizeof(DataHeader),           // Struct MappedData
    101         0xda,
    102         0x27},
    103 
    104     {                               // struct UDataInfo
    105         sizeof(UDataInfo),          //     size
    106         0,                          //     reserved
    107         U_IS_BIG_ENDIAN,
    108         U_CHARSET_FAMILY,
    109         U_SIZEOF_UCHAR,
    110         0,                          //     reserved
    111 
    112     { 0x43, 0x66, 0x75, 0x20 },     //     dataFormat="Cfu "
    113     { 0xff, 0, 0, 0 },              //     formatVersion.  Filled in later with values
    114                                     //      from the  builder.  The  values declared
    115                                     //      here should never appear in any real data.
    116         { 5, 1, 0, 0 }              //   dataVersion (Unicode version)
    117     }};
    118 
    119 #endif
    120 
    121 // Forward declaration for function for reading source files.
    122 static const char *readFile(const char *fileName, int32_t *len);
    123 
    124 //----------------------------------------------------------------------------
    125 //
    126 //  main      for gencfu
    127 //
    128 //----------------------------------------------------------------------------
    129 int  main(int argc, char **argv) {
    130     UErrorCode  status = U_ZERO_ERROR;
    131     const char *confFileName;
    132     const char *confWSFileName;
    133     const char *outFileName;
    134     const char *outDir = NULL;
    135     const char *copyright = NULL;
    136 
    137     //
    138     // Pick up and check the command line arguments,
    139     //    using the standard ICU tool utils option handling.
    140     //
    141     U_MAIN_INIT_ARGS(argc, argv);
    142     progName = argv[0];
    143     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
    144     if(argc<0) {
    145         // Unrecognized option
    146         fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
    147         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    148     }
    149 
    150     if(options[0].doesOccur || options[1].doesOccur) {
    151         //  -? or -h for help.
    152         usageAndDie(0);
    153     }
    154 
    155     if (!(options[3].doesOccur && options[4].doesOccur && options[5].doesOccur)) {
    156         fprintf(stderr, "confusables file, whole script confusables file and output file must all be specified.\n");
    157         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    158     }
    159     confFileName   = options[3].value;
    160     confWSFileName = options[4].value;
    161     outFileName    = options[5].value;
    162 
    163     if (options[6].doesOccur) {
    164         u_setDataDirectory(options[6].value);
    165     }
    166 
    167     status = U_ZERO_ERROR;
    168 
    169     /* Combine the directory with the file name */
    170     if(options[7].doesOccur) {
    171         outDir = options[7].value;
    172     }
    173     if (options[8].doesOccur) {
    174         copyright = U_COPYRIGHT_STRING;
    175     }
    176 
    177 #if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO
    178     // spoof detection data file parsing is dependent on regular expressions.
    179     // TODO: have the tool return an error status.  Requires fixing the ICU data build
    180     //       so that it doesn't abort entirely on that error.
    181 
    182     UNewDataMemory *pData;
    183     char msg[1024];
    184 
    185     /* write message with just the name */
    186     sprintf(msg, "gencfu writes dummy %s because of UCONFIG_NO_REGULAR_EXPRESSIONS and/or UCONFIG_NO_NORMALIZATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
    187     fprintf(stderr, "%s\n", msg);
    188 
    189     /* write the dummy data file */
    190     pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
    191     udata_writeBlock(pData, msg, strlen(msg));
    192     udata_finish(pData, &status);
    193     return (int)status;
    194 
    195 #else
    196     /* Initialize ICU */
    197     u_init(&status);
    198     if (U_FAILURE(status)) {
    199         fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
    200             argv[0], u_errorName(status));
    201         exit(1);
    202     }
    203     status = U_ZERO_ERROR;
    204 
    205     //  Read in the confusables source file
    206 
    207     int32_t      confusablesLen = 0;
    208     const char  *confusables = readFile(confFileName, &confusablesLen);
    209     if (confusables == NULL) {
    210         printf("gencfu: error reading file  \"%s\"\n", confFileName);
    211         exit(-1);
    212     }
    213 
    214     int32_t     wsConfusablesLen = 0;
    215     const char *wsConfsables =  readFile(confWSFileName, &wsConfusablesLen);
    216     if (wsConfsables == NULL) {
    217         printf("gencfu: error reading file  \"%s\"\n", confFileName);
    218         exit(-1);
    219     }
    220 
    221     //
    222     //  Create the Spoof Detector from the source confusables files.
    223     //     This will compile the data.
    224     //
    225     UParseError parseError;
    226     parseError.line = 0;
    227     parseError.offset = 0;
    228     int32_t errType;
    229     USpoofChecker *sc = uspoof_openFromSource(confusables, confusablesLen,
    230                                               wsConfsables, wsConfusablesLen,
    231                                               &errType, &parseError, &status);
    232     if (U_FAILURE(status)) {
    233         const char *errFile =
    234             (errType == USPOOF_WHOLE_SCRIPT_CONFUSABLE)? confWSFileName : confFileName;
    235         fprintf(stderr, "gencfu: uspoof_openFromSource error \"%s\"  at file %s, line %d, column %d\n",
    236                 u_errorName(status), errFile, (int)parseError.line, (int)parseError.offset);
    237         exit(status);
    238     };
    239 
    240 
    241     //
    242     //  Get the compiled rule data from the USpoofChecker.
    243     //
    244     uint32_t        outDataSize;
    245     uint8_t        *outData;
    246     outDataSize = uspoof_serialize(sc, NULL, 0, &status);
    247     if (status != U_BUFFER_OVERFLOW_ERROR) {
    248         fprintf(stderr, "gencfu: uspoof_serialize() returned %s\n", u_errorName(status));
    249         exit(status);
    250     }
    251     status = U_ZERO_ERROR;
    252     outData = new uint8_t[outDataSize];
    253     uspoof_serialize(sc, outData, outDataSize, &status);
    254 
    255     // Copy the data format version numbers from the spoof data header into the UDataMemory header.
    256 
    257     uprv_memcpy(dh.info.formatVersion,
    258                 reinterpret_cast<SpoofDataHeader *>(outData)->fFormatVersion,
    259                 sizeof(dh.info.formatVersion));
    260 
    261     //
    262     //  Create the output file
    263     //
    264     size_t bytesWritten;
    265     UNewDataMemory *pData;
    266     pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
    267     if(U_FAILURE(status)) {
    268         fprintf(stderr, "gencfu: Could not open output file \"%s\", \"%s\"\n",
    269                          outFileName, u_errorName(status));
    270         exit(status);
    271     }
    272 
    273 
    274     //  Write the data itself.
    275     udata_writeBlock(pData, outData, outDataSize);
    276     // finish up
    277     bytesWritten = udata_finish(pData, &status);
    278     if(U_FAILURE(status)) {
    279         fprintf(stderr, "gencfu: Error %d writing the output file\n", status);
    280         exit(status);
    281     }
    282 
    283     if (bytesWritten != outDataSize) {
    284         fprintf(stderr, "gencfu: Error writing to output file \"%s\"\n", outFileName);
    285         exit(-1);
    286     }
    287 
    288     uspoof_close(sc);
    289     delete [] outData;
    290     delete confusables;
    291     delete wsConfsables;
    292     u_cleanup();
    293     printf("gencfu: tool completed successfully.\n");
    294     return 0;
    295 #endif   // UCONFIG_NO_REGULAR_EXPRESSIONS
    296 }
    297 
    298 
    299  //
    300  //  Read in a confusables source file
    301  //
    302  static const char *readFile(const char *fileName, int32_t *len) {
    303     char       *result;
    304     long        fileSize;
    305     FILE        *file;
    306 
    307     file = fopen(fileName, "rb");
    308     if( file == 0 ) {
    309         return NULL;
    310     }
    311     fseek(file, 0, SEEK_END);
    312     fileSize = ftell(file);
    313     fseek(file, 0, SEEK_SET);
    314     result = new char[fileSize+10];
    315     if (result==NULL) {
    316         return result;
    317     }
    318 
    319     long t = fread(result, 1, fileSize, file);
    320     if (t != fileSize)  {
    321         delete [] result;
    322         fclose(file);
    323         return NULL;
    324     }
    325     result[fileSize]=0;
    326     *len = static_cast<int32_t>(fileSize);
    327     fclose(file);
    328     return result;
    329  }
    330