Home | History | Annotate | Download | only in gennorm2
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2009-2014, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  gennorm2.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2009nov25
     16 *   created by: Markus W. Scherer
     17 *
     18 *   This program reads text files that define Unicode normalization,
     19 *   parses them, and builds a binary data file.
     20 */
     21 
     22 #include "unicode/utypes.h"
     23 #include "n2builder.h"
     24 
     25 #include <fstream>
     26 #include <stdio.h>
     27 #include <stdlib.h>
     28 #include <string>
     29 #include <string.h>
     30 #include "unicode/errorcode.h"
     31 #include "unicode/localpointer.h"
     32 #include "unicode/putil.h"
     33 #include "unicode/uchar.h"
     34 #include "unicode/unistr.h"
     35 #include "charstr.h"
     36 #include "normalizer2impl.h"
     37 #include "toolutil.h"
     38 #include "uoptions.h"
     39 #include "uparse.h"
     40 
     41 #if UCONFIG_NO_NORMALIZATION
     42 #include "unewdata.h"
     43 #endif
     44 
     45 U_NAMESPACE_BEGIN
     46 
     47 UBool beVerbose=FALSE, haveCopyright=TRUE;
     48 
     49 #if !UCONFIG_NO_NORMALIZATION
     50 void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder);
     51 #endif
     52 
     53 /* -------------------------------------------------------------------------- */
     54 
     55 enum {
     56     HELP_H,
     57     HELP_QUESTION_MARK,
     58     VERBOSE,
     59     COPYRIGHT,
     60     SOURCEDIR,
     61     OUTPUT_FILENAME,
     62     UNICODE_VERSION,
     63     WRITE_C_SOURCE,
     64     WRITE_COMBINED_DATA,
     65     OPT_FAST
     66 };
     67 
     68 static UOption options[]={
     69     UOPTION_HELP_H,
     70     UOPTION_HELP_QUESTION_MARK,
     71     UOPTION_VERBOSE,
     72     UOPTION_COPYRIGHT,
     73     UOPTION_SOURCEDIR,
     74     UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
     75     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
     76     UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
     77     UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
     78     UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
     79 };
     80 
     81 extern "C" int
     82 main(int argc, char* argv[]) {
     83     U_MAIN_INIT_ARGS(argc, argv);
     84 
     85     /* preset then read command line options */
     86     options[SOURCEDIR].value="";
     87     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
     88 
     89     /* error handling, printing usage message */
     90     if(argc<0) {
     91         fprintf(stderr,
     92             "error in command line argument \"%s\"\n",
     93             argv[-argc]);
     94     }
     95     if(!options[OUTPUT_FILENAME].doesOccur) {
     96         argc=-1;
     97     }
     98     if( argc<2 ||
     99         options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
    100     ) {
    101         fprintf(stderr,
    102             "Usage: %s [-options] infiles+ -o outputfilename\n"
    103             "\n"
    104             "Reads the infiles with normalization data and\n"
    105             "creates a binary file, or a C source file (--csource), with the data,\n"
    106             "or writes a data file with the combined data (--combined).\n"
    107             "See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n"
    108             "\n"
    109             "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
    110             "\n"
    111             "Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
    112             "in input-file syntax to the outputfilename.\n"
    113             "It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
    114             "(Useful for computing minimal incremental mapping data files.)\n"
    115             "\n",
    116             argv[0], argv[0]);
    117         fprintf(stderr,
    118             "Options:\n"
    119             "\t-h or -? or --help  this usage text\n"
    120             "\t-v or --verbose     verbose output\n"
    121             "\t-c or --copyright   include a copyright notice\n"
    122             "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
    123         fprintf(stderr,
    124             "\t-s or --sourcedir   source directory, followed by the path\n"
    125             "\t-o or --output      output filename\n"
    126             "\t      --csource     writes a C source file with initializers\n"
    127             "\t      --combined    writes a .txt file (input-file syntax) with the\n"
    128             "\t                    combined data from all of the input files\n");
    129         fprintf(stderr,
    130             "\t      --fast        optimize the data for fast normalization,\n"
    131             "\t                    which might increase its size  (Writes fully decomposed\n"
    132             "\t                    regular mappings instead of delta mappings.\n"
    133             "\t                    You should measure the runtime speed to make sure that\n"
    134             "\t                    this is a good trade-off.)\n");
    135         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
    136     }
    137 
    138     beVerbose=options[VERBOSE].doesOccur;
    139     haveCopyright=options[COPYRIGHT].doesOccur;
    140 
    141     IcuToolErrorCode errorCode("gennorm2/main()");
    142 
    143 #if UCONFIG_NO_NORMALIZATION
    144 
    145     fprintf(stderr,
    146         "gennorm2 writes a dummy binary data file "
    147         "because UCONFIG_NO_NORMALIZATION is set, \n"
    148         "see icu/source/common/unicode/uconfig.h\n");
    149     udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
    150     // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
    151     // return U_UNSUPPORTED_ERROR;
    152     return 0;
    153 
    154 #else
    155 
    156     LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
    157     LocalPointer<Normalizer2DataBuilder> b2;
    158     LocalPointer<Normalizer2DataBuilder> diff;
    159     Normalizer2DataBuilder *builder = b1.getAlias();
    160     errorCode.assertSuccess();
    161 
    162     if(options[UNICODE_VERSION].doesOccur) {
    163         builder->setUnicodeVersion(options[UNICODE_VERSION].value);
    164     }
    165 
    166     if(options[OPT_FAST].doesOccur) {
    167         builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
    168     }
    169 
    170     // prepare the filename beginning with the source dir
    171     CharString filename(options[SOURCEDIR].value, errorCode);
    172     int32_t pathLength=filename.length();
    173     if( pathLength>0 &&
    174         filename[pathLength-1]!=U_FILE_SEP_CHAR &&
    175         filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
    176     ) {
    177         filename.append(U_FILE_SEP_CHAR, errorCode);
    178         pathLength=filename.length();
    179     }
    180 
    181     bool doMinus = false;
    182     for(int i=1; i<argc; ++i) {
    183         printf("gennorm2: processing %s\n", argv[i]);
    184         if(strcmp(argv[i], "minus") == 0) {
    185             if(doMinus) {
    186                 fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
    187                 exit(U_ILLEGAL_ARGUMENT_ERROR);
    188             }
    189             // Data from previous input files has been collected in b1.
    190             // Collect data from further input files in b2.
    191             b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
    192             diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
    193             errorCode.assertSuccess();
    194             builder = b2.getAlias();
    195             if(options[UNICODE_VERSION].doesOccur) {
    196                 builder->setUnicodeVersion(options[UNICODE_VERSION].value);
    197             }
    198             if(options[OPT_FAST].doesOccur) {
    199                 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
    200             }
    201             doMinus = true;
    202             continue;
    203         }
    204         filename.append(argv[i], errorCode);
    205         std::ifstream f(filename.data());
    206         if(f.fail()) {
    207             fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
    208             exit(U_FILE_ACCESS_ERROR);
    209         }
    210         builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
    211         parseFile(f, *builder);
    212         filename.truncate(pathLength);
    213     }
    214 
    215     if(doMinus) {
    216         Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
    217         diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
    218     } else if(options[WRITE_COMBINED_DATA].doesOccur) {
    219         builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
    220     } else if(options[WRITE_C_SOURCE].doesOccur) {
    221         builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
    222     } else {
    223         builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
    224     }
    225 
    226     return errorCode.get();
    227 
    228 #endif
    229 }
    230 
    231 #if !UCONFIG_NO_NORMALIZATION
    232 
    233 void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder) {
    234     IcuToolErrorCode errorCode("gennorm2/parseFile()");
    235     std::string lineString;
    236     uint32_t startCP, endCP;
    237     while(std::getline(f, lineString)) {
    238         if (lineString.empty()) {
    239             continue;  // skip empty lines.
    240         }
    241 #if (U_CPLUSPLUS_VERSION >= 11)
    242         char *line = &lineString.front();
    243 #else
    244         char *line = &lineString.at(0);
    245 #endif
    246         char *comment=(char *)strchr(line, '#');
    247         if(comment!=NULL) {
    248             *comment=0;
    249         }
    250         u_rtrim(line);
    251         if(line[0]==0) {
    252             continue;  // skip empty and comment-only lines
    253         }
    254         if(line[0]=='*') {
    255             const char *s=u_skipWhitespace(line+1);
    256             if(0==strncmp(s, "Unicode", 7)) {
    257                 s=u_skipWhitespace(s+7);
    258                 builder.setUnicodeVersion(s);
    259             }
    260             continue;  // reserved syntax
    261         }
    262         const char *delimiter;
    263         int32_t rangeLength=
    264             u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
    265         if(errorCode.isFailure()) {
    266             fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
    267             exit(errorCode.reset());
    268         }
    269         delimiter=u_skipWhitespace(delimiter);
    270         if(*delimiter==':') {
    271             const char *s=u_skipWhitespace(delimiter+1);
    272             char *end;
    273             unsigned long value=strtoul(s, &end, 10);
    274             if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
    275                 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
    276                 exit(U_PARSE_ERROR);
    277             }
    278             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
    279                 builder.setCC(c, (uint8_t)value);
    280             }
    281             continue;
    282         }
    283         if(*delimiter=='-') {
    284             if(*u_skipWhitespace(delimiter+1)!=0) {
    285                 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
    286                 exit(U_PARSE_ERROR);
    287             }
    288             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
    289                 builder.removeMapping(c);
    290             }
    291             continue;
    292         }
    293         if(*delimiter=='=' || *delimiter=='>') {
    294             UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
    295             int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode);
    296             if(errorCode.isFailure()) {
    297                 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
    298                 exit(errorCode.reset());
    299             }
    300             UnicodeString mapping(FALSE, uchars, length);
    301             if(*delimiter=='=') {
    302                 if(rangeLength!=1) {
    303                     fprintf(stderr,
    304                             "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
    305                             line);
    306                     exit(U_PARSE_ERROR);
    307                 }
    308                 builder.setRoundTripMapping((UChar32)startCP, mapping);
    309             } else {
    310                 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
    311                     builder.setOneWayMapping(c, mapping);
    312                 }
    313             }
    314             continue;
    315         }
    316         fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
    317         exit(U_PARSE_ERROR);
    318     }
    319 }
    320 
    321 #endif // !UCONFIG_NO_NORMALIZATION
    322 
    323 U_NAMESPACE_END
    324 
    325 /*
    326  * Hey, Emacs, please set the following:
    327  *
    328  * Local Variables:
    329  * indent-tabs-mode: nil
    330  * End:
    331  *
    332  */
    333