Home | History | Annotate | Download | only in gennorm2
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2009-2014, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  gennorm2.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2009nov25
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This program reads text files that define Unicode normalization,
     17 *   parses them, and builds a binary data file.
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 #include "n2builder.h"
     22 
     23 #include <stdio.h>
     24 #include <stdlib.h>
     25 #include <string.h>
     26 #include "unicode/errorcode.h"
     27 #include "unicode/localpointer.h"
     28 #include "unicode/putil.h"
     29 #include "unicode/uchar.h"
     30 #include "unicode/unistr.h"
     31 #include "charstr.h"
     32 #include "normalizer2impl.h"
     33 #include "toolutil.h"
     34 #include "uoptions.h"
     35 #include "uparse.h"
     36 
     37 #if UCONFIG_NO_NORMALIZATION
     38 #include "unewdata.h"
     39 #endif
     40 
     41 U_NAMESPACE_BEGIN
     42 
     43 UBool beVerbose=FALSE, haveCopyright=TRUE;
     44 
     45 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
     46 
     47 #if !UCONFIG_NO_NORMALIZATION
     48 void parseFile(FILE *f, Normalizer2DataBuilder &builder);
     49 #endif
     50 
     51 /* -------------------------------------------------------------------------- */
     52 
     53 enum {
     54     HELP_H,
     55     HELP_QUESTION_MARK,
     56     VERBOSE,
     57     COPYRIGHT,
     58     SOURCEDIR,
     59     OUTPUT_FILENAME,
     60     UNICODE_VERSION,
     61     WRITE_C_SOURCE,
     62     OPT_FAST
     63 };
     64 
     65 static UOption options[]={
     66     UOPTION_HELP_H,
     67     UOPTION_HELP_QUESTION_MARK,
     68     UOPTION_VERBOSE,
     69     UOPTION_COPYRIGHT,
     70     UOPTION_SOURCEDIR,
     71     UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
     72     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
     73     UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
     74     UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
     75 };
     76 
     77 extern "C" int
     78 main(int argc, char* argv[]) {
     79     U_MAIN_INIT_ARGS(argc, argv);
     80 
     81     /* preset then read command line options */
     82     options[SOURCEDIR].value="";
     83     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
     84 
     85     /* error handling, printing usage message */
     86     if(argc<0) {
     87         fprintf(stderr,
     88             "error in command line argument \"%s\"\n",
     89             argv[-argc]);
     90     }
     91     if(!options[OUTPUT_FILENAME].doesOccur) {
     92         argc=-1;
     93     }
     94     if( argc<2 ||
     95         options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
     96     ) {
     97         /*
     98          * Broken into chunks because the C89 standard says the minimum
     99          * required supported string length is 509 bytes.
    100          */
    101         fprintf(stderr,
    102             "Usage: %s [-options] infiles+ -o outputfilename\n"
    103             "\n"
    104             "Reads the infiles with normalization data and\n"
    105             "creates a binary or C source file (outputfilename) with the data.\n"
    106             "\n",
    107             argv[0]);
    108         fprintf(stderr,
    109             "Options:\n"
    110             "\t-h or -? or --help  this usage text\n"
    111             "\t-v or --verbose     verbose output\n"
    112             "\t-c or --copyright   include a copyright notice\n"
    113             "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
    114         fprintf(stderr,
    115             "\t-s or --sourcedir   source directory, followed by the path\n"
    116             "\t-o or --output      output filename\n"
    117             "\t      --csource     writes a C source file with initializers\n");
    118         fprintf(stderr,
    119             "\t      --fast        optimize the data for fast normalization,\n"
    120             "\t                    which might increase its size  (Writes fully decomposed\n"
    121             "\t                    regular mappings instead of delta mappings.\n"
    122             "\t                    You should measure the runtime speed to make sure that\n"
    123             "\t                    this is a good trade-off.)\n");
    124         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
    125     }
    126 
    127     beVerbose=options[VERBOSE].doesOccur;
    128     haveCopyright=options[COPYRIGHT].doesOccur;
    129 
    130     IcuToolErrorCode errorCode("gennorm2/main()");
    131 
    132 #if UCONFIG_NO_NORMALIZATION
    133 
    134     fprintf(stderr,
    135         "gennorm2 writes a dummy binary data file "
    136         "because UCONFIG_NO_NORMALIZATION is set, \n"
    137         "see icu/source/common/unicode/uconfig.h\n");
    138     udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
    139     // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
    140     // return U_UNSUPPORTED_ERROR;
    141     return 0;
    142 
    143 #else
    144 
    145     LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode), errorCode);
    146     errorCode.assertSuccess();
    147 
    148     if(options[UNICODE_VERSION].doesOccur) {
    149         builder->setUnicodeVersion(options[UNICODE_VERSION].value);
    150     }
    151 
    152     if(options[OPT_FAST].doesOccur) {
    153         builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
    154     }
    155 
    156     // prepare the filename beginning with the source dir
    157     CharString filename(options[SOURCEDIR].value, errorCode);
    158     int32_t pathLength=filename.length();
    159     if( pathLength>0 &&
    160         filename[pathLength-1]!=U_FILE_SEP_CHAR &&
    161         filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
    162     ) {
    163         filename.append(U_FILE_SEP_CHAR, errorCode);
    164         pathLength=filename.length();
    165     }
    166 
    167     for(int i=1; i<argc; ++i) {
    168         printf("gennorm2: processing %s\n", argv[i]);
    169         filename.append(argv[i], errorCode);
    170         LocalStdioFilePointer f(fopen(filename.data(), "r"));
    171         if(f==NULL) {
    172             fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
    173             exit(U_FILE_ACCESS_ERROR);
    174         }
    175         builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
    176         parseFile(f.getAlias(), *builder);
    177         filename.truncate(pathLength);
    178     }
    179 
    180     if(options[WRITE_C_SOURCE].doesOccur) {
    181         builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
    182     } else {
    183         builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
    184     }
    185 
    186     return errorCode.get();
    187 
    188 #endif
    189 }
    190 
    191 #if !UCONFIG_NO_NORMALIZATION
    192 
    193 void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
    194     IcuToolErrorCode errorCode("gennorm2/parseFile()");
    195     char line[300];
    196     uint32_t startCP, endCP;
    197     while(NULL!=fgets(line, (int)sizeof(line), f)) {
    198         char *comment=(char *)strchr(line, '#');
    199         if(comment!=NULL) {
    200             *comment=0;
    201         }
    202         u_rtrim(line);
    203         if(line[0]==0) {
    204             continue;  // skip empty and comment-only lines
    205         }
    206         if(line[0]=='*') {
    207             const char *s=u_skipWhitespace(line+1);
    208             if(0==strncmp(s, "Unicode", 7)) {
    209                 s=u_skipWhitespace(s+7);
    210                 builder.setUnicodeVersion(s);
    211             }
    212             continue;  // reserved syntax
    213         }
    214         const char *delimiter;
    215         int32_t rangeLength=
    216             u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
    217         if(errorCode.isFailure()) {
    218             fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
    219             exit(errorCode.reset());
    220         }
    221         delimiter=u_skipWhitespace(delimiter);
    222         if(*delimiter==':') {
    223             const char *s=u_skipWhitespace(delimiter+1);
    224             char *end;
    225             unsigned long value=strtoul(s, &end, 10);
    226             if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
    227                 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
    228                 exit(U_PARSE_ERROR);
    229             }
    230             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
    231                 builder.setCC(c, (uint8_t)value);
    232             }
    233             continue;
    234         }
    235         if(*delimiter=='-') {
    236             if(*u_skipWhitespace(delimiter+1)!=0) {
    237                 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
    238                 exit(U_PARSE_ERROR);
    239             }
    240             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
    241                 builder.removeMapping(c);
    242             }
    243             continue;
    244         }
    245         if(*delimiter=='=' || *delimiter=='>') {
    246             UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
    247             int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode);
    248             if(errorCode.isFailure()) {
    249                 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
    250                 exit(errorCode.reset());
    251             }
    252             UnicodeString mapping(FALSE, uchars, length);
    253             if(*delimiter=='=') {
    254                 if(rangeLength!=1) {
    255                     fprintf(stderr,
    256                             "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
    257                             line);
    258                     exit(U_PARSE_ERROR);
    259                 }
    260                 builder.setRoundTripMapping((UChar32)startCP, mapping);
    261             } else {
    262                 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
    263                     builder.setOneWayMapping(c, mapping);
    264                 }
    265             }
    266             continue;
    267         }
    268         fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
    269         exit(U_PARSE_ERROR);
    270     }
    271 }
    272 
    273 #endif // !UCONFIG_NO_NORMALIZATION
    274 
    275 U_NAMESPACE_END
    276 
    277 /*
    278  * Hey, Emacs, please set the following:
    279  *
    280  * Local Variables:
    281  * indent-tabs-mode: nil
    282  * End:
    283  *
    284  */
    285