Home | History | Annotate | Download | only in gennorm2
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2009-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  gennorm2.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2009nov25
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This program reads text files that define Unicode normalization,
     17 *   parses them, and builds a binary data file.
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 #include "n2builder.h"
     22 
     23 #include <stdio.h>
     24 #include <stdlib.h>
     25 #include <string.h>
     26 #include "unicode/errorcode.h"
     27 #include "unicode/localpointer.h"
     28 #include "unicode/putil.h"
     29 #include "unicode/uchar.h"
     30 #include "unicode/unistr.h"
     31 #include "charstr.h"
     32 #include "normalizer2impl.h"
     33 #include "toolutil.h"
     34 #include "uoptions.h"
     35 #include "uparse.h"
     36 
     37 #if UCONFIG_NO_NORMALIZATION
     38 #include "unewdata.h"
     39 #endif
     40 
     41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     42 
     43 U_NAMESPACE_BEGIN
     44 
     45 UBool beVerbose=FALSE, haveCopyright=TRUE;
     46 
     47 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
     48 
     49 #if !UCONFIG_NO_NORMALIZATION
     50 void parseFile(FILE *f, Normalizer2DataBuilder &builder);
     51 #endif
     52 
     53 /* -------------------------------------------------------------------------- */
     54 
     55 enum {
     56     HELP_H,
     57     HELP_QUESTION_MARK,
     58     VERBOSE,
     59     COPYRIGHT,
     60     SOURCEDIR,
     61     OUTPUT_FILENAME,
     62     UNICODE_VERSION,
     63     OPT_FAST
     64 };
     65 
     66 static UOption options[]={
     67     UOPTION_HELP_H,
     68     UOPTION_HELP_QUESTION_MARK,
     69     UOPTION_VERBOSE,
     70     UOPTION_COPYRIGHT,
     71     UOPTION_SOURCEDIR,
     72     UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
     73     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
     74     UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
     75 };
     76 
     77 extern "C" int
     78 main(int argc, char* argv[]) {
     79     U_MAIN_INIT_ARGS(argc, argv);
     80 
     81     /* preset then read command line options */
     82     options[SOURCEDIR].value="";
     83     options[UNICODE_VERSION].value=U_UNICODE_VERSION;
     84     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
     85 
     86     /* error handling, printing usage message */
     87     if(argc<0) {
     88         fprintf(stderr,
     89             "error in command line argument \"%s\"\n",
     90             argv[-argc]);
     91     }
     92     if(!options[OUTPUT_FILENAME].doesOccur) {
     93         argc=-1;
     94     }
     95     if( argc<2 ||
     96         options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
     97     ) {
     98         /*
     99          * Broken into chunks because the C89 standard says the minimum
    100          * required supported string length is 509 bytes.
    101          */
    102         fprintf(stderr,
    103             "Usage: %s [-options] infiles+ -o outputfilename\n"
    104             "\n"
    105             "Reads the infiles with normalization data and\n"
    106             "creates a binary file (outputfilename) with the data.\n"
    107             "\n",
    108             argv[0]);
    109         fprintf(stderr,
    110             "Options:\n"
    111             "\t-h or -? or --help  this usage text\n"
    112             "\t-v or --verbose     verbose output\n"
    113             "\t-c or --copyright   include a copyright notice\n"
    114             "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
    115         fprintf(stderr,
    116             "\t-s or --sourcedir   source directory, followed by the path\n"
    117             "\t-o or --output      output filename\n");
    118         fprintf(stderr,
    119             "\t      --fast        optimize the .nrm file for fast normalization,\n"
    120             "\t                    which might increase its size  (Writes fully decomposed\n"
    121             "\t                    regular mappings instead of delta mappings.\n"
    122             "\t                    You should measure the runtime speed to make sure that\n"
    123             "\t                    this is a good trade-off.)\n");
    124         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
    125     }
    126 
    127     beVerbose=options[VERBOSE].doesOccur;
    128     haveCopyright=options[COPYRIGHT].doesOccur;
    129 
    130     IcuToolErrorCode errorCode("gennorm2/main()");
    131 
    132 #if UCONFIG_NO_NORMALIZATION
    133 
    134     fprintf(stderr,
    135         "gennorm2 writes a dummy binary data file "
    136         "because UCONFIG_NO_NORMALIZATION is set, \n"
    137         "see icu/source/common/unicode/uconfig.h\n");
    138     udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
    139     // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
    140     // return U_UNSUPPORTED_ERROR;
    141     return 0;
    142 
    143 #else
    144 
    145     LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode));
    146     errorCode.assertSuccess();
    147 
    148     builder->setUnicodeVersion(options[UNICODE_VERSION].value);
    149 
    150     if(options[OPT_FAST].doesOccur) {
    151         builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
    152     }
    153 
    154     // prepare the filename beginning with the source dir
    155     CharString filename(options[SOURCEDIR].value, errorCode);
    156     int32_t pathLength=filename.length();
    157     if( pathLength>0 &&
    158         filename[pathLength-1]!=U_FILE_SEP_CHAR &&
    159         filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
    160     ) {
    161         filename.append(U_FILE_SEP_CHAR, errorCode);
    162         pathLength=filename.length();
    163     }
    164 
    165     for(int i=1; i<argc; ++i) {
    166         printf("gennorm2: processing %s\n", argv[i]);
    167         filename.append(argv[i], errorCode);
    168         LocalStdioFilePointer f(fopen(filename.data(), "r"));
    169         if(f==NULL) {
    170             fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
    171             exit(U_FILE_ACCESS_ERROR);
    172         }
    173         builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
    174         parseFile(f.getAlias(), *builder);
    175         filename.truncate(pathLength);
    176     }
    177 
    178     builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
    179 
    180     return errorCode.get();
    181 
    182 #endif
    183 }
    184 
    185 #if !UCONFIG_NO_NORMALIZATION
    186 
    187 void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
    188     IcuToolErrorCode errorCode("gennorm2/parseFile()");
    189     char line[300];
    190     uint32_t startCP, endCP;
    191     while(NULL!=fgets(line, (int)sizeof(line), f)) {
    192         char *comment=(char *)strchr(line, '#');
    193         if(comment!=NULL) {
    194             *comment=0;
    195         }
    196         u_rtrim(line);
    197         if(line[0]==0) {
    198             continue;  // skip empty and comment-only lines
    199         }
    200         if(line[0]=='*') {
    201             continue;  // reserved syntax
    202         }
    203         const char *delimiter;
    204         int32_t rangeLength=
    205             u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
    206         if(errorCode.isFailure()) {
    207             fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
    208             exit(errorCode.reset());
    209         }
    210         delimiter=u_skipWhitespace(delimiter);
    211         if(*delimiter==':') {
    212             const char *s=u_skipWhitespace(delimiter+1);
    213             char *end;
    214             unsigned long value=strtoul(s, &end, 10);
    215             if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
    216                 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
    217                 exit(U_PARSE_ERROR);
    218             }
    219             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
    220                 builder.setCC(c, (uint8_t)value);
    221             }
    222             continue;
    223         }
    224         if(*delimiter=='-') {
    225             if(*u_skipWhitespace(delimiter+1)!=0) {
    226                 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
    227                 exit(U_PARSE_ERROR);
    228             }
    229             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
    230                 builder.removeMapping(c);
    231             }
    232             continue;
    233         }
    234         if(*delimiter=='=' || *delimiter=='>') {
    235             UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
    236             int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode);
    237             if(errorCode.isFailure()) {
    238                 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
    239                 exit(errorCode.reset());
    240             }
    241             UnicodeString mapping(FALSE, uchars, length);
    242             if(*delimiter=='=') {
    243                 if(rangeLength!=1) {
    244                     fprintf(stderr,
    245                             "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
    246                             line);
    247                     exit(U_PARSE_ERROR);
    248                 }
    249                 builder.setRoundTripMapping((UChar32)startCP, mapping);
    250             } else {
    251                 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
    252                     builder.setOneWayMapping(c, mapping);
    253                 }
    254             }
    255             continue;
    256         }
    257         fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
    258         exit(U_PARSE_ERROR);
    259     }
    260 }
    261 
    262 #endif // !UCONFIG_NO_NORMALIZATION
    263 
    264 U_NAMESPACE_END
    265 
    266 /*
    267  * Hey, Emacs, please set the following:
    268  *
    269  * Local Variables:
    270  * indent-tabs-mode: nil
    271  * End:
    272  *
    273  */
    274