Home | History | Annotate | Download | only in gennorm2
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2009-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  gennorm2.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2009nov25
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This program reads text files that define Unicode normalization,
     17 *   parses them, and builds a binary data file.
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 #include "n2builder.h"
     22 
     23 #include <stdio.h>
     24 #include <stdlib.h>
     25 #include <string.h>
     26 #include "unicode/errorcode.h"
     27 #include "unicode/localpointer.h"
     28 #include "unicode/putil.h"
     29 #include "unicode/uchar.h"
     30 #include "unicode/unistr.h"
     31 #include "charstr.h"
     32 #include "normalizer2impl.h"
     33 #include "toolutil.h"
     34 #include "uoptions.h"
     35 #include "uparse.h"
     36 
     37 #if UCONFIG_NO_NORMALIZATION
     38 #include "unewdata.h"
     39 #endif
     40 
     41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     42 
     43 U_NAMESPACE_BEGIN
     44 
     45 UBool beVerbose=FALSE, haveCopyright=TRUE;
     46 
     47 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
     48 
     49 #if !UCONFIG_NO_NORMALIZATION
     50 void parseFile(FILE *f, Normalizer2DataBuilder &builder);
     51 #endif
     52 
     53 /* -------------------------------------------------------------------------- */
     54 
     55 enum {
     56     HELP_H,
     57     HELP_QUESTION_MARK,
     58     VERBOSE,
     59     COPYRIGHT,
     60     SOURCEDIR,
     61     OUTPUT_FILENAME,
     62     UNICODE_VERSION,
     63     OPT_FAST
     64 };
     65 
     66 static UOption options[]={
     67     UOPTION_HELP_H,
     68     UOPTION_HELP_QUESTION_MARK,
     69     UOPTION_VERBOSE,
     70     UOPTION_COPYRIGHT,
     71     UOPTION_SOURCEDIR,
     72     UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
     73     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
     74     UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
     75 };
     76 
     77 extern "C" int
     78 main(int argc, char* argv[]) {
     79     U_MAIN_INIT_ARGS(argc, argv);
     80 
     81     /* preset then read command line options */
     82     options[SOURCEDIR].value="";
     83     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
     84 
     85     /* error handling, printing usage message */
     86     if(argc<0) {
     87         fprintf(stderr,
     88             "error in command line argument \"%s\"\n",
     89             argv[-argc]);
     90     }
     91     if(!options[OUTPUT_FILENAME].doesOccur) {
     92         argc=-1;
     93     }
     94     if( argc<2 ||
     95         options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
     96     ) {
     97         /*
     98          * Broken into chunks because the C89 standard says the minimum
     99          * required supported string length is 509 bytes.
    100          */
    101         fprintf(stderr,
    102             "Usage: %s [-options] infiles+ -o outputfilename\n"
    103             "\n"
    104             "Reads the infiles with normalization data and\n"
    105             "creates a binary file (outputfilename) with the data.\n"
    106             "\n",
    107             argv[0]);
    108         fprintf(stderr,
    109             "Options:\n"
    110             "\t-h or -? or --help  this usage text\n"
    111             "\t-v or --verbose     verbose output\n"
    112             "\t-c or --copyright   include a copyright notice\n"
    113             "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
    114         fprintf(stderr,
    115             "\t-s or --sourcedir   source directory, followed by the path\n"
    116             "\t-o or --output      output filename\n");
    117         fprintf(stderr,
    118             "\t      --fast        optimize the .nrm file for fast normalization,\n"
    119             "\t                    which might increase its size  (Writes fully decomposed\n"
    120             "\t                    regular mappings instead of delta mappings.\n"
    121             "\t                    You should measure the runtime speed to make sure that\n"
    122             "\t                    this is a good trade-off.)\n");
    123         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
    124     }
    125 
    126     beVerbose=options[VERBOSE].doesOccur;
    127     haveCopyright=options[COPYRIGHT].doesOccur;
    128 
    129     IcuToolErrorCode errorCode("gennorm2/main()");
    130 
    131 #if UCONFIG_NO_NORMALIZATION
    132 
    133     fprintf(stderr,
    134         "gennorm2 writes a dummy binary data file "
    135         "because UCONFIG_NO_NORMALIZATION is set, \n"
    136         "see icu/source/common/unicode/uconfig.h\n");
    137     udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
    138     // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
    139     // return U_UNSUPPORTED_ERROR;
    140     return 0;
    141 
    142 #else
    143 
    144     LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode));
    145     errorCode.assertSuccess();
    146 
    147     if(options[UNICODE_VERSION].doesOccur) {
    148         builder->setUnicodeVersion(options[UNICODE_VERSION].value);
    149     }
    150 
    151     if(options[OPT_FAST].doesOccur) {
    152         builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
    153     }
    154 
    155     // prepare the filename beginning with the source dir
    156     CharString filename(options[SOURCEDIR].value, errorCode);
    157     int32_t pathLength=filename.length();
    158     if( pathLength>0 &&
    159         filename[pathLength-1]!=U_FILE_SEP_CHAR &&
    160         filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
    161     ) {
    162         filename.append(U_FILE_SEP_CHAR, errorCode);
    163         pathLength=filename.length();
    164     }
    165 
    166     for(int i=1; i<argc; ++i) {
    167         printf("gennorm2: processing %s\n", argv[i]);
    168         filename.append(argv[i], errorCode);
    169         LocalStdioFilePointer f(fopen(filename.data(), "r"));
    170         if(f==NULL) {
    171             fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
    172             exit(U_FILE_ACCESS_ERROR);
    173         }
    174         builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
    175         parseFile(f.getAlias(), *builder);
    176         filename.truncate(pathLength);
    177     }
    178 
    179     builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
    180 
    181     return errorCode.get();
    182 
    183 #endif
    184 }
    185 
    186 #if !UCONFIG_NO_NORMALIZATION
    187 
    188 void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
    189     IcuToolErrorCode errorCode("gennorm2/parseFile()");
    190     char line[300];
    191     uint32_t startCP, endCP;
    192     while(NULL!=fgets(line, (int)sizeof(line), f)) {
    193         char *comment=(char *)strchr(line, '#');
    194         if(comment!=NULL) {
    195             *comment=0;
    196         }
    197         u_rtrim(line);
    198         if(line[0]==0) {
    199             continue;  // skip empty and comment-only lines
    200         }
    201         if(line[0]=='*') {
    202             const char *s=u_skipWhitespace(line+1);
    203             if(0==strncmp(s, "Unicode", 7)) {
    204                 s=u_skipWhitespace(s+7);
    205                 builder.setUnicodeVersion(s);
    206             }
    207             continue;  // reserved syntax
    208         }
    209         const char *delimiter;
    210         int32_t rangeLength=
    211             u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
    212         if(errorCode.isFailure()) {
    213             fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
    214             exit(errorCode.reset());
    215         }
    216         delimiter=u_skipWhitespace(delimiter);
    217         if(*delimiter==':') {
    218             const char *s=u_skipWhitespace(delimiter+1);
    219             char *end;
    220             unsigned long value=strtoul(s, &end, 10);
    221             if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
    222                 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
    223                 exit(U_PARSE_ERROR);
    224             }
    225             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
    226                 builder.setCC(c, (uint8_t)value);
    227             }
    228             continue;
    229         }
    230         if(*delimiter=='-') {
    231             if(*u_skipWhitespace(delimiter+1)!=0) {
    232                 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
    233                 exit(U_PARSE_ERROR);
    234             }
    235             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
    236                 builder.removeMapping(c);
    237             }
    238             continue;
    239         }
    240         if(*delimiter=='=' || *delimiter=='>') {
    241             UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
    242             int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode);
    243             if(errorCode.isFailure()) {
    244                 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
    245                 exit(errorCode.reset());
    246             }
    247             UnicodeString mapping(FALSE, uchars, length);
    248             if(*delimiter=='=') {
    249                 if(rangeLength!=1) {
    250                     fprintf(stderr,
    251                             "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
    252                             line);
    253                     exit(U_PARSE_ERROR);
    254                 }
    255                 builder.setRoundTripMapping((UChar32)startCP, mapping);
    256             } else {
    257                 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
    258                     builder.setOneWayMapping(c, mapping);
    259                 }
    260             }
    261             continue;
    262         }
    263         fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
    264         exit(U_PARSE_ERROR);
    265     }
    266 }
    267 
    268 #endif // !UCONFIG_NO_NORMALIZATION
    269 
    270 U_NAMESPACE_END
    271 
    272 /*
    273  * Hey, Emacs, please set the following:
    274  *
    275  * Local Variables:
    276  * indent-tabs-mode: nil
    277  * End:
    278  *
    279  */
    280