Home | History | Annotate | Download | only in gennorm
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2001-2005, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  gennorm.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2001may25
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This program reads the Unicode character database text file,
     17 *   parses it, and extracts the data for normalization.
     18 *   It then preprocesses it and writes a binary file for efficient use
     19 *   in various Unicode text normalization processes.
     20 */
     21 
     22 #include <stdio.h>
     23 #include <stdlib.h>
     24 #include "unicode/utypes.h"
     25 #include "unicode/uchar.h"
     26 #include "unicode/ustring.h"
     27 #include "unicode/putil.h"
     28 #include "unicode/uclean.h"
     29 #include "unicode/udata.h"
     30 #include "unicode/uset.h"
     31 #include "cmemory.h"
     32 #include "cstring.h"
     33 #include "unewdata.h"
     34 #include "uoptions.h"
     35 #include "uparse.h"
     36 #include "unormimp.h"
     37 
     38 U_CDECL_BEGIN
     39 #include "gennorm.h"
     40 U_CDECL_END
     41 
     42 UBool beVerbose=FALSE, haveCopyright=TRUE;
     43 
     44 /* prototypes --------------------------------------------------------------- */
     45 
     46 static void
     47 parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError);
     48 
     49 static void
     50 parseDB(const char *filename, UErrorCode *pErrorCode);
     51 
     52 /* -------------------------------------------------------------------------- */
     53 
     54 enum {
     55     HELP_H,
     56     HELP_QUESTION_MARK,
     57     VERBOSE,
     58     COPYRIGHT,
     59     DESTDIR,
     60     SOURCEDIR,
     61     UNICODE_VERSION,
     62     ICUDATADIR,
     63     CSOURCE,
     64     STORE_FLAGS
     65 };
     66 
     67 static UOption options[]={
     68     UOPTION_HELP_H,
     69     UOPTION_HELP_QUESTION_MARK,
     70     UOPTION_VERBOSE,
     71     UOPTION_COPYRIGHT,
     72     UOPTION_DESTDIR,
     73     UOPTION_SOURCEDIR,
     74     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
     75     UOPTION_ICUDATADIR,
     76     UOPTION_DEF("csource", 'C', UOPT_NO_ARG),
     77     UOPTION_DEF("prune", 'p', UOPT_REQUIRES_ARG)
     78 };
     79 
     80 extern int
     81 main(int argc, char* argv[]) {
     82 #if !UCONFIG_NO_NORMALIZATION
     83     char filename[300];
     84 #endif
     85     const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
     86     char *basename=NULL;
     87     UErrorCode errorCode=U_ZERO_ERROR;
     88 
     89     U_MAIN_INIT_ARGS(argc, argv);
     90 
     91     /* preset then read command line options */
     92     options[4].value=u_getDataDirectory();
     93     options[5].value="";
     94     options[6].value="3.0.0";
     95     options[ICUDATADIR].value=u_getDataDirectory();
     96     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
     97 
     98     /* error handling, printing usage message */
     99     if(argc<0) {
    100         fprintf(stderr,
    101             "error in command line argument \"%s\"\n",
    102             argv[-argc]);
    103     }
    104     if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
    105         /*
    106          * Broken into chucks because the C89 standard says the minimum
    107          * required supported string length is 509 bytes.
    108          */
    109         fprintf(stderr,
    110             "Usage: %s [-options] [suffix]\n"
    111             "\n"
    112             "Read the UnicodeData.txt file and other Unicode properties files and\n"
    113             "create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n"
    114             "\n",
    115             argv[0]);
    116         fprintf(stderr,
    117             "Options:\n"
    118             "\t-h or -? or --help  this usage text\n"
    119             "\t-v or --verbose     verbose output\n"
    120             "\t-c or --copyright   include a copyright notice\n"
    121             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
    122             "\t-C or --csource     generate a .c source file rather than the .icu binary\n");
    123         fprintf(stderr,
    124             "\t-p or --prune flags Prune for data modularization:\n"
    125             "\t                    Determine what data is to be stored.\n"
    126             "\t        0 (zero) stores minimal data (only for NFD)\n"
    127             "\t        lowercase letters turn off data, uppercase turn on (use with 0)\n");
    128         fprintf(stderr,
    129             "\t        k: compatibility decompositions (NFKC, NFKD)\n"
    130             "\t        c: composition data (NFC, NFKC)\n"
    131             "\t        f: FCD data (will be generated at load time)\n"
    132             "\t        a: auxiliary data (canonical closure etc.)\n"
    133             "\t        x: exclusion sets (Unicode 3.2-level normalization)\n");
    134         fprintf(stderr,
    135             "\t-d or --destdir     destination directory, followed by the path\n"
    136             "\t-s or --sourcedir   source directory, followed by the path\n"
    137             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
    138             "\t                    followed by path, defaults to <%s>\n"
    139             "\tsuffix              suffix that is to be appended with a '-'\n"
    140             "\t                    to the source file basenames before opening;\n"
    141             "\t                    'gennorm new' will read UnicodeData-new.txt etc.\n",
    142             u_getDataDirectory());
    143         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
    144     }
    145 
    146     /* get the options values */
    147     beVerbose=options[2].doesOccur;
    148     haveCopyright=options[3].doesOccur;
    149     srcDir=options[5].value;
    150     destDir=options[4].value;
    151 
    152     if(argc>=2) {
    153         suffix=argv[1];
    154     } else {
    155         suffix=NULL;
    156     }
    157 
    158 #if UCONFIG_NO_NORMALIZATION
    159 
    160     fprintf(stderr,
    161         "gennorm writes a dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE
    162         " because UCONFIG_NO_NORMALIZATION is set, \n"
    163         "see icu/source/common/unicode/uconfig.h\n");
    164     generateData(destDir, options[CSOURCE].doesOccur);
    165 
    166 #else
    167 
    168     setUnicodeVersion(options[6].value);
    169 
    170     if (options[ICUDATADIR].doesOccur) {
    171         u_setDataDirectory(options[ICUDATADIR].value);
    172     }
    173 
    174     if(options[STORE_FLAGS].doesOccur) {
    175         const char *s=options[STORE_FLAGS].value;
    176         char c;
    177 
    178         while((c=*s++)!=0) {
    179             switch(c) {
    180             case '0':
    181                 gStoreFlags=0;  /* store minimal data (only for NFD) */
    182                 break;
    183 
    184             /* lowercase letters: omit data */
    185             case 'k':
    186                 gStoreFlags&=~U_MASK(UGENNORM_STORE_COMPAT);
    187                 break;
    188             case 'c':
    189                 gStoreFlags&=~U_MASK(UGENNORM_STORE_COMPOSITION);
    190                 break;
    191             case 'f':
    192                 gStoreFlags&=~U_MASK(UGENNORM_STORE_FCD);
    193                 break;
    194             case 'a':
    195                 gStoreFlags&=~U_MASK(UGENNORM_STORE_AUX);
    196                 break;
    197             case 'x':
    198                 gStoreFlags&=~U_MASK(UGENNORM_STORE_EXCLUSIONS);
    199                 break;
    200 
    201             /* uppercase letters: include data (use with 0) */
    202             case 'K':
    203                 gStoreFlags|=U_MASK(UGENNORM_STORE_COMPAT);
    204                 break;
    205             case 'C':
    206                 gStoreFlags|=U_MASK(UGENNORM_STORE_COMPOSITION);
    207                 break;
    208             case 'F':
    209                 gStoreFlags|=U_MASK(UGENNORM_STORE_FCD);
    210                 break;
    211             case 'A':
    212                 gStoreFlags|=U_MASK(UGENNORM_STORE_AUX);
    213                 break;
    214             case 'X':
    215                 gStoreFlags|=U_MASK(UGENNORM_STORE_EXCLUSIONS);
    216                 break;
    217 
    218             default:
    219                 fprintf(stderr, "ignoring undefined prune flag '%c'\n", c);
    220                 break;
    221             }
    222         }
    223     }
    224 
    225     /*
    226      * Verify that we can work with properties
    227      * but don't call u_init() because that needs unorm.icu which we are just
    228      * going to build here.
    229      */
    230     {
    231         U_STRING_DECL(ideo, "[:Ideographic:]", 15);
    232         USet *set;
    233 
    234         U_STRING_INIT(ideo, "[:Ideographic:]", 15);
    235         set=uset_openPattern(ideo, -1, &errorCode);
    236         if(U_FAILURE(errorCode) || !uset_contains(set, 0xf900)) {
    237             fprintf(stderr, "gennorm is unable to work with properties (uprops.icu): %s\n", u_errorName(errorCode));
    238             exit(errorCode);
    239         }
    240         uset_close(set);
    241     }
    242 
    243     /* prepare the filename beginning with the source dir */
    244     uprv_strcpy(filename, srcDir);
    245     basename=filename+uprv_strlen(filename);
    246     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
    247         *basename++=U_FILE_SEP_CHAR;
    248     }
    249 
    250     /* initialize */
    251     init();
    252 
    253     /* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */
    254     if(suffix==NULL) {
    255         uprv_strcpy(basename, "DerivedNormalizationProps.txt");
    256     } else {
    257         uprv_strcpy(basename, "DerivedNormalizationProps");
    258         basename[30]='-';
    259         uprv_strcpy(basename+31, suffix);
    260         uprv_strcat(basename+31, ".txt");
    261     }
    262     parseDerivedNormalizationProperties(filename, &errorCode, FALSE);
    263     if(U_FAILURE(errorCode)) {
    264         /* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */
    265         if(suffix==NULL) {
    266             uprv_strcpy(basename, "DerivedNormalizationProperties.txt");
    267         } else {
    268             uprv_strcpy(basename, "DerivedNormalizationProperties");
    269             basename[30]='-';
    270             uprv_strcpy(basename+31, suffix);
    271             uprv_strcat(basename+31, ".txt");
    272         }
    273         parseDerivedNormalizationProperties(filename, &errorCode, TRUE);
    274     }
    275 
    276     /* process UnicodeData.txt */
    277     if(suffix==NULL) {
    278         uprv_strcpy(basename, "UnicodeData.txt");
    279     } else {
    280         uprv_strcpy(basename, "UnicodeData");
    281         basename[11]='-';
    282         uprv_strcpy(basename+12, suffix);
    283         uprv_strcat(basename+12, ".txt");
    284     }
    285     parseDB(filename, &errorCode);
    286 
    287     /* process parsed data */
    288     if(U_SUCCESS(errorCode)) {
    289         processData();
    290 
    291         /* write the properties data file */
    292         generateData(destDir, options[CSOURCE].doesOccur);
    293 
    294         cleanUpData();
    295     }
    296 
    297 #endif
    298 
    299     return errorCode;
    300 }
    301 
    302 #if !UCONFIG_NO_NORMALIZATION
    303 
    304 /* parser for DerivedNormalizationProperties.txt ---------------------------- */
    305 
    306 static void U_CALLCONV
    307 derivedNormalizationPropertiesLineFn(void *context,
    308                                      char *fields[][2], int32_t fieldCount,
    309                                      UErrorCode *pErrorCode) {
    310     UChar string[32];
    311     char *s;
    312     uint32_t start, end;
    313     int32_t count;
    314     uint8_t qcFlags;
    315 
    316     /* get code point range */
    317     count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
    318     if(U_FAILURE(*pErrorCode)) {
    319         fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]);
    320         exit(*pErrorCode);
    321     }
    322 
    323     /* ignore hangul - handle explicitly */
    324     if(start==0xac00) {
    325         return;
    326     }
    327 
    328     /* get property - ignore unrecognized ones */
    329     s=(char *)u_skipWhitespace(fields[1][0]);
    330     if(*s=='N' && s[1]=='F') {
    331         /* quick check flag */
    332         qcFlags=0x11;
    333         s+=2;
    334         if(*s=='K') {
    335             qcFlags<<=1;
    336             ++s;
    337         }
    338 
    339         if(*s=='C' && s[1]=='_') {
    340             s+=2;
    341         } else if(*s=='D' && s[1]=='_') {
    342             qcFlags<<=2;
    343             s+=2;
    344         } else {
    345             return;
    346         }
    347 
    348         if(0==uprv_strncmp(s, "NO", 2)) {
    349             qcFlags&=0xf;
    350         } else if(0==uprv_strncmp(s, "MAYBE", 5)) {
    351             qcFlags&=0x30;
    352         } else if(0==uprv_strncmp(s, "QC", 2) && *(s=(char *)u_skipWhitespace(s+2))==';') {
    353             /*
    354              * Unicode 4.0.1:
    355              * changes single field "NFD_NO" -> two fields "NFD_QC; N" etc.
    356              */
    357             /* start of the field */
    358             s=(char *)u_skipWhitespace(s+1);
    359             if(*s=='N') {
    360                 qcFlags&=0xf;
    361             } else if(*s=='M') {
    362                 qcFlags&=0x30;
    363             } else {
    364                 return; /* do nothing for "Yes" because it's the default value */
    365             }
    366         } else {
    367             return; /* do nothing for "Yes" because it's the default value */
    368         }
    369 
    370         /* set this flag for all code points in this range */
    371         while(start<=end) {
    372             setQCFlags(start++, qcFlags);
    373         }
    374     } else if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) {
    375         /* full composition exclusion */
    376         while(start<=end) {
    377             setCompositionExclusion(start++);
    378         }
    379     } else if(
    380         ((0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') ||
    381         (0==uprv_memcmp(s, "FC_NFKC", 7) && *(s=(char *)u_skipWhitespace(s+7))==';'))
    382 
    383     ) {
    384         /* FC_NFKC_Closure, parse field 2 to get the string */
    385         char *t;
    386 
    387         /* start of the field */
    388         s=(char *)u_skipWhitespace(s+1);
    389 
    390         /* find the end of the field */
    391         for(t=s; *t!=';' && *t!='#' && *t!=0 && *t!='\n' && *t!='\r'; ++t) {}
    392         *t=0;
    393 
    394         string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode);
    395         if(U_FAILURE(*pErrorCode)) {
    396             fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]);
    397             exit(*pErrorCode);
    398         }
    399         while(start<=end) {
    400             setFNC(start++, string);
    401         }
    402     }
    403 }
    404 
    405 static void
    406 parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError) {
    407     char *fields[2][2];
    408 
    409     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    410         return;
    411     }
    412 
    413     u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode);
    414     if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
    415         fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
    416         exit(*pErrorCode);
    417     }
    418 }
    419 
    420 /* parser for UnicodeData.txt ----------------------------------------------- */
    421 
    422 static void U_CALLCONV
    423 unicodeDataLineFn(void *context,
    424                   char *fields[][2], int32_t fieldCount,
    425                   UErrorCode *pErrorCode) {
    426     uint32_t decomp[40];
    427     Norm norm;
    428     const char *s;
    429     char *end;
    430     uint32_t code, value;
    431     int32_t length;
    432     UBool isCompat, something=FALSE;
    433 
    434     /* ignore First and Last entries for ranges */
    435     if( *fields[1][0]=='<' &&
    436         (length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
    437         (0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
    438     ) {
    439         return;
    440     }
    441 
    442     /* reset the properties */
    443     uprv_memset(&norm, 0, sizeof(Norm));
    444 
    445     /*
    446      * The combiningIndex must not be initialized to 0 because 0 is the
    447      * combiningIndex of the first forward-combining character.
    448      */
    449     norm.combiningIndex=0xffff;
    450 
    451     /* get the character code, field 0 */
    452     code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
    453     if(end<=fields[0][0] || end!=fields[0][1]) {
    454         fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]);
    455         *pErrorCode=U_PARSE_ERROR;
    456         exit(U_PARSE_ERROR);
    457     }
    458 
    459     /* get canonical combining class, field 3 */
    460     value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10);
    461     if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
    462         fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]);
    463         *pErrorCode=U_PARSE_ERROR;
    464         exit(U_PARSE_ERROR);
    465     }
    466     if(value>0) {
    467         norm.udataCC=(uint8_t)value;
    468         something=TRUE;
    469     }
    470 
    471     /* get the decomposition, field 5 */
    472     if(fields[5][0]<fields[5][1]) {
    473         if(*(s=fields[5][0])=='<') {
    474             ++s;
    475             isCompat=TRUE;
    476 
    477             /* skip and ignore the compatibility type name */
    478             do {
    479                 if(s==fields[5][1]) {
    480                     /* missing '>' */
    481                     fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]);
    482                     *pErrorCode=U_PARSE_ERROR;
    483                     exit(U_PARSE_ERROR);
    484                 }
    485             } while(*s++!='>');
    486         } else {
    487             isCompat=FALSE;
    488         }
    489 
    490         /* parse the decomposition string */
    491         length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode);
    492         if(U_FAILURE(*pErrorCode)) {
    493             fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
    494                     (long)code, u_errorName(*pErrorCode));
    495             exit(*pErrorCode);
    496         }
    497 
    498         /* store the string */
    499         if(length>0) {
    500             something=TRUE;
    501             if(isCompat) {
    502                 norm.lenNFKD=(uint8_t)length;
    503                 norm.nfkd=decomp;
    504             } else {
    505                 if(length>2) {
    506                     fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n",
    507                             (long)code, (long)length);
    508                     *pErrorCode=U_PARSE_ERROR;
    509                     exit(U_PARSE_ERROR);
    510                 }
    511                 norm.lenNFD=(uint8_t)length;
    512                 norm.nfd=decomp;
    513             }
    514         }
    515     }
    516 
    517     /* check for non-character code points */
    518     if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) {
    519         fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n",
    520                 (long)code);
    521         *pErrorCode=U_PARSE_ERROR;
    522         exit(U_PARSE_ERROR);
    523     }
    524 
    525     if(something) {
    526         /* there are normalization values, so store them */
    527 #if 0
    528         if(beVerbose) {
    529             printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n",
    530                    (long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD);
    531         }
    532 #endif
    533         storeNorm(code, &norm);
    534     }
    535 }
    536 
    537 static void
    538 parseDB(const char *filename, UErrorCode *pErrorCode) {
    539     char *fields[15][2];
    540 
    541     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    542         return;
    543     }
    544 
    545     u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
    546     if(U_FAILURE(*pErrorCode)) {
    547         fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
    548         exit(*pErrorCode);
    549     }
    550 }
    551 
    552 #endif /* #if !UCONFIG_NO_NORMALIZATION */
    553 
    554 /*
    555  * Hey, Emacs, please set the following:
    556  *
    557  * Local Variables:
    558  * indent-tabs-mode: nil
    559  * End:
    560  *
    561  */
    562