Home | History | Annotate | Download | only in genprops
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2008, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  genprops.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 1999dec08
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This program reads several of the Unicode character database text files,
     17 *   parses them, and extracts most of the properties for each character.
     18 *   It then writes a binary file containing the properties
     19 *   that is designed to be used directly for random-access to
     20 *   the properties of each Unicode character.
     21 */
     22 
     23 #include <stdio.h>
     24 #include <stdlib.h>
     25 #include "unicode/utypes.h"
     26 #include "unicode/uchar.h"
     27 #include "unicode/putil.h"
     28 #include "unicode/uclean.h"
     29 #include "cmemory.h"
     30 #include "cstring.h"
     31 #include "unewdata.h"
     32 #include "uoptions.h"
     33 #include "uparse.h"
     34 #include "uprops.h"
     35 #include "propsvec.h"
     36 
     37 U_CDECL_BEGIN
     38 #include "genprops.h"
     39 U_CDECL_END
     40 
     41 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
     42 
     43 UBool beVerbose=FALSE, haveCopyright=TRUE;
     44 
     45 /* prototypes --------------------------------------------------------------- */
     46 
     47 static void
     48 parseDB(const char *filename, UErrorCode *pErrorCode);
     49 
     50 /* -------------------------------------------------------------------------- */
     51 
     52 enum
     53 {
     54     HELP_H,
     55     HELP_QUESTION_MARK,
     56     VERBOSE,
     57     COPYRIGHT,
     58     DESTDIR,
     59     SOURCEDIR,
     60     UNICODE_VERSION,
     61     ICUDATADIR,
     62     CSOURCE
     63 };
     64 
     65 /* Keep these values in sync with the above enums */
     66 static UOption options[]={
     67     UOPTION_HELP_H,
     68     UOPTION_HELP_QUESTION_MARK,
     69     UOPTION_VERBOSE,
     70     UOPTION_COPYRIGHT,
     71     UOPTION_DESTDIR,
     72     UOPTION_SOURCEDIR,
     73     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
     74     UOPTION_ICUDATADIR,
     75     UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
     76 };
     77 
     78 extern int
     79 main(int argc, char* argv[]) {
     80     char filename[300];
     81     const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
     82     char *basename=NULL;
     83     UErrorCode errorCode=U_ZERO_ERROR;
     84 
     85     U_MAIN_INIT_ARGS(argc, argv);
     86 
     87     /* preset then read command line options */
     88     options[DESTDIR].value=u_getDataDirectory();
     89     options[SOURCEDIR].value="";
     90     options[UNICODE_VERSION].value="";
     91     options[ICUDATADIR].value=u_getDataDirectory();
     92     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
     93 
     94     /* error handling, printing usage message */
     95     if(argc<0) {
     96         fprintf(stderr,
     97             "error in command line argument \"%s\"\n",
     98             argv[-argc]);
     99     }
    100     if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
    101         /*
    102          * Broken into chucks because the C89 standard says the minimum
    103          * required supported string length is 509 bytes.
    104          */
    105         fprintf(stderr,
    106             "Usage: %s [-options] [suffix]\n"
    107             "\n"
    108             "read the UnicodeData.txt file and other Unicode properties files and\n"
    109             "create a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
    110             "\n",
    111             argv[0]);
    112         fprintf(stderr,
    113             "Options:\n"
    114             "\t-h or -? or --help  this usage text\n"
    115             "\t-v or --verbose     verbose output\n"
    116             "\t-c or --copyright   include a copyright notice\n"
    117             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
    118             "\t-C or --csource     generate a .c source file rather than the .icu binary\n");
    119         fprintf(stderr,
    120             "\t-d or --destdir     destination directory, followed by the path\n"
    121             "\t-s or --sourcedir   source directory, followed by the path\n"
    122             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
    123             "\t                    followed by path, defaults to %s\n"
    124             "\tsuffix              suffix that is to be appended with a '-'\n"
    125             "\t                    to the source file basenames before opening;\n"
    126             "\t                    'genprops new' will read UnicodeData-new.txt etc.\n",
    127             u_getDataDirectory());
    128         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
    129     }
    130 
    131     /* get the options values */
    132     beVerbose=options[VERBOSE].doesOccur;
    133     haveCopyright=options[COPYRIGHT].doesOccur;
    134     srcDir=options[SOURCEDIR].value;
    135     destDir=options[DESTDIR].value;
    136 
    137     if(argc>=2) {
    138         suffix=argv[1];
    139     } else {
    140         suffix=NULL;
    141     }
    142 
    143     if(options[UNICODE_VERSION].doesOccur) {
    144         setUnicodeVersion(options[UNICODE_VERSION].value);
    145     }
    146     /* else use the default dataVersion in store.c */
    147 
    148     if (options[ICUDATADIR].doesOccur) {
    149         u_setDataDirectory(options[ICUDATADIR].value);
    150     }
    151 
    152     /* prepare the filename beginning with the source dir */
    153     uprv_strcpy(filename, srcDir);
    154     basename=filename+uprv_strlen(filename);
    155     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
    156         *basename++=U_FILE_SEP_CHAR;
    157     }
    158 
    159     /* initialize */
    160     initStore();
    161 
    162     /* process UnicodeData.txt */
    163     writeUCDFilename(basename, "UnicodeData", suffix);
    164     parseDB(filename, &errorCode);
    165 
    166     /* process additional properties files */
    167     *basename=0;
    168     generateAdditionalProperties(filename, suffix, &errorCode);
    169 
    170     /* process parsed data */
    171     if(U_SUCCESS(errorCode)) {
    172         /* write the properties data file */
    173         generateData(destDir, options[CSOURCE].doesOccur);
    174     }
    175 
    176     exitStore();
    177     u_cleanup();
    178     return errorCode;
    179 }
    180 
    181 U_CFUNC void
    182 writeUCDFilename(char *basename, const char *filename, const char *suffix) {
    183     int32_t length=(int32_t)uprv_strlen(filename);
    184     uprv_strcpy(basename, filename);
    185     if(suffix!=NULL) {
    186         basename[length++]='-';
    187         uprv_strcpy(basename+length, suffix);
    188         length+=(int32_t)uprv_strlen(suffix);
    189     }
    190     uprv_strcpy(basename+length, ".txt");
    191 }
    192 
    193 U_CFUNC UBool
    194 isToken(const char *token, const char *s) {
    195     const char *z;
    196     int32_t j;
    197 
    198     s=u_skipWhitespace(s);
    199     for(j=0;; ++j) {
    200         if(token[j]!=0) {
    201             if(s[j]!=token[j]) {
    202                 break;
    203             }
    204         } else {
    205             z=u_skipWhitespace(s+j);
    206             if(*z==';' || *z==0) {
    207                 return TRUE;
    208             } else {
    209                 break;
    210             }
    211         }
    212     }
    213 
    214     return FALSE;
    215 }
    216 
    217 U_CFUNC int32_t
    218 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
    219     const char *t, *z;
    220     int32_t i, j;
    221 
    222     s=u_skipWhitespace(s);
    223     for(i=0; i<countTokens; ++i) {
    224         t=tokens[i];
    225         if(t!=NULL) {
    226             for(j=0;; ++j) {
    227                 if(t[j]!=0) {
    228                     if(s[j]!=t[j]) {
    229                         break;
    230                     }
    231                 } else {
    232                     z=u_skipWhitespace(s+j);
    233                     if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
    234                         return i;
    235                     } else {
    236                         break;
    237                     }
    238                 }
    239             }
    240         }
    241     }
    242     return -1;
    243 }
    244 
    245 /* parser for UnicodeData.txt ----------------------------------------------- */
    246 
    247 /* general categories */
    248 const char *const
    249 genCategoryNames[U_CHAR_CATEGORY_COUNT]={
    250     "Cn",
    251     "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
    252     "Mc", "Nd", "Nl", "No",
    253     "Zs", "Zl", "Zp",
    254     "Cc", "Cf", "Co", "Cs",
    255     "Pd", "Ps", "Pe", "Pc", "Po",
    256     "Sm", "Sc", "Sk", "So",
    257     "Pi", "Pf"
    258 };
    259 
    260 const char *const
    261 decompositionTypeNames[U_DT_COUNT]={
    262     NULL,
    263     NULL,
    264     "compat",
    265     "circle",
    266     "final",
    267     "font",
    268     "fraction",
    269     "initial",
    270     "isolated",
    271     "medial",
    272     "narrow",
    273     "noBreak",
    274     "small",
    275     "square",
    276     "sub",
    277     "super",
    278     "vertical",
    279     "wide"
    280 };
    281 
    282 static struct {
    283     uint32_t first, last, props;
    284     char name[80];
    285 } unicodeAreas[32];
    286 
    287 static int32_t unicodeAreaIndex=0;
    288 
    289 static void U_CALLCONV
    290 unicodeDataLineFn(void *context,
    291                   char *fields[][2], int32_t fieldCount,
    292                   UErrorCode *pErrorCode) {
    293     Props p;
    294     char *end;
    295     static uint32_t prevCode=0;
    296     uint32_t value;
    297     int32_t i;
    298 
    299     /* reset the properties */
    300     uprv_memset(&p, 0, sizeof(Props));
    301 
    302     /* get the character code, field 0 */
    303     p.code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
    304     if(end<=fields[0][0] || end!=fields[0][1]) {
    305         fprintf(stderr, "genprops: syntax error in field 0 at %s\n", fields[0][0]);
    306         *pErrorCode=U_PARSE_ERROR;
    307         exit(U_PARSE_ERROR);
    308     }
    309 
    310     /* get general category, field 2 */
    311     i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
    312     if(i>=0) {
    313         p.generalCategory=(uint8_t)i;
    314     } else {
    315         fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n",
    316             fields[2][0], (unsigned long)p.code);
    317         *pErrorCode=U_PARSE_ERROR;
    318         exit(U_PARSE_ERROR);
    319     }
    320 
    321     /* get decomposition type, field 5 */
    322     if(fields[5][0]<fields[5][1]) {
    323         /* there is some decomposition */
    324         if(*fields[5][0]!='<') {
    325             /* canonical */
    326             i=U_DT_CANONICAL;
    327         } else {
    328             /* get compatibility type */
    329             end=fields[5][0]+1;
    330             while(end<fields[5][1] && *end!='>') {
    331                 ++end;
    332             }
    333             *end='#';
    334             i=getTokenIndex(decompositionTypeNames, U_DT_COUNT, fields[5][0]+1);
    335             if(i<0) {
    336                 fprintf(stderr, "genprops: unknown decomposition type \"%s\" at code 0x%lx\n",
    337                     fields[5][0], (unsigned long)p.code);
    338                 *pErrorCode=U_PARSE_ERROR;
    339                 exit(U_PARSE_ERROR);
    340             }
    341         }
    342         upvec_setValue(pv, p.code, p.code, 2, (uint32_t)i, UPROPS_DT_MASK, pErrorCode);
    343         if(U_FAILURE(*pErrorCode)) {
    344             fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(*pErrorCode));
    345             exit(*pErrorCode);
    346         }
    347     }
    348 
    349     /* decimal digit value, field 6 */
    350     if(fields[6][0]<fields[6][1]) {
    351         value=(uint32_t)uprv_strtoul(fields[6][0], &end, 10);
    352         if(end!=fields[6][1] || value>0x7fff) {
    353             fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n",
    354                 (unsigned long)p.code);
    355             *pErrorCode=U_PARSE_ERROR;
    356             exit(U_PARSE_ERROR);
    357         }
    358         p.numericValue=(int32_t)value;
    359         p.numericType=1;
    360     }
    361 
    362     /* digit value, field 7 */
    363     if(fields[7][0]<fields[7][1]) {
    364         value=(uint32_t)uprv_strtoul(fields[7][0], &end, 10);
    365         if(end!=fields[7][1] || value>0x7fff) {
    366             fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n",
    367                 (unsigned long)p.code);
    368             *pErrorCode=U_PARSE_ERROR;
    369             exit(U_PARSE_ERROR);
    370         }
    371         if(p.numericType==0) {
    372             p.numericValue=(int32_t)value;
    373             p.numericType=2;
    374         } else if((int32_t)value!=p.numericValue) {
    375             fprintf(stderr, "genprops error: numeric values in fields 6 & 7 different at code 0x%lx\n",
    376                 (unsigned long)p.code);
    377             *pErrorCode=U_PARSE_ERROR;
    378             exit(U_PARSE_ERROR);
    379         }
    380     }
    381 
    382     /* numeric value, field 8 */
    383     if(fields[8][0]<fields[8][1]) {
    384         char *s=fields[8][0];
    385         UBool isNegative;
    386 
    387         /* get a possible minus sign */
    388         if(*s=='-') {
    389             isNegative=TRUE;
    390             ++s;
    391         } else {
    392             isNegative=FALSE;
    393         }
    394 
    395         value=(uint32_t)uprv_strtoul(s, &end, 10);
    396         if(value>0 && *end=='/') {
    397             /* field 8 may contain a fractional value, get the denominator */
    398             if(p.numericType>0) {
    399                 fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
    400                     (unsigned long)p.code);
    401                 *pErrorCode=U_PARSE_ERROR;
    402                 exit(U_PARSE_ERROR);
    403             }
    404 
    405             p.denominator=(uint32_t)uprv_strtoul(end+1, &end, 10);
    406             if(p.denominator==0) {
    407                 fprintf(stderr, "genprops: denominator is 0 in field 8 at code 0x%lx\n",
    408                     (unsigned long)p.code);
    409                 *pErrorCode=U_PARSE_ERROR;
    410                 exit(U_PARSE_ERROR);
    411             }
    412         }
    413         if(end!=fields[8][1] || value>0x7fffffff) {
    414             fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n",
    415                 (unsigned long)p.code);
    416             *pErrorCode=U_PARSE_ERROR;
    417             exit(U_PARSE_ERROR);
    418         }
    419 
    420         if(p.numericType==0) {
    421             if(isNegative) {
    422                 p.numericValue=-(int32_t)value;
    423             } else {
    424                 p.numericValue=(int32_t)value;
    425             }
    426             p.numericType=3;
    427         } else if((int32_t)value!=p.numericValue) {
    428             fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
    429                 (unsigned long)p.code);
    430             *pErrorCode=U_PARSE_ERROR;
    431             exit(U_PARSE_ERROR);
    432         }
    433     }
    434 
    435     value=makeProps(&p);
    436 
    437     if(*fields[1][0]=='<') {
    438         /* first or last entry of a Unicode area */
    439         size_t length=fields[1][1]-fields[1][0];
    440 
    441         if(length<9) {
    442             /* name too short for an area name */
    443         } else if(0==uprv_memcmp(", First>", fields[1][1]-8, 8)) {
    444             /* set the current area */
    445             if(unicodeAreas[unicodeAreaIndex].first==0xffffffff) {
    446                 length-=9;
    447                 unicodeAreas[unicodeAreaIndex].first=p.code;
    448                 unicodeAreas[unicodeAreaIndex].props=value;
    449                 uprv_memcpy(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length);
    450                 unicodeAreas[unicodeAreaIndex].name[length]=0;
    451             } else {
    452                 /* error: a previous area is incomplete */
    453                 fprintf(stderr, "genprops: error - area \"%s\" is incomplete\n", unicodeAreas[unicodeAreaIndex].name);
    454                 *pErrorCode=U_PARSE_ERROR;
    455                 exit(U_PARSE_ERROR);
    456             }
    457             return;
    458         } else if(0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) {
    459             /* check that the current area matches, and complete it with the last code point */
    460             length-=8;
    461             if( unicodeAreas[unicodeAreaIndex].props==value &&
    462                 0==uprv_memcmp(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length) &&
    463                 unicodeAreas[unicodeAreaIndex].name[length]==0 &&
    464                 unicodeAreas[unicodeAreaIndex].first<p.code
    465             ) {
    466                 unicodeAreas[unicodeAreaIndex].last=p.code;
    467                 if(beVerbose) {
    468                     printf("Unicode area U+%04lx..U+%04lx \"%s\"\n",
    469                         (unsigned long)unicodeAreas[unicodeAreaIndex].first,
    470                         (unsigned long)unicodeAreas[unicodeAreaIndex].last,
    471                         unicodeAreas[unicodeAreaIndex].name);
    472                 }
    473                 unicodeAreas[++unicodeAreaIndex].first=0xffffffff;
    474             } else {
    475                 /* error: different properties between first & last, different area name, first>=last */
    476                 fprintf(stderr, "genprops: error - Last of area \"%s\" is incorrect\n", unicodeAreas[unicodeAreaIndex].name);
    477                 *pErrorCode=U_PARSE_ERROR;
    478                 exit(U_PARSE_ERROR);
    479             }
    480             return;
    481         } else {
    482             /* not an area name */
    483         }
    484     }
    485 
    486     /* check for non-character code points */
    487     if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
    488         fprintf(stderr, "genprops: error - properties for non-character code point U+%04lx\n",
    489                 (unsigned long)p.code);
    490         *pErrorCode=U_PARSE_ERROR;
    491         exit(U_PARSE_ERROR);
    492     }
    493 
    494     /* check that the code points (p.code) are in ascending order */
    495     if(p.code<=prevCode && p.code>0) {
    496         fprintf(stderr, "genprops: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
    497                 (unsigned long)p.code, (unsigned long)prevCode);
    498         *pErrorCode=U_PARSE_ERROR;
    499         exit(U_PARSE_ERROR);
    500     }
    501     prevCode=p.code;
    502 
    503     /* properties for a single code point */
    504     addProps(p.code, value);
    505 }
    506 
    507 /* set repeated properties for the areas */
    508 static void
    509 repeatAreaProps() {
    510     uint32_t puaProps;
    511     int32_t i;
    512     UBool hasPlane15PUA, hasPlane16PUA;
    513     UErrorCode errorCode;
    514 
    515     /*
    516      * UnicodeData.txt before 3.0.1 did not contain the PUAs on
    517      * planes 15 and 16.
    518      * If that is the case, then we add them here, using the properties
    519      * from the BMP PUA.
    520      */
    521     puaProps=0;
    522     hasPlane15PUA=hasPlane16PUA=FALSE;
    523 
    524     for(i=0; i<unicodeAreaIndex; ++i) {
    525         repeatProps(unicodeAreas[i].first,
    526                     unicodeAreas[i].last,
    527                     unicodeAreas[i].props);
    528         if(unicodeAreas[i].first==0xe000) {
    529             puaProps=unicodeAreas[i].props;
    530         } else if(unicodeAreas[i].first==0xf0000) {
    531             hasPlane15PUA=TRUE;
    532         } else if(unicodeAreas[i].first==0x100000) {
    533             hasPlane16PUA=TRUE;
    534         }
    535     }
    536 
    537     if(puaProps!=0) {
    538         if(!hasPlane15PUA) {
    539             repeatProps(0xf0000, 0xffffd, puaProps);
    540         }
    541         if(!hasPlane16PUA) {
    542             repeatProps(0x100000, 0x10fffd, puaProps);
    543         }
    544     }
    545 
    546     /* Hangul have canonical decompositions */
    547     errorCode=U_ZERO_ERROR;
    548     upvec_setValue(pv, 0xac00, 0xd7a3, 2, (uint32_t)U_DT_CANONICAL, UPROPS_DT_MASK, &errorCode);
    549     if(U_FAILURE(errorCode)) {
    550         fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(errorCode));
    551         exit(errorCode);
    552     }
    553 }
    554 
    555 static void
    556 parseDB(const char *filename, UErrorCode *pErrorCode) {
    557     char *fields[15][2];
    558 
    559     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    560         return;
    561     }
    562 
    563     /* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */
    564     unicodeAreas[0].first=0xffffffff;
    565 
    566     u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
    567 
    568     if(unicodeAreas[unicodeAreaIndex].first!=0xffffffff) {
    569         fprintf(stderr, "genprops: error - the last area \"%s\" from U+%04lx is incomplete\n",
    570             unicodeAreas[unicodeAreaIndex].name,
    571             (unsigned long)unicodeAreas[unicodeAreaIndex].first);
    572         *pErrorCode=U_PARSE_ERROR;
    573         exit(U_PARSE_ERROR);
    574     }
    575 
    576     repeatAreaProps();
    577 
    578     if(U_FAILURE(*pErrorCode)) {
    579         return;
    580     }
    581 }
    582 
    583 /*
    584  * Hey, Emacs, please set the following:
    585  *
    586  * Local Variables:
    587  * indent-tabs-mode: nil
    588  * End:
    589  *
    590  */
    591