Home | History | Annotate | Download | only in gensprep
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2003-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  gensprep.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2003-02-06
     14 *   created by: Ram Viswanadha
     15 *
     16 *   This program reads the Profile.txt files,
     17 *   parses them, and extracts the data for StringPrep profile.
     18 *   It then preprocesses it and writes a binary file for efficient use
     19 *   in various StringPrep conversion processes.
     20 */
     21 
     22 #define USPREP_TYPE_NAMES_ARRAY 1
     23 
     24 #include <stdio.h>
     25 #include <stdlib.h>
     26 
     27 #include "cmemory.h"
     28 #include "cstring.h"
     29 #include "unewdata.h"
     30 #include "uoptions.h"
     31 #include "uparse.h"
     32 #include "sprpimpl.h"
     33 
     34 #include "unicode/uclean.h"
     35 #include "unicode/udata.h"
     36 #include "unicode/utypes.h"
     37 #include "unicode/putil.h"
     38 
     39 
     40 U_CDECL_BEGIN
     41 #include "gensprep.h"
     42 U_CDECL_END
     43 
     44 UBool beVerbose=FALSE, haveCopyright=TRUE;
     45 
     46 #define NORM_CORRECTIONS_FILE_NAME "NormalizationCorrections.txt"
     47 
     48 #define NORMALIZE_DIRECTIVE "normalize"
     49 #define NORMALIZE_DIRECTIVE_LEN 9
     50 #define CHECK_BIDI_DIRECTIVE "check-bidi"
     51 #define CHECK_BIDI_DIRECTIVE_LEN 10
     52 
     53 /* prototypes --------------------------------------------------------------- */
     54 
     55 static void
     56 parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode);
     57 
     58 static void
     59 parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode);
     60 
     61 
     62 /* -------------------------------------------------------------------------- */
     63 
     64 static UOption options[]={
     65     UOPTION_HELP_H,
     66     UOPTION_HELP_QUESTION_MARK,
     67     UOPTION_VERBOSE,
     68     UOPTION_COPYRIGHT,
     69     UOPTION_DESTDIR,
     70     UOPTION_SOURCEDIR,
     71     UOPTION_ICUDATADIR,
     72     UOPTION_BUNDLE_NAME,
     73     { "normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0 },
     74     { "norm-correction", NULL, NULL, NULL, 'm', UOPT_REQUIRES_ARG, 0 },
     75     { "check-bidi", NULL, NULL, NULL,  'k', UOPT_NO_ARG, 0},
     76     { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
     77 };
     78 
     79 enum{
     80     HELP,
     81     HELP_QUESTION_MARK,
     82     VERBOSE,
     83     COPYRIGHT,
     84     DESTDIR,
     85     SOURCEDIR,
     86     ICUDATADIR,
     87     BUNDLE_NAME,
     88     NORMALIZE,
     89     NORM_CORRECTION_DIR,
     90     CHECK_BIDI,
     91     UNICODE_VERSION
     92 };
     93 
     94 static int printHelp(int argc, char* argv[]){
     95     /*
     96      * Broken into chucks because the C89 standard says the minimum
     97      * required supported string length is 509 bytes.
     98      */
     99     fprintf(stderr,
    100         "Usage: %s [-options] [file_name]\n"
    101         "\n"
    102         "Read the files specified and\n"
    103         "create a binary file [package-name]_[bundle-name]." DATA_TYPE " with the StringPrep profile data\n"
    104         "\n",
    105         argv[0]);
    106     fprintf(stderr,
    107         "Options:\n"
    108         "\t-h or -? or --help       print this usage text\n"
    109         "\t-v or --verbose          verbose output\n"
    110         "\t-c or --copyright        include a copyright notice\n");
    111     fprintf(stderr,
    112         "\t-d or --destdir          destination directory, followed by the path\n"
    113         "\t-s or --sourcedir        source directory of ICU data, followed by the path\n"
    114         "\t-b or --bundle-name      generate the ouput data file with the name specified\n"
    115         "\t-i or --icudatadir       directory for locating any needed intermediate data files,\n"
    116         "\t                         followed by path, defaults to %s\n",
    117         u_getDataDirectory());
    118     fprintf(stderr,
    119         "\t-n or --normalize        turn on the option for normalization and include mappings\n"
    120         "\t                         from NormalizationCorrections.txt from the given path,\n"
    121         "\t                         e.g: /test/icu/source/data/unidata\n");
    122     fprintf(stderr,
    123         "\t-m or --norm-correction  use NormalizationCorrections.txt from the given path\n"
    124         "\t                         when the input file contains a normalization directive.\n"
    125         "\t                         unlike -n/--normalize, this option does not force the\n"
    126         "\t                         normalization.\n");
    127     fprintf(stderr,
    128         "\t-k or --check-bidi       turn on the option for checking for BiDi in the profile\n"
    129         "\t-u or --unicode          version of Unicode to be used with this profile followed by the version\n"
    130         );
    131     return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
    132 }
    133 
    134 
    135 extern int
    136 main(int argc, char* argv[]) {
    137 #if !UCONFIG_NO_IDNA
    138     char* filename = NULL;
    139 #endif
    140     const char *srcDir=NULL, *destDir=NULL, *icuUniDataDir=NULL;
    141     const char *bundleName=NULL, *inputFileName = NULL;
    142     char *basename=NULL;
    143     int32_t sprepOptions = 0;
    144 
    145     UErrorCode errorCode=U_ZERO_ERROR;
    146 
    147     U_MAIN_INIT_ARGS(argc, argv);
    148 
    149     /* preset then read command line options */
    150     options[DESTDIR].value=u_getDataDirectory();
    151     options[SOURCEDIR].value="";
    152     options[UNICODE_VERSION].value="0"; /* don't assume the unicode version */
    153     options[BUNDLE_NAME].value = DATA_NAME;
    154     options[NORMALIZE].value = "";
    155 
    156     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
    157 
    158     /* error handling, printing usage message */
    159     if(argc<0) {
    160         fprintf(stderr,
    161             "error in command line argument \"%s\"\n",
    162             argv[-argc]);
    163     }
    164     if(argc<0 || options[HELP].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
    165         return printHelp(argc, argv);
    166 
    167     }
    168 
    169     /* get the options values */
    170     beVerbose=options[VERBOSE].doesOccur;
    171     haveCopyright=options[COPYRIGHT].doesOccur;
    172     srcDir=options[SOURCEDIR].value;
    173     destDir=options[DESTDIR].value;
    174     bundleName = options[BUNDLE_NAME].value;
    175     if(options[NORMALIZE].doesOccur) {
    176         icuUniDataDir = options[NORMALIZE].value;
    177     } else {
    178         icuUniDataDir = options[NORM_CORRECTION_DIR].value;
    179     }
    180 
    181     if(argc<2) {
    182         /* print the help message */
    183         return printHelp(argc, argv);
    184     } else {
    185         inputFileName = argv[1];
    186     }
    187     if(!options[UNICODE_VERSION].doesOccur){
    188         return printHelp(argc, argv);
    189     }
    190     if(options[ICUDATADIR].doesOccur) {
    191         u_setDataDirectory(options[ICUDATADIR].value);
    192     }
    193 #if UCONFIG_NO_IDNA
    194 
    195     fprintf(stderr,
    196         "gensprep writes dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE
    197         " because UCONFIG_NO_IDNA is set, \n"
    198         "see icu/source/common/unicode/uconfig.h\n");
    199     generateData(destDir, bundleName);
    200 
    201 #else
    202 
    203     setUnicodeVersion(options[UNICODE_VERSION].value);
    204     filename = (char* ) uprv_malloc(uprv_strlen(srcDir) + 300); /* hopefully this should be enough */
    205 
    206     /* prepare the filename beginning with the source dir */
    207     if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL && uprv_strchr(srcDir,U_FILE_ALT_SEP_CHAR) == NULL){
    208         filename[0] = '.';
    209         filename[1] = U_FILE_SEP_CHAR;
    210         uprv_strcpy(filename+2,srcDir);
    211     }else{
    212         uprv_strcpy(filename, srcDir);
    213     }
    214 
    215     basename=filename+uprv_strlen(filename);
    216     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
    217         *basename++=U_FILE_SEP_CHAR;
    218     }
    219 
    220     /* initialize */
    221     init();
    222 
    223     /* process the file */
    224     uprv_strcpy(basename,inputFileName);
    225     parseMappings(filename,FALSE, &errorCode);
    226     if(U_FAILURE(errorCode)) {
    227         fprintf(stderr, "Could not open file %s for reading. Error: %s \n", filename, u_errorName(errorCode));
    228         return errorCode;
    229     }
    230 
    231     if(options[NORMALIZE].doesOccur){ /* this option might be set by @normalize;; in the source file */
    232         /* set up directory for NormalizationCorrections.txt */
    233         uprv_strcpy(filename,icuUniDataDir);
    234         basename=filename+uprv_strlen(filename);
    235         if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
    236             *basename++=U_FILE_SEP_CHAR;
    237         }
    238 
    239         *basename++=U_FILE_SEP_CHAR;
    240         uprv_strcpy(basename,NORM_CORRECTIONS_FILE_NAME);
    241 
    242         parseNormalizationCorrections(filename,&errorCode);
    243         if(U_FAILURE(errorCode)){
    244             fprintf(stderr,"Could not open file %s for reading \n", filename);
    245             return errorCode;
    246         }
    247         sprepOptions |= _SPREP_NORMALIZATION_ON;
    248     }
    249 
    250     if(options[CHECK_BIDI].doesOccur){ /* this option might be set by @check-bidi;; in the source file */
    251         sprepOptions |= _SPREP_CHECK_BIDI_ON;
    252     }
    253 
    254     setOptions(sprepOptions);
    255 
    256     /* process parsed data */
    257     if(U_SUCCESS(errorCode)) {
    258         /* write the data file */
    259         generateData(destDir, bundleName);
    260 
    261         cleanUpData();
    262     }
    263 
    264     uprv_free(filename);
    265 
    266     u_cleanup();
    267 
    268 #endif
    269 
    270     return errorCode;
    271 }
    272 
    273 #if !UCONFIG_NO_IDNA
    274 
    275 static void U_CALLCONV
    276 normalizationCorrectionsLineFn(void *context,
    277                     char *fields[][2], int32_t fieldCount,
    278                     UErrorCode *pErrorCode) {
    279     uint32_t mapping[40];
    280     char *end, *s;
    281     uint32_t code;
    282     int32_t length;
    283     UVersionInfo version;
    284     UVersionInfo thisVersion;
    285 
    286     /* get the character code, field 0 */
    287     code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
    288     if(U_FAILURE(*pErrorCode)) {
    289         fprintf(stderr, "gensprep: error parsing NormalizationCorrections.txt mapping at %s\n", fields[0][0]);
    290         exit(*pErrorCode);
    291     }
    292     /* Original (erroneous) decomposition */
    293     s = fields[1][0];
    294 
    295     /* parse the mapping string */
    296     length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode);
    297 
    298     /* ignore corrected decomposition */
    299 
    300     u_versionFromString(version,fields[3][0] );
    301     u_versionFromString(thisVersion, "3.2.0");
    302 
    303 
    304 
    305     if(U_FAILURE(*pErrorCode)) {
    306         fprintf(stderr, "gensprep error parsing NormalizationCorrections.txt of U+%04lx - %s\n",
    307                 (long)code, u_errorName(*pErrorCode));
    308         exit(*pErrorCode);
    309     }
    310 
    311     /* store the mapping */
    312     if( version[0] > thisVersion[0] ||
    313         ((version[0]==thisVersion[0]) && (version[1] > thisVersion[1]))
    314         ){
    315         storeMapping(code,mapping, length, USPREP_MAP, pErrorCode);
    316     }
    317     setUnicodeVersionNC(version);
    318 }
    319 
    320 static void
    321 parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode) {
    322     char *fields[4][2];
    323 
    324     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    325         return;
    326     }
    327 
    328     u_parseDelimitedFile(filename, ';', fields, 4, normalizationCorrectionsLineFn, NULL, pErrorCode);
    329 
    330     /* fprintf(stdout,"Number of code points that have NormalizationCorrections mapping with length >1 : %i\n",len); */
    331 
    332     if(U_FAILURE(*pErrorCode) && ( *pErrorCode!=U_FILE_ACCESS_ERROR)) {
    333         fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
    334         exit(*pErrorCode);
    335     }
    336 }
    337 
    338 static void U_CALLCONV
    339 strprepProfileLineFn(void *context,
    340               char *fields[][2], int32_t fieldCount,
    341               UErrorCode *pErrorCode) {
    342     uint32_t mapping[40];
    343     char *end, *map;
    344     uint32_t code;
    345     int32_t length;
    346    /*UBool* mapWithNorm = (UBool*) context;*/
    347     const char* typeName;
    348     uint32_t rangeStart=0,rangeEnd =0;
    349     const char* filename = (const char*) context;
    350     const char *s;
    351 
    352     s = u_skipWhitespace(fields[0][0]);
    353     if (*s == '@') {
    354         /* special directive */
    355         s++;
    356         length = fields[0][1] - s;
    357         if (length >= NORMALIZE_DIRECTIVE_LEN
    358             && uprv_strncmp(s, NORMALIZE_DIRECTIVE, NORMALIZE_DIRECTIVE_LEN) == 0) {
    359             options[NORMALIZE].doesOccur = TRUE;
    360             return;
    361         }
    362         else if (length >= CHECK_BIDI_DIRECTIVE_LEN
    363             && uprv_strncmp(s, CHECK_BIDI_DIRECTIVE, CHECK_BIDI_DIRECTIVE_LEN) == 0) {
    364             options[CHECK_BIDI].doesOccur = TRUE;
    365             return;
    366         }
    367         else {
    368             fprintf(stderr, "gensprep error parsing a directive %s.", fields[0][0]);
    369         }
    370     }
    371 
    372     typeName = fields[2][0];
    373     map = fields[1][0];
    374 
    375     if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){
    376 
    377         u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode);
    378         if(U_FAILURE(*pErrorCode)){
    379             fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode));
    380             return;
    381         }
    382 
    383         /* store the range */
    384         storeRange(rangeStart,rangeEnd,USPREP_UNASSIGNED, pErrorCode);
    385 
    386     }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){
    387 
    388         u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode);
    389         if(U_FAILURE(*pErrorCode)){
    390             fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode));
    391             return;
    392         }
    393 
    394         /* store the range */
    395         storeRange(rangeStart,rangeEnd,USPREP_PROHIBITED, pErrorCode);
    396 
    397     }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){
    398 
    399         /* get the character code, field 0 */
    400         code=(uint32_t)uprv_strtoul(s, &end, 16);
    401         if(end<=s || end!=fields[0][1]) {
    402             fprintf(stderr, "gensprep: syntax error in field 0 at %s\n", fields[0][0]);
    403             *pErrorCode=U_PARSE_ERROR;
    404             exit(U_PARSE_ERROR);
    405         }
    406 
    407         /* parse the mapping string */
    408         length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode);
    409 
    410         /* store the mapping */
    411         storeMapping(code,mapping, length,USPREP_MAP, pErrorCode);
    412 
    413     }else{
    414         *pErrorCode = U_INVALID_FORMAT_ERROR;
    415     }
    416 
    417     if(U_FAILURE(*pErrorCode)) {
    418         fprintf(stderr, "gensprep error parsing  %s line %s at %s. Error: %s\n",filename,
    419                fields[0][0],fields[2][0],u_errorName(*pErrorCode));
    420         exit(*pErrorCode);
    421     }
    422 
    423 }
    424 
    425 static void
    426 parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode) {
    427     char *fields[3][2];
    428 
    429     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    430         return;
    431     }
    432 
    433     u_parseDelimitedFile(filename, ';', fields, 3, strprepProfileLineFn, (void*)filename, pErrorCode);
    434 
    435     /*fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);*/
    436 
    437     if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
    438         fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
    439         exit(*pErrorCode);
    440     }
    441 }
    442 
    443 
    444 #endif /* #if !UCONFIG_NO_IDNA */
    445 
    446 /*
    447  * Hey, Emacs, please set the following:
    448  *
    449  * Local Variables:
    450  * indent-tabs-mode: nil
    451  * End:
    452  *
    453  */
    454