1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2003-2009, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: gensprep.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2003-02-06 14 * created by: Ram Viswanadha 15 * 16 * This program reads the Profile.txt files, 17 * parses them, and extracts the data for StringPrep profile. 18 * It then preprocesses it and writes a binary file for efficient use 19 * in various StringPrep conversion processes. 20 */ 21 22 #define USPREP_TYPE_NAMES_ARRAY 1 23 24 #include <stdio.h> 25 #include <stdlib.h> 26 27 #include "cmemory.h" 28 #include "cstring.h" 29 #include "unewdata.h" 30 #include "uoptions.h" 31 #include "uparse.h" 32 #include "sprpimpl.h" 33 34 #include "unicode/udata.h" 35 #include "unicode/utypes.h" 36 #include "unicode/putil.h" 37 38 39 U_CDECL_BEGIN 40 #include "gensprep.h" 41 U_CDECL_END 42 43 UBool beVerbose=FALSE, haveCopyright=TRUE; 44 45 #define NORM_CORRECTIONS_FILE_NAME "NormalizationCorrections.txt" 46 47 #define NORMALIZE_DIRECTIVE "normalize" 48 #define NORMALIZE_DIRECTIVE_LEN 9 49 #define CHECK_BIDI_DIRECTIVE "check-bidi" 50 #define CHECK_BIDI_DIRECTIVE_LEN 10 51 52 /* prototypes --------------------------------------------------------------- */ 53 54 static void 55 parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode); 56 57 static void 58 parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode); 59 60 61 /* -------------------------------------------------------------------------- */ 62 63 static UOption options[]={ 64 UOPTION_HELP_H, 65 UOPTION_HELP_QUESTION_MARK, 66 UOPTION_VERBOSE, 67 UOPTION_COPYRIGHT, 68 UOPTION_DESTDIR, 69 UOPTION_SOURCEDIR, 70 UOPTION_ICUDATADIR, 71 UOPTION_BUNDLE_NAME, 72 { "normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0 }, 73 { "norm-correction", NULL, NULL, NULL, 'm', UOPT_REQUIRES_ARG, 0 }, 74 { "check-bidi", NULL, NULL, NULL, 'k', UOPT_NO_ARG, 0}, 75 { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }, 76 }; 77 78 enum{ 79 HELP, 80 HELP_QUESTION_MARK, 81 VERBOSE, 82 COPYRIGHT, 83 DESTDIR, 84 SOURCEDIR, 85 ICUDATADIR, 86 BUNDLE_NAME, 87 NORMALIZE, 88 NORM_CORRECTION_DIR, 89 CHECK_BIDI, 90 UNICODE_VERSION 91 }; 92 93 static int printHelp(int argc, char* argv[]){ 94 /* 95 * Broken into chucks because the C89 standard says the minimum 96 * required supported string length is 509 bytes. 97 */ 98 fprintf(stderr, 99 "Usage: %s [-options] [file_name]\n" 100 "\n" 101 "Read the files specified and\n" 102 "create a binary file [package-name]_[bundle-name]." DATA_TYPE " with the StringPrep profile data\n" 103 "\n", 104 argv[0]); 105 fprintf(stderr, 106 "Options:\n" 107 "\t-h or -? or --help print this usage text\n" 108 "\t-v or --verbose verbose output\n" 109 "\t-c or --copyright include a copyright notice\n"); 110 fprintf(stderr, 111 "\t-d or --destdir destination directory, followed by the path\n" 112 "\t-s or --sourcedir source directory of ICU data, followed by the path\n" 113 "\t-b or --bundle-name generate the ouput data file with the name specified\n" 114 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" 115 "\t followed by path, defaults to %s\n", 116 u_getDataDirectory()); 117 fprintf(stderr, 118 "\t-n or --normalize turn on the option for normalization and include mappings\n" 119 "\t from NormalizationCorrections.txt from the given path,\n" 120 "\t e.g: /test/icu/source/data/unidata\n"); 121 fprintf(stderr, 122 "\t-m or --norm-correction use NormalizationCorrections.txt from the given path\n" 123 "\t when the input file contains a normalization directive.\n" 124 "\t unlike -n/--normalize, this option does not force the\n" 125 "\t normalization.\n"); 126 fprintf(stderr, 127 "\t-k or --check-bidi turn on the option for checking for BiDi in the profile\n" 128 "\t-u or --unicode version of Unicode to be used with this profile followed by the version\n" 129 ); 130 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 131 } 132 133 134 extern int 135 main(int argc, char* argv[]) { 136 #if !UCONFIG_NO_IDNA 137 char* filename = NULL; 138 #endif 139 const char *srcDir=NULL, *destDir=NULL, *icuUniDataDir=NULL; 140 const char *bundleName=NULL, *inputFileName = NULL; 141 char *basename=NULL; 142 int32_t sprepOptions = 0; 143 144 UErrorCode errorCode=U_ZERO_ERROR; 145 146 U_MAIN_INIT_ARGS(argc, argv); 147 148 /* preset then read command line options */ 149 options[DESTDIR].value=u_getDataDirectory(); 150 options[SOURCEDIR].value=""; 151 options[UNICODE_VERSION].value="0"; /* don't assume the unicode version */ 152 options[BUNDLE_NAME].value = DATA_NAME; 153 options[NORMALIZE].value = ""; 154 155 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); 156 157 /* error handling, printing usage message */ 158 if(argc<0) { 159 fprintf(stderr, 160 "error in command line argument \"%s\"\n", 161 argv[-argc]); 162 } 163 if(argc<0 || options[HELP].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { 164 return printHelp(argc, argv); 165 166 } 167 168 /* get the options values */ 169 beVerbose=options[VERBOSE].doesOccur; 170 haveCopyright=options[COPYRIGHT].doesOccur; 171 srcDir=options[SOURCEDIR].value; 172 destDir=options[DESTDIR].value; 173 bundleName = options[BUNDLE_NAME].value; 174 if(options[NORMALIZE].doesOccur) { 175 icuUniDataDir = options[NORMALIZE].value; 176 } else { 177 icuUniDataDir = options[NORM_CORRECTION_DIR].value; 178 } 179 180 if(argc<2) { 181 /* print the help message */ 182 return printHelp(argc, argv); 183 } else { 184 inputFileName = argv[1]; 185 } 186 if(!options[UNICODE_VERSION].doesOccur){ 187 return printHelp(argc, argv); 188 } 189 if(options[ICUDATADIR].doesOccur) { 190 u_setDataDirectory(options[ICUDATADIR].value); 191 } 192 #if UCONFIG_NO_IDNA 193 194 fprintf(stderr, 195 "gensprep writes dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE 196 " because UCONFIG_NO_IDNA is set, \n" 197 "see icu/source/common/unicode/uconfig.h\n"); 198 generateData(destDir, bundleName); 199 200 #else 201 202 setUnicodeVersion(options[UNICODE_VERSION].value); 203 filename = (char* ) uprv_malloc(uprv_strlen(srcDir) + 300); /* hopefully this should be enough */ 204 205 /* prepare the filename beginning with the source dir */ 206 if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL && uprv_strchr(srcDir,U_FILE_ALT_SEP_CHAR) == NULL){ 207 filename[0] = '.'; 208 filename[1] = U_FILE_SEP_CHAR; 209 uprv_strcpy(filename+2,srcDir); 210 }else{ 211 uprv_strcpy(filename, srcDir); 212 } 213 214 basename=filename+uprv_strlen(filename); 215 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { 216 *basename++=U_FILE_SEP_CHAR; 217 } 218 219 /* initialize */ 220 init(); 221 222 /* process the file */ 223 uprv_strcpy(basename,inputFileName); 224 parseMappings(filename,FALSE, &errorCode); 225 if(U_FAILURE(errorCode)) { 226 fprintf(stderr, "Could not open file %s for reading. Error: %s \n", filename, u_errorName(errorCode)); 227 return errorCode; 228 } 229 230 if(options[NORMALIZE].doesOccur){ /* this option might be set by @normalize;; in the source file */ 231 /* set up directory for NormalizationCorrections.txt */ 232 uprv_strcpy(filename,icuUniDataDir); 233 basename=filename+uprv_strlen(filename); 234 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { 235 *basename++=U_FILE_SEP_CHAR; 236 } 237 238 *basename++=U_FILE_SEP_CHAR; 239 uprv_strcpy(basename,NORM_CORRECTIONS_FILE_NAME); 240 241 parseNormalizationCorrections(filename,&errorCode); 242 if(U_FAILURE(errorCode)){ 243 fprintf(stderr,"Could not open file %s for reading \n", filename); 244 return errorCode; 245 } 246 sprepOptions |= _SPREP_NORMALIZATION_ON; 247 } 248 249 if(options[CHECK_BIDI].doesOccur){ /* this option might be set by @check-bidi;; in the source file */ 250 sprepOptions |= _SPREP_CHECK_BIDI_ON; 251 } 252 253 setOptions(sprepOptions); 254 255 /* process parsed data */ 256 if(U_SUCCESS(errorCode)) { 257 /* write the data file */ 258 generateData(destDir, bundleName); 259 260 cleanUpData(); 261 } 262 263 uprv_free(filename); 264 265 #endif 266 267 return errorCode; 268 } 269 270 #if !UCONFIG_NO_IDNA 271 272 static void U_CALLCONV 273 normalizationCorrectionsLineFn(void *context, 274 char *fields[][2], int32_t fieldCount, 275 UErrorCode *pErrorCode) { 276 uint32_t mapping[40]; 277 char *end, *s; 278 uint32_t code; 279 int32_t length; 280 UVersionInfo version; 281 UVersionInfo thisVersion; 282 283 /* get the character code, field 0 */ 284 code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); 285 if(U_FAILURE(*pErrorCode)) { 286 fprintf(stderr, "gensprep: error parsing NormalizationCorrections.txt mapping at %s\n", fields[0][0]); 287 exit(*pErrorCode); 288 } 289 /* Original (erroneous) decomposition */ 290 s = fields[1][0]; 291 292 /* parse the mapping string */ 293 length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode); 294 295 /* ignore corrected decomposition */ 296 297 u_versionFromString(version,fields[3][0] ); 298 u_versionFromString(thisVersion, "3.2.0"); 299 300 301 302 if(U_FAILURE(*pErrorCode)) { 303 fprintf(stderr, "gensprep error parsing NormalizationCorrections.txt of U+%04lx - %s\n", 304 (long)code, u_errorName(*pErrorCode)); 305 exit(*pErrorCode); 306 } 307 308 /* store the mapping */ 309 if( version[0] > thisVersion[0] || 310 ((version[0]==thisVersion[0]) && (version[1] > thisVersion[1])) 311 ){ 312 storeMapping(code,mapping, length, USPREP_MAP, pErrorCode); 313 } 314 setUnicodeVersionNC(version); 315 } 316 317 static void 318 parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode) { 319 char *fields[4][2]; 320 321 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 322 return; 323 } 324 325 u_parseDelimitedFile(filename, ';', fields, 4, normalizationCorrectionsLineFn, NULL, pErrorCode); 326 327 /* fprintf(stdout,"Number of code points that have NormalizationCorrections mapping with length >1 : %i\n",len); */ 328 329 if(U_FAILURE(*pErrorCode) && ( *pErrorCode!=U_FILE_ACCESS_ERROR)) { 330 fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); 331 exit(*pErrorCode); 332 } 333 } 334 335 static void U_CALLCONV 336 strprepProfileLineFn(void *context, 337 char *fields[][2], int32_t fieldCount, 338 UErrorCode *pErrorCode) { 339 uint32_t mapping[40]; 340 char *end, *map; 341 uint32_t code; 342 int32_t length; 343 /*UBool* mapWithNorm = (UBool*) context;*/ 344 const char* typeName; 345 uint32_t rangeStart=0,rangeEnd =0; 346 const char* filename = (const char*) context; 347 const char *s; 348 349 s = u_skipWhitespace(fields[0][0]); 350 if (*s == '@') { 351 /* special directive */ 352 s++; 353 length = fields[0][1] - s; 354 if (length >= NORMALIZE_DIRECTIVE_LEN 355 && uprv_strncmp(s, NORMALIZE_DIRECTIVE, NORMALIZE_DIRECTIVE_LEN) == 0) { 356 options[NORMALIZE].doesOccur = TRUE; 357 return; 358 } 359 else if (length >= CHECK_BIDI_DIRECTIVE_LEN 360 && uprv_strncmp(s, CHECK_BIDI_DIRECTIVE, CHECK_BIDI_DIRECTIVE_LEN) == 0) { 361 options[CHECK_BIDI].doesOccur = TRUE; 362 return; 363 } 364 else { 365 fprintf(stderr, "gensprep error parsing a directive %s.", fields[0][0]); 366 } 367 } 368 369 typeName = fields[2][0]; 370 map = fields[1][0]; 371 372 if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){ 373 374 u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); 375 if(U_FAILURE(*pErrorCode)){ 376 fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); 377 return; 378 } 379 380 /* store the range */ 381 storeRange(rangeStart,rangeEnd,USPREP_UNASSIGNED, pErrorCode); 382 383 }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){ 384 385 u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); 386 if(U_FAILURE(*pErrorCode)){ 387 fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); 388 return; 389 } 390 391 /* store the range */ 392 storeRange(rangeStart,rangeEnd,USPREP_PROHIBITED, pErrorCode); 393 394 }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){ 395 396 /* get the character code, field 0 */ 397 code=(uint32_t)uprv_strtoul(s, &end, 16); 398 if(end<=s || end!=fields[0][1]) { 399 fprintf(stderr, "gensprep: syntax error in field 0 at %s\n", fields[0][0]); 400 *pErrorCode=U_PARSE_ERROR; 401 exit(U_PARSE_ERROR); 402 } 403 404 /* parse the mapping string */ 405 length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode); 406 407 /* store the mapping */ 408 storeMapping(code,mapping, length,USPREP_MAP, pErrorCode); 409 410 }else{ 411 *pErrorCode = U_INVALID_FORMAT_ERROR; 412 } 413 414 if(U_FAILURE(*pErrorCode)) { 415 fprintf(stderr, "gensprep error parsing %s line %s at %s. Error: %s\n",filename, 416 fields[0][0],fields[2][0],u_errorName(*pErrorCode)); 417 exit(*pErrorCode); 418 } 419 420 } 421 422 static void 423 parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode) { 424 char *fields[3][2]; 425 426 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 427 return; 428 } 429 430 u_parseDelimitedFile(filename, ';', fields, 3, strprepProfileLineFn, (void*)filename, pErrorCode); 431 432 /*fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);*/ 433 434 if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) { 435 fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); 436 exit(*pErrorCode); 437 } 438 } 439 440 441 #endif /* #if !UCONFIG_NO_IDNA */ 442 443 /* 444 * Hey, Emacs, please set the following: 445 * 446 * Local Variables: 447 * indent-tabs-mode: nil 448 * End: 449 * 450 */ 451