1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2003-2012, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: gensprep.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2003-02-06 14 * created by: Ram Viswanadha 15 * 16 * This program reads the Profile.txt files, 17 * parses them, and extracts the data for StringPrep profile. 18 * It then preprocesses it and writes a binary file for efficient use 19 * in various StringPrep conversion processes. 20 */ 21 22 #define USPREP_TYPE_NAMES_ARRAY 1 23 24 #include <stdio.h> 25 #include <stdlib.h> 26 27 #include "cmemory.h" 28 #include "cstring.h" 29 #include "unewdata.h" 30 #include "uoptions.h" 31 #include "uparse.h" 32 #include "sprpimpl.h" 33 34 #include "unicode/uclean.h" 35 #include "unicode/udata.h" 36 #include "unicode/utypes.h" 37 #include "unicode/putil.h" 38 39 40 U_CDECL_BEGIN 41 #include "gensprep.h" 42 U_CDECL_END 43 44 UBool beVerbose=FALSE, haveCopyright=TRUE; 45 46 #define NORM_CORRECTIONS_FILE_NAME "NormalizationCorrections.txt" 47 48 #define NORMALIZE_DIRECTIVE "normalize" 49 #define NORMALIZE_DIRECTIVE_LEN 9 50 #define CHECK_BIDI_DIRECTIVE "check-bidi" 51 #define CHECK_BIDI_DIRECTIVE_LEN 10 52 53 /* prototypes --------------------------------------------------------------- */ 54 55 static void 56 parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode); 57 58 static void 59 parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode); 60 61 62 /* -------------------------------------------------------------------------- */ 63 64 static UOption options[]={ 65 UOPTION_HELP_H, 66 UOPTION_HELP_QUESTION_MARK, 67 UOPTION_VERBOSE, 68 UOPTION_COPYRIGHT, 69 UOPTION_DESTDIR, 70 UOPTION_SOURCEDIR, 71 UOPTION_ICUDATADIR, 72 UOPTION_BUNDLE_NAME, 73 { "normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0 }, 74 { "norm-correction", NULL, NULL, NULL, 'm', UOPT_REQUIRES_ARG, 0 }, 75 { "check-bidi", NULL, NULL, NULL, 'k', UOPT_NO_ARG, 0}, 76 { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }, 77 }; 78 79 enum{ 80 HELP, 81 HELP_QUESTION_MARK, 82 VERBOSE, 83 COPYRIGHT, 84 DESTDIR, 85 SOURCEDIR, 86 ICUDATADIR, 87 BUNDLE_NAME, 88 NORMALIZE, 89 NORM_CORRECTION_DIR, 90 CHECK_BIDI, 91 UNICODE_VERSION 92 }; 93 94 static int printHelp(int argc, char* argv[]){ 95 /* 96 * Broken into chucks because the C89 standard says the minimum 97 * required supported string length is 509 bytes. 98 */ 99 fprintf(stderr, 100 "Usage: %s [-options] [file_name]\n" 101 "\n" 102 "Read the files specified and\n" 103 "create a binary file [package-name]_[bundle-name]." DATA_TYPE " with the StringPrep profile data\n" 104 "\n", 105 argv[0]); 106 fprintf(stderr, 107 "Options:\n" 108 "\t-h or -? or --help print this usage text\n" 109 "\t-v or --verbose verbose output\n" 110 "\t-c or --copyright include a copyright notice\n"); 111 fprintf(stderr, 112 "\t-d or --destdir destination directory, followed by the path\n" 113 "\t-s or --sourcedir source directory of ICU data, followed by the path\n" 114 "\t-b or --bundle-name generate the ouput data file with the name specified\n" 115 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" 116 "\t followed by path, defaults to %s\n", 117 u_getDataDirectory()); 118 fprintf(stderr, 119 "\t-n or --normalize turn on the option for normalization and include mappings\n" 120 "\t from NormalizationCorrections.txt from the given path,\n" 121 "\t e.g: /test/icu/source/data/unidata\n"); 122 fprintf(stderr, 123 "\t-m or --norm-correction use NormalizationCorrections.txt from the given path\n" 124 "\t when the input file contains a normalization directive.\n" 125 "\t unlike -n/--normalize, this option does not force the\n" 126 "\t normalization.\n"); 127 fprintf(stderr, 128 "\t-k or --check-bidi turn on the option for checking for BiDi in the profile\n" 129 "\t-u or --unicode version of Unicode to be used with this profile followed by the version\n" 130 ); 131 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 132 } 133 134 135 extern int 136 main(int argc, char* argv[]) { 137 #if !UCONFIG_NO_IDNA 138 char* filename = NULL; 139 #endif 140 const char *srcDir=NULL, *destDir=NULL, *icuUniDataDir=NULL; 141 const char *bundleName=NULL, *inputFileName = NULL; 142 char *basename=NULL; 143 int32_t sprepOptions = 0; 144 145 UErrorCode errorCode=U_ZERO_ERROR; 146 147 U_MAIN_INIT_ARGS(argc, argv); 148 149 /* preset then read command line options */ 150 options[DESTDIR].value=u_getDataDirectory(); 151 options[SOURCEDIR].value=""; 152 options[UNICODE_VERSION].value="0"; /* don't assume the unicode version */ 153 options[BUNDLE_NAME].value = DATA_NAME; 154 options[NORMALIZE].value = ""; 155 156 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); 157 158 /* error handling, printing usage message */ 159 if(argc<0) { 160 fprintf(stderr, 161 "error in command line argument \"%s\"\n", 162 argv[-argc]); 163 } 164 if(argc<0 || options[HELP].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { 165 return printHelp(argc, argv); 166 167 } 168 169 /* get the options values */ 170 beVerbose=options[VERBOSE].doesOccur; 171 haveCopyright=options[COPYRIGHT].doesOccur; 172 srcDir=options[SOURCEDIR].value; 173 destDir=options[DESTDIR].value; 174 bundleName = options[BUNDLE_NAME].value; 175 if(options[NORMALIZE].doesOccur) { 176 icuUniDataDir = options[NORMALIZE].value; 177 } else { 178 icuUniDataDir = options[NORM_CORRECTION_DIR].value; 179 } 180 181 if(argc<2) { 182 /* print the help message */ 183 return printHelp(argc, argv); 184 } else { 185 inputFileName = argv[1]; 186 } 187 if(!options[UNICODE_VERSION].doesOccur){ 188 return printHelp(argc, argv); 189 } 190 if(options[ICUDATADIR].doesOccur) { 191 u_setDataDirectory(options[ICUDATADIR].value); 192 } 193 #if UCONFIG_NO_IDNA 194 195 fprintf(stderr, 196 "gensprep writes dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE 197 " because UCONFIG_NO_IDNA is set, \n" 198 "see icu/source/common/unicode/uconfig.h\n"); 199 generateData(destDir, bundleName); 200 201 #else 202 203 setUnicodeVersion(options[UNICODE_VERSION].value); 204 filename = (char* ) uprv_malloc(uprv_strlen(srcDir) + 300); /* hopefully this should be enough */ 205 206 /* prepare the filename beginning with the source dir */ 207 if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL && uprv_strchr(srcDir,U_FILE_ALT_SEP_CHAR) == NULL){ 208 filename[0] = '.'; 209 filename[1] = U_FILE_SEP_CHAR; 210 uprv_strcpy(filename+2,srcDir); 211 }else{ 212 uprv_strcpy(filename, srcDir); 213 } 214 215 basename=filename+uprv_strlen(filename); 216 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { 217 *basename++=U_FILE_SEP_CHAR; 218 } 219 220 /* initialize */ 221 init(); 222 223 /* process the file */ 224 uprv_strcpy(basename,inputFileName); 225 parseMappings(filename,FALSE, &errorCode); 226 if(U_FAILURE(errorCode)) { 227 fprintf(stderr, "Could not open file %s for reading. Error: %s \n", filename, u_errorName(errorCode)); 228 return errorCode; 229 } 230 231 if(options[NORMALIZE].doesOccur){ /* this option might be set by @normalize;; in the source file */ 232 /* set up directory for NormalizationCorrections.txt */ 233 uprv_strcpy(filename,icuUniDataDir); 234 basename=filename+uprv_strlen(filename); 235 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { 236 *basename++=U_FILE_SEP_CHAR; 237 } 238 239 *basename++=U_FILE_SEP_CHAR; 240 uprv_strcpy(basename,NORM_CORRECTIONS_FILE_NAME); 241 242 parseNormalizationCorrections(filename,&errorCode); 243 if(U_FAILURE(errorCode)){ 244 fprintf(stderr,"Could not open file %s for reading \n", filename); 245 return errorCode; 246 } 247 sprepOptions |= _SPREP_NORMALIZATION_ON; 248 } 249 250 if(options[CHECK_BIDI].doesOccur){ /* this option might be set by @check-bidi;; in the source file */ 251 sprepOptions |= _SPREP_CHECK_BIDI_ON; 252 } 253 254 setOptions(sprepOptions); 255 256 /* process parsed data */ 257 if(U_SUCCESS(errorCode)) { 258 /* write the data file */ 259 generateData(destDir, bundleName); 260 261 cleanUpData(); 262 } 263 264 uprv_free(filename); 265 266 u_cleanup(); 267 268 #endif 269 270 return errorCode; 271 } 272 273 #if !UCONFIG_NO_IDNA 274 275 static void U_CALLCONV 276 normalizationCorrectionsLineFn(void *context, 277 char *fields[][2], int32_t fieldCount, 278 UErrorCode *pErrorCode) { 279 uint32_t mapping[40]; 280 char *end, *s; 281 uint32_t code; 282 int32_t length; 283 UVersionInfo version; 284 UVersionInfo thisVersion; 285 286 /* get the character code, field 0 */ 287 code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); 288 if(U_FAILURE(*pErrorCode)) { 289 fprintf(stderr, "gensprep: error parsing NormalizationCorrections.txt mapping at %s\n", fields[0][0]); 290 exit(*pErrorCode); 291 } 292 /* Original (erroneous) decomposition */ 293 s = fields[1][0]; 294 295 /* parse the mapping string */ 296 length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode); 297 298 /* ignore corrected decomposition */ 299 300 u_versionFromString(version,fields[3][0] ); 301 u_versionFromString(thisVersion, "3.2.0"); 302 303 304 305 if(U_FAILURE(*pErrorCode)) { 306 fprintf(stderr, "gensprep error parsing NormalizationCorrections.txt of U+%04lx - %s\n", 307 (long)code, u_errorName(*pErrorCode)); 308 exit(*pErrorCode); 309 } 310 311 /* store the mapping */ 312 if( version[0] > thisVersion[0] || 313 ((version[0]==thisVersion[0]) && (version[1] > thisVersion[1])) 314 ){ 315 storeMapping(code,mapping, length, USPREP_MAP, pErrorCode); 316 } 317 setUnicodeVersionNC(version); 318 } 319 320 static void 321 parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode) { 322 char *fields[4][2]; 323 324 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 325 return; 326 } 327 328 u_parseDelimitedFile(filename, ';', fields, 4, normalizationCorrectionsLineFn, NULL, pErrorCode); 329 330 /* fprintf(stdout,"Number of code points that have NormalizationCorrections mapping with length >1 : %i\n",len); */ 331 332 if(U_FAILURE(*pErrorCode) && ( *pErrorCode!=U_FILE_ACCESS_ERROR)) { 333 fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); 334 exit(*pErrorCode); 335 } 336 } 337 338 static void U_CALLCONV 339 strprepProfileLineFn(void *context, 340 char *fields[][2], int32_t fieldCount, 341 UErrorCode *pErrorCode) { 342 uint32_t mapping[40]; 343 char *end, *map; 344 uint32_t code; 345 int32_t length; 346 /*UBool* mapWithNorm = (UBool*) context;*/ 347 const char* typeName; 348 uint32_t rangeStart=0,rangeEnd =0; 349 const char* filename = (const char*) context; 350 const char *s; 351 352 s = u_skipWhitespace(fields[0][0]); 353 if (*s == '@') { 354 /* special directive */ 355 s++; 356 length = fields[0][1] - s; 357 if (length >= NORMALIZE_DIRECTIVE_LEN 358 && uprv_strncmp(s, NORMALIZE_DIRECTIVE, NORMALIZE_DIRECTIVE_LEN) == 0) { 359 options[NORMALIZE].doesOccur = TRUE; 360 return; 361 } 362 else if (length >= CHECK_BIDI_DIRECTIVE_LEN 363 && uprv_strncmp(s, CHECK_BIDI_DIRECTIVE, CHECK_BIDI_DIRECTIVE_LEN) == 0) { 364 options[CHECK_BIDI].doesOccur = TRUE; 365 return; 366 } 367 else { 368 fprintf(stderr, "gensprep error parsing a directive %s.", fields[0][0]); 369 } 370 } 371 372 typeName = fields[2][0]; 373 map = fields[1][0]; 374 375 if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){ 376 377 u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); 378 if(U_FAILURE(*pErrorCode)){ 379 fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); 380 return; 381 } 382 383 /* store the range */ 384 storeRange(rangeStart,rangeEnd,USPREP_UNASSIGNED, pErrorCode); 385 386 }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){ 387 388 u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); 389 if(U_FAILURE(*pErrorCode)){ 390 fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); 391 return; 392 } 393 394 /* store the range */ 395 storeRange(rangeStart,rangeEnd,USPREP_PROHIBITED, pErrorCode); 396 397 }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){ 398 399 /* get the character code, field 0 */ 400 code=(uint32_t)uprv_strtoul(s, &end, 16); 401 if(end<=s || end!=fields[0][1]) { 402 fprintf(stderr, "gensprep: syntax error in field 0 at %s\n", fields[0][0]); 403 *pErrorCode=U_PARSE_ERROR; 404 exit(U_PARSE_ERROR); 405 } 406 407 /* parse the mapping string */ 408 length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode); 409 410 /* store the mapping */ 411 storeMapping(code,mapping, length,USPREP_MAP, pErrorCode); 412 413 }else{ 414 *pErrorCode = U_INVALID_FORMAT_ERROR; 415 } 416 417 if(U_FAILURE(*pErrorCode)) { 418 fprintf(stderr, "gensprep error parsing %s line %s at %s. Error: %s\n",filename, 419 fields[0][0],fields[2][0],u_errorName(*pErrorCode)); 420 exit(*pErrorCode); 421 } 422 423 } 424 425 static void 426 parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode) { 427 char *fields[3][2]; 428 429 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 430 return; 431 } 432 433 u_parseDelimitedFile(filename, ';', fields, 3, strprepProfileLineFn, (void*)filename, pErrorCode); 434 435 /*fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);*/ 436 437 if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) { 438 fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); 439 exit(*pErrorCode); 440 } 441 } 442 443 444 #endif /* #if !UCONFIG_NO_IDNA */ 445 446 /* 447 * Hey, Emacs, please set the following: 448 * 449 * Local Variables: 450 * indent-tabs-mode: nil 451 * End: 452 * 453 */ 454