1 /* 2 ********************************************************************** 3 * Copyright (C) 2002-2006, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * 7 * File genbrk.c 8 */ 9 10 //-------------------------------------------------------------------- 11 // 12 // Tool for generating RuleBasedBreakIterator data files (.brk files). 13 // .brk files contain the precompiled rules for standard types 14 // of iterators - word, line, sentence, etc. 15 // 16 // Usage: genbrk [options] -r rule-file.txt -o output-file.brk 17 // 18 // options: -v verbose 19 // -? or -h help 20 // 21 // The input rule file is a plain text file containing break rules 22 // in the input format accepted by RuleBasedBreakIterators. The 23 // file can be encoded as utf-8, or utf-16 (either endian), or 24 // in the default code page (platform dependent.). utf encoded 25 // files must include a BOM. 26 // 27 //-------------------------------------------------------------------- 28 29 #include "unicode/utypes.h" 30 #include "unicode/ucnv.h" 31 #include "unicode/unistr.h" 32 #include "unicode/rbbi.h" 33 #include "unicode/uclean.h" 34 #include "unicode/udata.h" 35 #include "unicode/putil.h" 36 37 #include "uoptions.h" 38 #include "unewdata.h" 39 #include "ucmndata.h" 40 #include "rbbidata.h" 41 #include "cmemory.h" 42 43 #include <stdio.h> 44 #include <stdlib.h> 45 #include <string.h> 46 47 U_NAMESPACE_USE 48 49 static char *progName; 50 static UOption options[]={ 51 UOPTION_HELP_H, /* 0 */ 52 UOPTION_HELP_QUESTION_MARK, /* 1 */ 53 UOPTION_VERBOSE, /* 2 */ 54 { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */ 55 { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 4 */ 56 UOPTION_ICUDATADIR, /* 5 */ 57 UOPTION_DESTDIR, /* 6 */ 58 UOPTION_COPYRIGHT, /* 7 */ 59 }; 60 61 void usageAndDie(int retCode) { 62 printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName); 63 printf("\tRead in break iteration rules text and write out the binary data\n" 64 "options:\n" 65 "\t-h or -? or --help this usage text\n" 66 "\t-V or --version show a version message\n" 67 "\t-c or --copyright include a copyright notice\n" 68 "\t-v or --verbose turn on verbose output\n" 69 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" 70 "\t followed by path, defaults to %s\n" 71 "\t-d or --destdir destination directory, followed by the path\n", 72 u_getDataDirectory()); 73 exit (retCode); 74 } 75 76 77 #if UCONFIG_NO_BREAK_ITERATION 78 79 /* dummy UDataInfo cf. udata.h */ 80 static UDataInfo dummyDataInfo = { 81 sizeof(UDataInfo), 82 0, 83 84 U_IS_BIG_ENDIAN, 85 U_CHARSET_FAMILY, 86 U_SIZEOF_UCHAR, 87 0, 88 89 { 0, 0, 0, 0 }, /* dummy dataFormat */ 90 { 0, 0, 0, 0 }, /* dummy formatVersion */ 91 { 0, 0, 0, 0 } /* dummy dataVersion */ 92 }; 93 94 #else 95 96 // 97 // Set up the ICU data header, defined in ucmndata.h 98 // 99 DataHeader dh ={ 100 {sizeof(DataHeader), // Struct MappedData 101 0xda, 102 0x27}, 103 104 { // struct UDataInfo 105 sizeof(UDataInfo), // size 106 0, // reserved 107 U_IS_BIG_ENDIAN, 108 U_CHARSET_FAMILY, 109 U_SIZEOF_UCHAR, 110 0, // reserved 111 112 { 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk " 113 { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values 114 // from the RBBI rule builder. The values declared 115 // here should never appear in any real RBBI data. 116 { 4, 1, 0, 0 } // dataVersion (Unicode version) 117 }}; 118 119 #endif 120 121 //---------------------------------------------------------------------------- 122 // 123 // main for genbrk 124 // 125 //---------------------------------------------------------------------------- 126 int main(int argc, char **argv) { 127 UErrorCode status = U_ZERO_ERROR; 128 const char *ruleFileName; 129 const char *outFileName; 130 const char *outDir = NULL; 131 const char *copyright = NULL; 132 133 // 134 // Pick up and check the command line arguments, 135 // using the standard ICU tool utils option handling. 136 // 137 U_MAIN_INIT_ARGS(argc, argv); 138 progName = argv[0]; 139 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); 140 if(argc<0) { 141 // Unrecognized option 142 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); 143 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 144 } 145 146 if(options[0].doesOccur || options[1].doesOccur) { 147 // -? or -h for help. 148 usageAndDie(0); 149 } 150 151 if (!(options[3].doesOccur && options[4].doesOccur)) { 152 fprintf(stderr, "rule file and output file must both be specified.\n"); 153 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 154 } 155 ruleFileName = options[3].value; 156 outFileName = options[4].value; 157 158 if (options[5].doesOccur) { 159 u_setDataDirectory(options[5].value); 160 } 161 162 /* Initialize ICU */ 163 u_init(&status); 164 if (U_FAILURE(status)) { 165 fprintf(stderr, "%s: can not initialize ICU. status = %s\n", 166 argv[0], u_errorName(status)); 167 exit(1); 168 } 169 status = U_ZERO_ERROR; 170 171 /* Combine the directory with the file name */ 172 if(options[6].doesOccur) { 173 outDir = options[6].value; 174 } 175 if (options[7].doesOccur) { 176 copyright = U_COPYRIGHT_STRING; 177 } 178 179 #if UCONFIG_NO_BREAK_ITERATION 180 181 UNewDataMemory *pData; 182 char msg[1024]; 183 184 /* write message with just the name */ 185 sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION, see uconfig.h", outFileName); 186 fprintf(stderr, "%s\n", msg); 187 188 /* write the dummy data file */ 189 pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status); 190 udata_writeBlock(pData, msg, strlen(msg)); 191 udata_finish(pData, &status); 192 return (int)status; 193 194 #else 195 196 // 197 // Read in the rule source file 198 // 199 long result; 200 long ruleFileSize; 201 FILE *file; 202 char *ruleBufferC; 203 204 file = fopen(ruleFileName, "rb"); 205 if( file == 0 ) { 206 fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName); 207 exit(-1); 208 } 209 fseek(file, 0, SEEK_END); 210 ruleFileSize = ftell(file); 211 fseek(file, 0, SEEK_SET); 212 ruleBufferC = new char[ruleFileSize+10]; 213 214 result = (long)fread(ruleBufferC, 1, ruleFileSize, file); 215 if (result != ruleFileSize) { 216 fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName); 217 exit (-1); 218 } 219 ruleBufferC[ruleFileSize]=0; 220 fclose(file); 221 222 // 223 // Look for a Unicode Signature (BOM) on the rule file 224 // 225 int32_t signatureLength; 226 const char * ruleSourceC = ruleBufferC; 227 const char* encoding = ucnv_detectUnicodeSignature( 228 ruleSourceC, ruleFileSize, &signatureLength, &status); 229 if (U_FAILURE(status)) { 230 exit(status); 231 } 232 if(encoding!=NULL ){ 233 ruleSourceC += signatureLength; 234 ruleFileSize -= signatureLength; 235 } 236 237 // 238 // Open a converter to take the rule file to UTF-16 239 // 240 UConverter* conv; 241 conv = ucnv_open(encoding, &status); 242 if (U_FAILURE(status)) { 243 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); 244 exit(status); 245 } 246 247 // 248 // Convert the rules to UChar. 249 // Preflight first to determine required buffer size. 250 // 251 uint32_t destCap = ucnv_toUChars(conv, 252 NULL, // dest, 253 0, // destCapacity, 254 ruleSourceC, 255 ruleFileSize, 256 &status); 257 if (status != U_BUFFER_OVERFLOW_ERROR) { 258 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 259 exit(status); 260 }; 261 262 status = U_ZERO_ERROR; 263 UChar *ruleSourceU = new UChar[destCap+1]; 264 ucnv_toUChars(conv, 265 ruleSourceU, // dest, 266 destCap+1, 267 ruleSourceC, 268 ruleFileSize, 269 &status); 270 if (U_FAILURE(status)) { 271 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 272 exit(status); 273 }; 274 ucnv_close(conv); 275 276 277 // 278 // Put the source rules into a UnicodeString 279 // 280 UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap); 281 282 // 283 // Create the break iterator from the rules 284 // This will compile the rules. 285 // 286 UParseError parseError; 287 parseError.line = 0; 288 parseError.offset = 0; 289 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status); 290 if (U_FAILURE(status)) { 291 fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n", 292 u_errorName(status), (int)parseError.line, (int)parseError.offset); 293 exit(status); 294 }; 295 296 297 // 298 // Get the compiled rule data from the break iterator. 299 // 300 uint32_t outDataSize; 301 const uint8_t *outData; 302 outData = bi->getBinaryRules(outDataSize); 303 304 // Copy the data format version numbers from the RBBI data header into the UDataMemory header. 305 uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion)); 306 307 // 308 // Create the output file 309 // 310 size_t bytesWritten; 311 UNewDataMemory *pData; 312 pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status); 313 if(U_FAILURE(status)) { 314 fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n", 315 outFileName, u_errorName(status)); 316 exit(status); 317 } 318 319 320 // Write the data itself. 321 udata_writeBlock(pData, outData, outDataSize); 322 // finish up 323 bytesWritten = udata_finish(pData, &status); 324 if(U_FAILURE(status)) { 325 fprintf(stderr, "genbrk: error %d writing the output file\n", status); 326 exit(status); 327 } 328 329 if (bytesWritten != outDataSize) { 330 fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); 331 exit(-1); 332 } 333 334 delete bi; 335 delete[] ruleSourceU; 336 delete[] ruleBufferC; 337 u_cleanup(); 338 339 340 printf("genbrk: tool completed successfully.\n"); 341 return 0; 342 343 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 344 } 345 346