1 /* 2 ********************************************************************** 3 * Copyright (C) 2002-2006,2008, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * 7 * File genctd.c 8 */ 9 10 //-------------------------------------------------------------------- 11 // 12 // Tool for generating CompactTrieDictionary data files (.ctd files). 13 // 14 // Usage: genctd [options] -o output-file.ctd input-file 15 // 16 // options: -v verbose 17 // -? or -h help 18 // 19 // The input file is a plain text file containing words, one per line. 20 // Words end at the first whitespace; lines beginning with whitespace 21 // are ignored. 22 // The file can be encoded as utf-8, or utf-16 (either endian), or 23 // in the default code page (platform dependent.). utf encoded 24 // files must include a BOM. 25 // 26 //-------------------------------------------------------------------- 27 28 #include "unicode/utypes.h" 29 #include "unicode/uchar.h" 30 #include "unicode/ucnv.h" 31 #include "unicode/uniset.h" 32 #include "unicode/unistr.h" 33 #include "unicode/uclean.h" 34 #include "unicode/udata.h" 35 #include "unicode/putil.h" 36 37 //#include "unicode/ustdio.h" 38 39 #include "uoptions.h" 40 #include "unewdata.h" 41 #include "ucmndata.h" 42 #include "rbbidata.h" 43 #include "triedict.h" 44 #include "cmemory.h" 45 #include "uassert.h" 46 47 #include <stdio.h> 48 #include <stdlib.h> 49 #include <string.h> 50 51 U_NAMESPACE_USE 52 53 static char *progName; 54 static UOption options[]={ 55 UOPTION_HELP_H, /* 0 */ 56 UOPTION_HELP_QUESTION_MARK, /* 1 */ 57 UOPTION_VERBOSE, /* 2 */ 58 { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 3 */ 59 UOPTION_ICUDATADIR, /* 4 */ 60 UOPTION_DESTDIR, /* 5 */ 61 UOPTION_COPYRIGHT, /* 6 */ 62 }; 63 64 void usageAndDie(int retCode) { 65 printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName); 66 printf("\tRead in word list and write out compact trie dictionary\n" 67 "options:\n" 68 "\t-h or -? or --help this usage text\n" 69 "\t-V or --version show a version message\n" 70 "\t-c or --copyright include a copyright notice\n" 71 "\t-v or --verbose turn on verbose output\n" 72 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" 73 "\t followed by path, defaults to %s\n" 74 "\t-d or --destdir destination directory, followed by the path\n", 75 u_getDataDirectory()); 76 exit (retCode); 77 } 78 79 80 #if UCONFIG_NO_BREAK_ITERATION 81 82 /* dummy UDataInfo cf. udata.h */ 83 static UDataInfo dummyDataInfo = { 84 sizeof(UDataInfo), 85 0, 86 87 U_IS_BIG_ENDIAN, 88 U_CHARSET_FAMILY, 89 U_SIZEOF_UCHAR, 90 0, 91 92 { 0, 0, 0, 0 }, /* dummy dataFormat */ 93 { 0, 0, 0, 0 }, /* dummy formatVersion */ 94 { 0, 0, 0, 0 } /* dummy dataVersion */ 95 }; 96 97 #else 98 99 // 100 // Set up the ICU data header, defined in ucmndata.h 101 // 102 DataHeader dh ={ 103 {sizeof(DataHeader), // Struct MappedData 104 0xda, 105 0x27}, 106 107 { // struct UDataInfo 108 sizeof(UDataInfo), // size 109 0, // reserved 110 U_IS_BIG_ENDIAN, 111 U_CHARSET_FAMILY, 112 U_SIZEOF_UCHAR, 113 0, // reserved 114 115 { 0x54, 0x72, 0x44, 0x63 }, // "TrDc" Trie Dictionary 116 { 1, 0, 0, 0 }, // 1.0.0.0 117 { 0, 0, 0, 0 }, // Irrelevant for this data type 118 }}; 119 120 #endif 121 122 //---------------------------------------------------------------------------- 123 // 124 // main for genctd 125 // 126 //---------------------------------------------------------------------------- 127 int main(int argc, char **argv) { 128 UErrorCode status = U_ZERO_ERROR; 129 const char *wordFileName; 130 const char *outFileName; 131 const char *outDir = NULL; 132 const char *copyright = NULL; 133 134 // 135 // Pick up and check the command line arguments, 136 // using the standard ICU tool utils option handling. 137 // 138 U_MAIN_INIT_ARGS(argc, argv); 139 progName = argv[0]; 140 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); 141 if(argc<0) { 142 // Unrecognized option 143 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); 144 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 145 } 146 147 if(options[0].doesOccur || options[1].doesOccur) { 148 // -? or -h for help. 149 usageAndDie(0); 150 } 151 152 if (!options[3].doesOccur || argc < 2) { 153 fprintf(stderr, "input and output file must both be specified.\n"); 154 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 155 } 156 outFileName = options[3].value; 157 wordFileName = argv[1]; 158 159 if (options[4].doesOccur) { 160 u_setDataDirectory(options[4].value); 161 } 162 163 /* Initialize ICU */ 164 u_init(&status); 165 if (U_FAILURE(status)) { 166 fprintf(stderr, "%s: can not initialize ICU. status = %s\n", 167 argv[0], u_errorName(status)); 168 exit(1); 169 } 170 status = U_ZERO_ERROR; 171 172 /* Combine the directory with the file name */ 173 if(options[5].doesOccur) { 174 outDir = options[5].value; 175 } 176 if (options[6].doesOccur) { 177 copyright = U_COPYRIGHT_STRING; 178 } 179 180 #if UCONFIG_NO_BREAK_ITERATION 181 182 UNewDataMemory *pData; 183 char msg[1024]; 184 185 /* write message with just the name */ 186 sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION, see uconfig.h", outFileName); 187 fprintf(stderr, "%s\n", msg); 188 189 /* write the dummy data file */ 190 pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status); 191 udata_writeBlock(pData, msg, strlen(msg)); 192 udata_finish(pData, &status); 193 return (int)status; 194 195 #else 196 197 // 198 // Read in the dictionary source file 199 // 200 long result; 201 long wordFileSize; 202 FILE *file; 203 char *wordBufferC; 204 MutableTrieDictionary *mtd = NULL; 205 206 file = fopen(wordFileName, "rb"); 207 if( file == 0 ) { //cannot find file 208 //create 1-line dummy file: ie 1 char, 1 value 209 UNewDataMemory *pData; 210 char msg[1024]; 211 212 /* write message with just the name */ 213 sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName); 214 fprintf(stderr, "%s\n", msg); 215 216 UChar c = 0x0020; 217 mtd = new MutableTrieDictionary(c, status, TRUE); 218 mtd->addWord(&c, 1, status, 1); 219 220 } else { //read words in from input file 221 fseek(file, 0, SEEK_END); 222 wordFileSize = ftell(file); 223 fseek(file, 0, SEEK_SET); 224 wordBufferC = new char[wordFileSize+10]; 225 226 result = (long)fread(wordBufferC, 1, wordFileSize, file); 227 if (result != wordFileSize) { 228 fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); 229 exit (-1); 230 } 231 wordBufferC[wordFileSize]=0; 232 fclose(file); 233 234 // 235 // Look for a Unicode Signature (BOM) on the word file 236 // 237 int32_t signatureLength; 238 const char * wordSourceC = wordBufferC; 239 const char* encoding = ucnv_detectUnicodeSignature( 240 wordSourceC, wordFileSize, &signatureLength, &status); 241 if (U_FAILURE(status)) { 242 exit(status); 243 } 244 if(encoding!=NULL ){ 245 wordSourceC += signatureLength; 246 wordFileSize -= signatureLength; 247 } 248 249 // 250 // Open a converter to take the rule file to UTF-16 251 // 252 UConverter* conv; 253 conv = ucnv_open(encoding, &status); 254 if (U_FAILURE(status)) { 255 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); 256 exit(status); 257 } 258 259 // 260 // Convert the words to UChar. 261 // Preflight first to determine required buffer size. 262 // 263 uint32_t destCap = ucnv_toUChars(conv, 264 NULL, // dest, 265 0, // destCapacity, 266 wordSourceC, 267 wordFileSize, 268 &status); 269 if (status != U_BUFFER_OVERFLOW_ERROR) { 270 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 271 exit(status); 272 }; 273 274 status = U_ZERO_ERROR; 275 UChar *wordSourceU = new UChar[destCap+1]; 276 ucnv_toUChars(conv, 277 wordSourceU, // dest, 278 destCap+1, 279 wordSourceC, 280 wordFileSize, 281 &status); 282 if (U_FAILURE(status)) { 283 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 284 exit(status); 285 }; 286 ucnv_close(conv); 287 288 // Get rid of the original file buffer 289 delete[] wordBufferC; 290 291 // Create a MutableTrieDictionary, and loop through all the lines, inserting 292 // words. 293 294 // First, pick a median character. 295 UChar *current = wordSourceU + (destCap/2); 296 UChar uc = *current++; 297 UnicodeSet breaks; 298 breaks.add(0x000A); // Line Feed 299 breaks.add(0x000D); // Carriage Return 300 breaks.add(0x2028); // Line Separator 301 breaks.add(0x2029); // Paragraph Separator 302 303 do { 304 // Look for line break 305 while (uc && !breaks.contains(uc)) { 306 uc = *current++; 307 } 308 // Now skip to first non-line-break 309 while (uc && breaks.contains(uc)) { 310 uc = *current++; 311 } 312 } 313 while (uc && (breaks.contains(uc) || u_isspace(uc))); 314 315 mtd = new MutableTrieDictionary(uc, status); 316 317 if (U_FAILURE(status)) { 318 fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); 319 exit(status); 320 } 321 322 // Now add the words. Words are non-space characters at the beginning of 323 // lines, and must be at least one UChar. If a word has an associated value, 324 // the value should follow the word on the same line after a tab character. 325 current = wordSourceU; 326 UChar *candidate = current; 327 uc = *current++; 328 int32_t length = 0; 329 int count = 0; 330 331 while (uc) { 332 while (uc && !u_isspace(uc)) { 333 ++length; 334 uc = *current++; 335 } 336 337 UnicodeString valueString; 338 UChar candidateValue; 339 if(uc == 0x0009){ //separator is a tab char, read in number after space 340 while (uc && u_isspace(uc)) { 341 uc = *current++; 342 } 343 while (uc && !u_isspace(uc)) { 344 valueString.append(uc); 345 uc = *current++; 346 } 347 } 348 349 if (length > 0) { 350 count++; 351 if(valueString.length() > 0){ 352 mtd->setValued(TRUE); 353 354 uint32_t value = 0; 355 char* s = new char[valueString.length()]; 356 valueString.extract(0,valueString.length(), s, valueString.length()); 357 int n = sscanf(s, "%ud", &value); 358 U_ASSERT(n == 1); 359 U_ASSERT(value >= 0); 360 mtd->addWord(candidate, length, status, (uint16_t)value); 361 delete[] s; 362 } else { 363 mtd->addWord(candidate, length, status); 364 } 365 366 if (U_FAILURE(status)) { 367 fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n", 368 u_errorName(status), count); 369 exit(status); 370 } 371 } 372 373 // Find beginning of next line 374 while (uc && !breaks.contains(uc)) { 375 uc = *current++; 376 } 377 // Find next non-line-breaking character 378 while (uc && breaks.contains(uc)) { 379 uc = *current++; 380 } 381 candidate = current-1; 382 length = 0; 383 } 384 385 // Get rid of the Unicode text buffer 386 delete[] wordSourceU; 387 } 388 389 // Now, create a CompactTrieDictionary from the mutable dictionary 390 CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); 391 if (U_FAILURE(status)) { 392 fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); 393 exit(status); 394 } 395 396 // Get rid of the MutableTrieDictionary 397 delete mtd; 398 399 // 400 // Get the binary data from the dictionary. 401 // 402 uint32_t outDataSize = ctd->dataSize(); 403 const uint8_t *outData = (const uint8_t *)ctd->data(); 404 405 // 406 // Create the output file 407 // 408 size_t bytesWritten; 409 UNewDataMemory *pData; 410 pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status); 411 if(U_FAILURE(status)) { 412 fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n", 413 outFileName, u_errorName(status)); 414 exit(status); 415 } 416 417 418 // Write the data itself. 419 udata_writeBlock(pData, outData, outDataSize); 420 // finish up 421 bytesWritten = udata_finish(pData, &status); 422 if(U_FAILURE(status)) { 423 fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status)); 424 exit(status); 425 } 426 427 if (bytesWritten != outDataSize) { 428 fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); 429 exit(-1); 430 } 431 432 // Get rid of the CompactTrieDictionary 433 delete ctd; 434 435 u_cleanup(); 436 437 printf("genctd: tool completed successfully.\n"); 438 return 0; 439 440 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 441 } 442