1 /* 2 ********************************************************************** 3 * Copyright (C) 2002-2010, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * 7 * File genctd.c 8 */ 9 10 //-------------------------------------------------------------------- 11 // 12 // Tool for generating CompactTrieDictionary data files (.ctd files). 13 // 14 // Usage: genctd [options] -o output-file.ctd input-file 15 // 16 // options: -v verbose 17 // -? or -h help 18 // 19 // The input file is a plain text file containing words, one per line. 20 // Words end at the first whitespace; lines beginning with whitespace 21 // are ignored. 22 // The file can be encoded as utf-8, or utf-16 (either endian), or 23 // in the default code page (platform dependent.). utf encoded 24 // files must include a BOM. 25 // 26 //-------------------------------------------------------------------- 27 28 #include "unicode/utypes.h" 29 #include "unicode/uchar.h" 30 #include "unicode/ucnv.h" 31 #include "unicode/uniset.h" 32 #include "unicode/unistr.h" 33 #include "unicode/uclean.h" 34 #include "unicode/udata.h" 35 #include "unicode/putil.h" 36 37 //#include "unicode/ustdio.h" 38 39 #include "uoptions.h" 40 #include "unewdata.h" 41 #include "ucmndata.h" 42 #include "rbbidata.h" 43 #include "triedict.h" 44 #include "cmemory.h" 45 #include "uassert.h" 46 47 #include <stdio.h> 48 #include <stdlib.h> 49 #include <string.h> 50 51 U_NAMESPACE_USE 52 53 static char *progName; 54 static UOption options[]={ 55 UOPTION_HELP_H, /* 0 */ 56 UOPTION_HELP_QUESTION_MARK, /* 1 */ 57 UOPTION_VERBOSE, /* 2 */ 58 { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 3 */ 59 UOPTION_ICUDATADIR, /* 4 */ 60 UOPTION_DESTDIR, /* 5 */ 61 UOPTION_COPYRIGHT, /* 6 */ 62 }; 63 64 void usageAndDie(int retCode) { 65 printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName); 66 printf("\tRead in word list and write out compact trie dictionary\n" 67 "options:\n" 68 "\t-h or -? or --help this usage text\n" 69 "\t-V or --version show a version message\n" 70 "\t-c or --copyright include a copyright notice\n" 71 "\t-v or --verbose turn on verbose output\n" 72 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" 73 "\t followed by path, defaults to %s\n" 74 "\t-d or --destdir destination directory, followed by the path\n", 75 u_getDataDirectory()); 76 exit (retCode); 77 } 78 79 80 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO 81 82 /* dummy UDataInfo cf. udata.h */ 83 static UDataInfo dummyDataInfo = { 84 sizeof(UDataInfo), 85 0, 86 87 U_IS_BIG_ENDIAN, 88 U_CHARSET_FAMILY, 89 U_SIZEOF_UCHAR, 90 0, 91 92 { 0, 0, 0, 0 }, /* dummy dataFormat */ 93 { 0, 0, 0, 0 }, /* dummy formatVersion */ 94 { 0, 0, 0, 0 } /* dummy dataVersion */ 95 }; 96 97 #else 98 99 // 100 // Set up the ICU data header, defined in ucmndata.h 101 // 102 DataHeader dh ={ 103 {sizeof(DataHeader), // Struct MappedData 104 0xda, 105 0x27}, 106 107 { // struct UDataInfo 108 sizeof(UDataInfo), // size 109 0, // reserved 110 U_IS_BIG_ENDIAN, 111 U_CHARSET_FAMILY, 112 U_SIZEOF_UCHAR, 113 0, // reserved 114 115 { 0x54, 0x72, 0x44, 0x63 }, // "TrDc" Trie Dictionary 116 { 1, 0, 0, 0 }, // 1.0.0.0 117 { 0, 0, 0, 0 }, // Irrelevant for this data type 118 }}; 119 120 #endif 121 122 //---------------------------------------------------------------------------- 123 // 124 // main for genctd 125 // 126 //---------------------------------------------------------------------------- 127 int main(int argc, char **argv) { 128 UErrorCode status = U_ZERO_ERROR; 129 const char *wordFileName; 130 const char *outFileName; 131 const char *outDir = NULL; 132 const char *copyright = NULL; 133 134 // 135 // Pick up and check the command line arguments, 136 // using the standard ICU tool utils option handling. 137 // 138 U_MAIN_INIT_ARGS(argc, argv); 139 progName = argv[0]; 140 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); 141 if(argc<0) { 142 // Unrecognized option 143 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); 144 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 145 } 146 147 if(options[0].doesOccur || options[1].doesOccur) { 148 // -? or -h for help. 149 usageAndDie(0); 150 } 151 152 if (!options[3].doesOccur || argc < 2) { 153 fprintf(stderr, "input and output file must both be specified.\n"); 154 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 155 } 156 outFileName = options[3].value; 157 wordFileName = argv[1]; 158 159 if (options[4].doesOccur) { 160 u_setDataDirectory(options[4].value); 161 } 162 163 status = U_ZERO_ERROR; 164 165 /* Combine the directory with the file name */ 166 if(options[5].doesOccur) { 167 outDir = options[5].value; 168 } 169 if (options[6].doesOccur) { 170 copyright = U_COPYRIGHT_STRING; 171 } 172 173 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO 174 175 UNewDataMemory *pData; 176 char msg[1024]; 177 178 /* write message with just the name */ 179 sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); 180 fprintf(stderr, "%s\n", msg); 181 182 /* write the dummy data file */ 183 pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status); 184 udata_writeBlock(pData, msg, strlen(msg)); 185 udata_finish(pData, &status); 186 return (int)status; 187 188 #else 189 /* Initialize ICU */ 190 u_init(&status); 191 if (U_FAILURE(status)) { 192 fprintf(stderr, "%s: can not initialize ICU. status = %s\n", 193 argv[0], u_errorName(status)); 194 exit(1); 195 } 196 status = U_ZERO_ERROR; 197 198 // 199 // Read in the dictionary source file 200 // 201 long result; 202 long wordFileSize; 203 FILE *file; 204 char *wordBufferC; 205 MutableTrieDictionary *mtd = NULL; 206 207 file = fopen(wordFileName, "rb"); 208 if( file == 0 ) { //cannot find file 209 //create 1-line dummy file: ie 1 char, 1 value 210 UNewDataMemory *pData; 211 char msg[1024]; 212 213 /* write message with just the name */ 214 sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName); 215 fprintf(stderr, "%s\n", msg); 216 217 UChar c = 0x0020; 218 mtd = new MutableTrieDictionary(c, status, TRUE); 219 mtd->addWord(&c, 1, status, 1); 220 221 } else { //read words in from input file 222 fseek(file, 0, SEEK_END); 223 wordFileSize = ftell(file); 224 fseek(file, 0, SEEK_SET); 225 wordBufferC = new char[wordFileSize+10]; 226 227 result = (long)fread(wordBufferC, 1, wordFileSize, file); 228 if (result != wordFileSize) { 229 fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); 230 exit (-1); 231 } 232 wordBufferC[wordFileSize]=0; 233 fclose(file); 234 235 // 236 // Look for a Unicode Signature (BOM) on the word file 237 // 238 int32_t signatureLength; 239 const char * wordSourceC = wordBufferC; 240 const char* encoding = ucnv_detectUnicodeSignature( 241 wordSourceC, wordFileSize, &signatureLength, &status); 242 if (U_FAILURE(status)) { 243 exit(status); 244 } 245 if(encoding!=NULL ){ 246 wordSourceC += signatureLength; 247 wordFileSize -= signatureLength; 248 } 249 250 // 251 // Open a converter to take the rule file to UTF-16 252 // 253 UConverter* conv; 254 conv = ucnv_open(encoding, &status); 255 if (U_FAILURE(status)) { 256 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); 257 exit(status); 258 } 259 260 // 261 // Convert the words to UChar. 262 // Preflight first to determine required buffer size. 263 // 264 uint32_t destCap = ucnv_toUChars(conv, 265 NULL, // dest, 266 0, // destCapacity, 267 wordSourceC, 268 wordFileSize, 269 &status); 270 if (status != U_BUFFER_OVERFLOW_ERROR) { 271 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 272 exit(status); 273 }; 274 275 status = U_ZERO_ERROR; 276 UChar *wordSourceU = new UChar[destCap+1]; 277 ucnv_toUChars(conv, 278 wordSourceU, // dest, 279 destCap+1, 280 wordSourceC, 281 wordFileSize, 282 &status); 283 if (U_FAILURE(status)) { 284 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 285 exit(status); 286 }; 287 ucnv_close(conv); 288 289 // Get rid of the original file buffer 290 delete[] wordBufferC; 291 292 // Create a MutableTrieDictionary, and loop through all the lines, inserting 293 // words. 294 295 // First, pick a median character. 296 UChar *current = wordSourceU + (destCap/2); 297 UChar uc = *current++; 298 UnicodeSet breaks; 299 breaks.add(0x000A); // Line Feed 300 breaks.add(0x000D); // Carriage Return 301 breaks.add(0x2028); // Line Separator 302 breaks.add(0x2029); // Paragraph Separator 303 304 do { 305 // Look for line break 306 while (uc && !breaks.contains(uc)) { 307 uc = *current++; 308 } 309 // Now skip to first non-line-break 310 while (uc && breaks.contains(uc)) { 311 uc = *current++; 312 } 313 } 314 while (uc && (breaks.contains(uc) || u_isspace(uc))); 315 316 mtd = new MutableTrieDictionary(uc, status); 317 318 if (U_FAILURE(status)) { 319 fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); 320 exit(status); 321 } 322 323 // Now add the words. Words are non-space characters at the beginning of 324 // lines, and must be at least one UChar. If a word has an associated value, 325 // the value should follow the word on the same line after a tab character. 326 current = wordSourceU; 327 UChar *candidate = current; 328 uc = *current++; 329 int32_t length = 0; 330 int count = 0; 331 332 while (uc) { 333 while (uc && !u_isspace(uc)) { 334 ++length; 335 uc = *current++; 336 } 337 338 UnicodeString valueString; 339 UChar candidateValue; 340 if(uc == 0x0009){ //separator is a tab char, read in number after space 341 while (uc && u_isspace(uc)) { 342 uc = *current++; 343 } 344 while (uc && !u_isspace(uc)) { 345 valueString.append(uc); 346 uc = *current++; 347 } 348 } 349 350 if (length > 0) { 351 count++; 352 if(valueString.length() > 0){ 353 mtd->setValued(TRUE); 354 355 uint32_t value = 0; 356 char* s = new char[valueString.length()]; 357 valueString.extract(0,valueString.length(), s, valueString.length()); 358 int n = sscanf(s, "%ud", &value); 359 U_ASSERT(n == 1); 360 U_ASSERT(value >= 0); 361 mtd->addWord(candidate, length, status, (uint16_t)value); 362 delete[] s; 363 } else { 364 mtd->addWord(candidate, length, status); 365 } 366 367 if (U_FAILURE(status)) { 368 fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n", 369 u_errorName(status), count); 370 exit(status); 371 } 372 } 373 374 // Find beginning of next line 375 while (uc && !breaks.contains(uc)) { 376 uc = *current++; 377 } 378 // Find next non-line-breaking character 379 while (uc && breaks.contains(uc)) { 380 uc = *current++; 381 } 382 candidate = current-1; 383 length = 0; 384 } 385 386 // Get rid of the Unicode text buffer 387 delete[] wordSourceU; 388 } 389 390 // Now, create a CompactTrieDictionary from the mutable dictionary 391 CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); 392 if (U_FAILURE(status)) { 393 fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); 394 exit(status); 395 } 396 397 // Get rid of the MutableTrieDictionary 398 delete mtd; 399 400 // 401 // Get the binary data from the dictionary. 402 // 403 uint32_t outDataSize = ctd->dataSize(); 404 const uint8_t *outData = (const uint8_t *)ctd->data(); 405 406 // 407 // Create the output file 408 // 409 size_t bytesWritten; 410 UNewDataMemory *pData; 411 pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status); 412 if(U_FAILURE(status)) { 413 fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n", 414 outFileName, u_errorName(status)); 415 exit(status); 416 } 417 418 419 // Write the data itself. 420 udata_writeBlock(pData, outData, outDataSize); 421 // finish up 422 bytesWritten = udata_finish(pData, &status); 423 if(U_FAILURE(status)) { 424 fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status)); 425 exit(status); 426 } 427 428 if (bytesWritten != outDataSize) { 429 fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); 430 exit(-1); 431 } 432 433 // Get rid of the CompactTrieDictionary 434 delete ctd; 435 436 u_cleanup(); 437 438 printf("genctd: tool completed successfully.\n"); 439 return 0; 440 441 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 442 } 443