Home | History | Annotate | Download | only in genctd
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2002-2010, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *
      7 * File genctd.c
      8 */
      9 
     10 //--------------------------------------------------------------------
     11 //
     12 //   Tool for generating CompactTrieDictionary data files (.ctd files).
     13 //
     14 //   Usage:  genctd [options] -o output-file.ctd input-file
     15 //
     16 //       options:   -v         verbose
     17 //                  -? or -h   help
     18 //
     19 //   The input  file is a plain text file containing words, one per line.
     20 //    Words end at the first whitespace; lines beginning with whitespace
     21 //    are ignored.
     22 //    The file can be encoded as utf-8, or utf-16 (either endian), or
     23 //    in the default code page (platform dependent.).  utf encoded
     24 //    files must include a BOM.
     25 //
     26 //--------------------------------------------------------------------
     27 
     28 #include "unicode/utypes.h"
     29 #include "unicode/uchar.h"
     30 #include "unicode/ucnv.h"
     31 #include "unicode/uniset.h"
     32 #include "unicode/unistr.h"
     33 #include "unicode/uclean.h"
     34 #include "unicode/udata.h"
     35 #include "unicode/putil.h"
     36 
     37 //#include "unicode/ustdio.h"
     38 
     39 #include "uoptions.h"
     40 #include "unewdata.h"
     41 #include "ucmndata.h"
     42 #include "rbbidata.h"
     43 #include "triedict.h"
     44 #include "cmemory.h"
     45 #include "uassert.h"
     46 
     47 #include <stdio.h>
     48 #include <stdlib.h>
     49 #include <string.h>
     50 
     51 U_NAMESPACE_USE
     52 
     53 static char *progName;
     54 static UOption options[]={
     55     UOPTION_HELP_H,             /* 0 */
     56     UOPTION_HELP_QUESTION_MARK, /* 1 */
     57     UOPTION_VERBOSE,            /* 2 */
     58     { "out",   NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 },   /* 3 */
     59     UOPTION_ICUDATADIR,         /* 4 */
     60     UOPTION_DESTDIR,            /* 5 */
     61     UOPTION_COPYRIGHT,          /* 6 */
     62 };
     63 
     64 void usageAndDie(int retCode) {
     65         printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName);
     66         printf("\tRead in word list and write out compact trie dictionary\n"
     67             "options:\n"
     68             "\t-h or -? or --help  this usage text\n"
     69             "\t-V or --version     show a version message\n"
     70             "\t-c or --copyright   include a copyright notice\n"
     71             "\t-v or --verbose     turn on verbose output\n"
     72             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
     73             "\t                    followed by path, defaults to %s\n"
     74             "\t-d or --destdir     destination directory, followed by the path\n",
     75             u_getDataDirectory());
     76         exit (retCode);
     77 }
     78 
     79 
     80 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
     81 
     82 /* dummy UDataInfo cf. udata.h */
     83 static UDataInfo dummyDataInfo = {
     84     sizeof(UDataInfo),
     85     0,
     86 
     87     U_IS_BIG_ENDIAN,
     88     U_CHARSET_FAMILY,
     89     U_SIZEOF_UCHAR,
     90     0,
     91 
     92     { 0, 0, 0, 0 },                 /* dummy dataFormat */
     93     { 0, 0, 0, 0 },                 /* dummy formatVersion */
     94     { 0, 0, 0, 0 }                  /* dummy dataVersion */
     95 };
     96 
     97 #else
     98 
     99 //
    100 //  Set up the ICU data header, defined in ucmndata.h
    101 //
    102 DataHeader dh ={
    103     {sizeof(DataHeader),           // Struct MappedData
    104         0xda,
    105         0x27},
    106 
    107     {                               // struct UDataInfo
    108         sizeof(UDataInfo),          //     size
    109         0,                          //     reserved
    110         U_IS_BIG_ENDIAN,
    111         U_CHARSET_FAMILY,
    112         U_SIZEOF_UCHAR,
    113         0,                          //     reserved
    114 
    115     { 0x54, 0x72, 0x44, 0x63 },     // "TrDc" Trie Dictionary
    116     { 1, 0, 0, 0 },                 // 1.0.0.0
    117     { 0, 0, 0, 0 },                 // Irrelevant for this data type
    118     }};
    119 
    120 #endif
    121 
    122 //----------------------------------------------------------------------------
    123 //
    124 //  main      for genctd
    125 //
    126 //----------------------------------------------------------------------------
    127 int  main(int argc, char **argv) {
    128     UErrorCode  status = U_ZERO_ERROR;
    129     const char *wordFileName;
    130     const char *outFileName;
    131     const char *outDir = NULL;
    132     const char *copyright = NULL;
    133 
    134     //
    135     // Pick up and check the command line arguments,
    136     //    using the standard ICU tool utils option handling.
    137     //
    138     U_MAIN_INIT_ARGS(argc, argv);
    139     progName = argv[0];
    140     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
    141     if(argc<0) {
    142         // Unrecognized option
    143         fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
    144         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    145     }
    146 
    147     if(options[0].doesOccur || options[1].doesOccur) {
    148         //  -? or -h for help.
    149         usageAndDie(0);
    150     }
    151 
    152     if (!options[3].doesOccur || argc < 2) {
    153         fprintf(stderr, "input and output file must both be specified.\n");
    154         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    155     }
    156     outFileName  = options[3].value;
    157     wordFileName = argv[1];
    158 
    159     if (options[4].doesOccur) {
    160         u_setDataDirectory(options[4].value);
    161     }
    162 
    163     status = U_ZERO_ERROR;
    164 
    165     /* Combine the directory with the file name */
    166     if(options[5].doesOccur) {
    167         outDir = options[5].value;
    168     }
    169     if (options[6].doesOccur) {
    170         copyright = U_COPYRIGHT_STRING;
    171     }
    172 
    173 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
    174 
    175     UNewDataMemory *pData;
    176     char msg[1024];
    177 
    178     /* write message with just the name */
    179     sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
    180     fprintf(stderr, "%s\n", msg);
    181 
    182     /* write the dummy data file */
    183     pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
    184     udata_writeBlock(pData, msg, strlen(msg));
    185     udata_finish(pData, &status);
    186     return (int)status;
    187 
    188 #else
    189     /* Initialize ICU */
    190     u_init(&status);
    191     if (U_FAILURE(status)) {
    192         fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
    193             argv[0], u_errorName(status));
    194         exit(1);
    195     }
    196     status = U_ZERO_ERROR;
    197 
    198     //
    199     //  Read in the dictionary source file
    200     //
    201     long        result;
    202     long        wordFileSize;
    203     FILE        *file;
    204     char        *wordBufferC;
    205     MutableTrieDictionary *mtd = NULL;
    206 
    207     file = fopen(wordFileName, "rb");
    208     if( file == 0 ) { //cannot find file
    209         //create 1-line dummy file: ie 1 char, 1 value
    210         UNewDataMemory *pData;
    211         char msg[1024];
    212 
    213         /* write message with just the name */
    214         sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName);
    215         fprintf(stderr, "%s\n", msg);
    216 
    217         UChar c = 0x0020;
    218         mtd = new MutableTrieDictionary(c, status, TRUE);
    219         mtd->addWord(&c, 1, status, 1);
    220 
    221     } else { //read words in from input file
    222         fseek(file, 0, SEEK_END);
    223         wordFileSize = ftell(file);
    224         fseek(file, 0, SEEK_SET);
    225         wordBufferC = new char[wordFileSize+10];
    226 
    227         result = (long)fread(wordBufferC, 1, wordFileSize, file);
    228         if (result != wordFileSize)  {
    229             fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
    230             exit (-1);
    231         }
    232         wordBufferC[wordFileSize]=0;
    233         fclose(file);
    234 
    235         //
    236         // Look for a Unicode Signature (BOM) on the word file
    237         //
    238         int32_t        signatureLength;
    239         const char *   wordSourceC = wordBufferC;
    240         const char*    encoding = ucnv_detectUnicodeSignature(
    241                                wordSourceC, wordFileSize, &signatureLength, &status);
    242         if (U_FAILURE(status)) {
    243             exit(status);
    244         }
    245         if(encoding!=NULL ){
    246             wordSourceC  += signatureLength;
    247             wordFileSize -= signatureLength;
    248         }
    249 
    250         //
    251         // Open a converter to take the rule file to UTF-16
    252         //
    253         UConverter* conv;
    254         conv = ucnv_open(encoding, &status);
    255         if (U_FAILURE(status)) {
    256             fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
    257             exit(status);
    258         }
    259 
    260         //
    261         // Convert the words to UChar.
    262         //  Preflight first to determine required buffer size.
    263         //
    264         uint32_t destCap = ucnv_toUChars(conv,
    265                            NULL,           //  dest,
    266                            0,              //  destCapacity,
    267                            wordSourceC,
    268                            wordFileSize,
    269                            &status);
    270         if (status != U_BUFFER_OVERFLOW_ERROR) {
    271             fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
    272             exit(status);
    273         };
    274 
    275         status = U_ZERO_ERROR;
    276         UChar *wordSourceU = new UChar[destCap+1];
    277         ucnv_toUChars(conv,
    278                       wordSourceU,     //  dest,
    279                       destCap+1,
    280                       wordSourceC,
    281                       wordFileSize,
    282                       &status);
    283         if (U_FAILURE(status)) {
    284             fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
    285             exit(status);
    286         };
    287         ucnv_close(conv);
    288 
    289         // Get rid of the original file buffer
    290         delete[] wordBufferC;
    291 
    292         // Create a MutableTrieDictionary, and loop through all the lines, inserting
    293         // words.
    294 
    295         // First, pick a median character.
    296         UChar *current = wordSourceU + (destCap/2);
    297         UChar uc = *current++;
    298         UnicodeSet breaks;
    299         breaks.add(0x000A);     // Line Feed
    300         breaks.add(0x000D);     // Carriage Return
    301         breaks.add(0x2028);     // Line Separator
    302         breaks.add(0x2029);     // Paragraph Separator
    303 
    304         do {
    305             // Look for line break
    306             while (uc && !breaks.contains(uc)) {
    307                 uc = *current++;
    308             }
    309             // Now skip to first non-line-break
    310             while (uc && breaks.contains(uc)) {
    311                 uc = *current++;
    312             }
    313         }
    314         while (uc && (breaks.contains(uc) || u_isspace(uc)));
    315 
    316         mtd = new MutableTrieDictionary(uc, status);
    317 
    318         if (U_FAILURE(status)) {
    319             fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
    320             exit(status);
    321         }
    322 
    323         // Now add the words. Words are non-space characters at the beginning of
    324         // lines, and must be at least one UChar. If a word has an associated value,
    325         // the value should follow the word on the same line after a tab character.
    326         current = wordSourceU;
    327         UChar *candidate = current;
    328         uc = *current++;
    329         int32_t length = 0;
    330         int count = 0;
    331 
    332         while (uc) {
    333             while (uc && !u_isspace(uc)) {
    334                 ++length;
    335                 uc = *current++;
    336             }
    337 
    338             UnicodeString valueString;
    339             UChar candidateValue;
    340             if(uc == 0x0009){ //separator is a tab char, read in number after space
    341             	while (uc && u_isspace(uc)) {
    342             		uc = *current++;
    343             	}
    344                 while (uc && !u_isspace(uc)) {
    345                     valueString.append(uc);
    346                     uc = *current++;
    347                 }
    348             }
    349 
    350             if (length > 0) {
    351                 count++;
    352                 if(valueString.length() > 0){
    353                     mtd->setValued(TRUE);
    354 
    355                     uint32_t value = 0;
    356                     char* s = new char[valueString.length()];
    357                     valueString.extract(0,valueString.length(), s, valueString.length());
    358                     int n = sscanf(s, "%ud", &value);
    359                     U_ASSERT(n == 1);
    360                     U_ASSERT(value >= 0);
    361                     mtd->addWord(candidate, length, status, (uint16_t)value);
    362                     delete[] s;
    363                 } else {
    364                     mtd->addWord(candidate, length, status);
    365                 }
    366 
    367                 if (U_FAILURE(status)) {
    368                     fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n",
    369                             u_errorName(status), count);
    370                     exit(status);
    371                 }
    372             }
    373 
    374             // Find beginning of next line
    375             while (uc && !breaks.contains(uc)) {
    376                 uc = *current++;
    377             }
    378             // Find next non-line-breaking character
    379             while (uc && breaks.contains(uc)) {
    380                 uc = *current++;
    381             }
    382             candidate = current-1;
    383             length = 0;
    384         }
    385 
    386         // Get rid of the Unicode text buffer
    387         delete[] wordSourceU;
    388     }
    389 
    390     // Now, create a CompactTrieDictionary from the mutable dictionary
    391     CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
    392     if (U_FAILURE(status)) {
    393         fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
    394         exit(status);
    395     }
    396 
    397     // Get rid of the MutableTrieDictionary
    398     delete mtd;
    399 
    400     //
    401     //  Get the binary data from the dictionary.
    402     //
    403     uint32_t        outDataSize = ctd->dataSize();
    404     const uint8_t  *outData = (const uint8_t *)ctd->data();
    405 
    406     //
    407     //  Create the output file
    408     //
    409     size_t bytesWritten;
    410     UNewDataMemory *pData;
    411     pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
    412     if(U_FAILURE(status)) {
    413         fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n",
    414                          outFileName, u_errorName(status));
    415         exit(status);
    416     }
    417 
    418 
    419     //  Write the data itself.
    420     udata_writeBlock(pData, outData, outDataSize);
    421     // finish up
    422     bytesWritten = udata_finish(pData, &status);
    423     if(U_FAILURE(status)) {
    424         fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status));
    425         exit(status);
    426     }
    427 
    428     if (bytesWritten != outDataSize) {
    429         fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
    430         exit(-1);
    431     }
    432 
    433     // Get rid of the CompactTrieDictionary
    434     delete ctd;
    435 
    436     u_cleanup();
    437 
    438     printf("genctd: tool completed successfully.\n");
    439     return 0;
    440 
    441 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    442 }
    443