Home | History | Annotate | Download | only in genctd
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2002-2009, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *
      7 * File genctd.c
      8 */
      9 
     10 //--------------------------------------------------------------------
     11 //
     12 //   Tool for generating CompactTrieDictionary data files (.ctd files).
     13 //
     14 //   Usage:  genctd [options] -o output-file.ctd input-file
     15 //
     16 //       options:   -v         verbose
     17 //                  -? or -h   help
     18 //
     19 //   The input  file is a plain text file containing words, one per line.
     20 //    Words end at the first whitespace; lines beginning with whitespace
     21 //    are ignored.
     22 //    The file can be encoded as utf-8, or utf-16 (either endian), or
     23 //    in the default code page (platform dependent.).  utf encoded
     24 //    files must include a BOM.
     25 //
     26 //--------------------------------------------------------------------
     27 
     28 #include "unicode/utypes.h"
     29 #include "unicode/uchar.h"
     30 #include "unicode/ucnv.h"
     31 #include "unicode/uniset.h"
     32 #include "unicode/unistr.h"
     33 #include "unicode/uclean.h"
     34 #include "unicode/udata.h"
     35 #include "unicode/putil.h"
     36 
     37 #include "uoptions.h"
     38 #include "unewdata.h"
     39 #include "ucmndata.h"
     40 #include "rbbidata.h"
     41 #include "triedict.h"
     42 #include "cmemory.h"
     43 
     44 #include <stdio.h>
     45 #include <stdlib.h>
     46 #include <string.h>
     47 
     48 U_NAMESPACE_USE
     49 
     50 static char *progName;
     51 static UOption options[]={
     52     UOPTION_HELP_H,             /* 0 */
     53     UOPTION_HELP_QUESTION_MARK, /* 1 */
     54     UOPTION_VERBOSE,            /* 2 */
     55     { "out",   NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 },   /* 3 */
     56     UOPTION_ICUDATADIR,         /* 4 */
     57     UOPTION_DESTDIR,            /* 5 */
     58     UOPTION_COPYRIGHT,          /* 6 */
     59 };
     60 
     61 void usageAndDie(int retCode) {
     62         printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName);
     63         printf("\tRead in word list and write out compact trie dictionary\n"
     64             "options:\n"
     65             "\t-h or -? or --help  this usage text\n"
     66             "\t-V or --version     show a version message\n"
     67             "\t-c or --copyright   include a copyright notice\n"
     68             "\t-v or --verbose     turn on verbose output\n"
     69             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
     70             "\t                    followed by path, defaults to %s\n"
     71             "\t-d or --destdir     destination directory, followed by the path\n",
     72             u_getDataDirectory());
     73         exit (retCode);
     74 }
     75 
     76 
     77 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
     78 
     79 /* dummy UDataInfo cf. udata.h */
     80 static UDataInfo dummyDataInfo = {
     81     sizeof(UDataInfo),
     82     0,
     83 
     84     U_IS_BIG_ENDIAN,
     85     U_CHARSET_FAMILY,
     86     U_SIZEOF_UCHAR,
     87     0,
     88 
     89     { 0, 0, 0, 0 },                 /* dummy dataFormat */
     90     { 0, 0, 0, 0 },                 /* dummy formatVersion */
     91     { 0, 0, 0, 0 }                  /* dummy dataVersion */
     92 };
     93 
     94 #else
     95 
     96 //
     97 //  Set up the ICU data header, defined in ucmndata.h
     98 //
     99 DataHeader dh ={
    100     {sizeof(DataHeader),           // Struct MappedData
    101         0xda,
    102         0x27},
    103 
    104     {                               // struct UDataInfo
    105         sizeof(UDataInfo),          //     size
    106         0,                          //     reserved
    107         U_IS_BIG_ENDIAN,
    108         U_CHARSET_FAMILY,
    109         U_SIZEOF_UCHAR,
    110         0,                          //     reserved
    111 
    112     { 0x54, 0x72, 0x44, 0x63 },     // "TrDc" Trie Dictionary
    113     { 1, 0, 0, 0 },                 // 1.0.0.0
    114     { 0, 0, 0, 0 },                 // Irrelevant for this data type
    115     }};
    116 
    117 #endif
    118 
    119 //----------------------------------------------------------------------------
    120 //
    121 //  main      for genctd
    122 //
    123 //----------------------------------------------------------------------------
    124 int  main(int argc, char **argv) {
    125     UErrorCode  status = U_ZERO_ERROR;
    126     const char *wordFileName;
    127     const char *outFileName;
    128     const char *outDir = NULL;
    129     const char *copyright = NULL;
    130 
    131     //
    132     // Pick up and check the command line arguments,
    133     //    using the standard ICU tool utils option handling.
    134     //
    135     U_MAIN_INIT_ARGS(argc, argv);
    136     progName = argv[0];
    137     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
    138     if(argc<0) {
    139         // Unrecognized option
    140         fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
    141         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    142     }
    143 
    144     if(options[0].doesOccur || options[1].doesOccur) {
    145         //  -? or -h for help.
    146         usageAndDie(0);
    147     }
    148 
    149     if (!options[3].doesOccur || argc < 2) {
    150         fprintf(stderr, "input and output file must both be specified.\n");
    151         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    152     }
    153     outFileName  = options[3].value;
    154     wordFileName = argv[1];
    155 
    156     if (options[4].doesOccur) {
    157         u_setDataDirectory(options[4].value);
    158     }
    159 
    160     status = U_ZERO_ERROR;
    161 
    162     /* Combine the directory with the file name */
    163     if(options[5].doesOccur) {
    164         outDir = options[5].value;
    165     }
    166     if (options[6].doesOccur) {
    167         copyright = U_COPYRIGHT_STRING;
    168     }
    169 
    170 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
    171 
    172     UNewDataMemory *pData;
    173     char msg[1024];
    174 
    175     /* write message with just the name */
    176     sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
    177     fprintf(stderr, "%s\n", msg);
    178 
    179     /* write the dummy data file */
    180     pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
    181     udata_writeBlock(pData, msg, strlen(msg));
    182     udata_finish(pData, &status);
    183     return (int)status;
    184 
    185 #else
    186     /* Initialize ICU */
    187     u_init(&status);
    188     if (U_FAILURE(status)) {
    189         fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
    190             argv[0], u_errorName(status));
    191         exit(1);
    192     }
    193     status = U_ZERO_ERROR;
    194 
    195     //
    196     //  Read in the dictionary source file
    197     //
    198     long        result;
    199     long        wordFileSize;
    200     FILE        *file;
    201     char        *wordBufferC;
    202 
    203     file = fopen(wordFileName, "rb");
    204     if( file == 0 ) {
    205         fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
    206         exit(-1);
    207     }
    208     fseek(file, 0, SEEK_END);
    209     wordFileSize = ftell(file);
    210     fseek(file, 0, SEEK_SET);
    211     wordBufferC = new char[wordFileSize+10];
    212 
    213     result = (long)fread(wordBufferC, 1, wordFileSize, file);
    214     if (result != wordFileSize)  {
    215         fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
    216         exit (-1);
    217     }
    218     wordBufferC[wordFileSize]=0;
    219     fclose(file);
    220 
    221     //
    222     // Look for a Unicode Signature (BOM) on the word file
    223     //
    224     int32_t        signatureLength;
    225     const char *   wordSourceC = wordBufferC;
    226     const char*    encoding = ucnv_detectUnicodeSignature(
    227                            wordSourceC, wordFileSize, &signatureLength, &status);
    228     if (U_FAILURE(status)) {
    229         exit(status);
    230     }
    231     if(encoding!=NULL ){
    232         wordSourceC  += signatureLength;
    233         wordFileSize -= signatureLength;
    234     }
    235 
    236     //
    237     // Open a converter to take the rule file to UTF-16
    238     //
    239     UConverter* conv;
    240     conv = ucnv_open(encoding, &status);
    241     if (U_FAILURE(status)) {
    242         fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
    243         exit(status);
    244     }
    245 
    246     //
    247     // Convert the words to UChar.
    248     //  Preflight first to determine required buffer size.
    249     //
    250     uint32_t destCap = ucnv_toUChars(conv,
    251                        NULL,           //  dest,
    252                        0,              //  destCapacity,
    253                        wordSourceC,
    254                        wordFileSize,
    255                        &status);
    256     if (status != U_BUFFER_OVERFLOW_ERROR) {
    257         fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
    258         exit(status);
    259     };
    260 
    261     status = U_ZERO_ERROR;
    262     UChar *wordSourceU = new UChar[destCap+1];
    263     ucnv_toUChars(conv,
    264                   wordSourceU,     //  dest,
    265                   destCap+1,
    266                   wordSourceC,
    267                   wordFileSize,
    268                   &status);
    269     if (U_FAILURE(status)) {
    270         fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
    271         exit(status);
    272     };
    273     ucnv_close(conv);
    274 
    275     // Get rid of the original file buffer
    276     delete[] wordBufferC;
    277 
    278     // Create a MutableTrieDictionary, and loop through all the lines, inserting
    279     // words.
    280 
    281     // First, pick a median character.
    282     UChar *current = wordSourceU + (destCap/2);
    283     UChar uc = *current++;
    284     UnicodeSet breaks;
    285     breaks.add(0x000A);     // Line Feed
    286     breaks.add(0x000D);     // Carriage Return
    287     breaks.add(0x2028);     // Line Separator
    288     breaks.add(0x2029);     // Paragraph Separator
    289 
    290     do {
    291         // Look for line break
    292         while (uc && !breaks.contains(uc)) {
    293             uc = *current++;
    294         }
    295         // Now skip to first non-line-break
    296         while (uc && breaks.contains(uc)) {
    297             uc = *current++;
    298         }
    299     }
    300     while (uc && (breaks.contains(uc) || u_isspace(uc)));
    301 
    302     MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
    303 
    304     if (U_FAILURE(status)) {
    305         fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
    306         exit(status);
    307     }
    308 
    309     // Now add the words. Words are non-space characters at the beginning of
    310     // lines, and must be at least one UChar.
    311     current = wordSourceU;
    312     UChar *candidate = current;
    313     uc = *current++;
    314     int32_t length = 0;
    315 
    316     while (uc) {
    317         while (uc && !u_isspace(uc)) {
    318             ++length;
    319             uc = *current++;
    320         }
    321         if (length > 0) {
    322             mtd->addWord(candidate, length, status);
    323             if (U_FAILURE(status)) {
    324                 fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
    325                         u_errorName(status));
    326                 exit(status);
    327             }
    328         }
    329         // Find beginning of next line
    330         while (uc && !breaks.contains(uc)) {
    331             uc = *current++;
    332         }
    333         while (uc && breaks.contains(uc)) {
    334             uc = *current++;
    335         }
    336         candidate = current-1;
    337         length = 0;
    338     }
    339 
    340     // Get rid of the Unicode text buffer
    341     delete[] wordSourceU;
    342 
    343     // Now, create a CompactTrieDictionary from the mutable dictionary
    344     CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
    345     if (U_FAILURE(status)) {
    346         fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
    347         exit(status);
    348     }
    349 
    350     // Get rid of the MutableTrieDictionary
    351     delete mtd;
    352 
    353     //
    354     //  Get the binary data from the dictionary.
    355     //
    356     uint32_t        outDataSize = ctd->dataSize();
    357     const uint8_t  *outData = (const uint8_t *)ctd->data();
    358 
    359     //
    360     //  Create the output file
    361     //
    362     size_t bytesWritten;
    363     UNewDataMemory *pData;
    364     pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
    365     if(U_FAILURE(status)) {
    366         fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n",
    367                          outFileName, u_errorName(status));
    368         exit(status);
    369     }
    370 
    371 
    372     //  Write the data itself.
    373     udata_writeBlock(pData, outData, outDataSize);
    374     // finish up
    375     bytesWritten = udata_finish(pData, &status);
    376     if(U_FAILURE(status)) {
    377         fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status));
    378         exit(status);
    379     }
    380 
    381     if (bytesWritten != outDataSize) {
    382         fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
    383         exit(-1);
    384     }
    385 
    386     // Get rid of the CompactTrieDictionary
    387     delete ctd;
    388 
    389     u_cleanup();
    390 
    391     printf("genctd: tool completed successfully.\n");
    392     return 0;
    393 
    394 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    395 }
    396 
    397