Home | History | Annotate | Download | only in genbrk
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2002-2009, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *
      7 * File genbrk.c
      8 */
      9 
     10 //--------------------------------------------------------------------
     11 //
     12 //   Tool for generating RuleBasedBreakIterator data files (.brk files).
     13 //   .brk files contain the precompiled rules for standard types
     14 //   of iterators - word, line, sentence, etc.
     15 //
     16 //   Usage:  genbrk [options] -r rule-file.txt  -o output-file.brk
     17 //
     18 //       options:   -v         verbose
     19 //                  -? or -h   help
     20 //
     21 //   The input rule file is a plain text file containing break rules
     22 //    in the input format accepted by RuleBasedBreakIterators.  The
     23 //    file can be encoded as utf-8, or utf-16 (either endian), or
     24 //    in the default code page (platform dependent.).  utf encoded
     25 //    files must include a BOM.
     26 //
     27 //--------------------------------------------------------------------
     28 
     29 #include "unicode/utypes.h"
     30 #include "unicode/ucnv.h"
     31 #include "unicode/unistr.h"
     32 #include "unicode/rbbi.h"
     33 #include "unicode/uclean.h"
     34 #include "unicode/udata.h"
     35 #include "unicode/putil.h"
     36 
     37 #include "uoptions.h"
     38 #include "unewdata.h"
     39 #include "ucmndata.h"
     40 #include "rbbidata.h"
     41 #include "cmemory.h"
     42 
     43 #include <stdio.h>
     44 #include <stdlib.h>
     45 #include <string.h>
     46 
     47 U_NAMESPACE_USE
     48 
     49 static char *progName;
     50 static UOption options[]={
     51     UOPTION_HELP_H,             /* 0 */
     52     UOPTION_HELP_QUESTION_MARK, /* 1 */
     53     UOPTION_VERBOSE,            /* 2 */
     54     { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 },   /* 3 */
     55     { "out",   NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 },   /* 4 */
     56     UOPTION_ICUDATADIR,         /* 5 */
     57     UOPTION_DESTDIR,            /* 6 */
     58     UOPTION_COPYRIGHT,          /* 7 */
     59 };
     60 
     61 void usageAndDie(int retCode) {
     62         printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName);
     63         printf("\tRead in break iteration rules text and write out the binary data\n"
     64             "options:\n"
     65             "\t-h or -? or --help  this usage text\n"
     66             "\t-V or --version     show a version message\n"
     67             "\t-c or --copyright   include a copyright notice\n"
     68             "\t-v or --verbose     turn on verbose output\n"
     69             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
     70             "\t                    followed by path, defaults to %s\n"
     71             "\t-d or --destdir     destination directory, followed by the path\n",
     72             u_getDataDirectory());
     73         exit (retCode);
     74 }
     75 
     76 
     77 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
     78 
     79 /* dummy UDataInfo cf. udata.h */
     80 static UDataInfo dummyDataInfo = {
     81     sizeof(UDataInfo),
     82     0,
     83 
     84     U_IS_BIG_ENDIAN,
     85     U_CHARSET_FAMILY,
     86     U_SIZEOF_UCHAR,
     87     0,
     88 
     89     { 0, 0, 0, 0 },                 /* dummy dataFormat */
     90     { 0, 0, 0, 0 },                 /* dummy formatVersion */
     91     { 0, 0, 0, 0 }                  /* dummy dataVersion */
     92 };
     93 
     94 #else
     95 
     96 //
     97 //  Set up the ICU data header, defined in ucmndata.h
     98 //
     99 DataHeader dh ={
    100     {sizeof(DataHeader),           // Struct MappedData
    101         0xda,
    102         0x27},
    103 
    104     {                               // struct UDataInfo
    105         sizeof(UDataInfo),          //     size
    106         0,                          //     reserved
    107         U_IS_BIG_ENDIAN,
    108         U_CHARSET_FAMILY,
    109         U_SIZEOF_UCHAR,
    110         0,                          //     reserved
    111 
    112     { 0x42, 0x72, 0x6b, 0x20 },     //     dataFormat="Brk "
    113     { 0xff, 0, 0, 0 },              //     formatVersion.  Filled in later with values
    114                                     //      from the RBBI rule builder.  The  values declared
    115                                     //      here should never appear in any real RBBI data.
    116         { 4, 1, 0, 0 }              //   dataVersion (Unicode version)
    117     }};
    118 
    119 #endif
    120 
    121 //----------------------------------------------------------------------------
    122 //
    123 //  main      for genbrk
    124 //
    125 //----------------------------------------------------------------------------
    126 int  main(int argc, char **argv) {
    127     UErrorCode  status = U_ZERO_ERROR;
    128     const char *ruleFileName;
    129     const char *outFileName;
    130     const char *outDir = NULL;
    131     const char *copyright = NULL;
    132 
    133     //
    134     // Pick up and check the command line arguments,
    135     //    using the standard ICU tool utils option handling.
    136     //
    137     U_MAIN_INIT_ARGS(argc, argv);
    138     progName = argv[0];
    139     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
    140     if(argc<0) {
    141         // Unrecognized option
    142         fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
    143         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    144     }
    145 
    146     if(options[0].doesOccur || options[1].doesOccur) {
    147         //  -? or -h for help.
    148         usageAndDie(0);
    149     }
    150 
    151     if (!(options[3].doesOccur && options[4].doesOccur)) {
    152         fprintf(stderr, "rule file and output file must both be specified.\n");
    153         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    154     }
    155     ruleFileName = options[3].value;
    156     outFileName  = options[4].value;
    157 
    158     if (options[5].doesOccur) {
    159         u_setDataDirectory(options[5].value);
    160     }
    161 
    162     status = U_ZERO_ERROR;
    163 
    164     /* Combine the directory with the file name */
    165     if(options[6].doesOccur) {
    166         outDir = options[6].value;
    167     }
    168     if (options[7].doesOccur) {
    169         copyright = U_COPYRIGHT_STRING;
    170     }
    171 
    172 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
    173 
    174     UNewDataMemory *pData;
    175     char msg[1024];
    176 
    177     /* write message with just the name */
    178     sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
    179     fprintf(stderr, "%s\n", msg);
    180 
    181     /* write the dummy data file */
    182     pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
    183     udata_writeBlock(pData, msg, strlen(msg));
    184     udata_finish(pData, &status);
    185     return (int)status;
    186 
    187 #else
    188     /* Initialize ICU */
    189     u_init(&status);
    190     if (U_FAILURE(status)) {
    191         fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
    192             argv[0], u_errorName(status));
    193         exit(1);
    194     }
    195     status = U_ZERO_ERROR;
    196 
    197     //
    198     //  Read in the rule source file
    199     //
    200     long        result;
    201     long        ruleFileSize;
    202     FILE        *file;
    203     char        *ruleBufferC;
    204 
    205     file = fopen(ruleFileName, "rb");
    206     if( file == 0 ) {
    207         fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
    208         exit(-1);
    209     }
    210     fseek(file, 0, SEEK_END);
    211     ruleFileSize = ftell(file);
    212     fseek(file, 0, SEEK_SET);
    213     ruleBufferC = new char[ruleFileSize+10];
    214 
    215     result = (long)fread(ruleBufferC, 1, ruleFileSize, file);
    216     if (result != ruleFileSize)  {
    217         fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
    218         exit (-1);
    219     }
    220     ruleBufferC[ruleFileSize]=0;
    221     fclose(file);
    222 
    223     //
    224     // Look for a Unicode Signature (BOM) on the rule file
    225     //
    226     int32_t        signatureLength;
    227     const char *   ruleSourceC = ruleBufferC;
    228     const char*    encoding = ucnv_detectUnicodeSignature(
    229                            ruleSourceC, ruleFileSize, &signatureLength, &status);
    230     if (U_FAILURE(status)) {
    231         exit(status);
    232     }
    233     if(encoding!=NULL ){
    234         ruleSourceC  += signatureLength;
    235         ruleFileSize -= signatureLength;
    236     }
    237 
    238     //
    239     // Open a converter to take the rule file to UTF-16
    240     //
    241     UConverter* conv;
    242     conv = ucnv_open(encoding, &status);
    243     if (U_FAILURE(status)) {
    244         fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
    245         exit(status);
    246     }
    247 
    248     //
    249     // Convert the rules to UChar.
    250     //  Preflight first to determine required buffer size.
    251     //
    252     uint32_t destCap = ucnv_toUChars(conv,
    253                        NULL,           //  dest,
    254                        0,              //  destCapacity,
    255                        ruleSourceC,
    256                        ruleFileSize,
    257                        &status);
    258     if (status != U_BUFFER_OVERFLOW_ERROR) {
    259         fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
    260         exit(status);
    261     };
    262 
    263     status = U_ZERO_ERROR;
    264     UChar *ruleSourceU = new UChar[destCap+1];
    265     ucnv_toUChars(conv,
    266                   ruleSourceU,     //  dest,
    267                   destCap+1,
    268                   ruleSourceC,
    269                   ruleFileSize,
    270                   &status);
    271     if (U_FAILURE(status)) {
    272         fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
    273         exit(status);
    274     };
    275     ucnv_close(conv);
    276 
    277 
    278     //
    279     //  Put the source rules into a UnicodeString
    280     //
    281     UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap);
    282 
    283     //
    284     //  Create the break iterator from the rules
    285     //     This will compile the rules.
    286     //
    287     UParseError parseError;
    288     parseError.line = 0;
    289     parseError.offset = 0;
    290     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
    291     if (U_FAILURE(status)) {
    292         fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\"  at line %d, column %d\n",
    293                 u_errorName(status), (int)parseError.line, (int)parseError.offset);
    294         exit(status);
    295     };
    296 
    297 
    298     //
    299     //  Get the compiled rule data from the break iterator.
    300     //
    301     uint32_t        outDataSize;
    302     const uint8_t  *outData;
    303     outData = bi->getBinaryRules(outDataSize);
    304 
    305     // Copy the data format version numbers from the RBBI data header into the UDataMemory header.
    306     uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion));
    307 
    308     //
    309     //  Create the output file
    310     //
    311     size_t bytesWritten;
    312     UNewDataMemory *pData;
    313     pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
    314     if(U_FAILURE(status)) {
    315         fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n",
    316                          outFileName, u_errorName(status));
    317         exit(status);
    318     }
    319 
    320 
    321     //  Write the data itself.
    322     udata_writeBlock(pData, outData, outDataSize);
    323     // finish up
    324     bytesWritten = udata_finish(pData, &status);
    325     if(U_FAILURE(status)) {
    326         fprintf(stderr, "genbrk: error %d writing the output file\n", status);
    327         exit(status);
    328     }
    329 
    330     if (bytesWritten != outDataSize) {
    331         fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
    332         exit(-1);
    333     }
    334 
    335     delete bi;
    336     delete[] ruleSourceU;
    337     delete[] ruleBufferC;
    338     u_cleanup();
    339 
    340 
    341     printf("genbrk: tool completed successfully.\n");
    342     return 0;
    343 
    344 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    345 }
    346 
    347