Home | History | Annotate | Download | only in genbrk
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2002-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *
      9 * File genbrk.c
     10 */
     11 
     12 //--------------------------------------------------------------------
     13 //
     14 //   Tool for generating RuleBasedBreakIterator data files (.brk files).
     15 //   .brk files contain the precompiled rules for standard types
     16 //   of iterators - word, line, sentence, etc.
     17 //
     18 //   Usage:  genbrk [options] -r rule-file.txt  -o output-file.brk
     19 //
     20 //       options:   -v         verbose
     21 //                  -? or -h   help
     22 //
     23 //   The input rule file is a plain text file containing break rules
     24 //    in the input format accepted by RuleBasedBreakIterators.  The
     25 //    file can be encoded as utf-8, or utf-16 (either endian), or
     26 //    in the default code page (platform dependent.).  utf encoded
     27 //    files must include a BOM.
     28 //
     29 //--------------------------------------------------------------------
     30 
     31 #include "unicode/utypes.h"
     32 #include "unicode/ucnv.h"
     33 #include "unicode/unistr.h"
     34 #include "unicode/rbbi.h"
     35 #include "unicode/uclean.h"
     36 #include "unicode/udata.h"
     37 #include "unicode/putil.h"
     38 
     39 #include "uoptions.h"
     40 #include "unewdata.h"
     41 #include "ucmndata.h"
     42 #include "rbbidata.h"
     43 #include "cmemory.h"
     44 
     45 #include <stdio.h>
     46 #include <stdlib.h>
     47 #include <string.h>
     48 
     49 U_NAMESPACE_USE
     50 
     51 static char *progName;
     52 static UOption options[]={
     53     UOPTION_HELP_H,             /* 0 */
     54     UOPTION_HELP_QUESTION_MARK, /* 1 */
     55     UOPTION_VERBOSE,            /* 2 */
     56     { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 },   /* 3 */
     57     { "out",   NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 },   /* 4 */
     58     UOPTION_ICUDATADIR,         /* 5 */
     59     UOPTION_DESTDIR,            /* 6 */
     60     UOPTION_COPYRIGHT,          /* 7 */
     61     UOPTION_QUIET,              /* 8 */
     62 };
     63 
     64 void usageAndDie(int retCode) {
     65         printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName);
     66         printf("\tRead in break iteration rules text and write out the binary data\n"
     67             "options:\n"
     68             "\t-h or -? or --help  this usage text\n"
     69             "\t-V or --version     show a version message\n"
     70             "\t-c or --copyright   include a copyright notice\n"
     71             "\t-v or --verbose     turn on verbose output\n"
     72             "\t-q or --quiet       do not display warnings and progress\n"
     73             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
     74             "\t                    followed by path, defaults to %s\n"
     75             "\t-d or --destdir     destination directory, followed by the path\n",
     76             u_getDataDirectory());
     77         exit (retCode);
     78 }
     79 
     80 
     81 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
     82 
     83 /* dummy UDataInfo cf. udata.h */
     84 static UDataInfo dummyDataInfo = {
     85     sizeof(UDataInfo),
     86     0,
     87 
     88     U_IS_BIG_ENDIAN,
     89     U_CHARSET_FAMILY,
     90     U_SIZEOF_UCHAR,
     91     0,
     92 
     93     { 0, 0, 0, 0 },                 /* dummy dataFormat */
     94     { 0, 0, 0, 0 },                 /* dummy formatVersion */
     95     { 0, 0, 0, 0 }                  /* dummy dataVersion */
     96 };
     97 
     98 #else
     99 
    100 //
    101 //  Set up the ICU data header, defined in ucmndata.h
    102 //
    103 DataHeader dh ={
    104     {sizeof(DataHeader),           // Struct MappedData
    105         0xda,
    106         0x27},
    107 
    108     {                               // struct UDataInfo
    109         sizeof(UDataInfo),          //     size
    110         0,                          //     reserved
    111         U_IS_BIG_ENDIAN,
    112         U_CHARSET_FAMILY,
    113         U_SIZEOF_UCHAR,
    114         0,                          //     reserved
    115 
    116     { 0x42, 0x72, 0x6b, 0x20 },     //     dataFormat="Brk "
    117     { 0xff, 0, 0, 0 },              //     formatVersion.  Filled in later with values
    118                                     //      from the RBBI rule builder.  The  values declared
    119                                     //      here should never appear in any real RBBI data.
    120         { 4, 1, 0, 0 }              //   dataVersion (Unicode version)
    121     }};
    122 
    123 #endif
    124 
    125 //----------------------------------------------------------------------------
    126 //
    127 //  main      for genbrk
    128 //
    129 //----------------------------------------------------------------------------
    130 int  main(int argc, char **argv) {
    131     UErrorCode  status = U_ZERO_ERROR;
    132     const char *ruleFileName;
    133     const char *outFileName;
    134     const char *outDir = NULL;
    135     const char *copyright = NULL;
    136 
    137     //
    138     // Pick up and check the command line arguments,
    139     //    using the standard ICU tool utils option handling.
    140     //
    141     U_MAIN_INIT_ARGS(argc, argv);
    142     progName = argv[0];
    143     argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
    144     if(argc<0) {
    145         // Unrecognized option
    146         fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
    147         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    148     }
    149 
    150     if(options[0].doesOccur || options[1].doesOccur) {
    151         //  -? or -h for help.
    152         usageAndDie(0);
    153     }
    154 
    155     if (!(options[3].doesOccur && options[4].doesOccur)) {
    156         fprintf(stderr, "rule file and output file must both be specified.\n");
    157         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    158     }
    159     ruleFileName = options[3].value;
    160     outFileName  = options[4].value;
    161 
    162     if (options[5].doesOccur) {
    163         u_setDataDirectory(options[5].value);
    164     }
    165 
    166     status = U_ZERO_ERROR;
    167 
    168     /* Combine the directory with the file name */
    169     if(options[6].doesOccur) {
    170         outDir = options[6].value;
    171     }
    172     if (options[7].doesOccur) {
    173         copyright = U_COPYRIGHT_STRING;
    174     }
    175 
    176 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
    177 
    178     UNewDataMemory *pData;
    179     char msg[1024];
    180 
    181     /* write message with just the name */
    182     sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
    183     fprintf(stderr, "%s\n", msg);
    184 
    185     /* write the dummy data file */
    186     pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
    187     udata_writeBlock(pData, msg, strlen(msg));
    188     udata_finish(pData, &status);
    189     return (int)status;
    190 
    191 #else
    192     /* Initialize ICU */
    193     u_init(&status);
    194     if (U_FAILURE(status)) {
    195         fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
    196             argv[0], u_errorName(status));
    197         exit(1);
    198     }
    199     status = U_ZERO_ERROR;
    200 
    201     //
    202     //  Read in the rule source file
    203     //
    204     long        result;
    205     long        ruleFileSize;
    206     FILE        *file;
    207     char        *ruleBufferC;
    208 
    209     file = fopen(ruleFileName, "rb");
    210     if( file == 0 ) {
    211         fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
    212         exit(-1);
    213     }
    214     fseek(file, 0, SEEK_END);
    215     ruleFileSize = ftell(file);
    216     fseek(file, 0, SEEK_SET);
    217     ruleBufferC = new char[ruleFileSize+10];
    218 
    219     result = (long)fread(ruleBufferC, 1, ruleFileSize, file);
    220     if (result != ruleFileSize)  {
    221         fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
    222         exit (-1);
    223     }
    224     ruleBufferC[ruleFileSize]=0;
    225     fclose(file);
    226 
    227     //
    228     // Look for a Unicode Signature (BOM) on the rule file
    229     //
    230     int32_t        signatureLength;
    231     const char *   ruleSourceC = ruleBufferC;
    232     const char*    encoding = ucnv_detectUnicodeSignature(
    233                            ruleSourceC, ruleFileSize, &signatureLength, &status);
    234     if (U_FAILURE(status)) {
    235         exit(status);
    236     }
    237     if(encoding!=NULL ){
    238         ruleSourceC  += signatureLength;
    239         ruleFileSize -= signatureLength;
    240     }
    241 
    242     //
    243     // Open a converter to take the rule file to UTF-16
    244     //
    245     UConverter* conv;
    246     conv = ucnv_open(encoding, &status);
    247     if (U_FAILURE(status)) {
    248         fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
    249         exit(status);
    250     }
    251 
    252     //
    253     // Convert the rules to UChar.
    254     //  Preflight first to determine required buffer size.
    255     //
    256     uint32_t destCap = ucnv_toUChars(conv,
    257                        NULL,           //  dest,
    258                        0,              //  destCapacity,
    259                        ruleSourceC,
    260                        ruleFileSize,
    261                        &status);
    262     if (status != U_BUFFER_OVERFLOW_ERROR) {
    263         fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
    264         exit(status);
    265     };
    266 
    267     status = U_ZERO_ERROR;
    268     UChar *ruleSourceU = new UChar[destCap+1];
    269     ucnv_toUChars(conv,
    270                   ruleSourceU,     //  dest,
    271                   destCap+1,
    272                   ruleSourceC,
    273                   ruleFileSize,
    274                   &status);
    275     if (U_FAILURE(status)) {
    276         fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
    277         exit(status);
    278     };
    279     ucnv_close(conv);
    280 
    281 
    282     //
    283     //  Put the source rules into a UnicodeString
    284     //
    285     UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap);
    286 
    287     //
    288     //  Create the break iterator from the rules
    289     //     This will compile the rules.
    290     //
    291     UParseError parseError;
    292     parseError.line = 0;
    293     parseError.offset = 0;
    294     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
    295     if (U_FAILURE(status)) {
    296         fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\"  at line %d, column %d\n",
    297                 u_errorName(status), (int)parseError.line, (int)parseError.offset);
    298         exit(status);
    299     };
    300 
    301 
    302     //
    303     //  Get the compiled rule data from the break iterator.
    304     //
    305     uint32_t        outDataSize;
    306     const uint8_t  *outData;
    307     outData = bi->getBinaryRules(outDataSize);
    308 
    309     // Copy the data format version numbers from the RBBI data header into the UDataMemory header.
    310     uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion));
    311 
    312     //
    313     //  Create the output file
    314     //
    315     size_t bytesWritten;
    316     UNewDataMemory *pData;
    317     pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
    318     if(U_FAILURE(status)) {
    319         fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n",
    320                          outFileName, u_errorName(status));
    321         exit(status);
    322     }
    323 
    324 
    325     //  Write the data itself.
    326     udata_writeBlock(pData, outData, outDataSize);
    327     // finish up
    328     bytesWritten = udata_finish(pData, &status);
    329     if(U_FAILURE(status)) {
    330         fprintf(stderr, "genbrk: error %d writing the output file\n", status);
    331         exit(status);
    332     }
    333 
    334     if (bytesWritten != outDataSize) {
    335         fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
    336         exit(-1);
    337     }
    338 
    339     delete bi;
    340     delete[] ruleSourceU;
    341     delete[] ruleBufferC;
    342     u_cleanup();
    343 
    344 
    345     if(!options[8].doesOccur) {
    346         printf("genbrk: tool completed successfully.\n");
    347     }
    348     return 0;
    349 
    350 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    351 }
    352 
    353