Home | History | Annotate | Download | only in toolutil
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /******************************************************************************
      4  *   Copyright (C) 2008-2012, International Business Machines
      5  *   Corporation and others.  All Rights Reserved.
      6  *******************************************************************************
      7  */
      8 #include "unicode/utypes.h"
      9 
     10 #include <stdio.h>
     11 #include <stdlib.h>
     12 #include "unicode/utypes.h"
     13 #include "unicode/putil.h"
     14 #include "cmemory.h"
     15 #include "cstring.h"
     16 #include "filestrm.h"
     17 #include "toolutil.h"
     18 #include "unicode/uclean.h"
     19 #include "unewdata.h"
     20 #include "putilimp.h"
     21 #include "pkg_gencmn.h"
     22 
     23 #define STRING_STORE_SIZE 200000
     24 
     25 #define COMMON_DATA_NAME U_ICUDATA_NAME
     26 #define DATA_TYPE "dat"
     27 
     28 /* ICU package data file format (.dat files) ------------------------------- ***
     29 
     30 Description of the data format after the usual ICU data file header
     31 (UDataInfo etc.).
     32 
     33 Format version 1
     34 
     35 A .dat package file contains a simple Table of Contents of item names,
     36 followed by the items themselves:
     37 
     38 1. ToC table
     39 
     40 uint32_t count; - number of items
     41 UDataOffsetTOCEntry entry[count]; - pair of uint32_t values per item:
     42     uint32_t nameOffset; - offset of the item name
     43     uint32_t dataOffset; - offset of the item data
     44 both are byte offsets from the beginning of the data
     45 
     46 2. item name strings
     47 
     48 All item names are stored as char * strings in one block between the ToC table
     49 and the data items.
     50 
     51 3. data items
     52 
     53 The data items are stored following the item names block.
     54 Each data item is 16-aligned.
     55 The data items are stored in the sorted order of their names.
     56 
     57 Therefore, the top of the name strings block is the offset of the first item,
     58 the length of the last item is the difference between its offset and
     59 the .dat file length, and the length of all previous items is the difference
     60 between its offset and the next one.
     61 
     62 ----------------------------------------------------------------------------- */
     63 
     64 /* UDataInfo cf. udata.h */
     65 static const UDataInfo dataInfo={
     66     sizeof(UDataInfo),
     67     0,
     68 
     69     U_IS_BIG_ENDIAN,
     70     U_CHARSET_FAMILY,
     71     sizeof(UChar),
     72     0,
     73 
     74     {0x43, 0x6d, 0x6e, 0x44},     /* dataFormat="CmnD" */
     75     {1, 0, 0, 0},                 /* formatVersion */
     76     {3, 0, 0, 0}                  /* dataVersion */
     77 };
     78 
     79 static uint32_t maxSize;
     80 
     81 static char stringStore[STRING_STORE_SIZE];
     82 static uint32_t stringTop=0, basenameTotal=0;
     83 
     84 typedef struct {
     85     char *pathname, *basename;
     86     uint32_t basenameLength, basenameOffset, fileSize, fileOffset;
     87 } File;
     88 
     89 #define CHUNK_FILE_COUNT 256
     90 static File *files = NULL;
     91 static uint32_t fileCount=0;
     92 static uint32_t fileMax = 0;
     93 
     94 
     95 static char *symPrefix = NULL;
     96 
     97 #define LINE_BUFFER_SIZE 512
     98 /* prototypes --------------------------------------------------------------- */
     99 
    100 static void
    101 addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose);
    102 
    103 static char *
    104 allocString(uint32_t length);
    105 
    106 U_CDECL_BEGIN
    107 static int
    108 compareFiles(const void *file1, const void *file2);
    109 U_CDECL_END
    110 
    111 static char *
    112 pathToFullPath(const char *path, const char *source);
    113 
    114 /* map non-tree separator (such as '\') to tree separator ('/') inplace. */
    115 static void
    116 fixDirToTreePath(char *s);
    117 /* -------------------------------------------------------------------------- */
    118 
    119 U_CAPI void U_EXPORT2
    120 createCommonDataFile(const char *destDir, const char *name, const char *entrypointName, const char *type, const char *source, const char *copyRight,
    121                      const char *dataFile, uint32_t max_size, UBool sourceTOC, UBool verbose, char *gencmnFileName) {
    122     static char buffer[4096];
    123     char *line;
    124     char *linePtr;
    125     char *s = NULL;
    126     UErrorCode errorCode=U_ZERO_ERROR;
    127     uint32_t i, fileOffset, basenameOffset, length, nread;
    128     FileStream *in, *file;
    129 
    130     line = (char *)uprv_malloc(sizeof(char) * LINE_BUFFER_SIZE);
    131     if (line == NULL) {
    132         fprintf(stderr, "gencmn: unable to allocate memory for line buffer of size %d\n", LINE_BUFFER_SIZE);
    133         exit(U_MEMORY_ALLOCATION_ERROR);
    134     }
    135 
    136     linePtr = line;
    137 
    138     maxSize = max_size;
    139 
    140     if (destDir == NULL) {
    141         destDir = u_getDataDirectory();
    142     }
    143     if (name == NULL) {
    144         name = COMMON_DATA_NAME;
    145     }
    146     if (type == NULL) {
    147         type = DATA_TYPE;
    148     }
    149     if (source == NULL) {
    150         source = ".";
    151     }
    152 
    153     if (dataFile == NULL) {
    154         in = T_FileStream_stdin();
    155     } else {
    156         in = T_FileStream_open(dataFile, "r");
    157         if(in == NULL) {
    158             fprintf(stderr, "gencmn: unable to open input file %s\n", dataFile);
    159             exit(U_FILE_ACCESS_ERROR);
    160         }
    161     }
    162 
    163     if (verbose) {
    164         if(sourceTOC) {
    165             printf("generating %s_%s.c (table of contents source file)\n", name, type);
    166         } else {
    167             printf("generating %s.%s (common data file with table of contents)\n", name, type);
    168         }
    169     }
    170 
    171     /* read the list of files and get their lengths */
    172     while((s != NULL && *s != 0) || (s=T_FileStream_readLine(in, (line=linePtr),
    173                                                              LINE_BUFFER_SIZE))!=NULL) {
    174         /* remove trailing newline characters and parse space separated items */
    175         if (s != NULL && *s != 0) {
    176             line=s;
    177         } else {
    178             s=line;
    179         }
    180         while(*s!=0) {
    181             if(*s==' ') {
    182                 *s=0;
    183                 ++s;
    184                 break;
    185             } else if(*s=='\r' || *s=='\n') {
    186                 *s=0;
    187                 break;
    188             }
    189             ++s;
    190         }
    191 
    192         /* check for comment */
    193 
    194         if (*line == '#') {
    195             continue;
    196         }
    197 
    198         /* add the file */
    199 #if (U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR)
    200         {
    201           char *t;
    202           while((t = uprv_strchr(line,U_FILE_ALT_SEP_CHAR))) {
    203             *t = U_FILE_SEP_CHAR;
    204           }
    205         }
    206 #endif
    207         addFile(getLongPathname(line), name, source, sourceTOC, verbose);
    208     }
    209 
    210     uprv_free(linePtr);
    211 
    212     if(in!=T_FileStream_stdin()) {
    213         T_FileStream_close(in);
    214     }
    215 
    216     if(fileCount==0) {
    217         fprintf(stderr, "gencmn: no files listed in %s\n", dataFile == NULL ? "<stdin>" : dataFile);
    218         return;
    219     }
    220 
    221     /* sort the files by basename */
    222     qsort(files, fileCount, sizeof(File), compareFiles);
    223 
    224     if(!sourceTOC) {
    225         UNewDataMemory *out;
    226 
    227         /* determine the offsets of all basenames and files in this common one */
    228         basenameOffset=4+8*fileCount;
    229         fileOffset=(basenameOffset+(basenameTotal+15))&~0xf;
    230         for(i=0; i<fileCount; ++i) {
    231             files[i].fileOffset=fileOffset;
    232             fileOffset+=(files[i].fileSize+15)&~0xf;
    233             files[i].basenameOffset=basenameOffset;
    234             basenameOffset+=files[i].basenameLength;
    235         }
    236 
    237         /* create the output file */
    238         out=udata_create(destDir, type, name,
    239                          &dataInfo,
    240                          copyRight == NULL ? U_COPYRIGHT_STRING : copyRight,
    241                          &errorCode);
    242         if(U_FAILURE(errorCode)) {
    243             fprintf(stderr, "gencmn: udata_create(-d %s -n %s -t %s) failed - %s\n",
    244                 destDir, name, type,
    245                 u_errorName(errorCode));
    246             exit(errorCode);
    247         }
    248 
    249         /* write the table of contents */
    250         udata_write32(out, fileCount);
    251         for(i=0; i<fileCount; ++i) {
    252             udata_write32(out, files[i].basenameOffset);
    253             udata_write32(out, files[i].fileOffset);
    254         }
    255 
    256         /* write the basenames */
    257         for(i=0; i<fileCount; ++i) {
    258             udata_writeString(out, files[i].basename, files[i].basenameLength);
    259         }
    260         length=4+8*fileCount+basenameTotal;
    261 
    262         /* copy the files */
    263         for(i=0; i<fileCount; ++i) {
    264             /* pad to 16-align the next file */
    265             length&=0xf;
    266             if(length!=0) {
    267                 udata_writePadding(out, 16-length);
    268             }
    269 
    270             if (verbose) {
    271                 printf("adding %s (%ld byte%s)\n", files[i].pathname, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s");
    272             }
    273 
    274             /* copy the next file */
    275             file=T_FileStream_open(files[i].pathname, "rb");
    276             if(file==NULL) {
    277                 fprintf(stderr, "gencmn: unable to open listed file %s\n", files[i].pathname);
    278                 exit(U_FILE_ACCESS_ERROR);
    279             }
    280             for(nread = 0;;) {
    281                 length=T_FileStream_read(file, buffer, sizeof(buffer));
    282                 if(length <= 0) {
    283                     break;
    284                 }
    285                 nread += length;
    286                 udata_writeBlock(out, buffer, length);
    287             }
    288             T_FileStream_close(file);
    289             length=files[i].fileSize;
    290 
    291             if (nread != files[i].fileSize) {
    292               fprintf(stderr, "gencmn: unable to read %s properly (got %ld/%ld byte%s)\n", files[i].pathname,  (long)nread, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s");
    293                 exit(U_FILE_ACCESS_ERROR);
    294             }
    295         }
    296 
    297         /* pad to 16-align the last file (cleaner, avoids growing .dat files in icuswap) */
    298         length&=0xf;
    299         if(length!=0) {
    300             udata_writePadding(out, 16-length);
    301         }
    302 
    303         /* finish */
    304         udata_finish(out, &errorCode);
    305         if(U_FAILURE(errorCode)) {
    306             fprintf(stderr, "gencmn: udata_finish() failed - %s\n", u_errorName(errorCode));
    307             exit(errorCode);
    308         }
    309     } else {
    310         /* write a .c source file with the table of contents */
    311         char *filename;
    312         FileStream *out;
    313 
    314         /* create the output filename */
    315         filename=s=buffer;
    316         uprv_strcpy(filename, destDir);
    317         s=filename+uprv_strlen(filename);
    318         if(s>filename && *(s-1)!=U_FILE_SEP_CHAR) {
    319             *s++=U_FILE_SEP_CHAR;
    320         }
    321         uprv_strcpy(s, name);
    322         if(*(type)!=0) {
    323             s+=uprv_strlen(s);
    324             *s++='_';
    325             uprv_strcpy(s, type);
    326         }
    327         s+=uprv_strlen(s);
    328         uprv_strcpy(s, ".c");
    329 
    330         /* open the output file */
    331         out=T_FileStream_open(filename, "w");
    332         if (gencmnFileName != NULL) {
    333             uprv_strcpy(gencmnFileName, filename);
    334         }
    335         if(out==NULL) {
    336             fprintf(stderr, "gencmn: unable to open .c output file %s\n", filename);
    337             exit(U_FILE_ACCESS_ERROR);
    338         }
    339 
    340         /* write the source file */
    341         sprintf(buffer,
    342             "/*\n"
    343             " * ICU common data table of contents for %s.%s\n"
    344             " * Automatically generated by icu/source/tools/gencmn/gencmn .\n"
    345             " */\n\n"
    346             "#include \"unicode/utypes.h\"\n"
    347             "#include \"unicode/udata.h\"\n"
    348             "\n"
    349             "/* external symbol declarations for data (%d files) */\n",
    350                 name, type, fileCount);
    351         T_FileStream_writeLine(out, buffer);
    352 
    353         sprintf(buffer, "extern const char\n    %s%s[]", symPrefix?symPrefix:"", files[0].pathname);
    354         T_FileStream_writeLine(out, buffer);
    355         for(i=1; i<fileCount; ++i) {
    356             sprintf(buffer, ",\n    %s%s[]", symPrefix?symPrefix:"", files[i].pathname);
    357             T_FileStream_writeLine(out, buffer);
    358         }
    359         T_FileStream_writeLine(out, ";\n\n");
    360 
    361         sprintf(
    362             buffer,
    363             "U_EXPORT struct {\n"
    364             "    uint16_t headerSize;\n"
    365             "    uint8_t magic1, magic2;\n"
    366             "    UDataInfo info;\n"
    367             "    char padding[%lu];\n"
    368             "    uint32_t count, reserved;\n"
    369             "    struct {\n"
    370             "        const char *name;\n"
    371             "        const void *data;\n"
    372             "    } toc[%lu];\n"
    373             "} U_EXPORT2 %s_dat = {\n"
    374             "    32, 0xda, 0x27, {\n"
    375             "        %lu, 0,\n"
    376             "        %u, %u, %u, 0,\n"
    377             "        {0x54, 0x6f, 0x43, 0x50},\n"
    378             "        {1, 0, 0, 0},\n"
    379             "        {0, 0, 0, 0}\n"
    380             "    },\n"
    381             "    \"\", %lu, 0, {\n",
    382             (unsigned long)32-4-sizeof(UDataInfo),
    383             (unsigned long)fileCount,
    384             entrypointName,
    385             (unsigned long)sizeof(UDataInfo),
    386             U_IS_BIG_ENDIAN,
    387             U_CHARSET_FAMILY,
    388             U_SIZEOF_UCHAR,
    389             (unsigned long)fileCount
    390         );
    391         T_FileStream_writeLine(out, buffer);
    392 
    393         sprintf(buffer, "        { \"%s\", %s%s }", files[0].basename, symPrefix?symPrefix:"", files[0].pathname);
    394         T_FileStream_writeLine(out, buffer);
    395         for(i=1; i<fileCount; ++i) {
    396             sprintf(buffer, ",\n        { \"%s\", %s%s }", files[i].basename, symPrefix?symPrefix:"", files[i].pathname);
    397             T_FileStream_writeLine(out, buffer);
    398         }
    399 
    400         T_FileStream_writeLine(out, "\n    }\n};\n");
    401         T_FileStream_close(out);
    402 
    403         uprv_free(symPrefix);
    404     }
    405 }
    406 
    407 static void
    408 addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose) {
    409     char *s;
    410     uint32_t length;
    411     char *fullPath = NULL;
    412 
    413     if(fileCount==fileMax) {
    414       fileMax += CHUNK_FILE_COUNT;
    415       files = (File *)uprv_realloc(files, fileMax*sizeof(files[0])); /* note: never freed. */
    416       if(files==NULL) {
    417         fprintf(stderr, "pkgdata/gencmn: Could not allocate %u bytes for %d files\n", (unsigned int)(fileMax*sizeof(files[0])), fileCount);
    418         exit(U_MEMORY_ALLOCATION_ERROR);
    419       }
    420     }
    421 
    422     if(!sourceTOC) {
    423         FileStream *file;
    424 
    425         if(uprv_pathIsAbsolute(filename)) {
    426             fprintf(stderr, "gencmn: Error: absolute path encountered. Old style paths are not supported. Use relative paths such as 'fur.res' or 'translit%cfur.res'.\n\tBad path: '%s'\n", U_FILE_SEP_CHAR, filename);
    427             exit(U_ILLEGAL_ARGUMENT_ERROR);
    428         }
    429         fullPath = pathToFullPath(filename, source);
    430         /* store the pathname */
    431         length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1);
    432         s=allocString(length);
    433         uprv_strcpy(s, name);
    434         uprv_strcat(s, U_TREE_ENTRY_SEP_STRING);
    435         uprv_strcat(s, filename);
    436 
    437         /* get the basename */
    438         fixDirToTreePath(s);
    439         files[fileCount].basename=s;
    440         files[fileCount].basenameLength=length;
    441 
    442         files[fileCount].pathname=fullPath;
    443 
    444         basenameTotal+=length;
    445 
    446         /* try to open the file */
    447         file=T_FileStream_open(fullPath, "rb");
    448         if(file==NULL) {
    449             fprintf(stderr, "gencmn: unable to open listed file %s\n", fullPath);
    450             exit(U_FILE_ACCESS_ERROR);
    451         }
    452 
    453         /* get the file length */
    454         length=T_FileStream_size(file);
    455         if(T_FileStream_error(file) || length<=20) {
    456             fprintf(stderr, "gencmn: unable to get length of listed file %s\n", fullPath);
    457             exit(U_FILE_ACCESS_ERROR);
    458         }
    459 
    460         T_FileStream_close(file);
    461 
    462         /* do not add files that are longer than maxSize */
    463         if(maxSize && length>maxSize) {
    464             if (verbose) {
    465                 printf("%s ignored (size %ld > %ld)\n", fullPath, (long)length, (long)maxSize);
    466             }
    467             return;
    468         }
    469         files[fileCount].fileSize=length;
    470     } else {
    471         char *t;
    472         /* get and store the basename */
    473         /* need to include the package name */
    474         length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1);
    475         s=allocString(length);
    476         uprv_strcpy(s, name);
    477         uprv_strcat(s, U_TREE_ENTRY_SEP_STRING);
    478         uprv_strcat(s, filename);
    479         fixDirToTreePath(s);
    480         files[fileCount].basename=s;
    481         /* turn the basename into an entry point name and store in the pathname field */
    482         t=files[fileCount].pathname=allocString(length);
    483         while(--length>0) {
    484             if(*s=='.' || *s=='-' || *s=='/') {
    485                 *t='_';
    486             } else {
    487                 *t=*s;
    488             }
    489             ++s;
    490             ++t;
    491         }
    492         *t=0;
    493     }
    494     ++fileCount;
    495 }
    496 
    497 static char *
    498 allocString(uint32_t length) {
    499     uint32_t top=stringTop+length;
    500     char *p;
    501 
    502     if(top>STRING_STORE_SIZE) {
    503         fprintf(stderr, "gencmn: out of memory\n");
    504         exit(U_MEMORY_ALLOCATION_ERROR);
    505     }
    506     p=stringStore+stringTop;
    507     stringTop=top;
    508     return p;
    509 }
    510 
    511 static char *
    512 pathToFullPath(const char *path, const char *source) {
    513     int32_t length;
    514     int32_t newLength;
    515     char *fullPath;
    516     int32_t n;
    517 
    518     length = (uint32_t)(uprv_strlen(path) + 1);
    519     newLength = (length + 1 + (int32_t)uprv_strlen(source));
    520     fullPath = (char *)uprv_malloc(newLength);
    521     if(source != NULL) {
    522         uprv_strcpy(fullPath, source);
    523         uprv_strcat(fullPath, U_FILE_SEP_STRING);
    524     } else {
    525         fullPath[0] = 0;
    526     }
    527     n = (int32_t)uprv_strlen(fullPath);
    528     fullPath[n] = 0;       /* Suppress compiler warning for unused variable n    */
    529                            /*  when conditional code below is not compiled.      */
    530     uprv_strcat(fullPath, path);
    531 
    532 #if (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
    533 #if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR)
    534     /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */
    535     for(;fullPath[n];n++) {
    536         if(fullPath[n] == U_FILE_ALT_SEP_CHAR) {
    537             fullPath[n] = U_FILE_SEP_CHAR;
    538         }
    539     }
    540 #endif
    541 #endif
    542 #if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
    543     /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */
    544     for(;fullPath[n];n++) {
    545         if(fullPath[n] == U_TREE_ENTRY_SEP_CHAR) {
    546             fullPath[n] = U_FILE_SEP_CHAR;
    547         }
    548     }
    549 #endif
    550     return fullPath;
    551 }
    552 
    553 U_CDECL_BEGIN
    554 static int
    555 compareFiles(const void *file1, const void *file2) {
    556     /* sort by basename */
    557     return uprv_strcmp(((File *)file1)->basename, ((File *)file2)->basename);
    558 }
    559 U_CDECL_END
    560 
    561 static void
    562 fixDirToTreePath(char *s)
    563 {
    564     (void)s;
    565 #if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) || ((U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR))
    566     char *t;
    567 #endif
    568 #if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
    569     for(t=s;t=uprv_strchr(t,U_FILE_SEP_CHAR);) {
    570         *t = U_TREE_ENTRY_SEP_CHAR;
    571     }
    572 #endif
    573 #if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
    574     for(t=s;t=uprv_strchr(t,U_FILE_ALT_SEP_CHAR);) {
    575         *t = U_TREE_ENTRY_SEP_CHAR;
    576     }
    577 #endif
    578 }
    579