Home | History | Annotate | Download | only in toolutil
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /******************************************************************************
      4  *   Copyright (C) 2008-2012, International Business Machines
      5  *   Corporation and others.  All Rights Reserved.
      6  *******************************************************************************
      7  */
      8 #include "unicode/utypes.h"
      9 
     10 #include <stdio.h>
     11 #include <stdlib.h>
     12 #include "unicode/utypes.h"
     13 #include "unicode/putil.h"
     14 #include "cmemory.h"
     15 #include "cstring.h"
     16 #include "filestrm.h"
     17 #include "toolutil.h"
     18 #include "unicode/uclean.h"
     19 #include "unewdata.h"
     20 #include "putilimp.h"
     21 #include "pkg_gencmn.h"
     22 
     23 #define STRING_STORE_SIZE 200000
     24 
     25 #define COMMON_DATA_NAME U_ICUDATA_NAME
     26 #define DATA_TYPE "dat"
     27 
     28 /* ICU package data file format (.dat files) ------------------------------- ***
     29 
     30 Description of the data format after the usual ICU data file header
     31 (UDataInfo etc.).
     32 
     33 Format version 1
     34 
     35 A .dat package file contains a simple Table of Contents of item names,
     36 followed by the items themselves:
     37 
     38 1. ToC table
     39 
     40 uint32_t count; - number of items
     41 UDataOffsetTOCEntry entry[count]; - pair of uint32_t values per item:
     42     uint32_t nameOffset; - offset of the item name
     43     uint32_t dataOffset; - offset of the item data
     44 both are byte offsets from the beginning of the data
     45 
     46 2. item name strings
     47 
     48 All item names are stored as char * strings in one block between the ToC table
     49 and the data items.
     50 
     51 3. data items
     52 
     53 The data items are stored following the item names block.
     54 Each data item is 16-aligned.
     55 The data items are stored in the sorted order of their names.
     56 
     57 Therefore, the top of the name strings block is the offset of the first item,
     58 the length of the last item is the difference between its offset and
     59 the .dat file length, and the length of all previous items is the difference
     60 between its offset and the next one.
     61 
     62 ----------------------------------------------------------------------------- */
     63 
     64 /* UDataInfo cf. udata.h */
     65 static const UDataInfo dataInfo={
     66     sizeof(UDataInfo),
     67     0,
     68 
     69     U_IS_BIG_ENDIAN,
     70     U_CHARSET_FAMILY,
     71     sizeof(UChar),
     72     0,
     73 
     74     {0x43, 0x6d, 0x6e, 0x44},     /* dataFormat="CmnD" */
     75     {1, 0, 0, 0},                 /* formatVersion */
     76     {3, 0, 0, 0}                  /* dataVersion */
     77 };
     78 
     79 static uint32_t maxSize;
     80 
     81 static char stringStore[STRING_STORE_SIZE];
     82 static uint32_t stringTop=0, basenameTotal=0;
     83 
     84 typedef struct {
     85     char *pathname, *basename;
     86     uint32_t basenameLength, basenameOffset, fileSize, fileOffset;
     87 } File;
     88 
     89 #define CHUNK_FILE_COUNT 256
     90 static File *files = NULL;
     91 static uint32_t fileCount=0;
     92 static uint32_t fileMax = 0;
     93 
     94 
     95 static char *symPrefix = NULL;
     96 
     97 #define LINE_BUFFER_SIZE 512
     98 /* prototypes --------------------------------------------------------------- */
     99 
    100 static void
    101 addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose);
    102 
    103 static char *
    104 allocString(uint32_t length);
    105 
    106 static int
    107 compareFiles(const void *file1, const void *file2);
    108 
    109 static char *
    110 pathToFullPath(const char *path, const char *source);
    111 
    112 /* map non-tree separator (such as '\') to tree separator ('/') inplace. */
    113 static void
    114 fixDirToTreePath(char *s);
    115 /* -------------------------------------------------------------------------- */
    116 
    117 U_CAPI void U_EXPORT2
    118 createCommonDataFile(const char *destDir, const char *name, const char *entrypointName, const char *type, const char *source, const char *copyRight,
    119                      const char *dataFile, uint32_t max_size, UBool sourceTOC, UBool verbose, char *gencmnFileName) {
    120     static char buffer[4096];
    121     char *line;
    122     char *linePtr;
    123     char *s = NULL;
    124     UErrorCode errorCode=U_ZERO_ERROR;
    125     uint32_t i, fileOffset, basenameOffset, length, nread;
    126     FileStream *in, *file;
    127 
    128     line = (char *)uprv_malloc(sizeof(char) * LINE_BUFFER_SIZE);
    129     if (line == NULL) {
    130         fprintf(stderr, "gencmn: unable to allocate memory for line buffer of size %d\n", LINE_BUFFER_SIZE);
    131         exit(U_MEMORY_ALLOCATION_ERROR);
    132     }
    133 
    134     linePtr = line;
    135 
    136     maxSize = max_size;
    137 
    138     if (destDir == NULL) {
    139         destDir = u_getDataDirectory();
    140     }
    141     if (name == NULL) {
    142         name = COMMON_DATA_NAME;
    143     }
    144     if (type == NULL) {
    145         type = DATA_TYPE;
    146     }
    147     if (source == NULL) {
    148         source = ".";
    149     }
    150 
    151     if (dataFile == NULL) {
    152         in = T_FileStream_stdin();
    153     } else {
    154         in = T_FileStream_open(dataFile, "r");
    155         if(in == NULL) {
    156             fprintf(stderr, "gencmn: unable to open input file %s\n", dataFile);
    157             exit(U_FILE_ACCESS_ERROR);
    158         }
    159     }
    160 
    161     if (verbose) {
    162         if(sourceTOC) {
    163             printf("generating %s_%s.c (table of contents source file)\n", name, type);
    164         } else {
    165             printf("generating %s.%s (common data file with table of contents)\n", name, type);
    166         }
    167     }
    168 
    169     /* read the list of files and get their lengths */
    170     while((s != NULL && *s != 0) || (s=T_FileStream_readLine(in, (line=linePtr),
    171                                                              LINE_BUFFER_SIZE))!=NULL) {
    172         /* remove trailing newline characters and parse space separated items */
    173         if (s != NULL && *s != 0) {
    174             line=s;
    175         } else {
    176             s=line;
    177         }
    178         while(*s!=0) {
    179             if(*s==' ') {
    180                 *s=0;
    181                 ++s;
    182                 break;
    183             } else if(*s=='\r' || *s=='\n') {
    184                 *s=0;
    185                 break;
    186             }
    187             ++s;
    188         }
    189 
    190         /* check for comment */
    191 
    192         if (*line == '#') {
    193             continue;
    194         }
    195 
    196         /* add the file */
    197 #if (U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR)
    198         {
    199           char *t;
    200           while((t = uprv_strchr(line,U_FILE_ALT_SEP_CHAR))) {
    201             *t = U_FILE_SEP_CHAR;
    202           }
    203         }
    204 #endif
    205         addFile(getLongPathname(line), name, source, sourceTOC, verbose);
    206     }
    207 
    208     uprv_free(linePtr);
    209 
    210     if(in!=T_FileStream_stdin()) {
    211         T_FileStream_close(in);
    212     }
    213 
    214     if(fileCount==0) {
    215         fprintf(stderr, "gencmn: no files listed in %s\n", dataFile == NULL ? "<stdin>" : dataFile);
    216         return;
    217     }
    218 
    219     /* sort the files by basename */
    220     qsort(files, fileCount, sizeof(File), compareFiles);
    221 
    222     if(!sourceTOC) {
    223         UNewDataMemory *out;
    224 
    225         /* determine the offsets of all basenames and files in this common one */
    226         basenameOffset=4+8*fileCount;
    227         fileOffset=(basenameOffset+(basenameTotal+15))&~0xf;
    228         for(i=0; i<fileCount; ++i) {
    229             files[i].fileOffset=fileOffset;
    230             fileOffset+=(files[i].fileSize+15)&~0xf;
    231             files[i].basenameOffset=basenameOffset;
    232             basenameOffset+=files[i].basenameLength;
    233         }
    234 
    235         /* create the output file */
    236         out=udata_create(destDir, type, name,
    237                          &dataInfo,
    238                          copyRight == NULL ? U_COPYRIGHT_STRING : copyRight,
    239                          &errorCode);
    240         if(U_FAILURE(errorCode)) {
    241             fprintf(stderr, "gencmn: udata_create(-d %s -n %s -t %s) failed - %s\n",
    242                 destDir, name, type,
    243                 u_errorName(errorCode));
    244             exit(errorCode);
    245         }
    246 
    247         /* write the table of contents */
    248         udata_write32(out, fileCount);
    249         for(i=0; i<fileCount; ++i) {
    250             udata_write32(out, files[i].basenameOffset);
    251             udata_write32(out, files[i].fileOffset);
    252         }
    253 
    254         /* write the basenames */
    255         for(i=0; i<fileCount; ++i) {
    256             udata_writeString(out, files[i].basename, files[i].basenameLength);
    257         }
    258         length=4+8*fileCount+basenameTotal;
    259 
    260         /* copy the files */
    261         for(i=0; i<fileCount; ++i) {
    262             /* pad to 16-align the next file */
    263             length&=0xf;
    264             if(length!=0) {
    265                 udata_writePadding(out, 16-length);
    266             }
    267 
    268             if (verbose) {
    269                 printf("adding %s (%ld byte%s)\n", files[i].pathname, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s");
    270             }
    271 
    272             /* copy the next file */
    273             file=T_FileStream_open(files[i].pathname, "rb");
    274             if(file==NULL) {
    275                 fprintf(stderr, "gencmn: unable to open listed file %s\n", files[i].pathname);
    276                 exit(U_FILE_ACCESS_ERROR);
    277             }
    278             for(nread = 0;;) {
    279                 length=T_FileStream_read(file, buffer, sizeof(buffer));
    280                 if(length <= 0) {
    281                     break;
    282                 }
    283                 nread += length;
    284                 udata_writeBlock(out, buffer, length);
    285             }
    286             T_FileStream_close(file);
    287             length=files[i].fileSize;
    288 
    289             if (nread != files[i].fileSize) {
    290               fprintf(stderr, "gencmn: unable to read %s properly (got %ld/%ld byte%s)\n", files[i].pathname,  (long)nread, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s");
    291                 exit(U_FILE_ACCESS_ERROR);
    292             }
    293         }
    294 
    295         /* pad to 16-align the last file (cleaner, avoids growing .dat files in icuswap) */
    296         length&=0xf;
    297         if(length!=0) {
    298             udata_writePadding(out, 16-length);
    299         }
    300 
    301         /* finish */
    302         udata_finish(out, &errorCode);
    303         if(U_FAILURE(errorCode)) {
    304             fprintf(stderr, "gencmn: udata_finish() failed - %s\n", u_errorName(errorCode));
    305             exit(errorCode);
    306         }
    307     } else {
    308         /* write a .c source file with the table of contents */
    309         char *filename;
    310         FileStream *out;
    311 
    312         /* create the output filename */
    313         filename=s=buffer;
    314         uprv_strcpy(filename, destDir);
    315         s=filename+uprv_strlen(filename);
    316         if(s>filename && *(s-1)!=U_FILE_SEP_CHAR) {
    317             *s++=U_FILE_SEP_CHAR;
    318         }
    319         uprv_strcpy(s, name);
    320         if(*(type)!=0) {
    321             s+=uprv_strlen(s);
    322             *s++='_';
    323             uprv_strcpy(s, type);
    324         }
    325         s+=uprv_strlen(s);
    326         uprv_strcpy(s, ".c");
    327 
    328         /* open the output file */
    329         out=T_FileStream_open(filename, "w");
    330         if (gencmnFileName != NULL) {
    331             uprv_strcpy(gencmnFileName, filename);
    332         }
    333         if(out==NULL) {
    334             fprintf(stderr, "gencmn: unable to open .c output file %s\n", filename);
    335             exit(U_FILE_ACCESS_ERROR);
    336         }
    337 
    338         /* write the source file */
    339         sprintf(buffer,
    340             "/*\n"
    341             " * ICU common data table of contents for %s.%s\n"
    342             " * Automatically generated by icu/source/tools/gencmn/gencmn .\n"
    343             " */\n\n"
    344             "#include \"unicode/utypes.h\"\n"
    345             "#include \"unicode/udata.h\"\n"
    346             "\n"
    347             "/* external symbol declarations for data (%d files) */\n",
    348                 name, type, fileCount);
    349         T_FileStream_writeLine(out, buffer);
    350 
    351         sprintf(buffer, "extern const char\n    %s%s[]", symPrefix?symPrefix:"", files[0].pathname);
    352         T_FileStream_writeLine(out, buffer);
    353         for(i=1; i<fileCount; ++i) {
    354             sprintf(buffer, ",\n    %s%s[]", symPrefix?symPrefix:"", files[i].pathname);
    355             T_FileStream_writeLine(out, buffer);
    356         }
    357         T_FileStream_writeLine(out, ";\n\n");
    358 
    359         sprintf(
    360             buffer,
    361             "U_EXPORT struct {\n"
    362             "    uint16_t headerSize;\n"
    363             "    uint8_t magic1, magic2;\n"
    364             "    UDataInfo info;\n"
    365             "    char padding[%lu];\n"
    366             "    uint32_t count, reserved;\n"
    367             "    struct {\n"
    368             "        const char *name;\n"
    369             "        const void *data;\n"
    370             "    } toc[%lu];\n"
    371             "} U_EXPORT2 %s_dat = {\n"
    372             "    32, 0xda, 0x27, {\n"
    373             "        %lu, 0,\n"
    374             "        %u, %u, %u, 0,\n"
    375             "        {0x54, 0x6f, 0x43, 0x50},\n"
    376             "        {1, 0, 0, 0},\n"
    377             "        {0, 0, 0, 0}\n"
    378             "    },\n"
    379             "    \"\", %lu, 0, {\n",
    380             (unsigned long)32-4-sizeof(UDataInfo),
    381             (unsigned long)fileCount,
    382             entrypointName,
    383             (unsigned long)sizeof(UDataInfo),
    384             U_IS_BIG_ENDIAN,
    385             U_CHARSET_FAMILY,
    386             U_SIZEOF_UCHAR,
    387             (unsigned long)fileCount
    388         );
    389         T_FileStream_writeLine(out, buffer);
    390 
    391         sprintf(buffer, "        { \"%s\", %s%s }", files[0].basename, symPrefix?symPrefix:"", files[0].pathname);
    392         T_FileStream_writeLine(out, buffer);
    393         for(i=1; i<fileCount; ++i) {
    394             sprintf(buffer, ",\n        { \"%s\", %s%s }", files[i].basename, symPrefix?symPrefix:"", files[i].pathname);
    395             T_FileStream_writeLine(out, buffer);
    396         }
    397 
    398         T_FileStream_writeLine(out, "\n    }\n};\n");
    399         T_FileStream_close(out);
    400 
    401         uprv_free(symPrefix);
    402     }
    403 }
    404 
    405 static void
    406 addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose) {
    407     char *s;
    408     uint32_t length;
    409     char *fullPath = NULL;
    410 
    411     if(fileCount==fileMax) {
    412       fileMax += CHUNK_FILE_COUNT;
    413       files = uprv_realloc(files, fileMax*sizeof(files[0])); /* note: never freed. */
    414       if(files==NULL) {
    415         fprintf(stderr, "pkgdata/gencmn: Could not allocate %u bytes for %d files\n", (unsigned int)(fileMax*sizeof(files[0])), fileCount);
    416         exit(U_MEMORY_ALLOCATION_ERROR);
    417       }
    418     }
    419 
    420     if(!sourceTOC) {
    421         FileStream *file;
    422 
    423         if(uprv_pathIsAbsolute(filename)) {
    424             fprintf(stderr, "gencmn: Error: absolute path encountered. Old style paths are not supported. Use relative paths such as 'fur.res' or 'translit%cfur.res'.\n\tBad path: '%s'\n", U_FILE_SEP_CHAR, filename);
    425             exit(U_ILLEGAL_ARGUMENT_ERROR);
    426         }
    427         fullPath = pathToFullPath(filename, source);
    428         /* store the pathname */
    429         length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1);
    430         s=allocString(length);
    431         uprv_strcpy(s, name);
    432         uprv_strcat(s, U_TREE_ENTRY_SEP_STRING);
    433         uprv_strcat(s, filename);
    434 
    435         /* get the basename */
    436         fixDirToTreePath(s);
    437         files[fileCount].basename=s;
    438         files[fileCount].basenameLength=length;
    439 
    440         files[fileCount].pathname=fullPath;
    441 
    442         basenameTotal+=length;
    443 
    444         /* try to open the file */
    445         file=T_FileStream_open(fullPath, "rb");
    446         if(file==NULL) {
    447             fprintf(stderr, "gencmn: unable to open listed file %s\n", fullPath);
    448             exit(U_FILE_ACCESS_ERROR);
    449         }
    450 
    451         /* get the file length */
    452         length=T_FileStream_size(file);
    453         if(T_FileStream_error(file) || length<=20) {
    454             fprintf(stderr, "gencmn: unable to get length of listed file %s\n", fullPath);
    455             exit(U_FILE_ACCESS_ERROR);
    456         }
    457 
    458         T_FileStream_close(file);
    459 
    460         /* do not add files that are longer than maxSize */
    461         if(maxSize && length>maxSize) {
    462             if (verbose) {
    463                 printf("%s ignored (size %ld > %ld)\n", fullPath, (long)length, (long)maxSize);
    464             }
    465             return;
    466         }
    467         files[fileCount].fileSize=length;
    468     } else {
    469         char *t;
    470         /* get and store the basename */
    471         /* need to include the package name */
    472         length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1);
    473         s=allocString(length);
    474         uprv_strcpy(s, name);
    475         uprv_strcat(s, U_TREE_ENTRY_SEP_STRING);
    476         uprv_strcat(s, filename);
    477         fixDirToTreePath(s);
    478         files[fileCount].basename=s;
    479         /* turn the basename into an entry point name and store in the pathname field */
    480         t=files[fileCount].pathname=allocString(length);
    481         while(--length>0) {
    482             if(*s=='.' || *s=='-' || *s=='/') {
    483                 *t='_';
    484             } else {
    485                 *t=*s;
    486             }
    487             ++s;
    488             ++t;
    489         }
    490         *t=0;
    491     }
    492     ++fileCount;
    493 }
    494 
    495 static char *
    496 allocString(uint32_t length) {
    497     uint32_t top=stringTop+length;
    498     char *p;
    499 
    500     if(top>STRING_STORE_SIZE) {
    501         fprintf(stderr, "gencmn: out of memory\n");
    502         exit(U_MEMORY_ALLOCATION_ERROR);
    503     }
    504     p=stringStore+stringTop;
    505     stringTop=top;
    506     return p;
    507 }
    508 
    509 static char *
    510 pathToFullPath(const char *path, const char *source) {
    511     int32_t length;
    512     int32_t newLength;
    513     char *fullPath;
    514     int32_t n;
    515 
    516     length = (uint32_t)(uprv_strlen(path) + 1);
    517     newLength = (length + 1 + (int32_t)uprv_strlen(source));
    518     fullPath = uprv_malloc(newLength);
    519     if(source != NULL) {
    520         uprv_strcpy(fullPath, source);
    521         uprv_strcat(fullPath, U_FILE_SEP_STRING);
    522     } else {
    523         fullPath[0] = 0;
    524     }
    525     n = (int32_t)uprv_strlen(fullPath);
    526     fullPath[n] = 0;       /* Suppress compiler warning for unused variable n    */
    527                            /*  when conditional code below is not compiled.      */
    528     uprv_strcat(fullPath, path);
    529 
    530 #if (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
    531 #if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR)
    532     /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */
    533     for(;fullPath[n];n++) {
    534         if(fullPath[n] == U_FILE_ALT_SEP_CHAR) {
    535             fullPath[n] = U_FILE_SEP_CHAR;
    536         }
    537     }
    538 #endif
    539 #endif
    540 #if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
    541     /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */
    542     for(;fullPath[n];n++) {
    543         if(fullPath[n] == U_TREE_ENTRY_SEP_CHAR) {
    544             fullPath[n] = U_FILE_SEP_CHAR;
    545         }
    546     }
    547 #endif
    548     return fullPath;
    549 }
    550 
    551 static int
    552 compareFiles(const void *file1, const void *file2) {
    553     /* sort by basename */
    554     return uprv_strcmp(((File *)file1)->basename, ((File *)file2)->basename);
    555 }
    556 
    557 static void
    558 fixDirToTreePath(char *s)
    559 {
    560 #if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) || ((U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR))
    561     char *t;
    562 #endif
    563 #if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
    564     for(t=s;t=uprv_strchr(t,U_FILE_SEP_CHAR);) {
    565         *t = U_TREE_ENTRY_SEP_CHAR;
    566     }
    567 #endif
    568 #if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
    569     for(t=s;t=uprv_strchr(t,U_FILE_ALT_SEP_CHAR);) {
    570         *t = U_TREE_ENTRY_SEP_CHAR;
    571     }
    572 #endif
    573 }
    574