Home | History | Annotate | Download | only in toolutil
      1 /******************************************************************************
      2  *   Copyright (C) 2008-2012, International Business Machines
      3  *   Corporation and others.  All Rights Reserved.
      4  *******************************************************************************
      5  */
      6 #include "unicode/utypes.h"
      7 
      8 #include <stdio.h>
      9 #include <stdlib.h>
     10 #include "unicode/utypes.h"
     11 #include "unicode/putil.h"
     12 #include "cmemory.h"
     13 #include "cstring.h"
     14 #include "filestrm.h"
     15 #include "toolutil.h"
     16 #include "unicode/uclean.h"
     17 #include "unewdata.h"
     18 #include "putilimp.h"
     19 #include "pkg_gencmn.h"
     20 
     21 #define STRING_STORE_SIZE 200000
     22 
     23 #define COMMON_DATA_NAME U_ICUDATA_NAME
     24 #define DATA_TYPE "dat"
     25 
     26 /* ICU package data file format (.dat files) ------------------------------- ***
     27 
     28 Description of the data format after the usual ICU data file header
     29 (UDataInfo etc.).
     30 
     31 Format version 1
     32 
     33 A .dat package file contains a simple Table of Contents of item names,
     34 followed by the items themselves:
     35 
     36 1. ToC table
     37 
     38 uint32_t count; - number of items
     39 UDataOffsetTOCEntry entry[count]; - pair of uint32_t values per item:
     40     uint32_t nameOffset; - offset of the item name
     41     uint32_t dataOffset; - offset of the item data
     42 both are byte offsets from the beginning of the data
     43 
     44 2. item name strings
     45 
     46 All item names are stored as char * strings in one block between the ToC table
     47 and the data items.
     48 
     49 3. data items
     50 
     51 The data items are stored following the item names block.
     52 Each data item is 16-aligned.
     53 The data items are stored in the sorted order of their names.
     54 
     55 Therefore, the top of the name strings block is the offset of the first item,
     56 the length of the last item is the difference between its offset and
     57 the .dat file length, and the length of all previous items is the difference
     58 between its offset and the next one.
     59 
     60 ----------------------------------------------------------------------------- */
     61 
     62 /* UDataInfo cf. udata.h */
     63 static const UDataInfo dataInfo={
     64     sizeof(UDataInfo),
     65     0,
     66 
     67     U_IS_BIG_ENDIAN,
     68     U_CHARSET_FAMILY,
     69     sizeof(UChar),
     70     0,
     71 
     72     {0x43, 0x6d, 0x6e, 0x44},     /* dataFormat="CmnD" */
     73     {1, 0, 0, 0},                 /* formatVersion */
     74     {3, 0, 0, 0}                  /* dataVersion */
     75 };
     76 
     77 static uint32_t maxSize;
     78 
     79 static char stringStore[STRING_STORE_SIZE];
     80 static uint32_t stringTop=0, basenameTotal=0;
     81 
     82 typedef struct {
     83     char *pathname, *basename;
     84     uint32_t basenameLength, basenameOffset, fileSize, fileOffset;
     85 } File;
     86 
     87 #define CHUNK_FILE_COUNT 256
     88 static File *files = NULL;
     89 static uint32_t fileCount=0;
     90 static uint32_t fileMax = 0;
     91 
     92 
     93 static char *symPrefix = NULL;
     94 
     95 #define LINE_BUFFER_SIZE 512
     96 /* prototypes --------------------------------------------------------------- */
     97 
     98 static void
     99 addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose);
    100 
    101 static char *
    102 allocString(uint32_t length);
    103 
    104 static int
    105 compareFiles(const void *file1, const void *file2);
    106 
    107 static char *
    108 pathToFullPath(const char *path, const char *source);
    109 
    110 /* map non-tree separator (such as '\') to tree separator ('/') inplace. */
    111 static void
    112 fixDirToTreePath(char *s);
    113 /* -------------------------------------------------------------------------- */
    114 
    115 U_CAPI void U_EXPORT2
    116 createCommonDataFile(const char *destDir, const char *name, const char *entrypointName, const char *type, const char *source, const char *copyRight,
    117                      const char *dataFile, uint32_t max_size, UBool sourceTOC, UBool verbose, char *gencmnFileName) {
    118     static char buffer[4096];
    119     char *line;
    120     char *linePtr;
    121     char *s = NULL;
    122     UErrorCode errorCode=U_ZERO_ERROR;
    123     uint32_t i, fileOffset, basenameOffset, length, nread;
    124     FileStream *in, *file;
    125 
    126     line = (char *)uprv_malloc(sizeof(char) * LINE_BUFFER_SIZE);
    127     if (line == NULL) {
    128         fprintf(stderr, "gencmn: unable to allocate memory for line buffer of size %d\n", LINE_BUFFER_SIZE);
    129         exit(U_MEMORY_ALLOCATION_ERROR);
    130     }
    131 
    132     linePtr = line;
    133 
    134     maxSize = max_size;
    135 
    136     if (destDir == NULL) {
    137         destDir = u_getDataDirectory();
    138     }
    139     if (name == NULL) {
    140         name = COMMON_DATA_NAME;
    141     }
    142     if (type == NULL) {
    143         type = DATA_TYPE;
    144     }
    145     if (source == NULL) {
    146         source = ".";
    147     }
    148 
    149     if (dataFile == NULL) {
    150         in = T_FileStream_stdin();
    151     } else {
    152         in = T_FileStream_open(dataFile, "r");
    153         if(in == NULL) {
    154             fprintf(stderr, "gencmn: unable to open input file %s\n", dataFile);
    155             exit(U_FILE_ACCESS_ERROR);
    156         }
    157     }
    158 
    159     if (verbose) {
    160         if(sourceTOC) {
    161             printf("generating %s_%s.c (table of contents source file)\n", name, type);
    162         } else {
    163             printf("generating %s.%s (common data file with table of contents)\n", name, type);
    164         }
    165     }
    166 
    167     /* read the list of files and get their lengths */
    168     while((s != NULL && *s != 0) || (s=T_FileStream_readLine(in, (line=linePtr),
    169                                                              LINE_BUFFER_SIZE))!=NULL) {
    170         /* remove trailing newline characters and parse space separated items */
    171         if (s != NULL && *s != 0) {
    172             line=s;
    173         } else {
    174             s=line;
    175         }
    176         while(*s!=0) {
    177             if(*s==' ') {
    178                 *s=0;
    179                 ++s;
    180                 break;
    181             } else if(*s=='\r' || *s=='\n') {
    182                 *s=0;
    183                 break;
    184             }
    185             ++s;
    186         }
    187 
    188         /* check for comment */
    189 
    190         if (*line == '#') {
    191             continue;
    192         }
    193 
    194         /* add the file */
    195 #if (U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR)
    196         {
    197           char *t;
    198           while((t = uprv_strchr(line,U_FILE_ALT_SEP_CHAR))) {
    199             *t = U_FILE_SEP_CHAR;
    200           }
    201         }
    202 #endif
    203         addFile(getLongPathname(line), name, source, sourceTOC, verbose);
    204     }
    205 
    206     uprv_free(linePtr);
    207 
    208     if(in!=T_FileStream_stdin()) {
    209         T_FileStream_close(in);
    210     }
    211 
    212     if(fileCount==0) {
    213         fprintf(stderr, "gencmn: no files listed in %s\n", dataFile == NULL ? "<stdin>" : dataFile);
    214         return;
    215     }
    216 
    217     /* sort the files by basename */
    218     qsort(files, fileCount, sizeof(File), compareFiles);
    219 
    220     if(!sourceTOC) {
    221         UNewDataMemory *out;
    222 
    223         /* determine the offsets of all basenames and files in this common one */
    224         basenameOffset=4+8*fileCount;
    225         fileOffset=(basenameOffset+(basenameTotal+15))&~0xf;
    226         for(i=0; i<fileCount; ++i) {
    227             files[i].fileOffset=fileOffset;
    228             fileOffset+=(files[i].fileSize+15)&~0xf;
    229             files[i].basenameOffset=basenameOffset;
    230             basenameOffset+=files[i].basenameLength;
    231         }
    232 
    233         /* create the output file */
    234         out=udata_create(destDir, type, name,
    235                          &dataInfo,
    236                          copyRight == NULL ? U_COPYRIGHT_STRING : copyRight,
    237                          &errorCode);
    238         if(U_FAILURE(errorCode)) {
    239             fprintf(stderr, "gencmn: udata_create(-d %s -n %s -t %s) failed - %s\n",
    240                 destDir, name, type,
    241                 u_errorName(errorCode));
    242             exit(errorCode);
    243         }
    244 
    245         /* write the table of contents */
    246         udata_write32(out, fileCount);
    247         for(i=0; i<fileCount; ++i) {
    248             udata_write32(out, files[i].basenameOffset);
    249             udata_write32(out, files[i].fileOffset);
    250         }
    251 
    252         /* write the basenames */
    253         for(i=0; i<fileCount; ++i) {
    254             udata_writeString(out, files[i].basename, files[i].basenameLength);
    255         }
    256         length=4+8*fileCount+basenameTotal;
    257 
    258         /* copy the files */
    259         for(i=0; i<fileCount; ++i) {
    260             /* pad to 16-align the next file */
    261             length&=0xf;
    262             if(length!=0) {
    263                 udata_writePadding(out, 16-length);
    264             }
    265 
    266             if (verbose) {
    267                 printf("adding %s (%ld byte%s)\n", files[i].pathname, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s");
    268             }
    269 
    270             /* copy the next file */
    271             file=T_FileStream_open(files[i].pathname, "rb");
    272             if(file==NULL) {
    273                 fprintf(stderr, "gencmn: unable to open listed file %s\n", files[i].pathname);
    274                 exit(U_FILE_ACCESS_ERROR);
    275             }
    276             for(nread = 0;;) {
    277                 length=T_FileStream_read(file, buffer, sizeof(buffer));
    278                 if(length <= 0) {
    279                     break;
    280                 }
    281                 nread += length;
    282                 udata_writeBlock(out, buffer, length);
    283             }
    284             T_FileStream_close(file);
    285             length=files[i].fileSize;
    286 
    287             if (nread != files[i].fileSize) {
    288               fprintf(stderr, "gencmn: unable to read %s properly (got %ld/%ld byte%s)\n", files[i].pathname,  (long)nread, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s");
    289                 exit(U_FILE_ACCESS_ERROR);
    290             }
    291         }
    292 
    293         /* pad to 16-align the last file (cleaner, avoids growing .dat files in icuswap) */
    294         length&=0xf;
    295         if(length!=0) {
    296             udata_writePadding(out, 16-length);
    297         }
    298 
    299         /* finish */
    300         udata_finish(out, &errorCode);
    301         if(U_FAILURE(errorCode)) {
    302             fprintf(stderr, "gencmn: udata_finish() failed - %s\n", u_errorName(errorCode));
    303             exit(errorCode);
    304         }
    305     } else {
    306         /* write a .c source file with the table of contents */
    307         char *filename;
    308         FileStream *out;
    309 
    310         /* create the output filename */
    311         filename=s=buffer;
    312         uprv_strcpy(filename, destDir);
    313         s=filename+uprv_strlen(filename);
    314         if(s>filename && *(s-1)!=U_FILE_SEP_CHAR) {
    315             *s++=U_FILE_SEP_CHAR;
    316         }
    317         uprv_strcpy(s, name);
    318         if(*(type)!=0) {
    319             s+=uprv_strlen(s);
    320             *s++='_';
    321             uprv_strcpy(s, type);
    322         }
    323         s+=uprv_strlen(s);
    324         uprv_strcpy(s, ".c");
    325 
    326         /* open the output file */
    327         out=T_FileStream_open(filename, "w");
    328         if (gencmnFileName != NULL) {
    329             uprv_strcpy(gencmnFileName, filename);
    330         }
    331         if(out==NULL) {
    332             fprintf(stderr, "gencmn: unable to open .c output file %s\n", filename);
    333             exit(U_FILE_ACCESS_ERROR);
    334         }
    335 
    336         /* write the source file */
    337         sprintf(buffer,
    338             "/*\n"
    339             " * ICU common data table of contents for %s.%s\n"
    340             " * Automatically generated by icu/source/tools/gencmn/gencmn .\n"
    341             " */\n\n"
    342             "#include \"unicode/utypes.h\"\n"
    343             "#include \"unicode/udata.h\"\n"
    344             "\n"
    345             "/* external symbol declarations for data (%d files) */\n",
    346                 name, type, fileCount);
    347         T_FileStream_writeLine(out, buffer);
    348 
    349         sprintf(buffer, "extern const char\n    %s%s[]", symPrefix?symPrefix:"", files[0].pathname);
    350         T_FileStream_writeLine(out, buffer);
    351         for(i=1; i<fileCount; ++i) {
    352             sprintf(buffer, ",\n    %s%s[]", symPrefix?symPrefix:"", files[i].pathname);
    353             T_FileStream_writeLine(out, buffer);
    354         }
    355         T_FileStream_writeLine(out, ";\n\n");
    356 
    357         sprintf(
    358             buffer,
    359             "U_EXPORT struct {\n"
    360             "    uint16_t headerSize;\n"
    361             "    uint8_t magic1, magic2;\n"
    362             "    UDataInfo info;\n"
    363             "    char padding[%lu];\n"
    364             "    uint32_t count, reserved;\n"
    365             "    struct {\n"
    366             "        const char *name;\n"
    367             "        const void *data;\n"
    368             "    } toc[%lu];\n"
    369             "} U_EXPORT2 %s_dat = {\n"
    370             "    32, 0xda, 0x27, {\n"
    371             "        %lu, 0,\n"
    372             "        %u, %u, %u, 0,\n"
    373             "        {0x54, 0x6f, 0x43, 0x50},\n"
    374             "        {1, 0, 0, 0},\n"
    375             "        {0, 0, 0, 0}\n"
    376             "    },\n"
    377             "    \"\", %lu, 0, {\n",
    378             (unsigned long)32-4-sizeof(UDataInfo),
    379             (unsigned long)fileCount,
    380             entrypointName,
    381             (unsigned long)sizeof(UDataInfo),
    382             U_IS_BIG_ENDIAN,
    383             U_CHARSET_FAMILY,
    384             U_SIZEOF_UCHAR,
    385             (unsigned long)fileCount
    386         );
    387         T_FileStream_writeLine(out, buffer);
    388 
    389         sprintf(buffer, "        { \"%s\", %s%s }", files[0].basename, symPrefix?symPrefix:"", files[0].pathname);
    390         T_FileStream_writeLine(out, buffer);
    391         for(i=1; i<fileCount; ++i) {
    392             sprintf(buffer, ",\n        { \"%s\", %s%s }", files[i].basename, symPrefix?symPrefix:"", files[i].pathname);
    393             T_FileStream_writeLine(out, buffer);
    394         }
    395 
    396         T_FileStream_writeLine(out, "\n    }\n};\n");
    397         T_FileStream_close(out);
    398 
    399         uprv_free(symPrefix);
    400     }
    401 }
    402 
    403 static void
    404 addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose) {
    405     char *s;
    406     uint32_t length;
    407     char *fullPath = NULL;
    408 
    409     if(fileCount==fileMax) {
    410       fileMax += CHUNK_FILE_COUNT;
    411       files = uprv_realloc(files, fileMax*sizeof(files[0])); /* note: never freed. */
    412       if(files==NULL) {
    413         fprintf(stderr, "pkgdata/gencmn: Could not allocate %u bytes for %d files\n", (unsigned int)(fileMax*sizeof(files[0])), fileCount);
    414         exit(U_MEMORY_ALLOCATION_ERROR);
    415       }
    416     }
    417 
    418     if(!sourceTOC) {
    419         FileStream *file;
    420 
    421         if(uprv_pathIsAbsolute(filename)) {
    422             fprintf(stderr, "gencmn: Error: absolute path encountered. Old style paths are not supported. Use relative paths such as 'fur.res' or 'translit%cfur.res'.\n\tBad path: '%s'\n", U_FILE_SEP_CHAR, filename);
    423             exit(U_ILLEGAL_ARGUMENT_ERROR);
    424         }
    425         fullPath = pathToFullPath(filename, source);
    426         /* store the pathname */
    427         length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1);
    428         s=allocString(length);
    429         uprv_strcpy(s, name);
    430         uprv_strcat(s, U_TREE_ENTRY_SEP_STRING);
    431         uprv_strcat(s, filename);
    432 
    433         /* get the basename */
    434         fixDirToTreePath(s);
    435         files[fileCount].basename=s;
    436         files[fileCount].basenameLength=length;
    437 
    438         files[fileCount].pathname=fullPath;
    439 
    440         basenameTotal+=length;
    441 
    442         /* try to open the file */
    443         file=T_FileStream_open(fullPath, "rb");
    444         if(file==NULL) {
    445             fprintf(stderr, "gencmn: unable to open listed file %s\n", fullPath);
    446             exit(U_FILE_ACCESS_ERROR);
    447         }
    448 
    449         /* get the file length */
    450         length=T_FileStream_size(file);
    451         if(T_FileStream_error(file) || length<=20) {
    452             fprintf(stderr, "gencmn: unable to get length of listed file %s\n", fullPath);
    453             exit(U_FILE_ACCESS_ERROR);
    454         }
    455 
    456         T_FileStream_close(file);
    457 
    458         /* do not add files that are longer than maxSize */
    459         if(maxSize && length>maxSize) {
    460             if (verbose) {
    461                 printf("%s ignored (size %ld > %ld)\n", fullPath, (long)length, (long)maxSize);
    462             }
    463             return;
    464         }
    465         files[fileCount].fileSize=length;
    466     } else {
    467         char *t;
    468         /* get and store the basename */
    469         /* need to include the package name */
    470         length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1);
    471         s=allocString(length);
    472         uprv_strcpy(s, name);
    473         uprv_strcat(s, U_TREE_ENTRY_SEP_STRING);
    474         uprv_strcat(s, filename);
    475         fixDirToTreePath(s);
    476         files[fileCount].basename=s;
    477         /* turn the basename into an entry point name and store in the pathname field */
    478         t=files[fileCount].pathname=allocString(length);
    479         while(--length>0) {
    480             if(*s=='.' || *s=='-' || *s=='/') {
    481                 *t='_';
    482             } else {
    483                 *t=*s;
    484             }
    485             ++s;
    486             ++t;
    487         }
    488         *t=0;
    489     }
    490     ++fileCount;
    491 }
    492 
    493 static char *
    494 allocString(uint32_t length) {
    495     uint32_t top=stringTop+length;
    496     char *p;
    497 
    498     if(top>STRING_STORE_SIZE) {
    499         fprintf(stderr, "gencmn: out of memory\n");
    500         exit(U_MEMORY_ALLOCATION_ERROR);
    501     }
    502     p=stringStore+stringTop;
    503     stringTop=top;
    504     return p;
    505 }
    506 
    507 static char *
    508 pathToFullPath(const char *path, const char *source) {
    509     int32_t length;
    510     int32_t newLength;
    511     char *fullPath;
    512     int32_t n;
    513 
    514     length = (uint32_t)(uprv_strlen(path) + 1);
    515     newLength = (length + 1 + (int32_t)uprv_strlen(source));
    516     fullPath = uprv_malloc(newLength);
    517     if(source != NULL) {
    518         uprv_strcpy(fullPath, source);
    519         uprv_strcat(fullPath, U_FILE_SEP_STRING);
    520     } else {
    521         fullPath[0] = 0;
    522     }
    523     n = (int32_t)uprv_strlen(fullPath);
    524     fullPath[n] = 0;       /* Suppress compiler warning for unused variable n    */
    525                            /*  when conditional code below is not compiled.      */
    526     uprv_strcat(fullPath, path);
    527 
    528 #if (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
    529 #if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR)
    530     /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */
    531     for(;fullPath[n];n++) {
    532         if(fullPath[n] == U_FILE_ALT_SEP_CHAR) {
    533             fullPath[n] = U_FILE_SEP_CHAR;
    534         }
    535     }
    536 #endif
    537 #endif
    538 #if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
    539     /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */
    540     for(;fullPath[n];n++) {
    541         if(fullPath[n] == U_TREE_ENTRY_SEP_CHAR) {
    542             fullPath[n] = U_FILE_SEP_CHAR;
    543         }
    544     }
    545 #endif
    546     return fullPath;
    547 }
    548 
    549 static int
    550 compareFiles(const void *file1, const void *file2) {
    551     /* sort by basename */
    552     return uprv_strcmp(((File *)file1)->basename, ((File *)file2)->basename);
    553 }
    554 
    555 static void
    556 fixDirToTreePath(char *s)
    557 {
    558 #if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) || ((U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR))
    559     char *t;
    560 #endif
    561 #if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
    562     for(t=s;t=uprv_strchr(t,U_FILE_SEP_CHAR);) {
    563         *t = U_TREE_ENTRY_SEP_CHAR;
    564     }
    565 #endif
    566 #if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
    567     for(t=s;t=uprv_strchr(t,U_FILE_ALT_SEP_CHAR);) {
    568         *t = U_TREE_ENTRY_SEP_CHAR;
    569     }
    570 #endif
    571 }
    572