Home | History | Annotate | Download | only in makeconv
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4  ********************************************************************************
      5  *
      6  *   Copyright (C) 1998-2015, International Business Machines
      7  *   Corporation and others.  All Rights Reserved.
      8  *
      9  ********************************************************************************
     10  *
     11  *
     12  *  makeconv.cpp:
     13  *  tool creating a binary (compressed) representation of the conversion mapping
     14  *  table (IBM NLTC ucmap format).
     15  *
     16  *  05/04/2000    helena     Added fallback mapping into the picture...
     17  *  06/29/2000  helena      Major rewrite of the callback APIs.
     18  */
     19 
     20 #include <stdio.h>
     21 #include "unicode/putil.h"
     22 #include "unicode/ucnv_err.h"
     23 #include "charstr.h"
     24 #include "ucnv_bld.h"
     25 #include "ucnv_imp.h"
     26 #include "ucnv_cnv.h"
     27 #include "cstring.h"
     28 #include "cmemory.h"
     29 #include "uinvchar.h"
     30 #include "filestrm.h"
     31 #include "toolutil.h"
     32 #include "uoptions.h"
     33 #include "unicode/udata.h"
     34 #include "unewdata.h"
     35 #include "uparse.h"
     36 #include "ucm.h"
     37 #include "makeconv.h"
     38 #include "genmbcs.h"
     39 
     40 #define DEBUG 0
     41 
     42 typedef struct ConvData {
     43     UCMFile *ucm;
     44     NewConverter *cnvData, *extData;
     45     UConverterSharedData sharedData;
     46     UConverterStaticData staticData;
     47 } ConvData;
     48 
     49 static void
     50 initConvData(ConvData *data) {
     51     uprv_memset(data, 0, sizeof(ConvData));
     52     data->sharedData.structSize=sizeof(UConverterSharedData);
     53     data->staticData.structSize=sizeof(UConverterStaticData);
     54     data->sharedData.staticData=&data->staticData;
     55 }
     56 
     57 static void
     58 cleanupConvData(ConvData *data) {
     59     if(data!=NULL) {
     60         if(data->cnvData!=NULL) {
     61             data->cnvData->close(data->cnvData);
     62             data->cnvData=NULL;
     63         }
     64         if(data->extData!=NULL) {
     65             data->extData->close(data->extData);
     66             data->extData=NULL;
     67         }
     68         ucm_close(data->ucm);
     69         data->ucm=NULL;
     70     }
     71 }
     72 
     73 /*
     74  * from ucnvstat.c - static prototypes of data-based converters
     75  */
     76 U_CAPI const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES];
     77 
     78 /*
     79  * Global - verbosity
     80  */
     81 UBool VERBOSE = FALSE;
     82 UBool QUIET = FALSE;
     83 UBool SMALL = FALSE;
     84 UBool IGNORE_SISO_CHECK = FALSE;
     85 
     86 static void
     87 createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode);
     88 
     89 /*
     90  * Set up the UNewData and write the converter..
     91  */
     92 static void
     93 writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status);
     94 
     95 UBool haveCopyright=TRUE;
     96 
     97 static UDataInfo dataInfo={
     98     sizeof(UDataInfo),
     99     0,
    100 
    101     U_IS_BIG_ENDIAN,
    102     U_CHARSET_FAMILY,
    103     sizeof(UChar),
    104     0,
    105 
    106     {0x63, 0x6e, 0x76, 0x74},     /* dataFormat="cnvt" */
    107     {6, 2, 0, 0},                 /* formatVersion */
    108     {0, 0, 0, 0}                  /* dataVersion (calculated at runtime) */
    109 };
    110 
    111 static void
    112 writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status)
    113 {
    114     UNewDataMemory *mem = NULL;
    115     uint32_t sz2;
    116     uint32_t size = 0;
    117     int32_t tableType;
    118 
    119     if(U_FAILURE(*status))
    120       {
    121         return;
    122       }
    123 
    124     tableType=TABLE_NONE;
    125     if(data->cnvData!=NULL) {
    126         tableType|=TABLE_BASE;
    127     }
    128     if(data->extData!=NULL) {
    129         tableType|=TABLE_EXT;
    130     }
    131 
    132     mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, status);
    133 
    134     if(U_FAILURE(*status))
    135       {
    136         fprintf(stderr, "Couldn't create the udata %s.%s: %s\n",
    137                 cnvName,
    138                 "cnv",
    139                 u_errorName(*status));
    140         return;
    141       }
    142 
    143     if(VERBOSE)
    144       {
    145         printf("- Opened udata %s.%s\n", cnvName, "cnv");
    146       }
    147 
    148 
    149     /* all read only, clean, platform independent data.  Mmmm. :)  */
    150     udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData));
    151     size += sizeof(UConverterStaticData); /* Is 4-aligned  - by size */
    152     /* Now, write the table */
    153     if(tableType&TABLE_BASE) {
    154         size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType);
    155     }
    156     if(tableType&TABLE_EXT) {
    157         size += data->extData->write(data->extData, &data->staticData, mem, tableType);
    158     }
    159 
    160     sz2 = udata_finish(mem, status);
    161     if(size != sz2)
    162     {
    163         fprintf(stderr, "error: wrote %u bytes to the .cnv file but counted %u bytes\n", (int)sz2, (int)size);
    164         *status=U_INTERNAL_PROGRAM_ERROR;
    165     }
    166     if(VERBOSE)
    167     {
    168       printf("- Wrote %u bytes to the udata.\n", (int)sz2);
    169     }
    170 }
    171 
    172 enum {
    173     OPT_HELP_H,
    174     OPT_HELP_QUESTION_MARK,
    175     OPT_COPYRIGHT,
    176     OPT_VERSION,
    177     OPT_DESTDIR,
    178     OPT_VERBOSE,
    179     OPT_SMALL,
    180     OPT_IGNORE_SISO_CHECK,
    181     OPT_QUIET,
    182 
    183     OPT_COUNT
    184 };
    185 
    186 static UOption options[]={
    187     UOPTION_HELP_H,
    188     UOPTION_HELP_QUESTION_MARK,
    189     UOPTION_COPYRIGHT,
    190     UOPTION_VERSION,
    191     UOPTION_DESTDIR,
    192     UOPTION_VERBOSE,
    193     { "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
    194     { "ignore-siso-check", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
    195     UOPTION_QUIET,
    196 };
    197 
    198 int main(int argc, char* argv[])
    199 {
    200     ConvData data;
    201     char cnvName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
    202 
    203     U_MAIN_INIT_ARGS(argc, argv);
    204 
    205     /* Set up the ICU version number */
    206     UVersionInfo icuVersion;
    207     u_getVersion(icuVersion);
    208     uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo));
    209 
    210     /* preset then read command line options */
    211     options[OPT_DESTDIR].value=u_getDataDirectory();
    212     argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
    213 
    214     /* error handling, printing usage message */
    215     if(argc<0) {
    216         fprintf(stderr,
    217             "error in command line argument \"%s\"\n",
    218             argv[-argc]);
    219     } else if(argc<2) {
    220         argc=-1;
    221     }
    222     if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) {
    223         FILE *stdfile=argc<0 ? stderr : stdout;
    224         fprintf(stdfile,
    225             "usage: %s [-options] files...\n"
    226             "\tread .ucm codepage mapping files and write .cnv files\n"
    227             "options:\n"
    228             "\t-h or -? or --help  this usage text\n"
    229             "\t-V or --version     show a version message\n"
    230             "\t-c or --copyright   include a copyright notice\n"
    231             "\t-d or --destdir     destination directory, followed by the path\n"
    232             "\t-v or --verbose     Turn on verbose output\n"
    233             "\t-q or --quiet       do not display warnings and progress\n",
    234             argv[0]);
    235         fprintf(stdfile,
    236             "\t      --small       Generate smaller .cnv files. They will be\n"
    237             "\t                    significantly smaller but may not be compatible with\n"
    238             "\t                    older versions of ICU and will require heap memory\n"
    239             "\t                    allocation when loaded.\n"
    240             "\t      --ignore-siso-check         Use SI/SO other than 0xf/0xe.\n");
    241         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
    242     }
    243 
    244     if(options[OPT_VERSION].doesOccur) {
    245         printf("makeconv version %u.%u, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
    246                dataInfo.formatVersion[0], dataInfo.formatVersion[1]);
    247         printf("%s\n", U_COPYRIGHT_STRING);
    248         exit(0);
    249     }
    250 
    251     /* get the options values */
    252     haveCopyright = options[OPT_COPYRIGHT].doesOccur;
    253     const char *destdir = options[OPT_DESTDIR].value;
    254     VERBOSE = options[OPT_VERBOSE].doesOccur;
    255     QUIET = options[OPT_QUIET].doesOccur;
    256     SMALL = options[OPT_SMALL].doesOccur;
    257 
    258     if (options[OPT_IGNORE_SISO_CHECK].doesOccur) {
    259         IGNORE_SISO_CHECK = TRUE;
    260     }
    261 
    262     icu::CharString outFileName;
    263     UErrorCode err = U_ZERO_ERROR;
    264     if (destdir != NULL && *destdir != 0) {
    265         outFileName.append(destdir, err).ensureEndsWithFileSeparator(err);
    266         if (U_FAILURE(err)) {
    267             return err;
    268         }
    269     }
    270     int32_t outBasenameStart = outFileName.length();
    271 
    272 #if DEBUG
    273     {
    274       int i;
    275       printf("makeconv: processing %d files...\n", argc - 1);
    276       for(i=1; i<argc; ++i) {
    277         printf("%s ", argv[i]);
    278       }
    279       printf("\n");
    280       fflush(stdout);
    281     }
    282 #endif
    283 
    284     UBool printFilename = (UBool) (argc > 2 || VERBOSE);
    285     for (++argv; --argc; ++argv)
    286     {
    287         UErrorCode localError = U_ZERO_ERROR;
    288         const char *arg = getLongPathname(*argv);
    289 
    290         /*produces the right destination path for display*/
    291         outFileName.truncate(outBasenameStart);
    292         if (outBasenameStart != 0)
    293         {
    294             /* find the last file sepator */
    295             const char *basename = findBasename(arg);
    296             outFileName.append(basename, localError);
    297         }
    298         else
    299         {
    300             outFileName.append(arg, localError);
    301         }
    302         if (U_FAILURE(localError)) {
    303             return localError;
    304         }
    305 
    306         /*removes the extension if any is found*/
    307         int32_t lastDotIndex = outFileName.lastIndexOf('.');
    308         if (lastDotIndex >= outBasenameStart) {
    309             outFileName.truncate(lastDotIndex);
    310         }
    311 
    312         /* the basename without extension is the converter name */
    313         if ((outFileName.length() - outBasenameStart) >= UPRV_LENGTHOF(cnvName)) {
    314             fprintf(stderr, "converter name %s too long\n", outFileName.data() + outBasenameStart);
    315             return U_BUFFER_OVERFLOW_ERROR;
    316         }
    317         uprv_strcpy(cnvName, outFileName.data() + outBasenameStart);
    318 
    319         /*Adds the target extension*/
    320         outFileName.append(CONVERTER_FILE_EXTENSION, localError);
    321         if (U_FAILURE(localError)) {
    322             return localError;
    323         }
    324 
    325 #if DEBUG
    326         printf("makeconv: processing %s  ...\n", arg);
    327         fflush(stdout);
    328 #endif
    329         initConvData(&data);
    330         createConverter(&data, arg, &localError);
    331 
    332         if (U_FAILURE(localError))
    333         {
    334             /* if an error is found, print out an error msg and keep going */
    335             fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n",
    336                     outFileName.data(), arg, u_errorName(localError));
    337             if(U_SUCCESS(err)) {
    338                 err = localError;
    339             }
    340         }
    341         else
    342         {
    343             /* Insure the static data name matches the  file name */
    344             /* Changed to ignore directory and only compare base name
    345              LDH 1/2/08*/
    346             char *p;
    347             p = strrchr(cnvName, U_FILE_SEP_CHAR); /* Find last file separator */
    348 
    349             if(p == NULL)            /* OK, try alternate */
    350             {
    351                 p = strrchr(cnvName, U_FILE_ALT_SEP_CHAR);
    352                 if(p == NULL)
    353                 {
    354                     p=cnvName; /* If no separators, no problem */
    355                 }
    356             }
    357             else
    358             {
    359                 p++;   /* If found separator, don't include it in compare */
    360             }
    361             if(uprv_stricmp(p,data.staticData.name) && !QUIET)
    362             {
    363                 fprintf(stderr, "Warning: %s%s claims to be '%s'\n",
    364                     cnvName,  CONVERTER_FILE_EXTENSION,
    365                     data.staticData.name);
    366             }
    367 
    368             uprv_strcpy((char*)data.staticData.name, cnvName);
    369 
    370             if(!uprv_isInvariantString((char*)data.staticData.name, -1)) {
    371                 fprintf(stderr,
    372                     "Error: A converter name must contain only invariant characters.\n"
    373                     "%s is not a valid converter name.\n",
    374                     data.staticData.name);
    375                 if(U_SUCCESS(err)) {
    376                     err = U_INVALID_TABLE_FORMAT;
    377                 }
    378             }
    379 
    380             localError = U_ZERO_ERROR;
    381             writeConverterData(&data, cnvName, destdir, &localError);
    382 
    383             if(U_FAILURE(localError))
    384             {
    385                 /* if an error is found, print out an error msg and keep going*/
    386                 fprintf(stderr, "Error writing \"%s\" file for \"%s\" (%s)\n", outFileName.data(), arg,
    387                     u_errorName(localError));
    388                 if(U_SUCCESS(err)) {
    389                     err = localError;
    390                 }
    391             }
    392             else if (printFilename)
    393             {
    394                 puts(outFileName.data() + outBasenameStart);
    395             }
    396         }
    397         fflush(stdout);
    398         fflush(stderr);
    399 
    400         cleanupConvData(&data);
    401     }
    402 
    403     return err;
    404 }
    405 
    406 static void
    407 getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID) {
    408     if( (name[0]=='i' || name[0]=='I') &&
    409         (name[1]=='b' || name[1]=='B') &&
    410         (name[2]=='m' || name[2]=='M')
    411     ) {
    412         name+=3;
    413         if(*name=='-') {
    414             ++name;
    415         }
    416         *pPlatform=UCNV_IBM;
    417         *pCCSID=(int32_t)uprv_strtoul(name, NULL, 10);
    418     } else {
    419         *pPlatform=UCNV_UNKNOWN;
    420         *pCCSID=0;
    421     }
    422 }
    423 
    424 static void
    425 readHeader(ConvData *data,
    426            FileStream* convFile,
    427            UErrorCode *pErrorCode) {
    428     char line[1024];
    429     char *s, *key, *value;
    430     const UConverterStaticData *prototype;
    431     UConverterStaticData *staticData;
    432 
    433     if(U_FAILURE(*pErrorCode)) {
    434         return;
    435     }
    436 
    437     staticData=&data->staticData;
    438     staticData->platform=UCNV_IBM;
    439     staticData->subCharLen=0;
    440 
    441     while(T_FileStream_readLine(convFile, line, sizeof(line))) {
    442         /* basic parsing and handling of state-related items */
    443         if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) {
    444             continue;
    445         }
    446 
    447         /* stop at the beginning of the mapping section */
    448         if(uprv_strcmp(line, "CHARMAP")==0) {
    449             break;
    450         }
    451 
    452         /* collect the information from the header field, ignore unknown keys */
    453         if(uprv_strcmp(key, "code_set_name")==0) {
    454             if(*value!=0) {
    455                 uprv_strcpy((char *)staticData->name, value);
    456                 getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage);
    457             }
    458         } else if(uprv_strcmp(key, "subchar")==0) {
    459             uint8_t bytes[UCNV_EXT_MAX_BYTES];
    460             int8_t length;
    461 
    462             s=value;
    463             length=ucm_parseBytes(bytes, line, (const char **)&s);
    464             if(1<=length && length<=4 && *s==0) {
    465                 staticData->subCharLen=length;
    466                 uprv_memcpy(staticData->subChar, bytes, length);
    467             } else {
    468                 fprintf(stderr, "error: illegal <subchar> %s\n", value);
    469                 *pErrorCode=U_INVALID_TABLE_FORMAT;
    470                 return;
    471             }
    472         } else if(uprv_strcmp(key, "subchar1")==0) {
    473             uint8_t bytes[UCNV_EXT_MAX_BYTES];
    474 
    475             s=value;
    476             if(1==ucm_parseBytes(bytes, line, (const char **)&s) && *s==0) {
    477                 staticData->subChar1=bytes[0];
    478             } else {
    479                 fprintf(stderr, "error: illegal <subchar1> %s\n", value);
    480                 *pErrorCode=U_INVALID_TABLE_FORMAT;
    481                 return;
    482             }
    483         }
    484     }
    485 
    486     /* copy values from the UCMFile to the static data */
    487     staticData->maxBytesPerChar=(int8_t)data->ucm->states.maxCharLength;
    488     staticData->minBytesPerChar=(int8_t)data->ucm->states.minCharLength;
    489     staticData->conversionType=data->ucm->states.conversionType;
    490 
    491     if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
    492         fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
    493         *pErrorCode=U_INVALID_TABLE_FORMAT;
    494         return;
    495     }
    496 
    497     /*
    498      * Now that we know the type, copy any 'default' values from the table.
    499      * We need not check the type any further because the parser only
    500      * recognizes what we have prototypes for.
    501      *
    502      * For delta (extension-only) tables, copy values from the base file
    503      * instead, see createConverter().
    504      */
    505     if(data->ucm->baseName[0]==0) {
    506         prototype=ucnv_converterStaticData[staticData->conversionType];
    507         if(prototype!=NULL) {
    508             if(staticData->name[0]==0) {
    509                 uprv_strcpy((char *)staticData->name, prototype->name);
    510             }
    511 
    512             if(staticData->codepage==0) {
    513                 staticData->codepage=prototype->codepage;
    514             }
    515 
    516             if(staticData->platform==0) {
    517                 staticData->platform=prototype->platform;
    518             }
    519 
    520             if(staticData->minBytesPerChar==0) {
    521                 staticData->minBytesPerChar=prototype->minBytesPerChar;
    522             }
    523 
    524             if(staticData->maxBytesPerChar==0) {
    525                 staticData->maxBytesPerChar=prototype->maxBytesPerChar;
    526             }
    527 
    528             if(staticData->subCharLen==0) {
    529                 staticData->subCharLen=prototype->subCharLen;
    530                 if(prototype->subCharLen>0) {
    531                     uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen);
    532                 }
    533             }
    534         }
    535     }
    536 
    537     if(data->ucm->states.outputType<0) {
    538         data->ucm->states.outputType=(int8_t)data->ucm->states.maxCharLength-1;
    539     }
    540 
    541     if( staticData->subChar1!=0 &&
    542             (staticData->minBytesPerChar>1 ||
    543                 (staticData->conversionType!=UCNV_MBCS &&
    544                  staticData->conversionType!=UCNV_EBCDIC_STATEFUL))
    545     ) {
    546         fprintf(stderr, "error: <subchar1> defined for a type other than MBCS or EBCDIC_STATEFUL\n");
    547         *pErrorCode=U_INVALID_TABLE_FORMAT;
    548     }
    549 }
    550 
    551 /* return TRUE if a base table was read, FALSE for an extension table */
    552 static UBool
    553 readFile(ConvData *data, const char* converterName,
    554          UErrorCode *pErrorCode) {
    555     char line[1024];
    556     char *end;
    557     FileStream *convFile;
    558 
    559     UCMStates *baseStates;
    560     UBool dataIsBase;
    561 
    562     if(U_FAILURE(*pErrorCode)) {
    563         return FALSE;
    564     }
    565 
    566     data->ucm=ucm_open();
    567 
    568     convFile=T_FileStream_open(converterName, "r");
    569     if(convFile==NULL) {
    570         *pErrorCode=U_FILE_ACCESS_ERROR;
    571         return FALSE;
    572     }
    573 
    574     readHeader(data, convFile, pErrorCode);
    575     if(U_FAILURE(*pErrorCode)) {
    576         return FALSE;
    577     }
    578 
    579     if(data->ucm->baseName[0]==0) {
    580         dataIsBase=TRUE;
    581         baseStates=&data->ucm->states;
    582         ucm_processStates(baseStates, IGNORE_SISO_CHECK);
    583     } else {
    584         dataIsBase=FALSE;
    585         baseStates=NULL;
    586     }
    587 
    588     /* read the base table */
    589     ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode);
    590     if(U_FAILURE(*pErrorCode)) {
    591         return FALSE;
    592     }
    593 
    594     /* read an extension table if there is one */
    595     while(T_FileStream_readLine(convFile, line, sizeof(line))) {
    596         end=uprv_strchr(line, 0);
    597         while(line<end &&
    598               (*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) {
    599             --end;
    600         }
    601         *end=0;
    602 
    603         if(line[0]=='#' || u_skipWhitespace(line)==end) {
    604             continue; /* ignore empty and comment lines */
    605         }
    606 
    607         if(0==uprv_strcmp(line, "CHARMAP")) {
    608             /* read the extension table */
    609             ucm_readTable(data->ucm, convFile, FALSE, baseStates, pErrorCode);
    610         } else {
    611             fprintf(stderr, "unexpected text after the base mapping table\n");
    612         }
    613         break;
    614     }
    615 
    616     T_FileStream_close(convFile);
    617 
    618     if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) {
    619         fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n");
    620         *pErrorCode=U_INVALID_TABLE_FORMAT;
    621     }
    622 
    623     return dataIsBase;
    624 }
    625 
    626 static void
    627 createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) {
    628     ConvData baseData;
    629     UBool dataIsBase;
    630 
    631     UConverterStaticData *staticData;
    632     UCMStates *states, *baseStates;
    633 
    634     if(U_FAILURE(*pErrorCode)) {
    635         return;
    636     }
    637 
    638     initConvData(data);
    639 
    640     dataIsBase=readFile(data, converterName, pErrorCode);
    641     if(U_FAILURE(*pErrorCode)) {
    642         return;
    643     }
    644 
    645     staticData=&data->staticData;
    646     states=&data->ucm->states;
    647 
    648     if(dataIsBase) {
    649         /*
    650          * Build a normal .cnv file with a base table
    651          * and an optional extension table.
    652          */
    653         data->cnvData=MBCSOpen(data->ucm);
    654         if(data->cnvData==NULL) {
    655             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    656 
    657         } else if(!data->cnvData->isValid(data->cnvData,
    658                             staticData->subChar, staticData->subCharLen)
    659         ) {
    660             fprintf(stderr, "       the substitution character byte sequence is illegal in this codepage structure!\n");
    661             *pErrorCode=U_INVALID_TABLE_FORMAT;
    662 
    663         } else if(staticData->subChar1!=0 &&
    664                     !data->cnvData->isValid(data->cnvData, &staticData->subChar1, 1)
    665         ) {
    666             fprintf(stderr, "       the subchar1 byte is illegal in this codepage structure!\n");
    667             *pErrorCode=U_INVALID_TABLE_FORMAT;
    668 
    669         } else if(
    670             data->ucm->ext->mappingsLength>0 &&
    671             !ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
    672         ) {
    673             *pErrorCode=U_INVALID_TABLE_FORMAT;
    674         } else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) {
    675             /* sort the table so that it can be turned into UTF-8-friendly data */
    676             ucm_sortTable(data->ucm->base);
    677         }
    678 
    679         if(U_SUCCESS(*pErrorCode)) {
    680             if(
    681                 /* add the base table after ucm_checkBaseExt()! */
    682                 !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
    683             ) {
    684                 *pErrorCode=U_INVALID_TABLE_FORMAT;
    685             } else {
    686                 /*
    687                  * addTable() may have requested moving more mappings to the extension table
    688                  * if they fit into the base toUnicode table but not into the
    689                  * base fromUnicode table.
    690                  * (Especially for UTF-8-friendly fromUnicode tables.)
    691                  * Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them
    692                  * to be excluded from the extension toUnicode data.
    693                  * See MBCSOkForBaseFromUnicode() for which mappings do not fit into
    694                  * the base fromUnicode table.
    695                  */
    696                 ucm_moveMappings(data->ucm->base, data->ucm->ext);
    697                 ucm_sortTable(data->ucm->ext);
    698                 if(data->ucm->ext->mappingsLength>0) {
    699                     /* prepare the extension table, if there is one */
    700                     data->extData=CnvExtOpen(data->ucm);
    701                     if(data->extData==NULL) {
    702                         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    703                     } else if(
    704                         !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
    705                     ) {
    706                         *pErrorCode=U_INVALID_TABLE_FORMAT;
    707                     }
    708                 }
    709             }
    710         }
    711     } else {
    712         /* Build an extension-only .cnv file. */
    713         char baseFilename[500];
    714         char *basename;
    715 
    716         initConvData(&baseData);
    717 
    718         /* assemble a path/filename for data->ucm->baseName */
    719         uprv_strcpy(baseFilename, converterName);
    720         basename=(char *)findBasename(baseFilename);
    721         uprv_strcpy(basename, data->ucm->baseName);
    722         uprv_strcat(basename, ".ucm");
    723 
    724         /* read the base table */
    725         dataIsBase=readFile(&baseData, baseFilename, pErrorCode);
    726         if(U_FAILURE(*pErrorCode)) {
    727             return;
    728         } else if(!dataIsBase) {
    729             fprintf(stderr, "error: the <icu:base> file \"%s\" is not a base table file\n", baseFilename);
    730             *pErrorCode=U_INVALID_TABLE_FORMAT;
    731         } else {
    732             /* prepare the extension table */
    733             data->extData=CnvExtOpen(data->ucm);
    734             if(data->extData==NULL) {
    735                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    736             } else {
    737                 /* fill in gaps in extension file header fields */
    738                 UCMapping *m, *mLimit;
    739                 uint8_t fallbackFlags;
    740 
    741                 baseStates=&baseData.ucm->states;
    742                 if(states->conversionType==UCNV_DBCS) {
    743                     staticData->minBytesPerChar=(int8_t)(states->minCharLength=2);
    744                 } else if(states->minCharLength==0) {
    745                     staticData->minBytesPerChar=(int8_t)(states->minCharLength=baseStates->minCharLength);
    746                 }
    747                 if(states->maxCharLength<states->minCharLength) {
    748                     staticData->maxBytesPerChar=(int8_t)(states->maxCharLength=baseStates->maxCharLength);
    749                 }
    750 
    751                 if(staticData->subCharLen==0) {
    752                     uprv_memcpy(staticData->subChar, baseData.staticData.subChar, 4);
    753                     staticData->subCharLen=baseData.staticData.subCharLen;
    754                 }
    755                 /*
    756                  * do not copy subChar1 -
    757                  * only use what is explicitly specified
    758                  * because it cannot be unset in the extension file header
    759                  */
    760 
    761                 /* get the fallback flags */
    762                 fallbackFlags=0;
    763                 for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
    764                     m<mLimit && fallbackFlags!=3;
    765                     ++m
    766                 ) {
    767                     if(m->f==1) {
    768                         fallbackFlags|=1;
    769                     } else if(m->f==3) {
    770                         fallbackFlags|=2;
    771                     }
    772                 }
    773 
    774                 if(fallbackFlags&1) {
    775                     staticData->hasFromUnicodeFallback=TRUE;
    776                 }
    777                 if(fallbackFlags&2) {
    778                     staticData->hasToUnicodeFallback=TRUE;
    779                 }
    780 
    781                 if(1!=ucm_countChars(baseStates, staticData->subChar, staticData->subCharLen)) {
    782                     fprintf(stderr, "       the substitution character byte sequence is illegal in this codepage structure!\n");
    783                     *pErrorCode=U_INVALID_TABLE_FORMAT;
    784 
    785                 } else if(staticData->subChar1!=0 && 1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) {
    786                     fprintf(stderr, "       the subchar1 byte is illegal in this codepage structure!\n");
    787                     *pErrorCode=U_INVALID_TABLE_FORMAT;
    788 
    789                 } else if(
    790                     !ucm_checkValidity(data->ucm->ext, baseStates) ||
    791                     !ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
    792                 ) {
    793                     *pErrorCode=U_INVALID_TABLE_FORMAT;
    794                 } else {
    795                     if(states->maxCharLength>1) {
    796                         /*
    797                          * When building a normal .cnv file with a base table
    798                          * for an MBCS (not SBCS) table with explicit precision flags,
    799                          * the MBCSAddTable() function marks some mappings for moving
    800                          * to the extension table.
    801                          * They fit into the base toUnicode table but not into the
    802                          * base fromUnicode table.
    803                          * (Note: We do have explicit precision flags because they are
    804                          * required for extension table generation, and
    805                          * ucm_checkBaseExt() verified it.)
    806                          *
    807                          * We do not call MBCSAddTable() here (we probably could)
    808                          * so we need to do the analysis before building the extension table.
    809                          * We assume that MBCSAddTable() will build a UTF-8-friendly table.
    810                          * Redundant mappings in the extension table are ok except they cost some size.
    811                          *
    812                          * Do this after ucm_checkBaseExt().
    813                          */
    814                         const MBCSData *mbcsData=MBCSGetDummy();
    815                         int32_t needsMove=0;
    816                         for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
    817                             m<mLimit;
    818                             ++m
    819                         ) {
    820                             if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) {
    821                                 m->f|=MBCS_FROM_U_EXT_FLAG;
    822                                 m->moveFlag=UCM_MOVE_TO_EXT;
    823                                 ++needsMove;
    824                             }
    825                         }
    826 
    827                         if(needsMove!=0) {
    828                             ucm_moveMappings(baseData.ucm->base, data->ucm->ext);
    829                             ucm_sortTable(data->ucm->ext);
    830                         }
    831                     }
    832                     if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) {
    833                         *pErrorCode=U_INVALID_TABLE_FORMAT;
    834                     }
    835                 }
    836             }
    837         }
    838 
    839         cleanupConvData(&baseData);
    840     }
    841 }
    842 
    843 /*
    844  * Hey, Emacs, please set the following:
    845  *
    846  * Local Variables:
    847  * indent-tabs-mode: nil
    848  * End:
    849  *
    850  */
    851