Home | History | Annotate | Download | only in makeconv
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2000-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  genmbcs.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2000jul06
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #include <stdio.h>
     20 #include "unicode/utypes.h"
     21 #include "cstring.h"
     22 #include "cmemory.h"
     23 #include "unewdata.h"
     24 #include "ucnv_cnv.h"
     25 #include "ucnvmbcs.h"
     26 #include "ucm.h"
     27 #include "makeconv.h"
     28 #include "genmbcs.h"
     29 
     30 /*
     31  * TODO: Split this file into toUnicode, SBCSFromUnicode and MBCSFromUnicode files.
     32  * Reduce tests for maxCharLength.
     33  */
     34 
     35 struct MBCSData {
     36     NewConverter newConverter;
     37 
     38     UCMFile *ucm;
     39 
     40     /* toUnicode (state table in ucm->states) */
     41     _MBCSToUFallback toUFallbacks[MBCS_MAX_FALLBACK_COUNT];
     42     int32_t countToUFallbacks;
     43     uint16_t *unicodeCodeUnits;
     44 
     45     /* fromUnicode */
     46     uint16_t stage1[MBCS_STAGE_1_SIZE];
     47     uint16_t stage2Single[MBCS_STAGE_2_SIZE]; /* stage 2 for single-byte codepages */
     48     uint32_t stage2[MBCS_STAGE_2_SIZE]; /* stage 2 for MBCS */
     49     uint8_t *fromUBytes;
     50     uint32_t stage2Top, stage3Top;
     51 
     52     /* fromUTF8 */
     53     uint16_t stageUTF8[0x10000>>MBCS_UTF8_STAGE_SHIFT];  /* allow for utf8Max=0xffff */
     54 
     55     /*
     56      * Maximum UTF-8-friendly code point.
     57      * 0 if !utf8Friendly, otherwise 0x01ff..0xffff in steps of 0x100.
     58      * If utf8Friendly, utf8Max is normally either MBCS_UTF8_MAX or 0xffff.
     59      */
     60     uint16_t utf8Max;
     61 
     62     UBool utf8Friendly;
     63     UBool omitFromU;
     64 };
     65 
     66 /* prototypes */
     67 U_CDECL_BEGIN
     68 static void
     69 MBCSClose(NewConverter *cnvData);
     70 
     71 static UBool
     72 MBCSStartMappings(MBCSData *mbcsData);
     73 
     74 static UBool
     75 MBCSAddToUnicode(MBCSData *mbcsData,
     76                  const uint8_t *bytes, int32_t length,
     77                  UChar32 c,
     78                  int8_t flag);
     79 
     80 static UBool
     81 MBCSIsValid(NewConverter *cnvData,
     82             const uint8_t *bytes, int32_t length);
     83 
     84 static UBool
     85 MBCSSingleAddFromUnicode(MBCSData *mbcsData,
     86                          const uint8_t *bytes, int32_t length,
     87                          UChar32 c,
     88                          int8_t flag);
     89 
     90 static UBool
     91 MBCSAddFromUnicode(MBCSData *mbcsData,
     92                    const uint8_t *bytes, int32_t length,
     93                    UChar32 c,
     94                    int8_t flag);
     95 
     96 static void
     97 MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData);
     98 
     99 static UBool
    100 MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData);
    101 
    102 static uint32_t
    103 MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
    104           UNewDataMemory *pData, int32_t tableType);
    105 U_CDECL_END
    106 
    107 /* helper ------------------------------------------------------------------- */
    108 
    109 static inline char
    110 hexDigit(uint8_t digit) {
    111     return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
    112 }
    113 
    114 static inline char *
    115 printBytes(char *buffer, const uint8_t *bytes, int32_t length) {
    116     char *s=buffer;
    117     while(length>0) {
    118         *s++=hexDigit((uint8_t)(*bytes>>4));
    119         *s++=hexDigit((uint8_t)(*bytes&0xf));
    120         ++bytes;
    121         --length;
    122     }
    123 
    124     *s=0;
    125     return buffer;
    126 }
    127 
    128 /* implementation ----------------------------------------------------------- */
    129 
    130 static MBCSData gDummy;
    131 
    132 
    133 U_CFUNC const MBCSData *
    134 MBCSGetDummy() {
    135     uprv_memset(&gDummy, 0, sizeof(MBCSData));
    136 
    137     /*
    138      * Set "pessimistic" values which may sometimes move too many
    139      * mappings to the extension table (but never too few).
    140      * These values cause MBCSOkForBaseFromUnicode() to return FALSE for the
    141      * largest set of mappings.
    142      * Assume maxCharLength>1.
    143      */
    144     gDummy.utf8Friendly=TRUE;
    145     if(SMALL) {
    146         gDummy.utf8Max=0xffff;
    147         gDummy.omitFromU=TRUE;
    148     } else {
    149         gDummy.utf8Max=MBCS_UTF8_MAX;
    150     }
    151     return &gDummy;
    152 }
    153 
    154 static void
    155 MBCSInit(MBCSData *mbcsData, UCMFile *ucm) {
    156     uprv_memset(mbcsData, 0, sizeof(MBCSData));
    157 
    158     mbcsData->ucm=ucm; /* aliased, not owned */
    159 
    160     mbcsData->newConverter.close=MBCSClose;
    161     mbcsData->newConverter.isValid=MBCSIsValid;
    162     mbcsData->newConverter.addTable=MBCSAddTable;
    163     mbcsData->newConverter.write=MBCSWrite;
    164 }
    165 
    166 U_CFUNC NewConverter *
    167 MBCSOpen(UCMFile *ucm) {
    168     MBCSData *mbcsData=(MBCSData *)uprv_malloc(sizeof(MBCSData));
    169     if(mbcsData==NULL) {
    170         printf("out of memory\n");
    171         exit(U_MEMORY_ALLOCATION_ERROR);
    172     }
    173 
    174     MBCSInit(mbcsData, ucm);
    175     return &mbcsData->newConverter;
    176 }
    177 
    178 static void
    179 MBCSDestruct(MBCSData *mbcsData) {
    180     uprv_free(mbcsData->unicodeCodeUnits);
    181     uprv_free(mbcsData->fromUBytes);
    182 }
    183 
    184 U_CDECL_BEGIN
    185 static void
    186 MBCSClose(NewConverter *cnvData) {
    187     MBCSData *mbcsData=(MBCSData *)cnvData;
    188     if(mbcsData!=NULL) {
    189         MBCSDestruct(mbcsData);
    190         uprv_free(mbcsData);
    191     }
    192 }
    193 U_CDECL_END
    194 
    195 static UBool
    196 MBCSStartMappings(MBCSData *mbcsData) {
    197     int32_t i, sum, maxCharLength,
    198             stage2NullLength, stage2AllocLength,
    199             stage3NullLength, stage3AllocLength;
    200 
    201     /* toUnicode */
    202 
    203     /* allocate the code unit array and prefill it with "unassigned" values */
    204     sum=mbcsData->ucm->states.countToUCodeUnits;
    205     if(VERBOSE) {
    206         printf("the total number of offsets is 0x%lx=%ld\n", (long)sum, (long)sum);
    207     }
    208 
    209     if(sum>0) {
    210         mbcsData->unicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
    211         if(mbcsData->unicodeCodeUnits==NULL) {
    212             fprintf(stderr, "error: out of memory allocating %ld 16-bit code units\n",
    213                 (long)sum);
    214             return FALSE;
    215         }
    216         for(i=0; i<sum; ++i) {
    217             mbcsData->unicodeCodeUnits[i]=0xfffe;
    218         }
    219     }
    220 
    221     /* fromUnicode */
    222     maxCharLength=mbcsData->ucm->states.maxCharLength;
    223 
    224     /* allocate the codepage mappings and preset the first 16 characters to 0 */
    225     if(maxCharLength==1) {
    226         /* allocate 64k 16-bit results for single-byte codepages */
    227         sum=0x20000;
    228     } else {
    229         /* allocate 1M * maxCharLength bytes for at most 1M mappings */
    230         sum=0x100000*maxCharLength;
    231     }
    232     mbcsData->fromUBytes=(uint8_t *)uprv_malloc(sum);
    233     if(mbcsData->fromUBytes==NULL) {
    234         fprintf(stderr, "error: out of memory allocating %ld B for target mappings\n", (long)sum);
    235         return FALSE;
    236     }
    237     uprv_memset(mbcsData->fromUBytes, 0, sum);
    238 
    239     /*
    240      * UTF-8-friendly fromUnicode tries: allocate multiple blocks at a time.
    241      * See ucnvmbcs.h for details.
    242      *
    243      * There is code, for example in ucnv_MBCSGetUnicodeSetForUnicode(), which
    244      * assumes that the initial stage 2/3 blocks are the all-unassigned ones.
    245      * Therefore, we refine the data structure while maintaining this placement
    246      * even though it would be convenient to allocate the ASCII block at the
    247      * beginning of stage 3, for example.
    248      *
    249      * UTF-8-friendly fromUnicode tries work from sorted tables and are built
    250      * pre-compacted, overlapping adjacent stage 2/3 blocks.
    251      * This is necessary because the block allocation and compaction changes
    252      * at SBCS_UTF8_MAX or MBCS_UTF8_MAX, and for MBCS tables the additional
    253      * stage table uses direct indexes into stage 3, without a multiplier and
    254      * thus with a smaller reach.
    255      *
    256      * Non-UTF-8-friendly fromUnicode tries work from unsorted tables
    257      * (because implicit precision is used), and are compacted
    258      * in post-processing.
    259      *
    260      * Preallocation for UTF-8-friendly fromUnicode tries:
    261      *
    262      * Stage 3:
    263      * 64-entry all-unassigned first block followed by ASCII (128 entries).
    264      *
    265      * Stage 2:
    266      * 64-entry all-unassigned first block followed by preallocated
    267      * 64-block for ASCII.
    268      */
    269 
    270     /* Preallocate ASCII as a linear 128-entry stage 3 block. */
    271     stage2NullLength=MBCS_STAGE_2_BLOCK_SIZE;
    272     stage2AllocLength=MBCS_STAGE_2_BLOCK_SIZE;
    273 
    274     stage3NullLength=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
    275     stage3AllocLength=128; /* ASCII U+0000..U+007f */
    276 
    277     /* Initialize stage 1 for the preallocated blocks. */
    278     sum=stage2NullLength;
    279     for(i=0; i<(stage2AllocLength>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT); ++i) {
    280         mbcsData->stage1[i]=sum;
    281         sum+=MBCS_STAGE_2_BLOCK_SIZE;
    282     }
    283     mbcsData->stage2Top=stage2NullLength+stage2AllocLength; /* ==sum */
    284 
    285     /*
    286      * Stage 2 indexes count 16-blocks in stage 3 as follows:
    287      * SBCS: directly, indexes increment by 16
    288      * MBCS: indexes need to be multiplied by 16*maxCharLength, indexes increment by 1
    289      * MBCS UTF-8: directly, indexes increment by 16
    290      */
    291     if(maxCharLength==1) {
    292         sum=stage3NullLength;
    293         for(i=0; i<(stage3AllocLength/MBCS_STAGE_3_BLOCK_SIZE); ++i) {
    294             mbcsData->stage2Single[mbcsData->stage1[0]+i]=sum;
    295             sum+=MBCS_STAGE_3_BLOCK_SIZE;
    296         }
    297     } else {
    298         sum=stage3NullLength/MBCS_STAGE_3_GRANULARITY;
    299         for(i=0; i<(stage3AllocLength/MBCS_STAGE_3_BLOCK_SIZE); ++i) {
    300             mbcsData->stage2[mbcsData->stage1[0]+i]=sum;
    301             sum+=MBCS_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_GRANULARITY;
    302         }
    303     }
    304 
    305     sum=stage3NullLength;
    306     for(i=0; i<(stage3AllocLength/MBCS_UTF8_STAGE_3_BLOCK_SIZE); ++i) {
    307         mbcsData->stageUTF8[i]=sum;
    308         sum+=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
    309     }
    310 
    311     /*
    312      * Allocate a 64-entry all-unassigned first stage 3 block,
    313      * for UTF-8-friendly lookup with a trail byte,
    314      * plus 128 entries for ASCII.
    315      */
    316     mbcsData->stage3Top=(stage3NullLength+stage3AllocLength)*maxCharLength; /* ==sum*maxCharLength */
    317 
    318     return TRUE;
    319 }
    320 
    321 /* return TRUE for success */
    322 static UBool
    323 setFallback(MBCSData *mbcsData, uint32_t offset, UChar32 c) {
    324     int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset);
    325     if(i>=0) {
    326         /* if there is already a fallback for this offset, then overwrite it */
    327         mbcsData->toUFallbacks[i].codePoint=c;
    328         return TRUE;
    329     } else {
    330         /* if there is no fallback for this offset, then add one */
    331         i=mbcsData->countToUFallbacks;
    332         if(i>=MBCS_MAX_FALLBACK_COUNT) {
    333             fprintf(stderr, "error: too many toUnicode fallbacks, currently at: U+%x\n", (int)c);
    334             return FALSE;
    335         } else {
    336             mbcsData->toUFallbacks[i].offset=offset;
    337             mbcsData->toUFallbacks[i].codePoint=c;
    338             mbcsData->countToUFallbacks=i+1;
    339             return TRUE;
    340         }
    341     }
    342 }
    343 
    344 /* remove fallback if there is one with this offset; return the code point if there was such a fallback, otherwise -1 */
    345 static int32_t
    346 removeFallback(MBCSData *mbcsData, uint32_t offset) {
    347     int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset);
    348     if(i>=0) {
    349         _MBCSToUFallback *toUFallbacks;
    350         int32_t limit, old;
    351 
    352         toUFallbacks=mbcsData->toUFallbacks;
    353         limit=mbcsData->countToUFallbacks;
    354         old=(int32_t)toUFallbacks[i].codePoint;
    355 
    356         /* copy the last fallback entry here to keep the list contiguous */
    357         toUFallbacks[i].offset=toUFallbacks[limit-1].offset;
    358         toUFallbacks[i].codePoint=toUFallbacks[limit-1].codePoint;
    359         mbcsData->countToUFallbacks=limit-1;
    360         return old;
    361     } else {
    362         return -1;
    363     }
    364 }
    365 
    366 /*
    367  * isFallback is almost a boolean:
    368  * 1 (TRUE)  this is a fallback mapping
    369  * 0 (FALSE) this is a precise mapping
    370  * -1        the precision of this mapping is not specified
    371  */
    372 static UBool
    373 MBCSAddToUnicode(MBCSData *mbcsData,
    374                  const uint8_t *bytes, int32_t length,
    375                  UChar32 c,
    376                  int8_t flag) {
    377     char buffer[10];
    378     uint32_t offset=0;
    379     int32_t i=0, entry, old;
    380     uint8_t state=0;
    381 
    382     if(mbcsData->ucm->states.countStates==0) {
    383         fprintf(stderr, "error: there is no state information!\n");
    384         return FALSE;
    385     }
    386 
    387     /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
    388     if(length==2 && mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO) {
    389         state=1;
    390     }
    391 
    392     /*
    393      * Walk down the state table like in conversion,
    394      * much like getNextUChar().
    395      * We assume that c<=0x10ffff.
    396      */
    397     for(i=0;;) {
    398         entry=mbcsData->ucm->states.stateTable[state][bytes[i++]];
    399         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    400             if(i==length) {
    401                 fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%s (U+%x)\n",
    402                     (short)state, printBytes(buffer, bytes, length), (int)c);
    403                 return FALSE;
    404             }
    405             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
    406             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
    407         } else {
    408             if(i<length) {
    409                 fprintf(stderr, "error: byte sequence too long by %d bytes, final state %u: 0x%s (U+%x)\n",
    410                     (int)(length-i), state, printBytes(buffer, bytes, length), (int)c);
    411                 return FALSE;
    412             }
    413             switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
    414             case MBCS_STATE_ILLEGAL:
    415                 fprintf(stderr, "error: byte sequence ends in illegal state at U+%04x<->0x%s\n",
    416                     (int)c, printBytes(buffer, bytes, length));
    417                 return FALSE;
    418             case MBCS_STATE_CHANGE_ONLY:
    419                 fprintf(stderr, "error: byte sequence ends in state-change-only at U+%04x<->0x%s\n",
    420                     (int)c, printBytes(buffer, bytes, length));
    421                 return FALSE;
    422             case MBCS_STATE_UNASSIGNED:
    423                 fprintf(stderr, "error: byte sequence ends in unassigned state at U+%04x<->0x%s\n",
    424                     (int)c, printBytes(buffer, bytes, length));
    425                 return FALSE;
    426             case MBCS_STATE_FALLBACK_DIRECT_16:
    427             case MBCS_STATE_VALID_DIRECT_16:
    428             case MBCS_STATE_FALLBACK_DIRECT_20:
    429             case MBCS_STATE_VALID_DIRECT_20:
    430                 if(MBCS_ENTRY_SET_STATE(entry, 0)!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
    431                     /* the "direct" action's value is not "valid-direct-16-unassigned" any more */
    432                     if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_DIRECT_16 || MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_FALLBACK_DIRECT_16) {
    433                         old=MBCS_ENTRY_FINAL_VALUE(entry);
    434                     } else {
    435                         old=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
    436                     }
    437                     if(flag>=0) {
    438                         fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
    439                             (int)c, printBytes(buffer, bytes, length), (int)old);
    440                         return FALSE;
    441                     } else if(VERBOSE) {
    442                         fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
    443                             (int)c, printBytes(buffer, bytes, length), (int)old);
    444                     }
    445                     /*
    446                      * Continue after the above warning
    447                      * if the precision of the mapping is unspecified.
    448                      */
    449                 }
    450                 /* reassign the correct action code */
    451                 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, (MBCS_STATE_VALID_DIRECT_16+(flag==3 ? 2 : 0)+(c>=0x10000 ? 1 : 0)));
    452 
    453                 /* put the code point into bits 22..7 for BMP, c-0x10000 into 26..7 for others */
    454                 if(c<=0xffff) {
    455                     entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c);
    456                 } else {
    457                     entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c-0x10000);
    458                 }
    459                 mbcsData->ucm->states.stateTable[state][bytes[i-1]]=entry;
    460                 break;
    461             case MBCS_STATE_VALID_16:
    462                 /* bits 26..16 are not used, 0 */
    463                 /* bits 15..7 contain the final offset delta to one 16-bit code unit */
    464                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
    465                 /* check that this byte sequence is still unassigned */
    466                 if((old=mbcsData->unicodeCodeUnits[offset])!=0xfffe || (old=removeFallback(mbcsData, offset))!=-1) {
    467                     if(flag>=0) {
    468                         fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
    469                             (int)c, printBytes(buffer, bytes, length), (int)old);
    470                         return FALSE;
    471                     } else if(VERBOSE) {
    472                         fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
    473                             (int)c, printBytes(buffer, bytes, length), (int)old);
    474                     }
    475                 }
    476                 if(c>=0x10000) {
    477                     fprintf(stderr, "error: code point does not fit into valid-16-bit state at U+%04x<->0x%s\n",
    478                         (int)c, printBytes(buffer, bytes, length));
    479                     return FALSE;
    480                 }
    481                 if(flag>0) {
    482                     /* assign only if there is no precise mapping */
    483                     if(mbcsData->unicodeCodeUnits[offset]==0xfffe) {
    484                         return setFallback(mbcsData, offset, c);
    485                     }
    486                 } else {
    487                     mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
    488                 }
    489                 break;
    490             case MBCS_STATE_VALID_16_PAIR:
    491                 /* bits 26..16 are not used, 0 */
    492                 /* bits 15..7 contain the final offset delta to two 16-bit code units */
    493                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
    494                 /* check that this byte sequence is still unassigned */
    495                 old=mbcsData->unicodeCodeUnits[offset];
    496                 if(old<0xfffe) {
    497                     int32_t real;
    498                     if(old<0xd800) {
    499                         real=old;
    500                     } else if(old<=0xdfff) {
    501                         real=0x10000+((old&0x3ff)<<10)+((mbcsData->unicodeCodeUnits[offset+1])&0x3ff);
    502                     } else /* old<=0xe001 */ {
    503                         real=mbcsData->unicodeCodeUnits[offset+1];
    504                     }
    505                     if(flag>=0) {
    506                         fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
    507                             (int)c, printBytes(buffer, bytes, length), (int)real);
    508                         return FALSE;
    509                     } else if(VERBOSE) {
    510                         fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
    511                             (int)c, printBytes(buffer, bytes, length), (int)real);
    512                     }
    513                 }
    514                 if(flag>0) {
    515                     /* assign only if there is no precise mapping */
    516                     if(old<=0xdbff || old==0xe000) {
    517                         /* do nothing */
    518                     } else if(c<=0xffff) {
    519                         /* set a BMP fallback code point as a pair with 0xe001 */
    520                         mbcsData->unicodeCodeUnits[offset++]=0xe001;
    521                         mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
    522                     } else {
    523                         /* set a fallback surrogate pair with two second surrogates */
    524                         mbcsData->unicodeCodeUnits[offset++]=(uint16_t)(0xdbc0+(c>>10));
    525                         mbcsData->unicodeCodeUnits[offset]=(uint16_t)(0xdc00+(c&0x3ff));
    526                     }
    527                 } else {
    528                     if(c<0xd800) {
    529                         /* set a BMP code point */
    530                         mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
    531                     } else if(c<=0xffff) {
    532                         /* set a BMP code point above 0xd800 as a pair with 0xe000 */
    533                         mbcsData->unicodeCodeUnits[offset++]=0xe000;
    534                         mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
    535                     } else {
    536                         /* set a surrogate pair */
    537                         mbcsData->unicodeCodeUnits[offset++]=(uint16_t)(0xd7c0+(c>>10));
    538                         mbcsData->unicodeCodeUnits[offset]=(uint16_t)(0xdc00+(c&0x3ff));
    539                     }
    540                 }
    541                 break;
    542             default:
    543                 /* reserved, must never occur */
    544                 fprintf(stderr, "internal error: byte sequence reached reserved action code, entry 0x%02x: 0x%s (U+%x)\n",
    545                     (int)entry, printBytes(buffer, bytes, length), (int)c);
    546                 return FALSE;
    547             }
    548 
    549             return TRUE;
    550         }
    551     }
    552 }
    553 
    554 U_CDECL_BEGIN
    555 /* is this byte sequence valid? (this is almost the same as MBCSAddToUnicode()) */
    556 static UBool
    557 MBCSIsValid(NewConverter *cnvData,
    558             const uint8_t *bytes, int32_t length) {
    559     MBCSData *mbcsData=(MBCSData *)cnvData;
    560 
    561     return (UBool)(1==ucm_countChars(&mbcsData->ucm->states, bytes, length));
    562 }
    563 U_CDECL_END
    564 static UBool
    565 MBCSSingleAddFromUnicode(MBCSData *mbcsData,
    566                          const uint8_t *bytes, int32_t /*length*/,
    567                          UChar32 c,
    568                          int8_t flag) {
    569     uint16_t *stage3, *p;
    570     uint32_t idx;
    571     uint16_t old;
    572     uint8_t b;
    573 
    574     uint32_t blockSize, newTop, i, nextOffset, newBlock, min;
    575 
    576     /* ignore |2 SUB mappings */
    577     if(flag==2) {
    578         return TRUE;
    579     }
    580 
    581     /*
    582      * Walk down the triple-stage compact array ("trie") and
    583      * allocate parts as necessary.
    584      * Note that the first stage 2 and 3 blocks are reserved for all-unassigned mappings.
    585      * We assume that length<=maxCharLength and that c<=0x10ffff.
    586      */
    587     stage3=(uint16_t *)mbcsData->fromUBytes;
    588     b=*bytes;
    589 
    590     /* inspect stage 1 */
    591     idx=c>>MBCS_STAGE_1_SHIFT;
    592     if(mbcsData->utf8Friendly && c<=SBCS_UTF8_MAX) {
    593         nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1);
    594     } else {
    595         nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK;
    596     }
    597     if(mbcsData->stage1[idx]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
    598         /* allocate another block in stage 2 */
    599         newBlock=mbcsData->stage2Top;
    600         if(mbcsData->utf8Friendly) {
    601             min=newBlock-nextOffset; /* minimum block start with overlap */
    602             while(min<newBlock && mbcsData->stage2Single[newBlock-1]==0) {
    603                 --newBlock;
    604             }
    605         }
    606         newTop=newBlock+MBCS_STAGE_2_BLOCK_SIZE;
    607 
    608         if(newTop>MBCS_MAX_STAGE_2_TOP) {
    609             fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02x\n", (int)c, b);
    610             return FALSE;
    611         }
    612 
    613         /*
    614          * each stage 2 block contains 64 16-bit words:
    615          * 6 code point bits 9..4 with 1 stage 3 index
    616          */
    617         mbcsData->stage1[idx]=(uint16_t)newBlock;
    618         mbcsData->stage2Top=newTop;
    619     }
    620 
    621     /* inspect stage 2 */
    622     idx=mbcsData->stage1[idx]+nextOffset;
    623     if(mbcsData->utf8Friendly && c<=SBCS_UTF8_MAX) {
    624         /* allocate 64-entry blocks for UTF-8-friendly lookup */
    625         blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
    626         nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK;
    627     } else {
    628         blockSize=MBCS_STAGE_3_BLOCK_SIZE;
    629         nextOffset=c&MBCS_STAGE_3_BLOCK_MASK;
    630     }
    631     if(mbcsData->stage2Single[idx]==0) {
    632         /* allocate another block in stage 3 */
    633         newBlock=mbcsData->stage3Top;
    634         if(mbcsData->utf8Friendly) {
    635             min=newBlock-nextOffset; /* minimum block start with overlap */
    636             while(min<newBlock && stage3[newBlock-1]==0) {
    637                 --newBlock;
    638             }
    639         }
    640         newTop=newBlock+blockSize;
    641 
    642         if(newTop>MBCS_STAGE_3_SBCS_SIZE) {
    643             fprintf(stderr, "error: too many code points at U+%04x<->0x%02x\n", (int)c, b);
    644             return FALSE;
    645         }
    646         /* each block has 16 uint16_t entries */
    647         i=idx;
    648         while(newBlock<newTop) {
    649             mbcsData->stage2Single[i++]=(uint16_t)newBlock;
    650             newBlock+=MBCS_STAGE_3_BLOCK_SIZE;
    651         }
    652         mbcsData->stage3Top=newTop; /* ==newBlock */
    653     }
    654 
    655     /* write the codepage entry into stage 3 and get the previous entry */
    656     p=stage3+mbcsData->stage2Single[idx]+nextOffset;
    657     old=*p;
    658     if(flag<=0) {
    659         *p=(uint16_t)(0xf00|b);
    660     } else if(IS_PRIVATE_USE(c)) {
    661         *p=(uint16_t)(0xc00|b);
    662     } else {
    663         *p=(uint16_t)(0x800|b);
    664     }
    665 
    666     /* check that this Unicode code point was still unassigned */
    667     if(old>=0x100) {
    668         if(flag>=0) {
    669             fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
    670                 (int)c, b, old&0xff);
    671             return FALSE;
    672         } else if(VERBOSE) {
    673             fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
    674                 (int)c, b, old&0xff);
    675         }
    676         /* continue after the above warning if the precision of the mapping is unspecified */
    677     }
    678 
    679     return TRUE;
    680 }
    681 
    682 static UBool
    683 MBCSAddFromUnicode(MBCSData *mbcsData,
    684                    const uint8_t *bytes, int32_t length,
    685                    UChar32 c,
    686                    int8_t flag) {
    687     char buffer[10];
    688     const uint8_t *pb;
    689     uint8_t *stage3, *p;
    690     uint32_t idx, b, old, stage3Index;
    691     int32_t maxCharLength;
    692 
    693     uint32_t blockSize, newTop, i, nextOffset, newBlock, min, overlap, maxOverlap;
    694 
    695     maxCharLength=mbcsData->ucm->states.maxCharLength;
    696 
    697     if( mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO &&
    698         (!IGNORE_SISO_CHECK && (*bytes==0xe || *bytes==0xf))
    699     ) {
    700         fprintf(stderr, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%s\n",
    701             (int)c, printBytes(buffer, bytes, length));
    702         return FALSE;
    703     }
    704 
    705     if(flag==1 && length==1 && *bytes==0) {
    706         fprintf(stderr, "error: unable to encode a |1 fallback from U+%04x to 0x%02x\n",
    707             (int)c, *bytes);
    708         return FALSE;
    709     }
    710 
    711     /*
    712      * Walk down the triple-stage compact array ("trie") and
    713      * allocate parts as necessary.
    714      * Note that the first stage 2 and 3 blocks are reserved for
    715      * all-unassigned mappings.
    716      * We assume that length<=maxCharLength and that c<=0x10ffff.
    717      */
    718     stage3=mbcsData->fromUBytes;
    719 
    720     /* inspect stage 1 */
    721     idx=c>>MBCS_STAGE_1_SHIFT;
    722     if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
    723         nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1);
    724     } else {
    725         nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK;
    726     }
    727     if(mbcsData->stage1[idx]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
    728         /* allocate another block in stage 2 */
    729         newBlock=mbcsData->stage2Top;
    730         if(mbcsData->utf8Friendly) {
    731             min=newBlock-nextOffset; /* minimum block start with overlap */
    732             while(min<newBlock && mbcsData->stage2[newBlock-1]==0) {
    733                 --newBlock;
    734             }
    735         }
    736         newTop=newBlock+MBCS_STAGE_2_BLOCK_SIZE;
    737 
    738         if(newTop>MBCS_MAX_STAGE_2_TOP) {
    739             fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%s\n",
    740                 (int)c, printBytes(buffer, bytes, length));
    741             return FALSE;
    742         }
    743 
    744         /*
    745          * each stage 2 block contains 64 32-bit words:
    746          * 6 code point bits 9..4 with value with bits 31..16 "assigned" flags and bits 15..0 stage 3 index
    747          */
    748         i=idx;
    749         while(newBlock<newTop) {
    750             mbcsData->stage1[i++]=(uint16_t)newBlock;
    751             newBlock+=MBCS_STAGE_2_BLOCK_SIZE;
    752         }
    753         mbcsData->stage2Top=newTop; /* ==newBlock */
    754     }
    755 
    756     /* inspect stage 2 */
    757     idx=mbcsData->stage1[idx]+nextOffset;
    758     if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
    759         /* allocate 64-entry blocks for UTF-8-friendly lookup */
    760         blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE*maxCharLength;
    761         nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK;
    762     } else {
    763         blockSize=MBCS_STAGE_3_BLOCK_SIZE*maxCharLength;
    764         nextOffset=c&MBCS_STAGE_3_BLOCK_MASK;
    765     }
    766     if(mbcsData->stage2[idx]==0) {
    767         /* allocate another block in stage 3 */
    768         newBlock=mbcsData->stage3Top;
    769         if(mbcsData->utf8Friendly && nextOffset>=MBCS_STAGE_3_GRANULARITY) {
    770             /*
    771              * Overlap stage 3 blocks only in multiples of 16-entry blocks
    772              * because of the indexing granularity in stage 2.
    773              */
    774             maxOverlap=(nextOffset&~(MBCS_STAGE_3_GRANULARITY-1))*maxCharLength;
    775             for(overlap=0;
    776                 overlap<maxOverlap && stage3[newBlock-overlap-1]==0;
    777                 ++overlap) {}
    778 
    779             overlap=(overlap/MBCS_STAGE_3_GRANULARITY)/maxCharLength;
    780             overlap=(overlap*MBCS_STAGE_3_GRANULARITY)*maxCharLength;
    781 
    782             newBlock-=overlap;
    783         }
    784         newTop=newBlock+blockSize;
    785 
    786         if(newTop>MBCS_STAGE_3_MBCS_SIZE*(uint32_t)maxCharLength) {
    787             fprintf(stderr, "error: too many code points at U+%04x<->0x%s\n",
    788                 (int)c, printBytes(buffer, bytes, length));
    789             return FALSE;
    790         }
    791         /* each block has 16*maxCharLength bytes */
    792         i=idx;
    793         while(newBlock<newTop) {
    794             mbcsData->stage2[i++]=(newBlock/MBCS_STAGE_3_GRANULARITY)/maxCharLength;
    795             newBlock+=MBCS_STAGE_3_BLOCK_SIZE*maxCharLength;
    796         }
    797         mbcsData->stage3Top=newTop; /* ==newBlock */
    798     }
    799 
    800     stage3Index=MBCS_STAGE_3_GRANULARITY*(uint32_t)(uint16_t)mbcsData->stage2[idx];
    801 
    802     /* Build an alternate, UTF-8-friendly stage table as well. */
    803     if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
    804         /* Overflow for uint16_t entries in stageUTF8? */
    805         if(stage3Index>0xffff) {
    806             /*
    807              * This can occur only if the mapping table is nearly perfectly filled and if
    808              * utf8Max==0xffff.
    809              * (There is no known charset like this. GB 18030 does not map
    810              * surrogate code points and LMBCS does not map 256 PUA code points.)
    811              *
    812              * Otherwise, stage3Index<=MBCS_UTF8_LIMIT<0xffff
    813              * (stage3Index can at most reach exactly MBCS_UTF8_LIMIT)
    814              * because we have a sorted table and there are at most MBCS_UTF8_LIMIT
    815              * mappings with 0<=c<MBCS_UTF8_LIMIT, and there is only also
    816              * the initial all-unassigned block in stage3.
    817              *
    818              * Solution for the overflow: Reduce utf8Max to the next lower value, 0xfeff.
    819              *
    820              * (See svn revision 20866 of the markus/ucnvutf8 feature branch for
    821              * code that causes MBCSAddTable() to rebuild the table not utf8Friendly
    822              * in case of overflow. That code was not tested.)
    823              */
    824             mbcsData->utf8Max=0xfeff;
    825         } else {
    826             /*
    827              * The stage 3 block has been assigned for the regular trie.
    828              * Just copy its index into stageUTF8[], without the granularity.
    829              */
    830             mbcsData->stageUTF8[c>>MBCS_UTF8_STAGE_SHIFT]=(uint16_t)stage3Index;
    831         }
    832     }
    833 
    834     /* write the codepage bytes into stage 3 and get the previous bytes */
    835 
    836     /* assemble the bytes into a single integer */
    837     pb=bytes;
    838     b=0;
    839     switch(length) {
    840     case 4:
    841         b=*pb++;
    842         U_FALLTHROUGH;
    843     case 3:
    844         b=(b<<8)|*pb++;
    845         U_FALLTHROUGH;
    846     case 2:
    847         b=(b<<8)|*pb++;
    848         U_FALLTHROUGH;
    849     case 1:
    850     default:
    851         b=(b<<8)|*pb++;
    852         break;
    853     }
    854 
    855     old=0;
    856     p=stage3+(stage3Index+nextOffset)*maxCharLength;
    857     switch(maxCharLength) {
    858     case 2:
    859         old=*(uint16_t *)p;
    860         *(uint16_t *)p=(uint16_t)b;
    861         break;
    862     case 3:
    863         old=(uint32_t)*p<<16;
    864         *p++=(uint8_t)(b>>16);
    865         old|=(uint32_t)*p<<8;
    866         *p++=(uint8_t)(b>>8);
    867         old|=*p;
    868         *p=(uint8_t)b;
    869         break;
    870     case 4:
    871         old=*(uint32_t *)p;
    872         *(uint32_t *)p=b;
    873         break;
    874     default:
    875         /* will never occur */
    876         break;
    877     }
    878 
    879     /* check that this Unicode code point was still unassigned */
    880     if((mbcsData->stage2[idx+(nextOffset>>MBCS_STAGE_2_SHIFT)]&(1UL<<(16+(c&0xf))))!=0 || old!=0) {
    881         if(flag>=0) {
    882             fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
    883                 (int)c, printBytes(buffer, bytes, length), (int)old);
    884             return FALSE;
    885         } else if(VERBOSE) {
    886             fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
    887                 (int)c, printBytes(buffer, bytes, length), (int)old);
    888         }
    889         /* continue after the above warning if the precision of the mapping is
    890            unspecified */
    891     }
    892     if(flag<=0) {
    893         /* set the roundtrip flag */
    894         mbcsData->stage2[idx+(nextOffset>>4)]|=(1UL<<(16+(c&0xf)));
    895     }
    896 
    897     return TRUE;
    898 }
    899 
    900 U_CFUNC UBool
    901 MBCSOkForBaseFromUnicode(const MBCSData *mbcsData,
    902                          const uint8_t *bytes, int32_t length,
    903                          UChar32 c, int8_t flag) {
    904     /*
    905      * A 1:1 mapping does not fit into the MBCS base table's fromUnicode table under
    906      * the following conditions:
    907      *
    908      * - a |2 SUB mapping for <subchar1> (no base table data structure for them)
    909      * - a |1 fallback to 0x00 (result value 0, indistinguishable from unmappable entry)
    910      * - a multi-byte mapping with leading 0x00 bytes (no explicit length field)
    911      *
    912      * Some of these tests are redundant with ucm_mappingType().
    913      */
    914     if( (flag==2 && length==1) ||
    915         (flag==1 && bytes[0]==0) || /* testing length==1 would be redundant with the next test */
    916         (flag<=1 && length>1 && bytes[0]==0)
    917     ) {
    918         return FALSE;
    919     }
    920 
    921     /*
    922      * Additional restrictions for UTF-8-friendly fromUnicode tables,
    923      * for code points up to the maximum optimized one:
    924      *
    925      * - any mapping to 0x00 (result value 0, indistinguishable from unmappable entry)
    926      * - any |1 fallback (no roundtrip flags in the optimized table)
    927      */
    928     if(mbcsData->utf8Friendly && flag<=1 && c<=mbcsData->utf8Max && (bytes[0]==0 || flag==1)) {
    929         return FALSE;
    930     }
    931 
    932     /*
    933      * If we omit the fromUnicode data, we can only store roundtrips there
    934      * because only they are recoverable from the toUnicode data.
    935      * Fallbacks must go into the extension table.
    936      */
    937     if(mbcsData->omitFromU && flag!=0) {
    938         return FALSE;
    939     }
    940 
    941     /* All other mappings do fit into the base table. */
    942     return TRUE;
    943 }
    944 
    945 U_CDECL_BEGIN
    946 /* we can assume that the table only contains 1:1 mappings with <=4 bytes each */
    947 static UBool
    948 MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) {
    949     MBCSData *mbcsData;
    950     UCMapping *m;
    951     UChar32 c;
    952     int32_t i, maxCharLength;
    953     int8_t f;
    954     UBool isOK, utf8Friendly;
    955 
    956     staticData->unicodeMask=table->unicodeMask;
    957     if(staticData->unicodeMask==3) {
    958         fprintf(stderr, "error: contains mappings for both supplementary and surrogate code points\n");
    959         return FALSE;
    960     }
    961 
    962     staticData->conversionType=UCNV_MBCS;
    963 
    964     mbcsData=(MBCSData *)cnvData;
    965     maxCharLength=mbcsData->ucm->states.maxCharLength;
    966 
    967     /*
    968      * Generation of UTF-8-friendly data requires
    969      * a sorted table, which makeconv generates when explicit precision
    970      * indicators are used.
    971      */
    972     mbcsData->utf8Friendly=utf8Friendly=(UBool)((table->flagsType&UCM_FLAGS_EXPLICIT)!=0);
    973     if(utf8Friendly) {
    974         mbcsData->utf8Max=MBCS_UTF8_MAX;
    975         if(SMALL && maxCharLength>1) {
    976             mbcsData->omitFromU=TRUE;
    977         }
    978     } else {
    979         mbcsData->utf8Max=0;
    980         if(SMALL && maxCharLength>1) {
    981             fprintf(stderr,
    982                 "makeconv warning: --small not available for .ucm files without |0 etc.\n");
    983         }
    984     }
    985 
    986     if(!MBCSStartMappings(mbcsData)) {
    987         return FALSE;
    988     }
    989 
    990     staticData->hasFromUnicodeFallback=FALSE;
    991     staticData->hasToUnicodeFallback=FALSE;
    992 
    993     isOK=TRUE;
    994 
    995     m=table->mappings;
    996     for(i=0; i<table->mappingsLength; ++m, ++i) {
    997         c=m->u;
    998         f=m->f;
    999 
   1000         /*
   1001          * Small optimization for --small .cnv files:
   1002          *
   1003          * If there are fromUnicode mappings above MBCS_UTF8_MAX,
   1004          * then the file size will be smaller if we make utf8Max larger
   1005          * because the size increase in stageUTF8 will be more than balanced by
   1006          * how much less of stage2 needs to be stored.
   1007          *
   1008          * There is no point in doing this incrementally because stageUTF8
   1009          * uses so much less space per block than stage2,
   1010          * so we immediately increase utf8Max to 0xffff.
   1011          *
   1012          * Do not increase utf8Max if it is already at 0xfeff because MBCSAddFromUnicode()
   1013          * sets it to that value when stageUTF8 overflows.
   1014          */
   1015         if( mbcsData->omitFromU && f<=1 &&
   1016             mbcsData->utf8Max<c && c<=0xffff &&
   1017             mbcsData->utf8Max<0xfeff
   1018         ) {
   1019             mbcsData->utf8Max=0xffff;
   1020         }
   1021 
   1022         switch(f) {
   1023         case -1:
   1024             /* there was no precision/fallback indicator */
   1025             /* fall through to set the mappings */
   1026             U_FALLTHROUGH;
   1027         case 0:
   1028             /* set roundtrip mappings */
   1029             isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
   1030 
   1031             if(maxCharLength==1) {
   1032                 isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
   1033             } else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) {
   1034                 isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
   1035             } else {
   1036                 m->f|=MBCS_FROM_U_EXT_FLAG;
   1037                 m->moveFlag=UCM_MOVE_TO_EXT;
   1038             }
   1039             break;
   1040         case 1:
   1041             /* set only a fallback mapping from Unicode to codepage */
   1042             if(maxCharLength==1) {
   1043                 staticData->hasFromUnicodeFallback=TRUE;
   1044                 isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
   1045             } else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) {
   1046                 staticData->hasFromUnicodeFallback=TRUE;
   1047                 isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
   1048             } else {
   1049                 m->f|=MBCS_FROM_U_EXT_FLAG;
   1050                 m->moveFlag=UCM_MOVE_TO_EXT;
   1051             }
   1052             break;
   1053         case 2:
   1054             /* ignore |2 SUB mappings, except to move <subchar1> mappings to the extension table */
   1055             if(maxCharLength>1 && m->bLen==1) {
   1056                 m->f|=MBCS_FROM_U_EXT_FLAG;
   1057                 m->moveFlag=UCM_MOVE_TO_EXT;
   1058             }
   1059             break;
   1060         case 3:
   1061             /* set only a fallback mapping from codepage to Unicode */
   1062             staticData->hasToUnicodeFallback=TRUE;
   1063             isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
   1064             break;
   1065         case 4:
   1066             /* move "good one-way" mappings to the extension table */
   1067             m->f|=MBCS_FROM_U_EXT_FLAG;
   1068             m->moveFlag=UCM_MOVE_TO_EXT;
   1069             break;
   1070         default:
   1071             /* will not occur because the parser checked it already */
   1072             fprintf(stderr, "error: illegal fallback indicator %d\n", f);
   1073             return FALSE;
   1074         }
   1075     }
   1076 
   1077     MBCSPostprocess(mbcsData, staticData);
   1078 
   1079     return isOK;
   1080 }
   1081 U_CDECL_END
   1082 static UBool
   1083 transformEUC(MBCSData *mbcsData) {
   1084     uint8_t *p8;
   1085     uint32_t i, value, oldLength, old3Top;
   1086     uint8_t b;
   1087 
   1088     oldLength=mbcsData->ucm->states.maxCharLength;
   1089     if(oldLength<3) {
   1090         return FALSE;
   1091     }
   1092 
   1093     old3Top=mbcsData->stage3Top;
   1094 
   1095     /* careful: 2-byte and 4-byte codes are stored in platform endianness! */
   1096 
   1097     /* test if all first bytes are in {0, 0x8e, 0x8f} */
   1098     p8=mbcsData->fromUBytes;
   1099 
   1100 #if !U_IS_BIG_ENDIAN
   1101     if(oldLength==4) {
   1102         p8+=3;
   1103     }
   1104 #endif
   1105 
   1106     for(i=0; i<old3Top; i+=oldLength) {
   1107         b=p8[i];
   1108         if(b!=0 && b!=0x8e && b!=0x8f) {
   1109             /* some first byte does not fit the EUC pattern, nothing to be done */
   1110             return FALSE;
   1111         }
   1112     }
   1113     /* restore p if it was modified above */
   1114     p8=mbcsData->fromUBytes;
   1115 
   1116     /* modify outputType and adjust stage3Top */
   1117     mbcsData->ucm->states.outputType=(int8_t)(MBCS_OUTPUT_3_EUC+oldLength-3);
   1118     mbcsData->stage3Top=(old3Top*(oldLength-1))/oldLength;
   1119 
   1120     /*
   1121      * EUC-encode all byte sequences;
   1122      * see "CJKV Information Processing" (1st ed. 1999) from Ken Lunde, O'Reilly,
   1123      * p. 161 in chapter 4 "Encoding Methods"
   1124      *
   1125      * This also must reverse the byte order if the platform is little-endian!
   1126      */
   1127     if(oldLength==3) {
   1128         uint16_t *q=(uint16_t *)p8;
   1129         for(i=0; i<old3Top; i+=oldLength) {
   1130             b=*p8;
   1131             if(b==0) {
   1132                 /* short sequences are stored directly */
   1133                 /* code set 0 or 1 */
   1134                 (*q++)=(uint16_t)((p8[1]<<8)|p8[2]);
   1135             } else if(b==0x8e) {
   1136                 /* code set 2 */
   1137                 (*q++)=(uint16_t)(((p8[1]&0x7f)<<8)|p8[2]);
   1138             } else /* b==0x8f */ {
   1139                 /* code set 3 */
   1140                 (*q++)=(uint16_t)((p8[1]<<8)|(p8[2]&0x7f));
   1141             }
   1142             p8+=3;
   1143         }
   1144     } else /* oldLength==4 */ {
   1145         uint8_t *q=p8;
   1146         uint32_t *p32=(uint32_t *)p8;
   1147         for(i=0; i<old3Top; i+=4) {
   1148             value=(*p32++);
   1149             if(value<=0xffffff) {
   1150                 /* short sequences are stored directly */
   1151                 /* code set 0 or 1 */
   1152                 (*q++)=(uint8_t)(value>>16);
   1153                 (*q++)=(uint8_t)(value>>8);
   1154                 (*q++)=(uint8_t)value;
   1155             } else if(value<=0x8effffff) {
   1156                 /* code set 2 */
   1157                 (*q++)=(uint8_t)((value>>16)&0x7f);
   1158                 (*q++)=(uint8_t)(value>>8);
   1159                 (*q++)=(uint8_t)value;
   1160             } else /* first byte is 0x8f */ {
   1161                 /* code set 3 */
   1162                 (*q++)=(uint8_t)(value>>16);
   1163                 (*q++)=(uint8_t)((value>>8)&0x7f);
   1164                 (*q++)=(uint8_t)value;
   1165             }
   1166         }
   1167     }
   1168 
   1169     return TRUE;
   1170 }
   1171 
   1172 /*
   1173  * Compact stage 2 for SBCS by overlapping adjacent stage 2 blocks as far
   1174  * as possible. Overlapping is done on unassigned head and tail
   1175  * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER.
   1176  * Stage 1 indexes need to be adjusted accordingly.
   1177  * This function is very similar to genprops/store.c/compactStage().
   1178  */
   1179 static void
   1180 singleCompactStage2(MBCSData *mbcsData) {
   1181     /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */
   1182     uint16_t map[MBCS_STAGE_2_MAX_BLOCKS];
   1183     uint16_t i, start, prevEnd, newStart;
   1184 
   1185     /* enter the all-unassigned first stage 2 block into the map */
   1186     map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX;
   1187 
   1188     /* begin with the first block after the all-unassigned one */
   1189     start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED;
   1190     while(start<mbcsData->stage2Top) {
   1191         prevEnd=(uint16_t)(newStart-1);
   1192 
   1193         /* find the size of the overlap */
   1194         for(i=0; i<MBCS_STAGE_2_BLOCK_SIZE && mbcsData->stage2Single[start+i]==0 && mbcsData->stage2Single[prevEnd-i]==0; ++i) {}
   1195 
   1196         if(i>0) {
   1197             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=(uint16_t)(newStart-i);
   1198 
   1199             /* move the non-overlapping indexes to their new positions */
   1200             start+=i;
   1201             for(i=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE-i); i>0; --i) {
   1202                 mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++];
   1203             }
   1204         } else if(newStart<start) {
   1205             /* move the indexes to their new positions */
   1206             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart;
   1207             for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) {
   1208                 mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++];
   1209             }
   1210         } else /* no overlap && newStart==start */ {
   1211             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start;
   1212             start=newStart+=MBCS_STAGE_2_BLOCK_SIZE;
   1213         }
   1214     }
   1215 
   1216     /* adjust stage2Top */
   1217     if(VERBOSE && newStart<mbcsData->stage2Top) {
   1218         printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n",
   1219                 (unsigned long)mbcsData->stage2Top, (unsigned long)newStart,
   1220                 (long)(mbcsData->stage2Top-newStart)*2);
   1221     }
   1222     mbcsData->stage2Top=newStart;
   1223 
   1224     /* now adjust stage 1 */
   1225     for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
   1226         mbcsData->stage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT];
   1227     }
   1228 }
   1229 
   1230 /* Compact stage 3 for SBCS - same algorithm as above. */
   1231 static void
   1232 singleCompactStage3(MBCSData *mbcsData) {
   1233     uint16_t *stage3=(uint16_t *)mbcsData->fromUBytes;
   1234 
   1235     /* this array maps the ordinal number of a stage 3 block to its new stage 2 index */
   1236     uint16_t map[0x1000];
   1237     uint16_t i, start, prevEnd, newStart;
   1238 
   1239     /* enter the all-unassigned first stage 3 block into the map */
   1240     map[0]=0;
   1241 
   1242     /* begin with the first block after the all-unassigned one */
   1243     start=newStart=16;
   1244     while(start<mbcsData->stage3Top) {
   1245         prevEnd=(uint16_t)(newStart-1);
   1246 
   1247         /* find the size of the overlap */
   1248         for(i=0; i<16 && stage3[start+i]==0 && stage3[prevEnd-i]==0; ++i) {}
   1249 
   1250         if(i>0) {
   1251             map[start>>4]=(uint16_t)(newStart-i);
   1252 
   1253             /* move the non-overlapping indexes to their new positions */
   1254             start+=i;
   1255             for(i=(uint16_t)(16-i); i>0; --i) {
   1256                 stage3[newStart++]=stage3[start++];
   1257             }
   1258         } else if(newStart<start) {
   1259             /* move the indexes to their new positions */
   1260             map[start>>4]=newStart;
   1261             for(i=16; i>0; --i) {
   1262                 stage3[newStart++]=stage3[start++];
   1263             }
   1264         } else /* no overlap && newStart==start */ {
   1265             map[start>>4]=start;
   1266             start=newStart+=16;
   1267         }
   1268     }
   1269 
   1270     /* adjust stage3Top */
   1271     if(VERBOSE && newStart<mbcsData->stage3Top) {
   1272         printf("compacting stage 3 from stage3Top=0x%lx to 0x%lx, saving %ld bytes\n",
   1273                 (unsigned long)mbcsData->stage3Top, (unsigned long)newStart,
   1274                 (long)(mbcsData->stage3Top-newStart)*2);
   1275     }
   1276     mbcsData->stage3Top=newStart;
   1277 
   1278     /* now adjust stage 2 */
   1279     for(i=0; i<mbcsData->stage2Top; ++i) {
   1280         mbcsData->stage2Single[i]=map[mbcsData->stage2Single[i]>>4];
   1281     }
   1282 }
   1283 
   1284 /*
   1285  * Compact stage 2 by overlapping adjacent stage 2 blocks as far
   1286  * as possible. Overlapping is done on unassigned head and tail
   1287  * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER.
   1288  * Stage 1 indexes need to be adjusted accordingly.
   1289  * This function is very similar to genprops/store.c/compactStage().
   1290  */
   1291 static void
   1292 compactStage2(MBCSData *mbcsData) {
   1293     /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */
   1294     uint16_t map[MBCS_STAGE_2_MAX_BLOCKS];
   1295     uint16_t i, start, prevEnd, newStart;
   1296 
   1297     /* enter the all-unassigned first stage 2 block into the map */
   1298     map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX;
   1299 
   1300     /* begin with the first block after the all-unassigned one */
   1301     start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED;
   1302     while(start<mbcsData->stage2Top) {
   1303         prevEnd=(uint16_t)(newStart-1);
   1304 
   1305         /* find the size of the overlap */
   1306         for(i=0; i<MBCS_STAGE_2_BLOCK_SIZE && mbcsData->stage2[start+i]==0 && mbcsData->stage2[prevEnd-i]==0; ++i) {}
   1307 
   1308         if(i>0) {
   1309             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=(uint16_t)(newStart-i);
   1310 
   1311             /* move the non-overlapping indexes to their new positions */
   1312             start+=i;
   1313             for(i=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE-i); i>0; --i) {
   1314                 mbcsData->stage2[newStart++]=mbcsData->stage2[start++];
   1315             }
   1316         } else if(newStart<start) {
   1317             /* move the indexes to their new positions */
   1318             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart;
   1319             for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) {
   1320                 mbcsData->stage2[newStart++]=mbcsData->stage2[start++];
   1321             }
   1322         } else /* no overlap && newStart==start */ {
   1323             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start;
   1324             start=newStart+=MBCS_STAGE_2_BLOCK_SIZE;
   1325         }
   1326     }
   1327 
   1328     /* adjust stage2Top */
   1329     if(VERBOSE && newStart<mbcsData->stage2Top) {
   1330         printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n",
   1331                 (unsigned long)mbcsData->stage2Top, (unsigned long)newStart,
   1332                 (long)(mbcsData->stage2Top-newStart)*4);
   1333     }
   1334     mbcsData->stage2Top=newStart;
   1335 
   1336     /* now adjust stage 1 */
   1337     for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
   1338         mbcsData->stage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT];
   1339     }
   1340 }
   1341 
   1342 static void
   1343 MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData * /*staticData*/) {
   1344     UCMStates *states;
   1345     int32_t maxCharLength, stage3Width;
   1346 
   1347     states=&mbcsData->ucm->states;
   1348     stage3Width=maxCharLength=states->maxCharLength;
   1349 
   1350     ucm_optimizeStates(states,
   1351                        &mbcsData->unicodeCodeUnits,
   1352                        mbcsData->toUFallbacks, mbcsData->countToUFallbacks,
   1353                        VERBOSE);
   1354 
   1355     /* try to compact the fromUnicode tables */
   1356     if(transformEUC(mbcsData)) {
   1357         --stage3Width;
   1358     }
   1359 
   1360     /*
   1361      * UTF-8-friendly tries are built precompacted, to cope with variable
   1362      * stage 3 allocation block sizes.
   1363      *
   1364      * Tables without precision indicators cannot be built that way,
   1365      * because if a block was overlapped with a previous one, then a smaller
   1366      * code point for the same block would not fit.
   1367      * Therefore, such tables are not marked UTF-8-friendly and must be
   1368      * compacted after all mappings are entered.
   1369      */
   1370     if(!mbcsData->utf8Friendly) {
   1371         if(maxCharLength==1) {
   1372             singleCompactStage3(mbcsData);
   1373             singleCompactStage2(mbcsData);
   1374         } else {
   1375             compactStage2(mbcsData);
   1376         }
   1377     }
   1378 
   1379     if(VERBOSE) {
   1380         /*uint32_t c, i1, i2, i2Limit, i3;*/
   1381 
   1382         printf("fromUnicode number of uint%s_t in stage 2: 0x%lx=%lu\n",
   1383                maxCharLength==1 ? "16" : "32",
   1384                (unsigned long)mbcsData->stage2Top,
   1385                (unsigned long)mbcsData->stage2Top);
   1386         printf("fromUnicode number of %d-byte stage 3 mapping entries: 0x%lx=%lu\n",
   1387                (int)stage3Width,
   1388                (unsigned long)mbcsData->stage3Top/stage3Width,
   1389                (unsigned long)mbcsData->stage3Top/stage3Width);
   1390 #if 0
   1391         c=0;
   1392         for(i1=0; i1<MBCS_STAGE_1_SIZE; ++i1) {
   1393             i2=mbcsData->stage1[i1];
   1394             if(i2==0) {
   1395                 c+=MBCS_STAGE_2_BLOCK_SIZE*MBCS_STAGE_3_BLOCK_SIZE;
   1396                 continue;
   1397             }
   1398             for(i2Limit=i2+MBCS_STAGE_2_BLOCK_SIZE; i2<i2Limit; ++i2) {
   1399                 if(maxCharLength==1) {
   1400                     i3=mbcsData->stage2Single[i2];
   1401                 } else {
   1402                     i3=(uint16_t)mbcsData->stage2[i2];
   1403                 }
   1404                 if(i3==0) {
   1405                     c+=MBCS_STAGE_3_BLOCK_SIZE;
   1406                     continue;
   1407                 }
   1408                 printf("U+%04lx i1=0x%02lx i2=0x%04lx i3=0x%04lx\n",
   1409                        (unsigned long)c,
   1410                        (unsigned long)i1,
   1411                        (unsigned long)i2,
   1412                        (unsigned long)i3);
   1413                 c+=MBCS_STAGE_3_BLOCK_SIZE;
   1414             }
   1415         }
   1416 #endif
   1417     }
   1418 }
   1419 
   1420 U_CDECL_BEGIN
   1421 static uint32_t
   1422 MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
   1423           UNewDataMemory *pData, int32_t tableType) {
   1424     MBCSData *mbcsData=(MBCSData *)cnvData;
   1425     uint32_t stage2Start, stage2Length;
   1426     uint32_t top, stageUTF8Length=0;
   1427     int32_t i, stage1Top;
   1428     uint32_t headerLength;
   1429 
   1430     _MBCSHeader header=UCNV_MBCS_HEADER_INITIALIZER;
   1431 
   1432     stage2Length=mbcsData->stage2Top;
   1433     if(mbcsData->omitFromU) {
   1434         /* find how much of stage2 can be omitted */
   1435         int32_t utf8Limit=(int32_t)mbcsData->utf8Max+1;
   1436         uint32_t st2=0; /*initialized it to avoid compiler warnings */
   1437 
   1438         i=utf8Limit>>MBCS_STAGE_1_SHIFT;
   1439         if((utf8Limit&((1<<MBCS_STAGE_1_SHIFT)-1))!=0 && (st2=mbcsData->stage1[i])!=0) {
   1440             /* utf8Limit is in the middle of an existing stage 2 block */
   1441             stage2Start=st2+((utf8Limit>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK);
   1442         } else {
   1443             /* find the last stage2 block with mappings before utf8Limit */
   1444             while(i>0 && (st2=mbcsData->stage1[--i])==0) {}
   1445             /* stage2 up to the end of this block corresponds to stageUTF8 */
   1446             stage2Start=st2+MBCS_STAGE_2_BLOCK_SIZE;
   1447         }
   1448         header.options|=MBCS_OPT_NO_FROM_U;
   1449         header.fullStage2Length=stage2Length;
   1450         stage2Length-=stage2Start;
   1451         if(VERBOSE) {
   1452             printf("+ omitting %lu out of %lu stage2 entries and %lu fromUBytes\n",
   1453                     (unsigned long)stage2Start,
   1454                     (unsigned long)mbcsData->stage2Top,
   1455                     (unsigned long)mbcsData->stage3Top);
   1456             printf("+ total size savings: %lu bytes\n", (unsigned long)stage2Start*4+mbcsData->stage3Top);
   1457         }
   1458     } else {
   1459         stage2Start=0;
   1460     }
   1461 
   1462     if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
   1463         stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
   1464     } else {
   1465         stage1Top=0x40; /* 0x40==64 */
   1466     }
   1467 
   1468     /* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */
   1469     if(mbcsData->ucm->states.maxCharLength==1) {
   1470         for(i=0; i<stage1Top; ++i) {
   1471             mbcsData->stage1[i]+=(uint16_t)stage1Top;
   1472         }
   1473 
   1474         /* stage2Top/Length have counted 16-bit results, now we need to count bytes */
   1475         /* also round up to a multiple of 4 bytes */
   1476         stage2Length=(stage2Length*2+1)&~1;
   1477 
   1478         /* stage3Top has counted 16-bit results, now we need to count bytes */
   1479         mbcsData->stage3Top*=2;
   1480 
   1481         if(mbcsData->utf8Friendly) {
   1482             header.version[2]=(uint8_t)(SBCS_UTF8_MAX>>8); /* store 0x1f for max==0x1fff */
   1483         }
   1484     } else {
   1485         for(i=0; i<stage1Top; ++i) {
   1486             mbcsData->stage1[i]+=(uint16_t)stage1Top/2; /* stage 2 contains 32-bit entries, stage 1 16-bit entries */
   1487         }
   1488 
   1489         /* stage2Top/Length have counted 32-bit results, now we need to count bytes */
   1490         stage2Length*=4;
   1491         /* leave stage2Start counting 32-bit units */
   1492 
   1493         if(mbcsData->utf8Friendly) {
   1494             stageUTF8Length=(mbcsData->utf8Max+1)>>MBCS_UTF8_STAGE_SHIFT;
   1495             header.version[2]=(uint8_t)(mbcsData->utf8Max>>8); /* store 0xd7 for max==0xd7ff */
   1496         }
   1497 
   1498         /* stage3Top has already counted bytes */
   1499     }
   1500 
   1501     /* round up stage3Top so that the sizes of all data blocks are multiples of 4 */
   1502     mbcsData->stage3Top=(mbcsData->stage3Top+3)&~3;
   1503 
   1504     /* fill the header */
   1505     if(header.options&MBCS_OPT_INCOMPATIBLE_MASK) {
   1506         header.version[0]=5;
   1507         if(header.options&MBCS_OPT_NO_FROM_U) {
   1508             headerLength=10;  /* include fullStage2Length */
   1509         } else {
   1510             headerLength=MBCS_HEADER_V5_MIN_LENGTH;  /* 9 */
   1511         }
   1512     } else {
   1513         header.version[0]=4;
   1514         headerLength=MBCS_HEADER_V4_LENGTH;  /* 8 */
   1515     }
   1516     header.version[1]=4;
   1517     /* header.version[2] set above for utf8Friendly data */
   1518 
   1519     header.options|=(uint32_t)headerLength;
   1520 
   1521     header.countStates=mbcsData->ucm->states.countStates;
   1522     header.countToUFallbacks=mbcsData->countToUFallbacks;
   1523 
   1524     header.offsetToUCodeUnits=
   1525         headerLength*4+
   1526         mbcsData->ucm->states.countStates*1024+
   1527         mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback);
   1528     header.offsetFromUTable=
   1529         header.offsetToUCodeUnits+
   1530         mbcsData->ucm->states.countToUCodeUnits*2;
   1531     header.offsetFromUBytes=
   1532         header.offsetFromUTable+
   1533         stage1Top*2+
   1534         stage2Length;
   1535     header.fromUBytesLength=mbcsData->stage3Top;
   1536 
   1537     top=header.offsetFromUBytes+stageUTF8Length*2;
   1538     if(!(header.options&MBCS_OPT_NO_FROM_U)) {
   1539         top+=header.fromUBytesLength;
   1540     }
   1541 
   1542     header.flags=(uint8_t)(mbcsData->ucm->states.outputType);
   1543 
   1544     if(tableType&TABLE_EXT) {
   1545         if(top>0xffffff) {
   1546             fprintf(stderr, "error: offset 0x%lx to extension table exceeds 0xffffff\n", (long)top);
   1547             return 0;
   1548         }
   1549 
   1550         header.flags|=top<<8;
   1551     }
   1552 
   1553     /* write the MBCS data */
   1554     udata_writeBlock(pData, &header, headerLength*4);
   1555     udata_writeBlock(pData, mbcsData->ucm->states.stateTable, header.countStates*1024);
   1556     udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback));
   1557     udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->ucm->states.countToUCodeUnits*2);
   1558     udata_writeBlock(pData, mbcsData->stage1, stage1Top*2);
   1559     if(mbcsData->ucm->states.maxCharLength==1) {
   1560         udata_writeBlock(pData, mbcsData->stage2Single+stage2Start, stage2Length);
   1561     } else {
   1562         udata_writeBlock(pData, mbcsData->stage2+stage2Start, stage2Length);
   1563     }
   1564     if(!(header.options&MBCS_OPT_NO_FROM_U)) {
   1565         udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top);
   1566     }
   1567 
   1568     if(stageUTF8Length>0) {
   1569         udata_writeBlock(pData, mbcsData->stageUTF8, stageUTF8Length*2);
   1570     }
   1571 
   1572     /* return the number of bytes that should have been written */
   1573     return top;
   1574 }
   1575 U_CDECL_END
   1576