Home | History | Annotate | Download | only in toolutil
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2003-2013, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  ucm.c
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2003jun20
     16 *   created by: Markus W. Scherer
     17 *
     18 *   This file reads a .ucm file, stores its mappings and sorts them.
     19 *   It implements handling of Unicode conversion mappings from .ucm files
     20 *   for makeconv, canonucm, rptp2ucm, etc.
     21 *
     22 *   Unicode code point sequences with a length of more than 1,
     23 *   as well as byte sequences with more than 4 bytes or more than one complete
     24 *   character sequence are handled to support m:n mappings.
     25 */
     26 
     27 #include "unicode/utypes.h"
     28 #include "unicode/ustring.h"
     29 #include "cstring.h"
     30 #include "cmemory.h"
     31 #include "filestrm.h"
     32 #include "uarrsort.h"
     33 #include "ucnvmbcs.h"
     34 #include "ucnv_bld.h"
     35 #include "ucnv_ext.h"
     36 #include "uparse.h"
     37 #include "ucm.h"
     38 #include <stdio.h>
     39 
     40 #if !UCONFIG_NO_CONVERSION
     41 
     42 /* -------------------------------------------------------------------------- */
     43 
     44 static void
     45 printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
     46     int32_t j;
     47 
     48     for(j=0; j<m->uLen; ++j) {
     49         fprintf(f, "<U%04lX>", (long)codePoints[j]);
     50     }
     51 
     52     fputc(' ', f);
     53 
     54     for(j=0; j<m->bLen; ++j) {
     55         fprintf(f, "\\x%02X", bytes[j]);
     56     }
     57 
     58     if(m->f>=0) {
     59         fprintf(f, " |%u\n", m->f);
     60     } else {
     61         fputs("\n", f);
     62     }
     63 }
     64 
     65 U_CAPI void U_EXPORT2
     66 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
     67     printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
     68 }
     69 
     70 U_CAPI void U_EXPORT2
     71 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
     72     UCMapping *m;
     73     int32_t i, length;
     74 
     75     m=table->mappings;
     76     length=table->mappingsLength;
     77     if(byUnicode) {
     78         for(i=0; i<length; ++m, ++i) {
     79             ucm_printMapping(table, m, f);
     80         }
     81     } else {
     82         const int32_t *map=table->reverseMap;
     83         for(i=0; i<length; ++i) {
     84             ucm_printMapping(table, m+map[i], f);
     85         }
     86     }
     87 }
     88 
     89 /* mapping comparisons ------------------------------------------------------ */
     90 
     91 static int32_t
     92 compareUnicode(UCMTable *lTable, const UCMapping *l,
     93                UCMTable *rTable, const UCMapping *r) {
     94     const UChar32 *lu, *ru;
     95     int32_t result, i, length;
     96 
     97     if(l->uLen==1 && r->uLen==1) {
     98         /* compare two single code points */
     99         return l->u-r->u;
    100     }
    101 
    102     /* get pointers to the code point sequences */
    103     lu=UCM_GET_CODE_POINTS(lTable, l);
    104     ru=UCM_GET_CODE_POINTS(rTable, r);
    105 
    106     /* get the minimum length */
    107     if(l->uLen<=r->uLen) {
    108         length=l->uLen;
    109     } else {
    110         length=r->uLen;
    111     }
    112 
    113     /* compare the code points */
    114     for(i=0; i<length; ++i) {
    115         result=lu[i]-ru[i];
    116         if(result!=0) {
    117             return result;
    118         }
    119     }
    120 
    121     /* compare the lengths */
    122     return l->uLen-r->uLen;
    123 }
    124 
    125 static int32_t
    126 compareBytes(UCMTable *lTable, const UCMapping *l,
    127              UCMTable *rTable, const UCMapping *r,
    128              UBool lexical) {
    129     const uint8_t *lb, *rb;
    130     int32_t result, i, length;
    131 
    132     /*
    133      * A lexical comparison is used for sorting in the builder, to allow
    134      * an efficient search for a byte sequence that could be a prefix
    135      * of a previously entered byte sequence.
    136      *
    137      * Comparing by lengths first is for compatibility with old .ucm tools
    138      * like canonucm and rptp2ucm.
    139      */
    140     if(lexical) {
    141         /* get the minimum length and continue */
    142         if(l->bLen<=r->bLen) {
    143             length=l->bLen;
    144         } else {
    145             length=r->bLen;
    146         }
    147     } else {
    148         /* compare lengths first */
    149         result=l->bLen-r->bLen;
    150         if(result!=0) {
    151             return result;
    152         } else {
    153             length=l->bLen;
    154         }
    155     }
    156 
    157     /* get pointers to the byte sequences */
    158     lb=UCM_GET_BYTES(lTable, l);
    159     rb=UCM_GET_BYTES(rTable, r);
    160 
    161     /* compare the bytes */
    162     for(i=0; i<length; ++i) {
    163         result=lb[i]-rb[i];
    164         if(result!=0) {
    165             return result;
    166         }
    167     }
    168 
    169     /* compare the lengths */
    170     return l->bLen-r->bLen;
    171 }
    172 
    173 /* compare UCMappings for sorting */
    174 static int32_t
    175 compareMappings(UCMTable *lTable, const UCMapping *l,
    176                 UCMTable *rTable, const UCMapping *r,
    177                 UBool uFirst) {
    178     int32_t result;
    179 
    180     /* choose which side to compare first */
    181     if(uFirst) {
    182         /* Unicode then bytes */
    183         result=compareUnicode(lTable, l, rTable, r);
    184         if(result==0) {
    185             result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
    186         }
    187     } else {
    188         /* bytes then Unicode */
    189         result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
    190         if(result==0) {
    191             result=compareUnicode(lTable, l, rTable, r);
    192         }
    193     }
    194 
    195     if(result!=0) {
    196         return result;
    197     }
    198 
    199     /* compare the flags */
    200     return l->f-r->f;
    201 }
    202 U_CDECL_BEGIN
    203 /* sorting by Unicode first sorts mappings directly */
    204 static int32_t  U_CALLCONV
    205 compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
    206     return compareMappings(
    207         (UCMTable *)context, (const UCMapping *)left,
    208         (UCMTable *)context, (const UCMapping *)right, TRUE);
    209 }
    210 
    211 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
    212 static int32_t U_CALLCONV
    213 compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
    214     UCMTable *table=(UCMTable *)context;
    215     int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
    216     return compareMappings(
    217         table, table->mappings+l,
    218         table, table->mappings+r, FALSE);
    219 }
    220 U_CDECL_END
    221 
    222 U_CAPI void U_EXPORT2
    223 ucm_sortTable(UCMTable *t) {
    224     UErrorCode errorCode;
    225     int32_t i;
    226 
    227     if(t->isSorted) {
    228         return;
    229     }
    230 
    231     errorCode=U_ZERO_ERROR;
    232 
    233     /* 1. sort by Unicode first */
    234     uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
    235                    compareMappingsUnicodeFirst, t,
    236                    FALSE, &errorCode);
    237 
    238     /* build the reverseMap */
    239     if(t->reverseMap==NULL) {
    240         /*
    241          * allocate mappingsCapacity instead of mappingsLength so that
    242          * if mappings are added, the reverseMap need not be
    243          * reallocated each time
    244          * (see ucm_moveMappings() and ucm_addMapping())
    245          */
    246         t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
    247         if(t->reverseMap==NULL) {
    248             fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
    249             exit(U_MEMORY_ALLOCATION_ERROR);
    250         }
    251     }
    252     for(i=0; i<t->mappingsLength; ++i) {
    253         t->reverseMap[i]=i;
    254     }
    255 
    256     /* 2. sort reverseMap by mappings bytes first */
    257     uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
    258                    compareMappingsBytesFirst, t,
    259                    FALSE, &errorCode);
    260 
    261     if(U_FAILURE(errorCode)) {
    262         fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
    263                 u_errorName(errorCode));
    264         exit(errorCode);
    265     }
    266 
    267     t->isSorted=TRUE;
    268 }
    269 
    270 /*
    271  * remove mappings with their move flag set from the base table
    272  * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
    273  */
    274 U_CAPI void U_EXPORT2
    275 ucm_moveMappings(UCMTable *base, UCMTable *ext) {
    276     UCMapping *mb, *mbLimit;
    277     int8_t flag;
    278 
    279     mb=base->mappings;
    280     mbLimit=mb+base->mappingsLength;
    281 
    282     while(mb<mbLimit) {
    283         flag=mb->moveFlag;
    284         if(flag!=0) {
    285             /* reset the move flag */
    286             mb->moveFlag=0;
    287 
    288             if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
    289                 /* add the mapping to the extension table */
    290                 ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
    291             }
    292 
    293             /* remove this mapping: move the last base mapping down and overwrite the current one */
    294             if(mb<(mbLimit-1)) {
    295                 uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
    296             }
    297             --mbLimit;
    298             --base->mappingsLength;
    299             base->isSorted=FALSE;
    300         } else {
    301             ++mb;
    302         }
    303     }
    304 }
    305 
    306 enum {
    307     NEEDS_MOVE=1,
    308     HAS_ERRORS=2
    309 };
    310 
    311 static uint8_t
    312 checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
    313                     UBool moveToExt, UBool intersectBase) {
    314     (void)baseStates;
    315 
    316     UCMapping *mb, *me, *mbLimit, *meLimit;
    317     int32_t cmp;
    318     uint8_t result;
    319 
    320     mb=base->mappings;
    321     mbLimit=mb+base->mappingsLength;
    322 
    323     me=ext->mappings;
    324     meLimit=me+ext->mappingsLength;
    325 
    326     result=0;
    327 
    328     for(;;) {
    329         /* skip irrelevant mappings on both sides */
    330         for(;;) {
    331             if(mb==mbLimit) {
    332                 return result;
    333             }
    334 
    335             if((0<=mb->f && mb->f<=2) || mb->f==4) {
    336                 break;
    337             }
    338 
    339             ++mb;
    340         }
    341 
    342         for(;;) {
    343             if(me==meLimit) {
    344                 return result;
    345             }
    346 
    347             if((0<=me->f && me->f<=2) || me->f==4) {
    348                 break;
    349             }
    350 
    351             ++me;
    352         }
    353 
    354         /* compare the base and extension mappings */
    355         cmp=compareUnicode(base, mb, ext, me);
    356         if(cmp<0) {
    357             if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
    358                 /*
    359                  * mapping in base but not in ext, move it
    360                  *
    361                  * if ext is DBCS, move DBCS mappings here
    362                  * and check SBCS ones for Unicode prefix below
    363                  */
    364                 mb->moveFlag|=UCM_MOVE_TO_EXT;
    365                 result|=NEEDS_MOVE;
    366 
    367             /* does mb map from an input sequence that is a prefix of me's? */
    368             } else if( mb->uLen<me->uLen &&
    369                 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
    370             ) {
    371                 if(moveToExt) {
    372                     /* mark this mapping to be moved to the extension table */
    373                     mb->moveFlag|=UCM_MOVE_TO_EXT;
    374                     result|=NEEDS_MOVE;
    375                 } else {
    376                     fprintf(stderr,
    377                             "ucm error: the base table contains a mapping whose input sequence\n"
    378                             "           is a prefix of the input sequence of an extension mapping\n");
    379                     ucm_printMapping(base, mb, stderr);
    380                     ucm_printMapping(ext, me, stderr);
    381                     result|=HAS_ERRORS;
    382                 }
    383             }
    384 
    385             ++mb;
    386         } else if(cmp==0) {
    387             /*
    388              * same output: remove the extension mapping,
    389              * otherwise treat as an error
    390              */
    391             if( mb->f==me->f && mb->bLen==me->bLen &&
    392                 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
    393             ) {
    394                 me->moveFlag|=UCM_REMOVE_MAPPING;
    395                 result|=NEEDS_MOVE;
    396             } else if(intersectBase) {
    397                 /* mapping in base but not in ext, move it */
    398                 mb->moveFlag|=UCM_MOVE_TO_EXT;
    399                 result|=NEEDS_MOVE;
    400             } else {
    401                 fprintf(stderr,
    402                         "ucm error: the base table contains a mapping whose input sequence\n"
    403                         "           is the same as the input sequence of an extension mapping\n"
    404                         "           but it maps differently\n");
    405                 ucm_printMapping(base, mb, stderr);
    406                 ucm_printMapping(ext, me, stderr);
    407                 result|=HAS_ERRORS;
    408             }
    409 
    410             ++mb;
    411         } else /* cmp>0 */ {
    412             ++me;
    413         }
    414     }
    415 }
    416 
    417 static uint8_t
    418 checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
    419                   UBool moveToExt, UBool intersectBase) {
    420     UCMapping *mb, *me;
    421     int32_t *baseMap, *extMap;
    422     int32_t b, e, bLimit, eLimit, cmp;
    423     uint8_t result;
    424     UBool isSISO;
    425 
    426     baseMap=base->reverseMap;
    427     extMap=ext->reverseMap;
    428 
    429     b=e=0;
    430     bLimit=base->mappingsLength;
    431     eLimit=ext->mappingsLength;
    432 
    433     result=0;
    434 
    435     isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
    436 
    437     for(;;) {
    438         /* skip irrelevant mappings on both sides */
    439         for(;; ++b) {
    440             if(b==bLimit) {
    441                 return result;
    442             }
    443             mb=base->mappings+baseMap[b];
    444 
    445             if(intersectBase==2 && mb->bLen==1) {
    446                 /*
    447                  * comparing a base against a DBCS extension:
    448                  * leave SBCS base mappings alone
    449                  */
    450                 continue;
    451             }
    452 
    453             if(mb->f==0 || mb->f==3) {
    454                 break;
    455             }
    456         }
    457 
    458         for(;;) {
    459             if(e==eLimit) {
    460                 return result;
    461             }
    462             me=ext->mappings+extMap[e];
    463 
    464             if(me->f==0 || me->f==3) {
    465                 break;
    466             }
    467 
    468             ++e;
    469         }
    470 
    471         /* compare the base and extension mappings */
    472         cmp=compareBytes(base, mb, ext, me, TRUE);
    473         if(cmp<0) {
    474             if(intersectBase) {
    475                 /* mapping in base but not in ext, move it */
    476                 mb->moveFlag|=UCM_MOVE_TO_EXT;
    477                 result|=NEEDS_MOVE;
    478 
    479             /*
    480              * does mb map from an input sequence that is a prefix of me's?
    481              * for SI/SO tables, a single byte is never a prefix because it
    482              * occurs in a separate single-byte state
    483              */
    484             } else if( mb->bLen<me->bLen &&
    485                 (!isSISO || mb->bLen>1) &&
    486                 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
    487             ) {
    488                 if(moveToExt) {
    489                     /* mark this mapping to be moved to the extension table */
    490                     mb->moveFlag|=UCM_MOVE_TO_EXT;
    491                     result|=NEEDS_MOVE;
    492                 } else {
    493                     fprintf(stderr,
    494                             "ucm error: the base table contains a mapping whose input sequence\n"
    495                             "           is a prefix of the input sequence of an extension mapping\n");
    496                     ucm_printMapping(base, mb, stderr);
    497                     ucm_printMapping(ext, me, stderr);
    498                     result|=HAS_ERRORS;
    499                 }
    500             }
    501 
    502             ++b;
    503         } else if(cmp==0) {
    504             /*
    505              * same output: remove the extension mapping,
    506              * otherwise treat as an error
    507              */
    508             if( mb->f==me->f && mb->uLen==me->uLen &&
    509                 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
    510             ) {
    511                 me->moveFlag|=UCM_REMOVE_MAPPING;
    512                 result|=NEEDS_MOVE;
    513             } else if(intersectBase) {
    514                 /* mapping in base but not in ext, move it */
    515                 mb->moveFlag|=UCM_MOVE_TO_EXT;
    516                 result|=NEEDS_MOVE;
    517             } else {
    518                 fprintf(stderr,
    519                         "ucm error: the base table contains a mapping whose input sequence\n"
    520                         "           is the same as the input sequence of an extension mapping\n"
    521                         "           but it maps differently\n");
    522                 ucm_printMapping(base, mb, stderr);
    523                 ucm_printMapping(ext, me, stderr);
    524                 result|=HAS_ERRORS;
    525             }
    526 
    527             ++b;
    528         } else /* cmp>0 */ {
    529             ++e;
    530         }
    531     }
    532 }
    533 
    534 U_CAPI UBool U_EXPORT2
    535 ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
    536     UCMapping *m, *mLimit;
    537     int32_t count;
    538     UBool isOK;
    539 
    540     m=table->mappings;
    541     mLimit=m+table->mappingsLength;
    542     isOK=TRUE;
    543 
    544     while(m<mLimit) {
    545         count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
    546         if(count<1) {
    547             ucm_printMapping(table, m, stderr);
    548             isOK=FALSE;
    549         }
    550         ++m;
    551     }
    552 
    553     return isOK;
    554 }
    555 
    556 U_CAPI UBool U_EXPORT2
    557 ucm_checkBaseExt(UCMStates *baseStates,
    558                  UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
    559                  UBool intersectBase) {
    560     uint8_t result;
    561 
    562     /* if we have an extension table, we must always use precision flags */
    563     if(base->flagsType&UCM_FLAGS_IMPLICIT) {
    564         fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
    565         return FALSE;
    566     }
    567     if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
    568         fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
    569         return FALSE;
    570     }
    571 
    572     /* checking requires both tables to be sorted */
    573     ucm_sortTable(base);
    574     ucm_sortTable(ext);
    575 
    576     /* check */
    577     result=
    578         checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
    579         checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
    580 
    581     if(result&HAS_ERRORS) {
    582         return FALSE;
    583     }
    584 
    585     if(result&NEEDS_MOVE) {
    586         ucm_moveMappings(ext, NULL);
    587         ucm_moveMappings(base, moveTarget);
    588         ucm_sortTable(base);
    589         ucm_sortTable(ext);
    590         if(moveTarget!=NULL) {
    591             ucm_sortTable(moveTarget);
    592         }
    593     }
    594 
    595     return TRUE;
    596 }
    597 
    598 /* merge tables for rptp2ucm ------------------------------------------------ */
    599 
    600 U_CAPI void U_EXPORT2
    601 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
    602                 const uint8_t *subchar, int32_t subcharLength,
    603                 uint8_t subchar1) {
    604     UCMapping *fromUMapping, *toUMapping;
    605     int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
    606 
    607     ucm_sortTable(fromUTable);
    608     ucm_sortTable(toUTable);
    609 
    610     fromUMapping=fromUTable->mappings;
    611     toUMapping=toUTable->mappings;
    612 
    613     fromUTop=fromUTable->mappingsLength;
    614     toUTop=toUTable->mappingsLength;
    615 
    616     fromUIndex=toUIndex=0;
    617 
    618     while(fromUIndex<fromUTop && toUIndex<toUTop) {
    619         cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
    620         if(cmp==0) {
    621             /* equal: roundtrip, nothing to do (flags are initially 0) */
    622             ++fromUMapping;
    623             ++toUMapping;
    624 
    625             ++fromUIndex;
    626             ++toUIndex;
    627         } else if(cmp<0) {
    628             /*
    629              * the fromU mapping does not have a toU counterpart:
    630              * fallback Unicode->codepage
    631              */
    632             if( (fromUMapping->bLen==subcharLength &&
    633                  0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
    634                 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
    635             ) {
    636                 fromUMapping->f=2; /* SUB mapping */
    637             } else {
    638                 fromUMapping->f=1; /* normal fallback */
    639             }
    640 
    641             ++fromUMapping;
    642             ++fromUIndex;
    643         } else {
    644             /*
    645              * the toU mapping does not have a fromU counterpart:
    646              * (reverse) fallback codepage->Unicode, copy it to the fromU table
    647              */
    648 
    649             /* ignore reverse fallbacks to Unicode SUB */
    650             if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
    651                 toUMapping->f=3; /* reverse fallback */
    652                 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
    653 
    654                 /* the table may have been reallocated */
    655                 fromUMapping=fromUTable->mappings+fromUIndex;
    656             }
    657 
    658             ++toUMapping;
    659             ++toUIndex;
    660         }
    661     }
    662 
    663     /* either one or both tables are exhausted */
    664     while(fromUIndex<fromUTop) {
    665         /* leftover fromU mappings are fallbacks */
    666         if( (fromUMapping->bLen==subcharLength &&
    667              0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
    668             (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
    669         ) {
    670             fromUMapping->f=2; /* SUB mapping */
    671         } else {
    672             fromUMapping->f=1; /* normal fallback */
    673         }
    674 
    675         ++fromUMapping;
    676         ++fromUIndex;
    677     }
    678 
    679     while(toUIndex<toUTop) {
    680         /* leftover toU mappings are reverse fallbacks */
    681 
    682         /* ignore reverse fallbacks to Unicode SUB */
    683         if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
    684             toUMapping->f=3; /* reverse fallback */
    685             ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
    686         }
    687 
    688         ++toUMapping;
    689         ++toUIndex;
    690     }
    691 
    692     fromUTable->isSorted=FALSE;
    693 }
    694 
    695 /* separate extension mappings out of base table for rptp2ucm --------------- */
    696 
    697 U_CAPI UBool U_EXPORT2
    698 ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
    699     UCMTable *table;
    700     UCMapping *m, *mLimit;
    701     int32_t type;
    702     UBool needsMove, isOK;
    703 
    704     table=ucm->base;
    705     m=table->mappings;
    706     mLimit=m+table->mappingsLength;
    707 
    708     needsMove=FALSE;
    709     isOK=TRUE;
    710 
    711     for(; m<mLimit; ++m) {
    712         if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
    713             fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
    714             ucm_printMapping(table, m, stderr);
    715             m->moveFlag|=UCM_REMOVE_MAPPING;
    716             needsMove=TRUE;
    717             continue;
    718         }
    719 
    720         type=ucm_mappingType(
    721                 &ucm->states, m,
    722                 UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
    723         if(type<0) {
    724             /* illegal byte sequence */
    725             printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
    726             isOK=FALSE;
    727         } else if(type>0) {
    728             m->moveFlag|=UCM_MOVE_TO_EXT;
    729             needsMove=TRUE;
    730         }
    731     }
    732 
    733     if(!isOK) {
    734         return FALSE;
    735     }
    736     if(needsMove) {
    737         ucm_moveMappings(ucm->base, ucm->ext);
    738         return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
    739     } else {
    740         ucm_sortTable(ucm->base);
    741         return TRUE;
    742     }
    743 }
    744 
    745 /* ucm parser --------------------------------------------------------------- */
    746 
    747 U_CAPI int8_t U_EXPORT2
    748 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
    749     const char *s=*ps;
    750     char *end;
    751     uint8_t byte;
    752     int8_t bLen;
    753 
    754     bLen=0;
    755     for(;;) {
    756         /* skip an optional plus sign */
    757         if(bLen>0 && *s=='+') {
    758             ++s;
    759         }
    760         if(*s!='\\') {
    761             break;
    762         }
    763 
    764         if( s[1]!='x' ||
    765             (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
    766         ) {
    767             fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
    768             return -1;
    769         }
    770 
    771         if(bLen==UCNV_EXT_MAX_BYTES) {
    772             fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
    773             return -1;
    774         }
    775         bytes[bLen++]=byte;
    776         s=end;
    777     }
    778 
    779     *ps=s;
    780     return bLen;
    781 }
    782 
    783 /* parse a mapping line; must not be empty */
    784 U_CAPI UBool U_EXPORT2
    785 ucm_parseMappingLine(UCMapping *m,
    786                      UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    787                      uint8_t bytes[UCNV_EXT_MAX_BYTES],
    788                      const char *line) {
    789     const char *s;
    790     char *end;
    791     UChar32 cp;
    792     int32_t u16Length;
    793     int8_t uLen, bLen, f;
    794 
    795     s=line;
    796     uLen=bLen=0;
    797 
    798     /* parse code points */
    799     for(;;) {
    800         /* skip an optional plus sign */
    801         if(uLen>0 && *s=='+') {
    802             ++s;
    803         }
    804         if(*s!='<') {
    805             break;
    806         }
    807 
    808         if( s[1]!='U' ||
    809             (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
    810             *end!='>'
    811         ) {
    812             fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
    813             return FALSE;
    814         }
    815         if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
    816             fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
    817             return FALSE;
    818         }
    819 
    820         if(uLen==UCNV_EXT_MAX_UCHARS) {
    821             fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
    822             return FALSE;
    823         }
    824         codePoints[uLen++]=cp;
    825         s=end+1;
    826     }
    827 
    828     if(uLen==0) {
    829         fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
    830         return FALSE;
    831     } else if(uLen==1) {
    832         m->u=codePoints[0];
    833     } else {
    834         UErrorCode errorCode=U_ZERO_ERROR;
    835         u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
    836         if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
    837             u16Length>UCNV_EXT_MAX_UCHARS
    838         ) {
    839             fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
    840             return FALSE;
    841         }
    842     }
    843 
    844     s=u_skipWhitespace(s);
    845 
    846     /* parse bytes */
    847     bLen=ucm_parseBytes(bytes, line, &s);
    848 
    849     if(bLen<0) {
    850         return FALSE;
    851     } else if(bLen==0) {
    852         fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
    853         return FALSE;
    854     } else if(bLen<=4) {
    855         uprv_memcpy(m->b.bytes, bytes, bLen);
    856     }
    857 
    858     /* skip everything until the fallback indicator, even the start of a comment */
    859     for(;;) {
    860         if(*s==0) {
    861             f=-1; /* no fallback indicator */
    862             break;
    863         } else if(*s=='|') {
    864             f=(int8_t)(s[1]-'0');
    865             if((uint8_t)f>4) {
    866                 fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
    867                 return FALSE;
    868             }
    869             break;
    870         }
    871         ++s;
    872     }
    873 
    874     m->uLen=uLen;
    875     m->bLen=bLen;
    876     m->f=f;
    877     return TRUE;
    878 }
    879 
    880 /* general APIs ------------------------------------------------------------- */
    881 
    882 U_CAPI UCMTable * U_EXPORT2
    883 ucm_openTable() {
    884     UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
    885     if(table==NULL) {
    886         fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
    887         exit(U_MEMORY_ALLOCATION_ERROR);
    888     }
    889 
    890     memset(table, 0, sizeof(UCMTable));
    891     return table;
    892 }
    893 
    894 U_CAPI void U_EXPORT2
    895 ucm_closeTable(UCMTable *table) {
    896     if(table!=NULL) {
    897         uprv_free(table->mappings);
    898         uprv_free(table->codePoints);
    899         uprv_free(table->bytes);
    900         uprv_free(table->reverseMap);
    901         uprv_free(table);
    902     }
    903 }
    904 
    905 U_CAPI void U_EXPORT2
    906 ucm_resetTable(UCMTable *table) {
    907     if(table!=NULL) {
    908         table->mappingsLength=0;
    909         table->flagsType=0;
    910         table->unicodeMask=0;
    911         table->bytesLength=table->codePointsLength=0;
    912         table->isSorted=FALSE;
    913     }
    914 }
    915 
    916 U_CAPI void U_EXPORT2
    917 ucm_addMapping(UCMTable *table,
    918                UCMapping *m,
    919                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    920                uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
    921     UCMapping *tm;
    922     UChar32 c;
    923     int32_t idx;
    924 
    925     if(table->mappingsLength>=table->mappingsCapacity) {
    926         /* make the mappings array larger */
    927         if(table->mappingsCapacity==0) {
    928             table->mappingsCapacity=1000;
    929         } else {
    930             table->mappingsCapacity*=10;
    931         }
    932         table->mappings=(UCMapping *)uprv_realloc(table->mappings,
    933                                              table->mappingsCapacity*sizeof(UCMapping));
    934         if(table->mappings==NULL) {
    935             fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
    936                             (int)table->mappingsCapacity);
    937             exit(U_MEMORY_ALLOCATION_ERROR);
    938         }
    939 
    940         if(table->reverseMap!=NULL) {
    941             /* the reverseMap must be reallocated in a new sort */
    942             uprv_free(table->reverseMap);
    943             table->reverseMap=NULL;
    944         }
    945     }
    946 
    947     if(m->uLen>1 && table->codePointsCapacity==0) {
    948         table->codePointsCapacity=10000;
    949         table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
    950         if(table->codePoints==NULL) {
    951             fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
    952                             (int)table->codePointsCapacity);
    953             exit(U_MEMORY_ALLOCATION_ERROR);
    954         }
    955     }
    956 
    957     if(m->bLen>4 && table->bytesCapacity==0) {
    958         table->bytesCapacity=10000;
    959         table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
    960         if(table->bytes==NULL) {
    961             fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
    962                             (int)table->bytesCapacity);
    963             exit(U_MEMORY_ALLOCATION_ERROR);
    964         }
    965     }
    966 
    967     if(m->uLen>1) {
    968         idx=table->codePointsLength;
    969         table->codePointsLength+=m->uLen;
    970         if(table->codePointsLength>table->codePointsCapacity) {
    971             fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
    972             exit(U_MEMORY_ALLOCATION_ERROR);
    973         }
    974 
    975         uprv_memcpy(table->codePoints+idx, codePoints, (size_t)m->uLen*4);
    976         m->u=idx;
    977     }
    978 
    979     if(m->bLen>4) {
    980         idx=table->bytesLength;
    981         table->bytesLength+=m->bLen;
    982         if(table->bytesLength>table->bytesCapacity) {
    983             fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
    984             exit(U_MEMORY_ALLOCATION_ERROR);
    985         }
    986 
    987         uprv_memcpy(table->bytes+idx, bytes, m->bLen);
    988         m->b.idx=idx;
    989     }
    990 
    991     /* set unicodeMask */
    992     for(idx=0; idx<m->uLen; ++idx) {
    993         c=codePoints[idx];
    994         if(c>=0x10000) {
    995             table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
    996         } else if(U_IS_SURROGATE(c)) {
    997             table->unicodeMask|=UCNV_HAS_SURROGATES;    /* there are surrogate code points */
    998         }
    999     }
   1000 
   1001     /* set flagsType */
   1002     if(m->f<0) {
   1003         table->flagsType|=UCM_FLAGS_IMPLICIT;
   1004     } else {
   1005         table->flagsType|=UCM_FLAGS_EXPLICIT;
   1006     }
   1007 
   1008     tm=table->mappings+table->mappingsLength++;
   1009     uprv_memcpy(tm, m, sizeof(UCMapping));
   1010 
   1011     table->isSorted=FALSE;
   1012 }
   1013 
   1014 U_CAPI UCMFile * U_EXPORT2
   1015 ucm_open() {
   1016     UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
   1017     if(ucm==NULL) {
   1018         fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
   1019         exit(U_MEMORY_ALLOCATION_ERROR);
   1020     }
   1021 
   1022     memset(ucm, 0, sizeof(UCMFile));
   1023 
   1024     ucm->base=ucm_openTable();
   1025     ucm->ext=ucm_openTable();
   1026 
   1027     ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
   1028     ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
   1029     ucm->states.outputType=-1;
   1030     ucm->states.minCharLength=ucm->states.maxCharLength=1;
   1031 
   1032     return ucm;
   1033 }
   1034 
   1035 U_CAPI void U_EXPORT2
   1036 ucm_close(UCMFile *ucm) {
   1037     if(ucm!=NULL) {
   1038         ucm_closeTable(ucm->base);
   1039         ucm_closeTable(ucm->ext);
   1040         uprv_free(ucm);
   1041     }
   1042 }
   1043 
   1044 U_CAPI int32_t U_EXPORT2
   1045 ucm_mappingType(UCMStates *baseStates,
   1046                 UCMapping *m,
   1047                 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
   1048                 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
   1049     (void)codePoints;
   1050     /* check validity of the bytes and count the characters in them */
   1051     int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
   1052     if(count<1) {
   1053         /* illegal byte sequence */
   1054         return -1;
   1055     }
   1056 
   1057     /*
   1058      * Suitable for an ICU conversion base table means:
   1059      * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
   1060      * - precision flag 0..3
   1061      * - SBCS: any 1:1 mapping
   1062      *         (the table stores additional bits to distinguish mapping types)
   1063      * - MBCS: not a |2 SUB mapping for <subchar1>
   1064      * - MBCS: not a |1 fallback to 0x00
   1065      * - MBCS: not a multi-byte mapping with leading 0x00 bytes
   1066      *
   1067      * Further restrictions for fromUnicode tables
   1068      * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
   1069      *
   1070      * All of the MBCS fromUnicode specific tests could be removed from here,
   1071      * but the ones above are for unusual mappings, and removing the tests
   1072      * from here would change canonucm output which seems gratuitous.
   1073      * (Markus Scherer 2006-nov-28)
   1074      *
   1075      * Exception: All implicit mappings (f<0) that need to be moved
   1076      * because of fromUnicode restrictions _must_ be moved here because
   1077      * makeconv uses a hack for moving mappings only for the fromUnicode table
   1078      * that only works with non-negative values of f.
   1079      */
   1080     if( m->uLen==1 && count==1 && m->f<=3 &&
   1081         (baseStates->maxCharLength==1 ||
   1082             !((m->f==2 && m->bLen==1) ||
   1083               (m->f==1 && bytes[0]==0) ||
   1084               (m->f<=1 && m->bLen>1 && bytes[0]==0)))
   1085     ) {
   1086         return 0; /* suitable for a base table */
   1087     } else {
   1088         return 1; /* needs to go into an extension table */
   1089     }
   1090 }
   1091 
   1092 U_CAPI UBool U_EXPORT2
   1093 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
   1094                    UCMapping *m,
   1095                    UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
   1096                    uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
   1097     int32_t type;
   1098 
   1099     if(m->f==2 && m->uLen>1) {
   1100         fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
   1101         printMapping(m, codePoints, bytes, stderr);
   1102         return FALSE;
   1103     }
   1104 
   1105     if(baseStates!=NULL) {
   1106         /* check validity of the bytes and count the characters in them */
   1107         type=ucm_mappingType(baseStates, m, codePoints, bytes);
   1108         if(type<0) {
   1109             /* illegal byte sequence */
   1110             printMapping(m, codePoints, bytes, stderr);
   1111             return FALSE;
   1112         }
   1113     } else {
   1114         /* not used - adding a mapping for an extension-only table before its base table is read */
   1115         type=1;
   1116     }
   1117 
   1118     /*
   1119      * Add the mapping to the base table if this is requested and suitable.
   1120      * Otherwise, add it to the extension table.
   1121      */
   1122     if(forBase && type==0) {
   1123         ucm_addMapping(ucm->base, m, codePoints, bytes);
   1124     } else {
   1125         ucm_addMapping(ucm->ext, m, codePoints, bytes);
   1126     }
   1127 
   1128     return TRUE;
   1129 }
   1130 
   1131 U_CAPI UBool U_EXPORT2
   1132 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
   1133   UCMapping m={ 0, {0}, 0, 0, 0, 0 };
   1134     UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
   1135     uint8_t bytes[UCNV_EXT_MAX_BYTES];
   1136 
   1137     const char *s;
   1138 
   1139     /* ignore empty and comment lines */
   1140     if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
   1141         return TRUE;
   1142     }
   1143 
   1144     return
   1145         ucm_parseMappingLine(&m, codePoints, bytes, line) &&
   1146         ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
   1147 }
   1148 
   1149 U_CAPI void U_EXPORT2
   1150 ucm_readTable(UCMFile *ucm, FileStream* convFile,
   1151               UBool forBase, UCMStates *baseStates,
   1152               UErrorCode *pErrorCode) {
   1153     char line[500];
   1154     char *end;
   1155     UBool isOK;
   1156 
   1157     if(U_FAILURE(*pErrorCode)) {
   1158         return;
   1159     }
   1160 
   1161     isOK=TRUE;
   1162 
   1163     for(;;) {
   1164         /* read the next line */
   1165         if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
   1166             fprintf(stderr, "incomplete charmap section\n");
   1167             isOK=FALSE;
   1168             break;
   1169         }
   1170 
   1171         /* remove CR LF */
   1172         end=uprv_strchr(line, 0);
   1173         while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
   1174             --end;
   1175         }
   1176         *end=0;
   1177 
   1178         /* ignore empty and comment lines */
   1179         if(line[0]==0 || line[0]=='#') {
   1180             continue;
   1181         }
   1182 
   1183         /* stop at the end of the mapping table */
   1184         if(0==uprv_strcmp(line, "END CHARMAP")) {
   1185             break;
   1186         }
   1187 
   1188         isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
   1189     }
   1190 
   1191     if(!isOK) {
   1192         *pErrorCode=U_INVALID_TABLE_FORMAT;
   1193     }
   1194 }
   1195 #endif
   1196