Home | History | Annotate | Download | only in toolutil
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2003-2013, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  ucm.c
     11 *   encoding:   US-ASCII
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2003jun20
     16 *   created by: Markus W. Scherer
     17 *
     18 *   This file reads a .ucm file, stores its mappings and sorts them.
     19 *   It implements handling of Unicode conversion mappings from .ucm files
     20 *   for makeconv, canonucm, rptp2ucm, etc.
     21 *
     22 *   Unicode code point sequences with a length of more than 1,
     23 *   as well as byte sequences with more than 4 bytes or more than one complete
     24 *   character sequence are handled to support m:n mappings.
     25 */
     26 
     27 #include "unicode/utypes.h"
     28 #include "unicode/ustring.h"
     29 #include "cstring.h"
     30 #include "cmemory.h"
     31 #include "filestrm.h"
     32 #include "uarrsort.h"
     33 #include "ucnvmbcs.h"
     34 #include "ucnv_bld.h"
     35 #include "ucnv_ext.h"
     36 #include "uparse.h"
     37 #include "ucm.h"
     38 #include <stdio.h>
     39 
     40 #if !UCONFIG_NO_CONVERSION
     41 
     42 /* -------------------------------------------------------------------------- */
     43 
     44 static void
     45 printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
     46     int32_t j;
     47 
     48     for(j=0; j<m->uLen; ++j) {
     49         fprintf(f, "<U%04lX>", (long)codePoints[j]);
     50     }
     51 
     52     fputc(' ', f);
     53 
     54     for(j=0; j<m->bLen; ++j) {
     55         fprintf(f, "\\x%02X", bytes[j]);
     56     }
     57 
     58     if(m->f>=0) {
     59         fprintf(f, " |%u\n", m->f);
     60     } else {
     61         fputs("\n", f);
     62     }
     63 }
     64 
     65 U_CAPI void U_EXPORT2
     66 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
     67     printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
     68 }
     69 
     70 U_CAPI void U_EXPORT2
     71 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
     72     UCMapping *m;
     73     int32_t i, length;
     74 
     75     m=table->mappings;
     76     length=table->mappingsLength;
     77     if(byUnicode) {
     78         for(i=0; i<length; ++m, ++i) {
     79             ucm_printMapping(table, m, f);
     80         }
     81     } else {
     82         const int32_t *map=table->reverseMap;
     83         for(i=0; i<length; ++i) {
     84             ucm_printMapping(table, m+map[i], f);
     85         }
     86     }
     87 }
     88 
     89 /* mapping comparisons ------------------------------------------------------ */
     90 
     91 static int32_t
     92 compareUnicode(UCMTable *lTable, const UCMapping *l,
     93                UCMTable *rTable, const UCMapping *r) {
     94     const UChar32 *lu, *ru;
     95     int32_t result, i, length;
     96 
     97     if(l->uLen==1 && r->uLen==1) {
     98         /* compare two single code points */
     99         return l->u-r->u;
    100     }
    101 
    102     /* get pointers to the code point sequences */
    103     lu=UCM_GET_CODE_POINTS(lTable, l);
    104     ru=UCM_GET_CODE_POINTS(rTable, r);
    105 
    106     /* get the minimum length */
    107     if(l->uLen<=r->uLen) {
    108         length=l->uLen;
    109     } else {
    110         length=r->uLen;
    111     }
    112 
    113     /* compare the code points */
    114     for(i=0; i<length; ++i) {
    115         result=lu[i]-ru[i];
    116         if(result!=0) {
    117             return result;
    118         }
    119     }
    120 
    121     /* compare the lengths */
    122     return l->uLen-r->uLen;
    123 }
    124 
    125 static int32_t
    126 compareBytes(UCMTable *lTable, const UCMapping *l,
    127              UCMTable *rTable, const UCMapping *r,
    128              UBool lexical) {
    129     const uint8_t *lb, *rb;
    130     int32_t result, i, length;
    131 
    132     /*
    133      * A lexical comparison is used for sorting in the builder, to allow
    134      * an efficient search for a byte sequence that could be a prefix
    135      * of a previously entered byte sequence.
    136      *
    137      * Comparing by lengths first is for compatibility with old .ucm tools
    138      * like canonucm and rptp2ucm.
    139      */
    140     if(lexical) {
    141         /* get the minimum length and continue */
    142         if(l->bLen<=r->bLen) {
    143             length=l->bLen;
    144         } else {
    145             length=r->bLen;
    146         }
    147     } else {
    148         /* compare lengths first */
    149         result=l->bLen-r->bLen;
    150         if(result!=0) {
    151             return result;
    152         } else {
    153             length=l->bLen;
    154         }
    155     }
    156 
    157     /* get pointers to the byte sequences */
    158     lb=UCM_GET_BYTES(lTable, l);
    159     rb=UCM_GET_BYTES(rTable, r);
    160 
    161     /* compare the bytes */
    162     for(i=0; i<length; ++i) {
    163         result=lb[i]-rb[i];
    164         if(result!=0) {
    165             return result;
    166         }
    167     }
    168 
    169     /* compare the lengths */
    170     return l->bLen-r->bLen;
    171 }
    172 
    173 /* compare UCMappings for sorting */
    174 static int32_t
    175 compareMappings(UCMTable *lTable, const UCMapping *l,
    176                 UCMTable *rTable, const UCMapping *r,
    177                 UBool uFirst) {
    178     int32_t result;
    179 
    180     /* choose which side to compare first */
    181     if(uFirst) {
    182         /* Unicode then bytes */
    183         result=compareUnicode(lTable, l, rTable, r);
    184         if(result==0) {
    185             result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
    186         }
    187     } else {
    188         /* bytes then Unicode */
    189         result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
    190         if(result==0) {
    191             result=compareUnicode(lTable, l, rTable, r);
    192         }
    193     }
    194 
    195     if(result!=0) {
    196         return result;
    197     }
    198 
    199     /* compare the flags */
    200     return l->f-r->f;
    201 }
    202 
    203 /* sorting by Unicode first sorts mappings directly */
    204 static int32_t
    205 compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
    206     return compareMappings(
    207         (UCMTable *)context, (const UCMapping *)left,
    208         (UCMTable *)context, (const UCMapping *)right, TRUE);
    209 }
    210 
    211 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
    212 static int32_t
    213 compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
    214     UCMTable *table=(UCMTable *)context;
    215     int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
    216     return compareMappings(
    217         table, table->mappings+l,
    218         table, table->mappings+r, FALSE);
    219 }
    220 
    221 U_CAPI void U_EXPORT2
    222 ucm_sortTable(UCMTable *t) {
    223     UErrorCode errorCode;
    224     int32_t i;
    225 
    226     if(t->isSorted) {
    227         return;
    228     }
    229 
    230     errorCode=U_ZERO_ERROR;
    231 
    232     /* 1. sort by Unicode first */
    233     uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
    234                    compareMappingsUnicodeFirst, t,
    235                    FALSE, &errorCode);
    236 
    237     /* build the reverseMap */
    238     if(t->reverseMap==NULL) {
    239         /*
    240          * allocate mappingsCapacity instead of mappingsLength so that
    241          * if mappings are added, the reverseMap need not be
    242          * reallocated each time
    243          * (see ucm_moveMappings() and ucm_addMapping())
    244          */
    245         t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
    246         if(t->reverseMap==NULL) {
    247             fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
    248             exit(U_MEMORY_ALLOCATION_ERROR);
    249         }
    250     }
    251     for(i=0; i<t->mappingsLength; ++i) {
    252         t->reverseMap[i]=i;
    253     }
    254 
    255     /* 2. sort reverseMap by mappings bytes first */
    256     uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
    257                    compareMappingsBytesFirst, t,
    258                    FALSE, &errorCode);
    259 
    260     if(U_FAILURE(errorCode)) {
    261         fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
    262                 u_errorName(errorCode));
    263         exit(errorCode);
    264     }
    265 
    266     t->isSorted=TRUE;
    267 }
    268 
    269 /*
    270  * remove mappings with their move flag set from the base table
    271  * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
    272  */
    273 U_CAPI void U_EXPORT2
    274 ucm_moveMappings(UCMTable *base, UCMTable *ext) {
    275     UCMapping *mb, *mbLimit;
    276     int8_t flag;
    277 
    278     mb=base->mappings;
    279     mbLimit=mb+base->mappingsLength;
    280 
    281     while(mb<mbLimit) {
    282         flag=mb->moveFlag;
    283         if(flag!=0) {
    284             /* reset the move flag */
    285             mb->moveFlag=0;
    286 
    287             if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
    288                 /* add the mapping to the extension table */
    289                 ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
    290             }
    291 
    292             /* remove this mapping: move the last base mapping down and overwrite the current one */
    293             if(mb<(mbLimit-1)) {
    294                 uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
    295             }
    296             --mbLimit;
    297             --base->mappingsLength;
    298             base->isSorted=FALSE;
    299         } else {
    300             ++mb;
    301         }
    302     }
    303 }
    304 
    305 enum {
    306     NEEDS_MOVE=1,
    307     HAS_ERRORS=2
    308 };
    309 
    310 static uint8_t
    311 checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
    312                     UBool moveToExt, UBool intersectBase) {
    313     UCMapping *mb, *me, *mbLimit, *meLimit;
    314     int32_t cmp;
    315     uint8_t result;
    316 
    317     mb=base->mappings;
    318     mbLimit=mb+base->mappingsLength;
    319 
    320     me=ext->mappings;
    321     meLimit=me+ext->mappingsLength;
    322 
    323     result=0;
    324 
    325     for(;;) {
    326         /* skip irrelevant mappings on both sides */
    327         for(;;) {
    328             if(mb==mbLimit) {
    329                 return result;
    330             }
    331 
    332             if((0<=mb->f && mb->f<=2) || mb->f==4) {
    333                 break;
    334             }
    335 
    336             ++mb;
    337         }
    338 
    339         for(;;) {
    340             if(me==meLimit) {
    341                 return result;
    342             }
    343 
    344             if((0<=me->f && me->f<=2) || me->f==4) {
    345                 break;
    346             }
    347 
    348             ++me;
    349         }
    350 
    351         /* compare the base and extension mappings */
    352         cmp=compareUnicode(base, mb, ext, me);
    353         if(cmp<0) {
    354             if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
    355                 /*
    356                  * mapping in base but not in ext, move it
    357                  *
    358                  * if ext is DBCS, move DBCS mappings here
    359                  * and check SBCS ones for Unicode prefix below
    360                  */
    361                 mb->moveFlag|=UCM_MOVE_TO_EXT;
    362                 result|=NEEDS_MOVE;
    363 
    364             /* does mb map from an input sequence that is a prefix of me's? */
    365             } else if( mb->uLen<me->uLen &&
    366                 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
    367             ) {
    368                 if(moveToExt) {
    369                     /* mark this mapping to be moved to the extension table */
    370                     mb->moveFlag|=UCM_MOVE_TO_EXT;
    371                     result|=NEEDS_MOVE;
    372                 } else {
    373                     fprintf(stderr,
    374                             "ucm error: the base table contains a mapping whose input sequence\n"
    375                             "           is a prefix of the input sequence of an extension mapping\n");
    376                     ucm_printMapping(base, mb, stderr);
    377                     ucm_printMapping(ext, me, stderr);
    378                     result|=HAS_ERRORS;
    379                 }
    380             }
    381 
    382             ++mb;
    383         } else if(cmp==0) {
    384             /*
    385              * same output: remove the extension mapping,
    386              * otherwise treat as an error
    387              */
    388             if( mb->f==me->f && mb->bLen==me->bLen &&
    389                 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
    390             ) {
    391                 me->moveFlag|=UCM_REMOVE_MAPPING;
    392                 result|=NEEDS_MOVE;
    393             } else if(intersectBase) {
    394                 /* mapping in base but not in ext, move it */
    395                 mb->moveFlag|=UCM_MOVE_TO_EXT;
    396                 result|=NEEDS_MOVE;
    397             } else {
    398                 fprintf(stderr,
    399                         "ucm error: the base table contains a mapping whose input sequence\n"
    400                         "           is the same as the input sequence of an extension mapping\n"
    401                         "           but it maps differently\n");
    402                 ucm_printMapping(base, mb, stderr);
    403                 ucm_printMapping(ext, me, stderr);
    404                 result|=HAS_ERRORS;
    405             }
    406 
    407             ++mb;
    408         } else /* cmp>0 */ {
    409             ++me;
    410         }
    411     }
    412 }
    413 
    414 static uint8_t
    415 checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
    416                   UBool moveToExt, UBool intersectBase) {
    417     UCMapping *mb, *me;
    418     int32_t *baseMap, *extMap;
    419     int32_t b, e, bLimit, eLimit, cmp;
    420     uint8_t result;
    421     UBool isSISO;
    422 
    423     baseMap=base->reverseMap;
    424     extMap=ext->reverseMap;
    425 
    426     b=e=0;
    427     bLimit=base->mappingsLength;
    428     eLimit=ext->mappingsLength;
    429 
    430     result=0;
    431 
    432     isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
    433 
    434     for(;;) {
    435         /* skip irrelevant mappings on both sides */
    436         for(;; ++b) {
    437             if(b==bLimit) {
    438                 return result;
    439             }
    440             mb=base->mappings+baseMap[b];
    441 
    442             if(intersectBase==2 && mb->bLen==1) {
    443                 /*
    444                  * comparing a base against a DBCS extension:
    445                  * leave SBCS base mappings alone
    446                  */
    447                 continue;
    448             }
    449 
    450             if(mb->f==0 || mb->f==3) {
    451                 break;
    452             }
    453         }
    454 
    455         for(;;) {
    456             if(e==eLimit) {
    457                 return result;
    458             }
    459             me=ext->mappings+extMap[e];
    460 
    461             if(me->f==0 || me->f==3) {
    462                 break;
    463             }
    464 
    465             ++e;
    466         }
    467 
    468         /* compare the base and extension mappings */
    469         cmp=compareBytes(base, mb, ext, me, TRUE);
    470         if(cmp<0) {
    471             if(intersectBase) {
    472                 /* mapping in base but not in ext, move it */
    473                 mb->moveFlag|=UCM_MOVE_TO_EXT;
    474                 result|=NEEDS_MOVE;
    475 
    476             /*
    477              * does mb map from an input sequence that is a prefix of me's?
    478              * for SI/SO tables, a single byte is never a prefix because it
    479              * occurs in a separate single-byte state
    480              */
    481             } else if( mb->bLen<me->bLen &&
    482                 (!isSISO || mb->bLen>1) &&
    483                 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
    484             ) {
    485                 if(moveToExt) {
    486                     /* mark this mapping to be moved to the extension table */
    487                     mb->moveFlag|=UCM_MOVE_TO_EXT;
    488                     result|=NEEDS_MOVE;
    489                 } else {
    490                     fprintf(stderr,
    491                             "ucm error: the base table contains a mapping whose input sequence\n"
    492                             "           is a prefix of the input sequence of an extension mapping\n");
    493                     ucm_printMapping(base, mb, stderr);
    494                     ucm_printMapping(ext, me, stderr);
    495                     result|=HAS_ERRORS;
    496                 }
    497             }
    498 
    499             ++b;
    500         } else if(cmp==0) {
    501             /*
    502              * same output: remove the extension mapping,
    503              * otherwise treat as an error
    504              */
    505             if( mb->f==me->f && mb->uLen==me->uLen &&
    506                 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
    507             ) {
    508                 me->moveFlag|=UCM_REMOVE_MAPPING;
    509                 result|=NEEDS_MOVE;
    510             } else if(intersectBase) {
    511                 /* mapping in base but not in ext, move it */
    512                 mb->moveFlag|=UCM_MOVE_TO_EXT;
    513                 result|=NEEDS_MOVE;
    514             } else {
    515                 fprintf(stderr,
    516                         "ucm error: the base table contains a mapping whose input sequence\n"
    517                         "           is the same as the input sequence of an extension mapping\n"
    518                         "           but it maps differently\n");
    519                 ucm_printMapping(base, mb, stderr);
    520                 ucm_printMapping(ext, me, stderr);
    521                 result|=HAS_ERRORS;
    522             }
    523 
    524             ++b;
    525         } else /* cmp>0 */ {
    526             ++e;
    527         }
    528     }
    529 }
    530 
    531 U_CAPI UBool U_EXPORT2
    532 ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
    533     UCMapping *m, *mLimit;
    534     int32_t count;
    535     UBool isOK;
    536 
    537     m=table->mappings;
    538     mLimit=m+table->mappingsLength;
    539     isOK=TRUE;
    540 
    541     while(m<mLimit) {
    542         count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
    543         if(count<1) {
    544             ucm_printMapping(table, m, stderr);
    545             isOK=FALSE;
    546         }
    547         ++m;
    548     }
    549 
    550     return isOK;
    551 }
    552 
    553 U_CAPI UBool U_EXPORT2
    554 ucm_checkBaseExt(UCMStates *baseStates,
    555                  UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
    556                  UBool intersectBase) {
    557     uint8_t result;
    558 
    559     /* if we have an extension table, we must always use precision flags */
    560     if(base->flagsType&UCM_FLAGS_IMPLICIT) {
    561         fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
    562         return FALSE;
    563     }
    564     if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
    565         fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
    566         return FALSE;
    567     }
    568 
    569     /* checking requires both tables to be sorted */
    570     ucm_sortTable(base);
    571     ucm_sortTable(ext);
    572 
    573     /* check */
    574     result=
    575         checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
    576         checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
    577 
    578     if(result&HAS_ERRORS) {
    579         return FALSE;
    580     }
    581 
    582     if(result&NEEDS_MOVE) {
    583         ucm_moveMappings(ext, NULL);
    584         ucm_moveMappings(base, moveTarget);
    585         ucm_sortTable(base);
    586         ucm_sortTable(ext);
    587         if(moveTarget!=NULL) {
    588             ucm_sortTable(moveTarget);
    589         }
    590     }
    591 
    592     return TRUE;
    593 }
    594 
    595 /* merge tables for rptp2ucm ------------------------------------------------ */
    596 
    597 U_CAPI void U_EXPORT2
    598 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
    599                 const uint8_t *subchar, int32_t subcharLength,
    600                 uint8_t subchar1) {
    601     UCMapping *fromUMapping, *toUMapping;
    602     int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
    603 
    604     ucm_sortTable(fromUTable);
    605     ucm_sortTable(toUTable);
    606 
    607     fromUMapping=fromUTable->mappings;
    608     toUMapping=toUTable->mappings;
    609 
    610     fromUTop=fromUTable->mappingsLength;
    611     toUTop=toUTable->mappingsLength;
    612 
    613     fromUIndex=toUIndex=0;
    614 
    615     while(fromUIndex<fromUTop && toUIndex<toUTop) {
    616         cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
    617         if(cmp==0) {
    618             /* equal: roundtrip, nothing to do (flags are initially 0) */
    619             ++fromUMapping;
    620             ++toUMapping;
    621 
    622             ++fromUIndex;
    623             ++toUIndex;
    624         } else if(cmp<0) {
    625             /*
    626              * the fromU mapping does not have a toU counterpart:
    627              * fallback Unicode->codepage
    628              */
    629             if( (fromUMapping->bLen==subcharLength &&
    630                  0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
    631                 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
    632             ) {
    633                 fromUMapping->f=2; /* SUB mapping */
    634             } else {
    635                 fromUMapping->f=1; /* normal fallback */
    636             }
    637 
    638             ++fromUMapping;
    639             ++fromUIndex;
    640         } else {
    641             /*
    642              * the toU mapping does not have a fromU counterpart:
    643              * (reverse) fallback codepage->Unicode, copy it to the fromU table
    644              */
    645 
    646             /* ignore reverse fallbacks to Unicode SUB */
    647             if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
    648                 toUMapping->f=3; /* reverse fallback */
    649                 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
    650 
    651                 /* the table may have been reallocated */
    652                 fromUMapping=fromUTable->mappings+fromUIndex;
    653             }
    654 
    655             ++toUMapping;
    656             ++toUIndex;
    657         }
    658     }
    659 
    660     /* either one or both tables are exhausted */
    661     while(fromUIndex<fromUTop) {
    662         /* leftover fromU mappings are fallbacks */
    663         if( (fromUMapping->bLen==subcharLength &&
    664              0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
    665             (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
    666         ) {
    667             fromUMapping->f=2; /* SUB mapping */
    668         } else {
    669             fromUMapping->f=1; /* normal fallback */
    670         }
    671 
    672         ++fromUMapping;
    673         ++fromUIndex;
    674     }
    675 
    676     while(toUIndex<toUTop) {
    677         /* leftover toU mappings are reverse fallbacks */
    678 
    679         /* ignore reverse fallbacks to Unicode SUB */
    680         if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
    681             toUMapping->f=3; /* reverse fallback */
    682             ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
    683         }
    684 
    685         ++toUMapping;
    686         ++toUIndex;
    687     }
    688 
    689     fromUTable->isSorted=FALSE;
    690 }
    691 
    692 /* separate extension mappings out of base table for rptp2ucm --------------- */
    693 
    694 U_CAPI UBool U_EXPORT2
    695 ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
    696     UCMTable *table;
    697     UCMapping *m, *mLimit;
    698     int32_t type;
    699     UBool needsMove, isOK;
    700 
    701     table=ucm->base;
    702     m=table->mappings;
    703     mLimit=m+table->mappingsLength;
    704 
    705     needsMove=FALSE;
    706     isOK=TRUE;
    707 
    708     for(; m<mLimit; ++m) {
    709         if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
    710             fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
    711             ucm_printMapping(table, m, stderr);
    712             m->moveFlag|=UCM_REMOVE_MAPPING;
    713             needsMove=TRUE;
    714             continue;
    715         }
    716 
    717         type=ucm_mappingType(
    718                 &ucm->states, m,
    719                 UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
    720         if(type<0) {
    721             /* illegal byte sequence */
    722             printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
    723             isOK=FALSE;
    724         } else if(type>0) {
    725             m->moveFlag|=UCM_MOVE_TO_EXT;
    726             needsMove=TRUE;
    727         }
    728     }
    729 
    730     if(!isOK) {
    731         return FALSE;
    732     }
    733     if(needsMove) {
    734         ucm_moveMappings(ucm->base, ucm->ext);
    735         return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
    736     } else {
    737         ucm_sortTable(ucm->base);
    738         return TRUE;
    739     }
    740 }
    741 
    742 /* ucm parser --------------------------------------------------------------- */
    743 
    744 U_CAPI int8_t U_EXPORT2
    745 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
    746     const char *s=*ps;
    747     char *end;
    748     uint8_t byte;
    749     int8_t bLen;
    750 
    751     bLen=0;
    752     for(;;) {
    753         /* skip an optional plus sign */
    754         if(bLen>0 && *s=='+') {
    755             ++s;
    756         }
    757         if(*s!='\\') {
    758             break;
    759         }
    760 
    761         if( s[1]!='x' ||
    762             (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
    763         ) {
    764             fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
    765             return -1;
    766         }
    767 
    768         if(bLen==UCNV_EXT_MAX_BYTES) {
    769             fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
    770             return -1;
    771         }
    772         bytes[bLen++]=byte;
    773         s=end;
    774     }
    775 
    776     *ps=s;
    777     return bLen;
    778 }
    779 
    780 /* parse a mapping line; must not be empty */
    781 U_CAPI UBool U_EXPORT2
    782 ucm_parseMappingLine(UCMapping *m,
    783                      UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    784                      uint8_t bytes[UCNV_EXT_MAX_BYTES],
    785                      const char *line) {
    786     const char *s;
    787     char *end;
    788     UChar32 cp;
    789     int32_t u16Length;
    790     int8_t uLen, bLen, f;
    791 
    792     s=line;
    793     uLen=bLen=0;
    794 
    795     /* parse code points */
    796     for(;;) {
    797         /* skip an optional plus sign */
    798         if(uLen>0 && *s=='+') {
    799             ++s;
    800         }
    801         if(*s!='<') {
    802             break;
    803         }
    804 
    805         if( s[1]!='U' ||
    806             (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
    807             *end!='>'
    808         ) {
    809             fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
    810             return FALSE;
    811         }
    812         if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
    813             fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
    814             return FALSE;
    815         }
    816 
    817         if(uLen==UCNV_EXT_MAX_UCHARS) {
    818             fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
    819             return FALSE;
    820         }
    821         codePoints[uLen++]=cp;
    822         s=end+1;
    823     }
    824 
    825     if(uLen==0) {
    826         fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
    827         return FALSE;
    828     } else if(uLen==1) {
    829         m->u=codePoints[0];
    830     } else {
    831         UErrorCode errorCode=U_ZERO_ERROR;
    832         u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
    833         if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
    834             u16Length>UCNV_EXT_MAX_UCHARS
    835         ) {
    836             fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
    837             return FALSE;
    838         }
    839     }
    840 
    841     s=u_skipWhitespace(s);
    842 
    843     /* parse bytes */
    844     bLen=ucm_parseBytes(bytes, line, &s);
    845 
    846     if(bLen<0) {
    847         return FALSE;
    848     } else if(bLen==0) {
    849         fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
    850         return FALSE;
    851     } else if(bLen<=4) {
    852         uprv_memcpy(m->b.bytes, bytes, bLen);
    853     }
    854 
    855     /* skip everything until the fallback indicator, even the start of a comment */
    856     for(;;) {
    857         if(*s==0) {
    858             f=-1; /* no fallback indicator */
    859             break;
    860         } else if(*s=='|') {
    861             f=(int8_t)(s[1]-'0');
    862             if((uint8_t)f>4) {
    863                 fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
    864                 return FALSE;
    865             }
    866             break;
    867         }
    868         ++s;
    869     }
    870 
    871     m->uLen=uLen;
    872     m->bLen=bLen;
    873     m->f=f;
    874     return TRUE;
    875 }
    876 
    877 /* general APIs ------------------------------------------------------------- */
    878 
    879 U_CAPI UCMTable * U_EXPORT2
    880 ucm_openTable() {
    881     UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
    882     if(table==NULL) {
    883         fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
    884         exit(U_MEMORY_ALLOCATION_ERROR);
    885     }
    886 
    887     memset(table, 0, sizeof(UCMTable));
    888     return table;
    889 }
    890 
    891 U_CAPI void U_EXPORT2
    892 ucm_closeTable(UCMTable *table) {
    893     if(table!=NULL) {
    894         uprv_free(table->mappings);
    895         uprv_free(table->codePoints);
    896         uprv_free(table->bytes);
    897         uprv_free(table->reverseMap);
    898         uprv_free(table);
    899     }
    900 }
    901 
    902 U_CAPI void U_EXPORT2
    903 ucm_resetTable(UCMTable *table) {
    904     if(table!=NULL) {
    905         table->mappingsLength=0;
    906         table->flagsType=0;
    907         table->unicodeMask=0;
    908         table->bytesLength=table->codePointsLength=0;
    909         table->isSorted=FALSE;
    910     }
    911 }
    912 
    913 U_CAPI void U_EXPORT2
    914 ucm_addMapping(UCMTable *table,
    915                UCMapping *m,
    916                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    917                uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
    918     UCMapping *tm;
    919     UChar32 c;
    920     int32_t idx;
    921 
    922     if(table->mappingsLength>=table->mappingsCapacity) {
    923         /* make the mappings array larger */
    924         if(table->mappingsCapacity==0) {
    925             table->mappingsCapacity=1000;
    926         } else {
    927             table->mappingsCapacity*=10;
    928         }
    929         table->mappings=(UCMapping *)uprv_realloc(table->mappings,
    930                                              table->mappingsCapacity*sizeof(UCMapping));
    931         if(table->mappings==NULL) {
    932             fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
    933                             (int)table->mappingsCapacity);
    934             exit(U_MEMORY_ALLOCATION_ERROR);
    935         }
    936 
    937         if(table->reverseMap!=NULL) {
    938             /* the reverseMap must be reallocated in a new sort */
    939             uprv_free(table->reverseMap);
    940             table->reverseMap=NULL;
    941         }
    942     }
    943 
    944     if(m->uLen>1 && table->codePointsCapacity==0) {
    945         table->codePointsCapacity=10000;
    946         table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
    947         if(table->codePoints==NULL) {
    948             fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
    949                             (int)table->codePointsCapacity);
    950             exit(U_MEMORY_ALLOCATION_ERROR);
    951         }
    952     }
    953 
    954     if(m->bLen>4 && table->bytesCapacity==0) {
    955         table->bytesCapacity=10000;
    956         table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
    957         if(table->bytes==NULL) {
    958             fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
    959                             (int)table->bytesCapacity);
    960             exit(U_MEMORY_ALLOCATION_ERROR);
    961         }
    962     }
    963 
    964     if(m->uLen>1) {
    965         idx=table->codePointsLength;
    966         table->codePointsLength+=m->uLen;
    967         if(table->codePointsLength>table->codePointsCapacity) {
    968             fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
    969             exit(U_MEMORY_ALLOCATION_ERROR);
    970         }
    971 
    972         uprv_memcpy(table->codePoints+idx, codePoints, (size_t)m->uLen*4);
    973         m->u=idx;
    974     }
    975 
    976     if(m->bLen>4) {
    977         idx=table->bytesLength;
    978         table->bytesLength+=m->bLen;
    979         if(table->bytesLength>table->bytesCapacity) {
    980             fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
    981             exit(U_MEMORY_ALLOCATION_ERROR);
    982         }
    983 
    984         uprv_memcpy(table->bytes+idx, bytes, m->bLen);
    985         m->b.idx=idx;
    986     }
    987 
    988     /* set unicodeMask */
    989     for(idx=0; idx<m->uLen; ++idx) {
    990         c=codePoints[idx];
    991         if(c>=0x10000) {
    992             table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
    993         } else if(U_IS_SURROGATE(c)) {
    994             table->unicodeMask|=UCNV_HAS_SURROGATES;    /* there are surrogate code points */
    995         }
    996     }
    997 
    998     /* set flagsType */
    999     if(m->f<0) {
   1000         table->flagsType|=UCM_FLAGS_IMPLICIT;
   1001     } else {
   1002         table->flagsType|=UCM_FLAGS_EXPLICIT;
   1003     }
   1004 
   1005     tm=table->mappings+table->mappingsLength++;
   1006     uprv_memcpy(tm, m, sizeof(UCMapping));
   1007 
   1008     table->isSorted=FALSE;
   1009 }
   1010 
   1011 U_CAPI UCMFile * U_EXPORT2
   1012 ucm_open() {
   1013     UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
   1014     if(ucm==NULL) {
   1015         fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
   1016         exit(U_MEMORY_ALLOCATION_ERROR);
   1017     }
   1018 
   1019     memset(ucm, 0, sizeof(UCMFile));
   1020 
   1021     ucm->base=ucm_openTable();
   1022     ucm->ext=ucm_openTable();
   1023 
   1024     ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
   1025     ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
   1026     ucm->states.outputType=-1;
   1027     ucm->states.minCharLength=ucm->states.maxCharLength=1;
   1028 
   1029     return ucm;
   1030 }
   1031 
   1032 U_CAPI void U_EXPORT2
   1033 ucm_close(UCMFile *ucm) {
   1034     if(ucm!=NULL) {
   1035         ucm_closeTable(ucm->base);
   1036         ucm_closeTable(ucm->ext);
   1037         uprv_free(ucm);
   1038     }
   1039 }
   1040 
   1041 U_CAPI int32_t U_EXPORT2
   1042 ucm_mappingType(UCMStates *baseStates,
   1043                 UCMapping *m,
   1044                 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
   1045                 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
   1046     /* check validity of the bytes and count the characters in them */
   1047     int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
   1048     if(count<1) {
   1049         /* illegal byte sequence */
   1050         return -1;
   1051     }
   1052 
   1053     /*
   1054      * Suitable for an ICU conversion base table means:
   1055      * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
   1056      * - precision flag 0..3
   1057      * - SBCS: any 1:1 mapping
   1058      *         (the table stores additional bits to distinguish mapping types)
   1059      * - MBCS: not a |2 SUB mapping for <subchar1>
   1060      * - MBCS: not a |1 fallback to 0x00
   1061      * - MBCS: not a multi-byte mapping with leading 0x00 bytes
   1062      *
   1063      * Further restrictions for fromUnicode tables
   1064      * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
   1065      *
   1066      * All of the MBCS fromUnicode specific tests could be removed from here,
   1067      * but the ones above are for unusual mappings, and removing the tests
   1068      * from here would change canonucm output which seems gratuitous.
   1069      * (Markus Scherer 2006-nov-28)
   1070      *
   1071      * Exception: All implicit mappings (f<0) that need to be moved
   1072      * because of fromUnicode restrictions _must_ be moved here because
   1073      * makeconv uses a hack for moving mappings only for the fromUnicode table
   1074      * that only works with non-negative values of f.
   1075      */
   1076     if( m->uLen==1 && count==1 && m->f<=3 &&
   1077         (baseStates->maxCharLength==1 ||
   1078             !((m->f==2 && m->bLen==1) ||
   1079               (m->f==1 && bytes[0]==0) ||
   1080               (m->f<=1 && m->bLen>1 && bytes[0]==0)))
   1081     ) {
   1082         return 0; /* suitable for a base table */
   1083     } else {
   1084         return 1; /* needs to go into an extension table */
   1085     }
   1086 }
   1087 
   1088 U_CAPI UBool U_EXPORT2
   1089 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
   1090                    UCMapping *m,
   1091                    UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
   1092                    uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
   1093     int32_t type;
   1094 
   1095     if(m->f==2 && m->uLen>1) {
   1096         fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
   1097         printMapping(m, codePoints, bytes, stderr);
   1098         return FALSE;
   1099     }
   1100 
   1101     if(baseStates!=NULL) {
   1102         /* check validity of the bytes and count the characters in them */
   1103         type=ucm_mappingType(baseStates, m, codePoints, bytes);
   1104         if(type<0) {
   1105             /* illegal byte sequence */
   1106             printMapping(m, codePoints, bytes, stderr);
   1107             return FALSE;
   1108         }
   1109     } else {
   1110         /* not used - adding a mapping for an extension-only table before its base table is read */
   1111         type=1;
   1112     }
   1113 
   1114     /*
   1115      * Add the mapping to the base table if this is requested and suitable.
   1116      * Otherwise, add it to the extension table.
   1117      */
   1118     if(forBase && type==0) {
   1119         ucm_addMapping(ucm->base, m, codePoints, bytes);
   1120     } else {
   1121         ucm_addMapping(ucm->ext, m, codePoints, bytes);
   1122     }
   1123 
   1124     return TRUE;
   1125 }
   1126 
   1127 U_CAPI UBool U_EXPORT2
   1128 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
   1129   UCMapping m={ 0, {0}, 0, 0, 0, 0 };
   1130     UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
   1131     uint8_t bytes[UCNV_EXT_MAX_BYTES];
   1132 
   1133     const char *s;
   1134 
   1135     /* ignore empty and comment lines */
   1136     if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
   1137         return TRUE;
   1138     }
   1139 
   1140     return
   1141         ucm_parseMappingLine(&m, codePoints, bytes, line) &&
   1142         ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
   1143 }
   1144 
   1145 U_CAPI void U_EXPORT2
   1146 ucm_readTable(UCMFile *ucm, FileStream* convFile,
   1147               UBool forBase, UCMStates *baseStates,
   1148               UErrorCode *pErrorCode) {
   1149     char line[500];
   1150     char *end;
   1151     UBool isOK;
   1152 
   1153     if(U_FAILURE(*pErrorCode)) {
   1154         return;
   1155     }
   1156 
   1157     isOK=TRUE;
   1158 
   1159     for(;;) {
   1160         /* read the next line */
   1161         if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
   1162             fprintf(stderr, "incomplete charmap section\n");
   1163             isOK=FALSE;
   1164             break;
   1165         }
   1166 
   1167         /* remove CR LF */
   1168         end=uprv_strchr(line, 0);
   1169         while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
   1170             --end;
   1171         }
   1172         *end=0;
   1173 
   1174         /* ignore empty and comment lines */
   1175         if(line[0]==0 || line[0]=='#') {
   1176             continue;
   1177         }
   1178 
   1179         /* stop at the end of the mapping table */
   1180         if(0==uprv_strcmp(line, "END CHARMAP")) {
   1181             break;
   1182         }
   1183 
   1184         isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
   1185     }
   1186 
   1187     if(!isOK) {
   1188         *pErrorCode=U_INVALID_TABLE_FORMAT;
   1189     }
   1190 }
   1191 #endif
   1192