Home | History | Annotate | Download | only in toolutil
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2003-2013, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucm.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2003jun20
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This file reads a .ucm file, stores its mappings and sorts them.
     17 *   It implements handling of Unicode conversion mappings from .ucm files
     18 *   for makeconv, canonucm, rptp2ucm, etc.
     19 *
     20 *   Unicode code point sequences with a length of more than 1,
     21 *   as well as byte sequences with more than 4 bytes or more than one complete
     22 *   character sequence are handled to support m:n mappings.
     23 */
     24 
     25 #include "unicode/utypes.h"
     26 #include "unicode/ustring.h"
     27 #include "cstring.h"
     28 #include "cmemory.h"
     29 #include "filestrm.h"
     30 #include "uarrsort.h"
     31 #include "ucnvmbcs.h"
     32 #include "ucnv_bld.h"
     33 #include "ucnv_ext.h"
     34 #include "uparse.h"
     35 #include "ucm.h"
     36 #include <stdio.h>
     37 
     38 #if !UCONFIG_NO_CONVERSION
     39 
     40 /* -------------------------------------------------------------------------- */
     41 
     42 static void
     43 printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
     44     int32_t j;
     45 
     46     for(j=0; j<m->uLen; ++j) {
     47         fprintf(f, "<U%04lX>", (long)codePoints[j]);
     48     }
     49 
     50     fputc(' ', f);
     51 
     52     for(j=0; j<m->bLen; ++j) {
     53         fprintf(f, "\\x%02X", bytes[j]);
     54     }
     55 
     56     if(m->f>=0) {
     57         fprintf(f, " |%u\n", m->f);
     58     } else {
     59         fputs("\n", f);
     60     }
     61 }
     62 
     63 U_CAPI void U_EXPORT2
     64 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
     65     printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
     66 }
     67 
     68 U_CAPI void U_EXPORT2
     69 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
     70     UCMapping *m;
     71     int32_t i, length;
     72 
     73     m=table->mappings;
     74     length=table->mappingsLength;
     75     if(byUnicode) {
     76         for(i=0; i<length; ++m, ++i) {
     77             ucm_printMapping(table, m, f);
     78         }
     79     } else {
     80         const int32_t *map=table->reverseMap;
     81         for(i=0; i<length; ++i) {
     82             ucm_printMapping(table, m+map[i], f);
     83         }
     84     }
     85 }
     86 
     87 /* mapping comparisons ------------------------------------------------------ */
     88 
     89 static int32_t
     90 compareUnicode(UCMTable *lTable, const UCMapping *l,
     91                UCMTable *rTable, const UCMapping *r) {
     92     const UChar32 *lu, *ru;
     93     int32_t result, i, length;
     94 
     95     if(l->uLen==1 && r->uLen==1) {
     96         /* compare two single code points */
     97         return l->u-r->u;
     98     }
     99 
    100     /* get pointers to the code point sequences */
    101     lu=UCM_GET_CODE_POINTS(lTable, l);
    102     ru=UCM_GET_CODE_POINTS(rTable, r);
    103 
    104     /* get the minimum length */
    105     if(l->uLen<=r->uLen) {
    106         length=l->uLen;
    107     } else {
    108         length=r->uLen;
    109     }
    110 
    111     /* compare the code points */
    112     for(i=0; i<length; ++i) {
    113         result=lu[i]-ru[i];
    114         if(result!=0) {
    115             return result;
    116         }
    117     }
    118 
    119     /* compare the lengths */
    120     return l->uLen-r->uLen;
    121 }
    122 
    123 static int32_t
    124 compareBytes(UCMTable *lTable, const UCMapping *l,
    125              UCMTable *rTable, const UCMapping *r,
    126              UBool lexical) {
    127     const uint8_t *lb, *rb;
    128     int32_t result, i, length;
    129 
    130     /*
    131      * A lexical comparison is used for sorting in the builder, to allow
    132      * an efficient search for a byte sequence that could be a prefix
    133      * of a previously entered byte sequence.
    134      *
    135      * Comparing by lengths first is for compatibility with old .ucm tools
    136      * like canonucm and rptp2ucm.
    137      */
    138     if(lexical) {
    139         /* get the minimum length and continue */
    140         if(l->bLen<=r->bLen) {
    141             length=l->bLen;
    142         } else {
    143             length=r->bLen;
    144         }
    145     } else {
    146         /* compare lengths first */
    147         result=l->bLen-r->bLen;
    148         if(result!=0) {
    149             return result;
    150         } else {
    151             length=l->bLen;
    152         }
    153     }
    154 
    155     /* get pointers to the byte sequences */
    156     lb=UCM_GET_BYTES(lTable, l);
    157     rb=UCM_GET_BYTES(rTable, r);
    158 
    159     /* compare the bytes */
    160     for(i=0; i<length; ++i) {
    161         result=lb[i]-rb[i];
    162         if(result!=0) {
    163             return result;
    164         }
    165     }
    166 
    167     /* compare the lengths */
    168     return l->bLen-r->bLen;
    169 }
    170 
    171 /* compare UCMappings for sorting */
    172 static int32_t
    173 compareMappings(UCMTable *lTable, const UCMapping *l,
    174                 UCMTable *rTable, const UCMapping *r,
    175                 UBool uFirst) {
    176     int32_t result;
    177 
    178     /* choose which side to compare first */
    179     if(uFirst) {
    180         /* Unicode then bytes */
    181         result=compareUnicode(lTable, l, rTable, r);
    182         if(result==0) {
    183             result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
    184         }
    185     } else {
    186         /* bytes then Unicode */
    187         result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
    188         if(result==0) {
    189             result=compareUnicode(lTable, l, rTable, r);
    190         }
    191     }
    192 
    193     if(result!=0) {
    194         return result;
    195     }
    196 
    197     /* compare the flags */
    198     return l->f-r->f;
    199 }
    200 
    201 /* sorting by Unicode first sorts mappings directly */
    202 static int32_t
    203 compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
    204     return compareMappings(
    205         (UCMTable *)context, (const UCMapping *)left,
    206         (UCMTable *)context, (const UCMapping *)right, TRUE);
    207 }
    208 
    209 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
    210 static int32_t
    211 compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
    212     UCMTable *table=(UCMTable *)context;
    213     int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
    214     return compareMappings(
    215         table, table->mappings+l,
    216         table, table->mappings+r, FALSE);
    217 }
    218 
    219 U_CAPI void U_EXPORT2
    220 ucm_sortTable(UCMTable *t) {
    221     UErrorCode errorCode;
    222     int32_t i;
    223 
    224     if(t->isSorted) {
    225         return;
    226     }
    227 
    228     errorCode=U_ZERO_ERROR;
    229 
    230     /* 1. sort by Unicode first */
    231     uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
    232                    compareMappingsUnicodeFirst, t,
    233                    FALSE, &errorCode);
    234 
    235     /* build the reverseMap */
    236     if(t->reverseMap==NULL) {
    237         /*
    238          * allocate mappingsCapacity instead of mappingsLength so that
    239          * if mappings are added, the reverseMap need not be
    240          * reallocated each time
    241          * (see ucm_moveMappings() and ucm_addMapping())
    242          */
    243         t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
    244         if(t->reverseMap==NULL) {
    245             fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
    246             exit(U_MEMORY_ALLOCATION_ERROR);
    247         }
    248     }
    249     for(i=0; i<t->mappingsLength; ++i) {
    250         t->reverseMap[i]=i;
    251     }
    252 
    253     /* 2. sort reverseMap by mappings bytes first */
    254     uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
    255                    compareMappingsBytesFirst, t,
    256                    FALSE, &errorCode);
    257 
    258     if(U_FAILURE(errorCode)) {
    259         fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
    260                 u_errorName(errorCode));
    261         exit(errorCode);
    262     }
    263 
    264     t->isSorted=TRUE;
    265 }
    266 
    267 /*
    268  * remove mappings with their move flag set from the base table
    269  * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
    270  */
    271 U_CAPI void U_EXPORT2
    272 ucm_moveMappings(UCMTable *base, UCMTable *ext) {
    273     UCMapping *mb, *mbLimit;
    274     int8_t flag;
    275 
    276     mb=base->mappings;
    277     mbLimit=mb+base->mappingsLength;
    278 
    279     while(mb<mbLimit) {
    280         flag=mb->moveFlag;
    281         if(flag!=0) {
    282             /* reset the move flag */
    283             mb->moveFlag=0;
    284 
    285             if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
    286                 /* add the mapping to the extension table */
    287                 ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
    288             }
    289 
    290             /* remove this mapping: move the last base mapping down and overwrite the current one */
    291             if(mb<(mbLimit-1)) {
    292                 uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
    293             }
    294             --mbLimit;
    295             --base->mappingsLength;
    296             base->isSorted=FALSE;
    297         } else {
    298             ++mb;
    299         }
    300     }
    301 }
    302 
    303 enum {
    304     NEEDS_MOVE=1,
    305     HAS_ERRORS=2
    306 };
    307 
    308 static uint8_t
    309 checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
    310                     UBool moveToExt, UBool intersectBase) {
    311     UCMapping *mb, *me, *mbLimit, *meLimit;
    312     int32_t cmp;
    313     uint8_t result;
    314 
    315     mb=base->mappings;
    316     mbLimit=mb+base->mappingsLength;
    317 
    318     me=ext->mappings;
    319     meLimit=me+ext->mappingsLength;
    320 
    321     result=0;
    322 
    323     for(;;) {
    324         /* skip irrelevant mappings on both sides */
    325         for(;;) {
    326             if(mb==mbLimit) {
    327                 return result;
    328             }
    329 
    330             if((0<=mb->f && mb->f<=2) || mb->f==4) {
    331                 break;
    332             }
    333 
    334             ++mb;
    335         }
    336 
    337         for(;;) {
    338             if(me==meLimit) {
    339                 return result;
    340             }
    341 
    342             if((0<=me->f && me->f<=2) || me->f==4) {
    343                 break;
    344             }
    345 
    346             ++me;
    347         }
    348 
    349         /* compare the base and extension mappings */
    350         cmp=compareUnicode(base, mb, ext, me);
    351         if(cmp<0) {
    352             if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
    353                 /*
    354                  * mapping in base but not in ext, move it
    355                  *
    356                  * if ext is DBCS, move DBCS mappings here
    357                  * and check SBCS ones for Unicode prefix below
    358                  */
    359                 mb->moveFlag|=UCM_MOVE_TO_EXT;
    360                 result|=NEEDS_MOVE;
    361 
    362             /* does mb map from an input sequence that is a prefix of me's? */
    363             } else if( mb->uLen<me->uLen &&
    364                 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
    365             ) {
    366                 if(moveToExt) {
    367                     /* mark this mapping to be moved to the extension table */
    368                     mb->moveFlag|=UCM_MOVE_TO_EXT;
    369                     result|=NEEDS_MOVE;
    370                 } else {
    371                     fprintf(stderr,
    372                             "ucm error: the base table contains a mapping whose input sequence\n"
    373                             "           is a prefix of the input sequence of an extension mapping\n");
    374                     ucm_printMapping(base, mb, stderr);
    375                     ucm_printMapping(ext, me, stderr);
    376                     result|=HAS_ERRORS;
    377                 }
    378             }
    379 
    380             ++mb;
    381         } else if(cmp==0) {
    382             /*
    383              * same output: remove the extension mapping,
    384              * otherwise treat as an error
    385              */
    386             if( mb->f==me->f && mb->bLen==me->bLen &&
    387                 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
    388             ) {
    389                 me->moveFlag|=UCM_REMOVE_MAPPING;
    390                 result|=NEEDS_MOVE;
    391             } else if(intersectBase) {
    392                 /* mapping in base but not in ext, move it */
    393                 mb->moveFlag|=UCM_MOVE_TO_EXT;
    394                 result|=NEEDS_MOVE;
    395             } else {
    396                 fprintf(stderr,
    397                         "ucm error: the base table contains a mapping whose input sequence\n"
    398                         "           is the same as the input sequence of an extension mapping\n"
    399                         "           but it maps differently\n");
    400                 ucm_printMapping(base, mb, stderr);
    401                 ucm_printMapping(ext, me, stderr);
    402                 result|=HAS_ERRORS;
    403             }
    404 
    405             ++mb;
    406         } else /* cmp>0 */ {
    407             ++me;
    408         }
    409     }
    410 }
    411 
    412 static uint8_t
    413 checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
    414                   UBool moveToExt, UBool intersectBase) {
    415     UCMapping *mb, *me;
    416     int32_t *baseMap, *extMap;
    417     int32_t b, e, bLimit, eLimit, cmp;
    418     uint8_t result;
    419     UBool isSISO;
    420 
    421     baseMap=base->reverseMap;
    422     extMap=ext->reverseMap;
    423 
    424     b=e=0;
    425     bLimit=base->mappingsLength;
    426     eLimit=ext->mappingsLength;
    427 
    428     result=0;
    429 
    430     isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
    431 
    432     for(;;) {
    433         /* skip irrelevant mappings on both sides */
    434         for(;; ++b) {
    435             if(b==bLimit) {
    436                 return result;
    437             }
    438             mb=base->mappings+baseMap[b];
    439 
    440             if(intersectBase==2 && mb->bLen==1) {
    441                 /*
    442                  * comparing a base against a DBCS extension:
    443                  * leave SBCS base mappings alone
    444                  */
    445                 continue;
    446             }
    447 
    448             if(mb->f==0 || mb->f==3) {
    449                 break;
    450             }
    451         }
    452 
    453         for(;;) {
    454             if(e==eLimit) {
    455                 return result;
    456             }
    457             me=ext->mappings+extMap[e];
    458 
    459             if(me->f==0 || me->f==3) {
    460                 break;
    461             }
    462 
    463             ++e;
    464         }
    465 
    466         /* compare the base and extension mappings */
    467         cmp=compareBytes(base, mb, ext, me, TRUE);
    468         if(cmp<0) {
    469             if(intersectBase) {
    470                 /* mapping in base but not in ext, move it */
    471                 mb->moveFlag|=UCM_MOVE_TO_EXT;
    472                 result|=NEEDS_MOVE;
    473 
    474             /*
    475              * does mb map from an input sequence that is a prefix of me's?
    476              * for SI/SO tables, a single byte is never a prefix because it
    477              * occurs in a separate single-byte state
    478              */
    479             } else if( mb->bLen<me->bLen &&
    480                 (!isSISO || mb->bLen>1) &&
    481                 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
    482             ) {
    483                 if(moveToExt) {
    484                     /* mark this mapping to be moved to the extension table */
    485                     mb->moveFlag|=UCM_MOVE_TO_EXT;
    486                     result|=NEEDS_MOVE;
    487                 } else {
    488                     fprintf(stderr,
    489                             "ucm error: the base table contains a mapping whose input sequence\n"
    490                             "           is a prefix of the input sequence of an extension mapping\n");
    491                     ucm_printMapping(base, mb, stderr);
    492                     ucm_printMapping(ext, me, stderr);
    493                     result|=HAS_ERRORS;
    494                 }
    495             }
    496 
    497             ++b;
    498         } else if(cmp==0) {
    499             /*
    500              * same output: remove the extension mapping,
    501              * otherwise treat as an error
    502              */
    503             if( mb->f==me->f && mb->uLen==me->uLen &&
    504                 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
    505             ) {
    506                 me->moveFlag|=UCM_REMOVE_MAPPING;
    507                 result|=NEEDS_MOVE;
    508             } else if(intersectBase) {
    509                 /* mapping in base but not in ext, move it */
    510                 mb->moveFlag|=UCM_MOVE_TO_EXT;
    511                 result|=NEEDS_MOVE;
    512             } else {
    513                 fprintf(stderr,
    514                         "ucm error: the base table contains a mapping whose input sequence\n"
    515                         "           is the same as the input sequence of an extension mapping\n"
    516                         "           but it maps differently\n");
    517                 ucm_printMapping(base, mb, stderr);
    518                 ucm_printMapping(ext, me, stderr);
    519                 result|=HAS_ERRORS;
    520             }
    521 
    522             ++b;
    523         } else /* cmp>0 */ {
    524             ++e;
    525         }
    526     }
    527 }
    528 
    529 U_CAPI UBool U_EXPORT2
    530 ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
    531     UCMapping *m, *mLimit;
    532     int32_t count;
    533     UBool isOK;
    534 
    535     m=table->mappings;
    536     mLimit=m+table->mappingsLength;
    537     isOK=TRUE;
    538 
    539     while(m<mLimit) {
    540         count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
    541         if(count<1) {
    542             ucm_printMapping(table, m, stderr);
    543             isOK=FALSE;
    544         }
    545         ++m;
    546     }
    547 
    548     return isOK;
    549 }
    550 
    551 U_CAPI UBool U_EXPORT2
    552 ucm_checkBaseExt(UCMStates *baseStates,
    553                  UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
    554                  UBool intersectBase) {
    555     uint8_t result;
    556 
    557     /* if we have an extension table, we must always use precision flags */
    558     if(base->flagsType&UCM_FLAGS_IMPLICIT) {
    559         fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
    560         return FALSE;
    561     }
    562     if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
    563         fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
    564         return FALSE;
    565     }
    566 
    567     /* checking requires both tables to be sorted */
    568     ucm_sortTable(base);
    569     ucm_sortTable(ext);
    570 
    571     /* check */
    572     result=
    573         checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
    574         checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
    575 
    576     if(result&HAS_ERRORS) {
    577         return FALSE;
    578     }
    579 
    580     if(result&NEEDS_MOVE) {
    581         ucm_moveMappings(ext, NULL);
    582         ucm_moveMappings(base, moveTarget);
    583         ucm_sortTable(base);
    584         ucm_sortTable(ext);
    585         if(moveTarget!=NULL) {
    586             ucm_sortTable(moveTarget);
    587         }
    588     }
    589 
    590     return TRUE;
    591 }
    592 
    593 /* merge tables for rptp2ucm ------------------------------------------------ */
    594 
    595 U_CAPI void U_EXPORT2
    596 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
    597                 const uint8_t *subchar, int32_t subcharLength,
    598                 uint8_t subchar1) {
    599     UCMapping *fromUMapping, *toUMapping;
    600     int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
    601 
    602     ucm_sortTable(fromUTable);
    603     ucm_sortTable(toUTable);
    604 
    605     fromUMapping=fromUTable->mappings;
    606     toUMapping=toUTable->mappings;
    607 
    608     fromUTop=fromUTable->mappingsLength;
    609     toUTop=toUTable->mappingsLength;
    610 
    611     fromUIndex=toUIndex=0;
    612 
    613     while(fromUIndex<fromUTop && toUIndex<toUTop) {
    614         cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
    615         if(cmp==0) {
    616             /* equal: roundtrip, nothing to do (flags are initially 0) */
    617             ++fromUMapping;
    618             ++toUMapping;
    619 
    620             ++fromUIndex;
    621             ++toUIndex;
    622         } else if(cmp<0) {
    623             /*
    624              * the fromU mapping does not have a toU counterpart:
    625              * fallback Unicode->codepage
    626              */
    627             if( (fromUMapping->bLen==subcharLength &&
    628                  0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
    629                 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
    630             ) {
    631                 fromUMapping->f=2; /* SUB mapping */
    632             } else {
    633                 fromUMapping->f=1; /* normal fallback */
    634             }
    635 
    636             ++fromUMapping;
    637             ++fromUIndex;
    638         } else {
    639             /*
    640              * the toU mapping does not have a fromU counterpart:
    641              * (reverse) fallback codepage->Unicode, copy it to the fromU table
    642              */
    643 
    644             /* ignore reverse fallbacks to Unicode SUB */
    645             if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
    646                 toUMapping->f=3; /* reverse fallback */
    647                 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
    648 
    649                 /* the table may have been reallocated */
    650                 fromUMapping=fromUTable->mappings+fromUIndex;
    651             }
    652 
    653             ++toUMapping;
    654             ++toUIndex;
    655         }
    656     }
    657 
    658     /* either one or both tables are exhausted */
    659     while(fromUIndex<fromUTop) {
    660         /* leftover fromU mappings are fallbacks */
    661         if( (fromUMapping->bLen==subcharLength &&
    662              0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
    663             (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
    664         ) {
    665             fromUMapping->f=2; /* SUB mapping */
    666         } else {
    667             fromUMapping->f=1; /* normal fallback */
    668         }
    669 
    670         ++fromUMapping;
    671         ++fromUIndex;
    672     }
    673 
    674     while(toUIndex<toUTop) {
    675         /* leftover toU mappings are reverse fallbacks */
    676 
    677         /* ignore reverse fallbacks to Unicode SUB */
    678         if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
    679             toUMapping->f=3; /* reverse fallback */
    680             ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
    681         }
    682 
    683         ++toUMapping;
    684         ++toUIndex;
    685     }
    686 
    687     fromUTable->isSorted=FALSE;
    688 }
    689 
    690 /* separate extension mappings out of base table for rptp2ucm --------------- */
    691 
    692 U_CAPI UBool U_EXPORT2
    693 ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
    694     UCMTable *table;
    695     UCMapping *m, *mLimit;
    696     int32_t type;
    697     UBool needsMove, isOK;
    698 
    699     table=ucm->base;
    700     m=table->mappings;
    701     mLimit=m+table->mappingsLength;
    702 
    703     needsMove=FALSE;
    704     isOK=TRUE;
    705 
    706     for(; m<mLimit; ++m) {
    707         if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
    708             fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
    709             ucm_printMapping(table, m, stderr);
    710             m->moveFlag|=UCM_REMOVE_MAPPING;
    711             needsMove=TRUE;
    712             continue;
    713         }
    714 
    715         type=ucm_mappingType(
    716                 &ucm->states, m,
    717                 UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
    718         if(type<0) {
    719             /* illegal byte sequence */
    720             printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
    721             isOK=FALSE;
    722         } else if(type>0) {
    723             m->moveFlag|=UCM_MOVE_TO_EXT;
    724             needsMove=TRUE;
    725         }
    726     }
    727 
    728     if(!isOK) {
    729         return FALSE;
    730     }
    731     if(needsMove) {
    732         ucm_moveMappings(ucm->base, ucm->ext);
    733         return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
    734     } else {
    735         ucm_sortTable(ucm->base);
    736         return TRUE;
    737     }
    738 }
    739 
    740 /* ucm parser --------------------------------------------------------------- */
    741 
    742 U_CAPI int8_t U_EXPORT2
    743 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
    744     const char *s=*ps;
    745     char *end;
    746     uint8_t byte;
    747     int8_t bLen;
    748 
    749     bLen=0;
    750     for(;;) {
    751         /* skip an optional plus sign */
    752         if(bLen>0 && *s=='+') {
    753             ++s;
    754         }
    755         if(*s!='\\') {
    756             break;
    757         }
    758 
    759         if( s[1]!='x' ||
    760             (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
    761         ) {
    762             fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
    763             return -1;
    764         }
    765 
    766         if(bLen==UCNV_EXT_MAX_BYTES) {
    767             fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
    768             return -1;
    769         }
    770         bytes[bLen++]=byte;
    771         s=end;
    772     }
    773 
    774     *ps=s;
    775     return bLen;
    776 }
    777 
    778 /* parse a mapping line; must not be empty */
    779 U_CAPI UBool U_EXPORT2
    780 ucm_parseMappingLine(UCMapping *m,
    781                      UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    782                      uint8_t bytes[UCNV_EXT_MAX_BYTES],
    783                      const char *line) {
    784     const char *s;
    785     char *end;
    786     UChar32 cp;
    787     int32_t u16Length;
    788     int8_t uLen, bLen, f;
    789 
    790     s=line;
    791     uLen=bLen=0;
    792 
    793     /* parse code points */
    794     for(;;) {
    795         /* skip an optional plus sign */
    796         if(uLen>0 && *s=='+') {
    797             ++s;
    798         }
    799         if(*s!='<') {
    800             break;
    801         }
    802 
    803         if( s[1]!='U' ||
    804             (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
    805             *end!='>'
    806         ) {
    807             fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
    808             return FALSE;
    809         }
    810         if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
    811             fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
    812             return FALSE;
    813         }
    814 
    815         if(uLen==UCNV_EXT_MAX_UCHARS) {
    816             fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
    817             return FALSE;
    818         }
    819         codePoints[uLen++]=cp;
    820         s=end+1;
    821     }
    822 
    823     if(uLen==0) {
    824         fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
    825         return FALSE;
    826     } else if(uLen==1) {
    827         m->u=codePoints[0];
    828     } else {
    829         UErrorCode errorCode=U_ZERO_ERROR;
    830         u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
    831         if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
    832             u16Length>UCNV_EXT_MAX_UCHARS
    833         ) {
    834             fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
    835             return FALSE;
    836         }
    837     }
    838 
    839     s=u_skipWhitespace(s);
    840 
    841     /* parse bytes */
    842     bLen=ucm_parseBytes(bytes, line, &s);
    843 
    844     if(bLen<0) {
    845         return FALSE;
    846     } else if(bLen==0) {
    847         fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
    848         return FALSE;
    849     } else if(bLen<=4) {
    850         uprv_memcpy(m->b.bytes, bytes, bLen);
    851     }
    852 
    853     /* skip everything until the fallback indicator, even the start of a comment */
    854     for(;;) {
    855         if(*s==0) {
    856             f=-1; /* no fallback indicator */
    857             break;
    858         } else if(*s=='|') {
    859             f=(int8_t)(s[1]-'0');
    860             if((uint8_t)f>4) {
    861                 fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
    862                 return FALSE;
    863             }
    864             break;
    865         }
    866         ++s;
    867     }
    868 
    869     m->uLen=uLen;
    870     m->bLen=bLen;
    871     m->f=f;
    872     return TRUE;
    873 }
    874 
    875 /* general APIs ------------------------------------------------------------- */
    876 
    877 U_CAPI UCMTable * U_EXPORT2
    878 ucm_openTable() {
    879     UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
    880     if(table==NULL) {
    881         fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
    882         exit(U_MEMORY_ALLOCATION_ERROR);
    883     }
    884 
    885     memset(table, 0, sizeof(UCMTable));
    886     return table;
    887 }
    888 
    889 U_CAPI void U_EXPORT2
    890 ucm_closeTable(UCMTable *table) {
    891     if(table!=NULL) {
    892         uprv_free(table->mappings);
    893         uprv_free(table->codePoints);
    894         uprv_free(table->bytes);
    895         uprv_free(table->reverseMap);
    896         uprv_free(table);
    897     }
    898 }
    899 
    900 U_CAPI void U_EXPORT2
    901 ucm_resetTable(UCMTable *table) {
    902     if(table!=NULL) {
    903         table->mappingsLength=0;
    904         table->flagsType=0;
    905         table->unicodeMask=0;
    906         table->bytesLength=table->codePointsLength=0;
    907         table->isSorted=FALSE;
    908     }
    909 }
    910 
    911 U_CAPI void U_EXPORT2
    912 ucm_addMapping(UCMTable *table,
    913                UCMapping *m,
    914                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    915                uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
    916     UCMapping *tm;
    917     UChar32 c;
    918     int32_t idx;
    919 
    920     if(table->mappingsLength>=table->mappingsCapacity) {
    921         /* make the mappings array larger */
    922         if(table->mappingsCapacity==0) {
    923             table->mappingsCapacity=1000;
    924         } else {
    925             table->mappingsCapacity*=10;
    926         }
    927         table->mappings=(UCMapping *)uprv_realloc(table->mappings,
    928                                              table->mappingsCapacity*sizeof(UCMapping));
    929         if(table->mappings==NULL) {
    930             fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
    931                             (int)table->mappingsCapacity);
    932             exit(U_MEMORY_ALLOCATION_ERROR);
    933         }
    934 
    935         if(table->reverseMap!=NULL) {
    936             /* the reverseMap must be reallocated in a new sort */
    937             uprv_free(table->reverseMap);
    938             table->reverseMap=NULL;
    939         }
    940     }
    941 
    942     if(m->uLen>1 && table->codePointsCapacity==0) {
    943         table->codePointsCapacity=10000;
    944         table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
    945         if(table->codePoints==NULL) {
    946             fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
    947                             (int)table->codePointsCapacity);
    948             exit(U_MEMORY_ALLOCATION_ERROR);
    949         }
    950     }
    951 
    952     if(m->bLen>4 && table->bytesCapacity==0) {
    953         table->bytesCapacity=10000;
    954         table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
    955         if(table->bytes==NULL) {
    956             fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
    957                             (int)table->bytesCapacity);
    958             exit(U_MEMORY_ALLOCATION_ERROR);
    959         }
    960     }
    961 
    962     if(m->uLen>1) {
    963         idx=table->codePointsLength;
    964         table->codePointsLength+=m->uLen;
    965         if(table->codePointsLength>table->codePointsCapacity) {
    966             fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
    967             exit(U_MEMORY_ALLOCATION_ERROR);
    968         }
    969 
    970         uprv_memcpy(table->codePoints+idx, codePoints, m->uLen*4);
    971         m->u=idx;
    972     }
    973 
    974     if(m->bLen>4) {
    975         idx=table->bytesLength;
    976         table->bytesLength+=m->bLen;
    977         if(table->bytesLength>table->bytesCapacity) {
    978             fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
    979             exit(U_MEMORY_ALLOCATION_ERROR);
    980         }
    981 
    982         uprv_memcpy(table->bytes+idx, bytes, m->bLen);
    983         m->b.idx=idx;
    984     }
    985 
    986     /* set unicodeMask */
    987     for(idx=0; idx<m->uLen; ++idx) {
    988         c=codePoints[idx];
    989         if(c>=0x10000) {
    990             table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
    991         } else if(U_IS_SURROGATE(c)) {
    992             table->unicodeMask|=UCNV_HAS_SURROGATES;    /* there are surrogate code points */
    993         }
    994     }
    995 
    996     /* set flagsType */
    997     if(m->f<0) {
    998         table->flagsType|=UCM_FLAGS_IMPLICIT;
    999     } else {
   1000         table->flagsType|=UCM_FLAGS_EXPLICIT;
   1001     }
   1002 
   1003     tm=table->mappings+table->mappingsLength++;
   1004     uprv_memcpy(tm, m, sizeof(UCMapping));
   1005 
   1006     table->isSorted=FALSE;
   1007 }
   1008 
   1009 U_CAPI UCMFile * U_EXPORT2
   1010 ucm_open() {
   1011     UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
   1012     if(ucm==NULL) {
   1013         fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
   1014         exit(U_MEMORY_ALLOCATION_ERROR);
   1015     }
   1016 
   1017     memset(ucm, 0, sizeof(UCMFile));
   1018 
   1019     ucm->base=ucm_openTable();
   1020     ucm->ext=ucm_openTable();
   1021 
   1022     ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
   1023     ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
   1024     ucm->states.outputType=-1;
   1025     ucm->states.minCharLength=ucm->states.maxCharLength=1;
   1026 
   1027     return ucm;
   1028 }
   1029 
   1030 U_CAPI void U_EXPORT2
   1031 ucm_close(UCMFile *ucm) {
   1032     if(ucm!=NULL) {
   1033         ucm_closeTable(ucm->base);
   1034         ucm_closeTable(ucm->ext);
   1035         uprv_free(ucm);
   1036     }
   1037 }
   1038 
   1039 U_CAPI int32_t U_EXPORT2
   1040 ucm_mappingType(UCMStates *baseStates,
   1041                 UCMapping *m,
   1042                 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
   1043                 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
   1044     /* check validity of the bytes and count the characters in them */
   1045     int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
   1046     if(count<1) {
   1047         /* illegal byte sequence */
   1048         return -1;
   1049     }
   1050 
   1051     /*
   1052      * Suitable for an ICU conversion base table means:
   1053      * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
   1054      * - precision flag 0..3
   1055      * - SBCS: any 1:1 mapping
   1056      *         (the table stores additional bits to distinguish mapping types)
   1057      * - MBCS: not a |2 SUB mapping for <subchar1>
   1058      * - MBCS: not a |1 fallback to 0x00
   1059      * - MBCS: not a multi-byte mapping with leading 0x00 bytes
   1060      *
   1061      * Further restrictions for fromUnicode tables
   1062      * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
   1063      *
   1064      * All of the MBCS fromUnicode specific tests could be removed from here,
   1065      * but the ones above are for unusual mappings, and removing the tests
   1066      * from here would change canonucm output which seems gratuitous.
   1067      * (Markus Scherer 2006-nov-28)
   1068      *
   1069      * Exception: All implicit mappings (f<0) that need to be moved
   1070      * because of fromUnicode restrictions _must_ be moved here because
   1071      * makeconv uses a hack for moving mappings only for the fromUnicode table
   1072      * that only works with non-negative values of f.
   1073      */
   1074     if( m->uLen==1 && count==1 && m->f<=3 &&
   1075         (baseStates->maxCharLength==1 ||
   1076             !((m->f==2 && m->bLen==1) ||
   1077               (m->f==1 && bytes[0]==0) ||
   1078               (m->f<=1 && m->bLen>1 && bytes[0]==0)))
   1079     ) {
   1080         return 0; /* suitable for a base table */
   1081     } else {
   1082         return 1; /* needs to go into an extension table */
   1083     }
   1084 }
   1085 
   1086 U_CAPI UBool U_EXPORT2
   1087 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
   1088                    UCMapping *m,
   1089                    UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
   1090                    uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
   1091     int32_t type;
   1092 
   1093     if(m->f==2 && m->uLen>1) {
   1094         fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
   1095         printMapping(m, codePoints, bytes, stderr);
   1096         return FALSE;
   1097     }
   1098 
   1099     if(baseStates!=NULL) {
   1100         /* check validity of the bytes and count the characters in them */
   1101         type=ucm_mappingType(baseStates, m, codePoints, bytes);
   1102         if(type<0) {
   1103             /* illegal byte sequence */
   1104             printMapping(m, codePoints, bytes, stderr);
   1105             return FALSE;
   1106         }
   1107     } else {
   1108         /* not used - adding a mapping for an extension-only table before its base table is read */
   1109         type=1;
   1110     }
   1111 
   1112     /*
   1113      * Add the mapping to the base table if this is requested and suitable.
   1114      * Otherwise, add it to the extension table.
   1115      */
   1116     if(forBase && type==0) {
   1117         ucm_addMapping(ucm->base, m, codePoints, bytes);
   1118     } else {
   1119         ucm_addMapping(ucm->ext, m, codePoints, bytes);
   1120     }
   1121 
   1122     return TRUE;
   1123 }
   1124 
   1125 U_CAPI UBool U_EXPORT2
   1126 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
   1127     UCMapping m={ 0 };
   1128     UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
   1129     uint8_t bytes[UCNV_EXT_MAX_BYTES];
   1130 
   1131     const char *s;
   1132 
   1133     /* ignore empty and comment lines */
   1134     if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
   1135         return TRUE;
   1136     }
   1137 
   1138     return
   1139         ucm_parseMappingLine(&m, codePoints, bytes, line) &&
   1140         ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
   1141 }
   1142 
   1143 U_CAPI void U_EXPORT2
   1144 ucm_readTable(UCMFile *ucm, FileStream* convFile,
   1145               UBool forBase, UCMStates *baseStates,
   1146               UErrorCode *pErrorCode) {
   1147     char line[500];
   1148     char *end;
   1149     UBool isOK;
   1150 
   1151     if(U_FAILURE(*pErrorCode)) {
   1152         return;
   1153     }
   1154 
   1155     isOK=TRUE;
   1156 
   1157     for(;;) {
   1158         /* read the next line */
   1159         if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
   1160             fprintf(stderr, "incomplete charmap section\n");
   1161             isOK=FALSE;
   1162             break;
   1163         }
   1164 
   1165         /* remove CR LF */
   1166         end=uprv_strchr(line, 0);
   1167         while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
   1168             --end;
   1169         }
   1170         *end=0;
   1171 
   1172         /* ignore empty and comment lines */
   1173         if(line[0]==0 || line[0]=='#') {
   1174             continue;
   1175         }
   1176 
   1177         /* stop at the end of the mapping table */
   1178         if(0==uprv_strcmp(line, "END CHARMAP")) {
   1179             break;
   1180         }
   1181 
   1182         isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
   1183     }
   1184 
   1185     if(!isOK) {
   1186         *pErrorCode=U_INVALID_TABLE_FORMAT;
   1187     }
   1188 }
   1189 #endif
   1190