Home | History | Annotate | Download | only in toolutil
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4  *******************************************************************************
      5  *   Copyright (C) 2003-2013, International Business Machines
      6  *   Corporation and others.  All Rights Reserved.
      7  *******************************************************************************
      8  *   file name:  ucm.h
      9  *   encoding:   UTF-8
     10  *   tab size:   8 (not used)
     11  *   indentation:4
     12  *
     13  *   created on: 2003jun20
     14  *   created by: Markus W. Scherer
     15  *
     16  *   Definitions for the .ucm file parser and handler module ucm.c.
     17  */
     18 
     19 #ifndef __UCM_H__
     20 #define __UCM_H__
     21 
     22 #include "unicode/utypes.h"
     23 #include "ucnvmbcs.h"
     24 #include "ucnv_ext.h"
     25 #include "filestrm.h"
     26 #include <stdio.h>
     27 
     28 #if !UCONFIG_NO_CONVERSION
     29 
     30 U_CDECL_BEGIN
     31 
     32 /* constants for UCMapping.moveFlag */
     33 enum {
     34     UCM_MOVE_TO_EXT=1,
     35     UCM_REMOVE_MAPPING=2
     36 };
     37 
     38 /*
     39  * Per-mapping data structure
     40  *
     41  * u if uLen==1: Unicode code point
     42  *   else index to uLen code points
     43  * b if bLen<=4: up to 4 bytes
     44  *   else index to bLen bytes
     45  * uLen number of code points
     46  * bLen number of words containing left-justified bytes
     47  * bIsMultipleChars indicates that the bytes contain more than one sequence
     48  *                  according to the state table
     49  * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3)
     50  *   or "good one-way" mapping (4).
     51  *   Same values as in the source file after |
     52  */
     53 typedef struct UCMapping {
     54     UChar32 u;
     55     union {
     56         uint32_t idx;
     57         uint8_t bytes[4];
     58     } b;
     59     int8_t uLen, bLen, f, moveFlag;
     60 } UCMapping;
     61 
     62 /* constants for UCMTable.flagsType */
     63 enum {
     64     UCM_FLAGS_INITIAL,  /* no mappings parsed yet */
     65     UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */
     66     UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */
     67     UCM_FLAGS_MIXED     /* both implicit and explicit */
     68 };
     69 
     70 typedef struct UCMTable {
     71     UCMapping *mappings;
     72     int32_t mappingsCapacity, mappingsLength;
     73 
     74     UChar32 *codePoints;
     75     int32_t codePointsCapacity, codePointsLength;
     76 
     77     uint8_t *bytes;
     78     int32_t bytesCapacity, bytesLength;
     79 
     80     /* index map for mapping by bytes first */
     81     int32_t *reverseMap;
     82 
     83     uint8_t unicodeMask;
     84     int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */
     85     UBool isSorted;
     86 } UCMTable;
     87 
     88 enum {
     89     MBCS_STATE_FLAG_DIRECT=1,
     90     MBCS_STATE_FLAG_SURROGATES,
     91 
     92     MBCS_STATE_FLAG_READY=16
     93 };
     94 
     95 typedef struct UCMStates {
     96     int32_t stateTable[MBCS_MAX_STATE_COUNT][256];
     97     uint32_t stateFlags[MBCS_MAX_STATE_COUNT],
     98              stateOffsetSum[MBCS_MAX_STATE_COUNT];
     99 
    100     int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits;
    101     int8_t conversionType, outputType;
    102 } UCMStates;
    103 
    104 typedef struct UCMFile {
    105     UCMTable *base, *ext;
    106     UCMStates states;
    107 
    108     char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH];
    109 } UCMFile;
    110 
    111 /* simple accesses ---------------------------------------------------------- */
    112 
    113 #define UCM_GET_CODE_POINTS(t, m) \
    114     (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u)
    115 
    116 #define UCM_GET_BYTES(t, m) \
    117     (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx)
    118 
    119 /* APIs --------------------------------------------------------------------- */
    120 
    121 U_CAPI UCMFile * U_EXPORT2
    122 ucm_open(void);
    123 
    124 U_CAPI void U_EXPORT2
    125 ucm_close(UCMFile *ucm);
    126 
    127 U_CAPI UBool U_EXPORT2
    128 ucm_parseHeaderLine(UCMFile *ucm,
    129                     char *line, char **pKey, char **pValue);
    130 
    131 /* @return -1 illegal bytes  0 suitable for base table  1 needs to go into extension table */
    132 U_CAPI int32_t U_EXPORT2
    133 ucm_mappingType(UCMStates *baseStates,
    134                 UCMapping *m,
    135                 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    136                 uint8_t bytes[UCNV_EXT_MAX_BYTES]);
    137 
    138 /* add a mapping to the base or extension table as appropriate */
    139 U_CAPI UBool U_EXPORT2
    140 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
    141                    UCMapping *m,
    142                    UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    143                    uint8_t bytes[UCNV_EXT_MAX_BYTES]);
    144 
    145 U_CAPI UBool U_EXPORT2
    146 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates);
    147 
    148 
    149 U_CAPI UCMTable * U_EXPORT2
    150 ucm_openTable(void);
    151 
    152 U_CAPI void U_EXPORT2
    153 ucm_closeTable(UCMTable *table);
    154 
    155 U_CAPI void U_EXPORT2
    156 ucm_resetTable(UCMTable *table);
    157 
    158 U_CAPI void U_EXPORT2
    159 ucm_sortTable(UCMTable *t);
    160 
    161 /*
    162  * Remove mappings with their move flag set from the base table
    163  * and move some of them (with UCM_MOVE_TO_EXT) to the extension table.
    164  */
    165 U_CAPI void U_EXPORT2
    166 ucm_moveMappings(UCMTable *base, UCMTable *ext);
    167 
    168 /**
    169  * Read a table from a .ucm file, from after the CHARMAP line to
    170  * including the END CHARMAP line.
    171  */
    172 U_CAPI void U_EXPORT2
    173 ucm_readTable(UCMFile *ucm, FileStream* convFile,
    174               UBool forBase, UCMStates *baseStates,
    175               UErrorCode *pErrorCode);
    176 
    177 /**
    178  * Check the validity of mappings against a base table's states;
    179  * necessary for extension-only tables that were read before their base tables.
    180  */
    181 U_CAPI UBool U_EXPORT2
    182 ucm_checkValidity(UCMTable *ext, UCMStates *baseStates);
    183 
    184 /**
    185  * Check a base table against an extension table.
    186  * Set the moveTarget!=NULL if it is possible to move mappings from the base.
    187  * This is the case where base and extension tables are parsed from a single file
    188  * (moveTarget==ext)
    189  * or when delta file mappings are subtracted from a base table.
    190  *
    191  * When a base table cannot be modified because a delta file is parsed in makeconv,
    192  * then set moveTarget=NULL.
    193  *
    194  * if(intersectBase) then mappings that exist in the base table but not in
    195  * the extension table are moved to moveTarget instead of showing an error.
    196  *
    197  * Special mode:
    198  * If intersectBase==2 for a DBCS extension table, then SBCS mappings are
    199  * not moved out of the base unless their Unicode input requires it.
    200  * This helps ucmkbase generate base tables for DBCS-only extension .cnv files.
    201  *
    202  * For both tables in the same file, the extension table is automatically
    203  * built.
    204  * For separate files, the extension file can use a complete mapping table (.ucm file),
    205  * so that common mappings need not be stripped out manually.
    206  *
    207  *
    208  * Sort both tables, and then for each mapping direction:
    209  *
    210  * If intersectBase is TRUE and the base table contains a mapping
    211  * that does not exist in the extension table, then this mapping is moved
    212  * to moveTarget.
    213  *
    214  * - otherwise -
    215  *
    216  * If the base table contains a mapping for which the input sequence is
    217  * the same as the extension input, then
    218  * - if the output is the same: remove the extension mapping
    219  * - else: error
    220  *
    221  * If the base table contains a mapping for which the input sequence is
    222  * a prefix of the extension input, then
    223  * - if moveTarget!=NULL: move the base mapping to the moveTarget table
    224  * - else: error
    225  *
    226  * @return FALSE in case of an irreparable error
    227  */
    228 U_CAPI UBool U_EXPORT2
    229 ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
    230                  UCMTable *moveTarget, UBool intersectBase);
    231 
    232 U_CAPI void U_EXPORT2
    233 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode);
    234 
    235 U_CAPI void U_EXPORT2
    236 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f);
    237 
    238 
    239 U_CAPI void U_EXPORT2
    240 ucm_addState(UCMStates *states, const char *s);
    241 
    242 U_CAPI void U_EXPORT2
    243 ucm_processStates(UCMStates *states, UBool ignoreSISOCheck);
    244 
    245 U_CAPI int32_t U_EXPORT2
    246 ucm_countChars(UCMStates *states,
    247                const uint8_t *bytes, int32_t length);
    248 
    249 
    250 U_CAPI int8_t U_EXPORT2
    251 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps);
    252 
    253 U_CAPI UBool U_EXPORT2
    254 ucm_parseMappingLine(UCMapping *m,
    255                      UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    256                      uint8_t bytes[UCNV_EXT_MAX_BYTES],
    257                      const char *line);
    258 
    259 U_CAPI void U_EXPORT2
    260 ucm_addMapping(UCMTable *table,
    261                UCMapping *m,
    262                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    263                uint8_t bytes[UCNV_EXT_MAX_BYTES]);
    264 
    265 /* very makeconv-specific functions ----------------------------------------- */
    266 
    267 /* finalize and optimize states after the toUnicode mappings are processed */
    268 U_CAPI void U_EXPORT2
    269 ucm_optimizeStates(UCMStates *states,
    270                    uint16_t **pUnicodeCodeUnits,
    271                    _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
    272                    UBool verbose);
    273 
    274 /* moved here because it is used inside ucmstate.c */
    275 U_CAPI int32_t U_EXPORT2
    276 ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
    277                  uint32_t offset);
    278 
    279 /* very rptp2ucm-specific functions ----------------------------------------- */
    280 
    281 /*
    282  * Input: Separate tables with mappings from/to Unicode,
    283  * subchar and subchar1 (0 if none).
    284  * All mappings must have flag 0.
    285  *
    286  * Output: fromUTable will contain the union of mappings with the correct
    287  * precision flags, and be sorted.
    288  */
    289 U_CAPI void U_EXPORT2
    290 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
    291                 const uint8_t *subchar, int32_t subcharLength,
    292                 uint8_t subchar1);
    293 
    294 U_CAPI UBool U_EXPORT2
    295 ucm_separateMappings(UCMFile *ucm, UBool isSISO);
    296 
    297 U_CDECL_END
    298 
    299 #endif
    300 
    301 #endif
    302 
    303