Home | History | Annotate | Download | only in toolutil
      1 /*
      2  *******************************************************************************
      3  *   Copyright (C) 2003-2013, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  *******************************************************************************
      6  *   file name:  ucm.h
      7  *   encoding:   US-ASCII
      8  *   tab size:   8 (not used)
      9  *   indentation:4
     10  *
     11  *   created on: 2003jun20
     12  *   created by: Markus W. Scherer
     13  *
     14  *   Definitions for the .ucm file parser and handler module ucm.c.
     15  */
     16 
     17 #ifndef __UCM_H__
     18 #define __UCM_H__
     19 
     20 #include "unicode/utypes.h"
     21 #include "ucnvmbcs.h"
     22 #include "ucnv_ext.h"
     23 #include "filestrm.h"
     24 #include <stdio.h>
     25 
     26 #if !UCONFIG_NO_CONVERSION
     27 
     28 U_CDECL_BEGIN
     29 
     30 /* constants for UCMapping.moveFlag */
     31 enum {
     32     UCM_MOVE_TO_EXT=1,
     33     UCM_REMOVE_MAPPING=2
     34 };
     35 
     36 /*
     37  * Per-mapping data structure
     38  *
     39  * u if uLen==1: Unicode code point
     40  *   else index to uLen code points
     41  * b if bLen<=4: up to 4 bytes
     42  *   else index to bLen bytes
     43  * uLen number of code points
     44  * bLen number of words containing left-justified bytes
     45  * bIsMultipleChars indicates that the bytes contain more than one sequence
     46  *                  according to the state table
     47  * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3)
     48  *   or "good one-way" mapping (4).
     49  *   Same values as in the source file after |
     50  */
     51 typedef struct UCMapping {
     52     UChar32 u;
     53     union {
     54         uint32_t idx;
     55         uint8_t bytes[4];
     56     } b;
     57     int8_t uLen, bLen, f, moveFlag;
     58 } UCMapping;
     59 
     60 /* constants for UCMTable.flagsType */
     61 enum {
     62     UCM_FLAGS_INITIAL,  /* no mappings parsed yet */
     63     UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */
     64     UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */
     65     UCM_FLAGS_MIXED     /* both implicit and explicit */
     66 };
     67 
     68 typedef struct UCMTable {
     69     UCMapping *mappings;
     70     int32_t mappingsCapacity, mappingsLength;
     71 
     72     UChar32 *codePoints;
     73     int32_t codePointsCapacity, codePointsLength;
     74 
     75     uint8_t *bytes;
     76     int32_t bytesCapacity, bytesLength;
     77 
     78     /* index map for mapping by bytes first */
     79     int32_t *reverseMap;
     80 
     81     uint8_t unicodeMask;
     82     int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */
     83     UBool isSorted;
     84 } UCMTable;
     85 
     86 enum {
     87     MBCS_STATE_FLAG_DIRECT=1,
     88     MBCS_STATE_FLAG_SURROGATES,
     89 
     90     MBCS_STATE_FLAG_READY=16
     91 };
     92 
     93 typedef struct UCMStates {
     94     int32_t stateTable[MBCS_MAX_STATE_COUNT][256];
     95     uint32_t stateFlags[MBCS_MAX_STATE_COUNT],
     96              stateOffsetSum[MBCS_MAX_STATE_COUNT];
     97 
     98     int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits;
     99     int8_t conversionType, outputType;
    100 } UCMStates;
    101 
    102 typedef struct UCMFile {
    103     UCMTable *base, *ext;
    104     UCMStates states;
    105 
    106     char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH];
    107 } UCMFile;
    108 
    109 /* simple accesses ---------------------------------------------------------- */
    110 
    111 #define UCM_GET_CODE_POINTS(t, m) \
    112     (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u)
    113 
    114 #define UCM_GET_BYTES(t, m) \
    115     (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx)
    116 
    117 /* APIs --------------------------------------------------------------------- */
    118 
    119 U_CAPI UCMFile * U_EXPORT2
    120 ucm_open(void);
    121 
    122 U_CAPI void U_EXPORT2
    123 ucm_close(UCMFile *ucm);
    124 
    125 U_CAPI UBool U_EXPORT2
    126 ucm_parseHeaderLine(UCMFile *ucm,
    127                     char *line, char **pKey, char **pValue);
    128 
    129 /* @return -1 illegal bytes  0 suitable for base table  1 needs to go into extension table */
    130 U_CAPI int32_t U_EXPORT2
    131 ucm_mappingType(UCMStates *baseStates,
    132                 UCMapping *m,
    133                 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    134                 uint8_t bytes[UCNV_EXT_MAX_BYTES]);
    135 
    136 /* add a mapping to the base or extension table as appropriate */
    137 U_CAPI UBool U_EXPORT2
    138 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
    139                    UCMapping *m,
    140                    UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    141                    uint8_t bytes[UCNV_EXT_MAX_BYTES]);
    142 
    143 U_CAPI UBool U_EXPORT2
    144 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates);
    145 
    146 
    147 U_CAPI UCMTable * U_EXPORT2
    148 ucm_openTable(void);
    149 
    150 U_CAPI void U_EXPORT2
    151 ucm_closeTable(UCMTable *table);
    152 
    153 U_CAPI void U_EXPORT2
    154 ucm_resetTable(UCMTable *table);
    155 
    156 U_CAPI void U_EXPORT2
    157 ucm_sortTable(UCMTable *t);
    158 
    159 /*
    160  * Remove mappings with their move flag set from the base table
    161  * and move some of them (with UCM_MOVE_TO_EXT) to the extension table.
    162  */
    163 U_CAPI void U_EXPORT2
    164 ucm_moveMappings(UCMTable *base, UCMTable *ext);
    165 
    166 /**
    167  * Read a table from a .ucm file, from after the CHARMAP line to
    168  * including the END CHARMAP line.
    169  */
    170 U_CAPI void U_EXPORT2
    171 ucm_readTable(UCMFile *ucm, FileStream* convFile,
    172               UBool forBase, UCMStates *baseStates,
    173               UErrorCode *pErrorCode);
    174 
    175 /**
    176  * Check the validity of mappings against a base table's states;
    177  * necessary for extension-only tables that were read before their base tables.
    178  */
    179 U_CAPI UBool U_EXPORT2
    180 ucm_checkValidity(UCMTable *ext, UCMStates *baseStates);
    181 
    182 /**
    183  * Check a base table against an extension table.
    184  * Set the moveTarget!=NULL if it is possible to move mappings from the base.
    185  * This is the case where base and extension tables are parsed from a single file
    186  * (moveTarget==ext)
    187  * or when delta file mappings are subtracted from a base table.
    188  *
    189  * When a base table cannot be modified because a delta file is parsed in makeconv,
    190  * then set moveTarget=NULL.
    191  *
    192  * if(intersectBase) then mappings that exist in the base table but not in
    193  * the extension table are moved to moveTarget instead of showing an error.
    194  *
    195  * Special mode:
    196  * If intersectBase==2 for a DBCS extension table, then SBCS mappings are
    197  * not moved out of the base unless their Unicode input requires it.
    198  * This helps ucmkbase generate base tables for DBCS-only extension .cnv files.
    199  *
    200  * For both tables in the same file, the extension table is automatically
    201  * built.
    202  * For separate files, the extension file can use a complete mapping table (.ucm file),
    203  * so that common mappings need not be stripped out manually.
    204  *
    205  *
    206  * Sort both tables, and then for each mapping direction:
    207  *
    208  * If intersectBase is TRUE and the base table contains a mapping
    209  * that does not exist in the extension table, then this mapping is moved
    210  * to moveTarget.
    211  *
    212  * - otherwise -
    213  *
    214  * If the base table contains a mapping for which the input sequence is
    215  * the same as the extension input, then
    216  * - if the output is the same: remove the extension mapping
    217  * - else: error
    218  *
    219  * If the base table contains a mapping for which the input sequence is
    220  * a prefix of the extension input, then
    221  * - if moveTarget!=NULL: move the base mapping to the moveTarget table
    222  * - else: error
    223  *
    224  * @return FALSE in case of an irreparable error
    225  */
    226 U_CAPI UBool U_EXPORT2
    227 ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
    228                  UCMTable *moveTarget, UBool intersectBase);
    229 
    230 U_CAPI void U_EXPORT2
    231 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode);
    232 
    233 U_CAPI void U_EXPORT2
    234 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f);
    235 
    236 
    237 U_CAPI void U_EXPORT2
    238 ucm_addState(UCMStates *states, const char *s);
    239 
    240 U_CAPI void U_EXPORT2
    241 ucm_processStates(UCMStates *states, UBool ignoreSISOCheck);
    242 
    243 U_CAPI int32_t U_EXPORT2
    244 ucm_countChars(UCMStates *states,
    245                const uint8_t *bytes, int32_t length);
    246 
    247 
    248 U_CAPI int8_t U_EXPORT2
    249 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps);
    250 
    251 U_CAPI UBool U_EXPORT2
    252 ucm_parseMappingLine(UCMapping *m,
    253                      UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    254                      uint8_t bytes[UCNV_EXT_MAX_BYTES],
    255                      const char *line);
    256 
    257 U_CAPI void U_EXPORT2
    258 ucm_addMapping(UCMTable *table,
    259                UCMapping *m,
    260                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    261                uint8_t bytes[UCNV_EXT_MAX_BYTES]);
    262 
    263 /* very makeconv-specific functions ----------------------------------------- */
    264 
    265 /* finalize and optimize states after the toUnicode mappings are processed */
    266 U_CAPI void U_EXPORT2
    267 ucm_optimizeStates(UCMStates *states,
    268                    uint16_t **pUnicodeCodeUnits,
    269                    _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
    270                    UBool verbose);
    271 
    272 /* moved here because it is used inside ucmstate.c */
    273 U_CAPI int32_t U_EXPORT2
    274 ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
    275                  uint32_t offset);
    276 
    277 /* very rptp2ucm-specific functions ----------------------------------------- */
    278 
    279 /*
    280  * Input: Separate tables with mappings from/to Unicode,
    281  * subchar and subchar1 (0 if none).
    282  * All mappings must have flag 0.
    283  *
    284  * Output: fromUTable will contain the union of mappings with the correct
    285  * precision flags, and be sorted.
    286  */
    287 U_CAPI void U_EXPORT2
    288 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
    289                 const uint8_t *subchar, int32_t subcharLength,
    290                 uint8_t subchar1);
    291 
    292 U_CAPI UBool U_EXPORT2
    293 ucm_separateMappings(UCMFile *ucm, UBool isSISO);
    294 
    295 U_CDECL_END
    296 
    297 #endif
    298 
    299 #endif
    300 
    301