Home | History | Annotate | Download | only in toolutil
      1 /*
      2  *******************************************************************************
      3  *   Copyright (C) 2003-2010, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  *******************************************************************************
      6  *   file name:  ucm.h
      7  *   encoding:   US-ASCII
      8  *   tab size:   8 (not used)
      9  *   indentation:4
     10  *
     11  *   created on: 2003jun20
     12  *   created by: Markus W. Scherer
     13  *
     14  *   Definitions for the .ucm file parser and handler module ucm.c.
     15  */
     16 
     17 #ifndef __UCM_H__
     18 #define __UCM_H__
     19 
     20 #include "unicode/utypes.h"
     21 #include "ucnvmbcs.h"
     22 #include "ucnv_ext.h"
     23 #include "filestrm.h"
     24 #include <stdio.h>
     25 
     26 #if !UCONFIG_NO_CONVERSION
     27 
     28 U_CDECL_BEGIN
     29 
     30 /* constants for UCMapping.moveFlag */
     31 enum {
     32     UCM_MOVE_TO_EXT=1,
     33     UCM_REMOVE_MAPPING=2
     34 };
     35 
     36 /*
     37  * Per-mapping data structure
     38  *
     39  * u if uLen==1: Unicode code point
     40  *   else index to uLen code points
     41  * b if bLen<=4: up to 4 bytes
     42  *   else index to bLen bytes
     43  * uLen number of code points
     44  * bLen number of words containing left-justified bytes
     45  * bIsMultipleChars indicates that the bytes contain more than one sequence
     46  *                  according to the state table
     47  * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3)
     48  *   same values as in the source file after |
     49  */
     50 typedef struct UCMapping {
     51     UChar32 u;
     52     union {
     53         uint32_t idx;
     54         uint8_t bytes[4];
     55     } b;
     56     int8_t uLen, bLen, f, moveFlag;
     57 } UCMapping;
     58 
     59 /* constants for UCMTable.flagsType */
     60 enum {
     61     UCM_FLAGS_INITIAL,  /* no mappings parsed yet */
     62     UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */
     63     UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */
     64     UCM_FLAGS_MIXED     /* both implicit and explicit */
     65 };
     66 
     67 typedef struct UCMTable {
     68     UCMapping *mappings;
     69     int32_t mappingsCapacity, mappingsLength;
     70 
     71     UChar32 *codePoints;
     72     int32_t codePointsCapacity, codePointsLength;
     73 
     74     uint8_t *bytes;
     75     int32_t bytesCapacity, bytesLength;
     76 
     77     /* index map for mapping by bytes first */
     78     int32_t *reverseMap;
     79 
     80     uint8_t unicodeMask;
     81     int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */
     82     UBool isSorted;
     83 } UCMTable;
     84 
     85 enum {
     86     MBCS_STATE_FLAG_DIRECT=1,
     87     MBCS_STATE_FLAG_SURROGATES,
     88 
     89     MBCS_STATE_FLAG_READY=16
     90 };
     91 
     92 typedef struct UCMStates {
     93     int32_t stateTable[MBCS_MAX_STATE_COUNT][256];
     94     uint32_t stateFlags[MBCS_MAX_STATE_COUNT],
     95              stateOffsetSum[MBCS_MAX_STATE_COUNT];
     96 
     97     int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits;
     98     int8_t conversionType, outputType;
     99 } UCMStates;
    100 
    101 typedef struct UCMFile {
    102     UCMTable *base, *ext;
    103     UCMStates states;
    104 
    105     char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH];
    106 } UCMFile;
    107 
    108 /* simple accesses ---------------------------------------------------------- */
    109 
    110 #define UCM_GET_CODE_POINTS(t, m) \
    111     (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u)
    112 
    113 #define UCM_GET_BYTES(t, m) \
    114     (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx)
    115 
    116 /* APIs --------------------------------------------------------------------- */
    117 
    118 U_CAPI UCMFile * U_EXPORT2
    119 ucm_open(void);
    120 
    121 U_CAPI void U_EXPORT2
    122 ucm_close(UCMFile *ucm);
    123 
    124 U_CAPI UBool U_EXPORT2
    125 ucm_parseHeaderLine(UCMFile *ucm,
    126                     char *line, char **pKey, char **pValue);
    127 
    128 /* @return -1 illegal bytes  0 suitable for base table  1 needs to go into extension table */
    129 U_CAPI int32_t U_EXPORT2
    130 ucm_mappingType(UCMStates *baseStates,
    131                 UCMapping *m,
    132                 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    133                 uint8_t bytes[UCNV_EXT_MAX_BYTES]);
    134 
    135 /* add a mapping to the base or extension table as appropriate */
    136 U_CAPI UBool U_EXPORT2
    137 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
    138                    UCMapping *m,
    139                    UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    140                    uint8_t bytes[UCNV_EXT_MAX_BYTES]);
    141 
    142 U_CAPI UBool U_EXPORT2
    143 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates);
    144 
    145 
    146 U_CAPI UCMTable * U_EXPORT2
    147 ucm_openTable(void);
    148 
    149 U_CAPI void U_EXPORT2
    150 ucm_closeTable(UCMTable *table);
    151 
    152 U_CAPI void U_EXPORT2
    153 ucm_resetTable(UCMTable *table);
    154 
    155 U_CAPI void U_EXPORT2
    156 ucm_sortTable(UCMTable *t);
    157 
    158 /*
    159  * Remove mappings with their move flag set from the base table
    160  * and move some of them (with UCM_MOVE_TO_EXT) to the extension table.
    161  */
    162 U_CAPI void U_EXPORT2
    163 ucm_moveMappings(UCMTable *base, UCMTable *ext);
    164 
    165 /**
    166  * Read a table from a .ucm file, from after the CHARMAP line to
    167  * including the END CHARMAP line.
    168  */
    169 U_CAPI void U_EXPORT2
    170 ucm_readTable(UCMFile *ucm, FileStream* convFile,
    171               UBool forBase, UCMStates *baseStates,
    172               UErrorCode *pErrorCode);
    173 
    174 /**
    175  * Check the validity of mappings against a base table's states;
    176  * necessary for extension-only tables that were read before their base tables.
    177  */
    178 U_CAPI UBool U_EXPORT2
    179 ucm_checkValidity(UCMTable *ext, UCMStates *baseStates);
    180 
    181 /**
    182  * Check a base table against an extension table.
    183  * Set the moveTarget!=NULL if it is possible to move mappings from the base.
    184  * This is the case where base and extension tables are parsed from a single file
    185  * (moveTarget==ext)
    186  * or when delta file mappings are subtracted from a base table.
    187  *
    188  * When a base table cannot be modified because a delta file is parsed in makeconv,
    189  * then set moveTarget=NULL.
    190  *
    191  * if(intersectBase) then mappings that exist in the base table but not in
    192  * the extension table are moved to moveTarget instead of showing an error.
    193  *
    194  * Special mode:
    195  * If intersectBase==2 for a DBCS extension table, then SBCS mappings are
    196  * not moved out of the base unless their Unicode input requires it.
    197  * This helps ucmkbase generate base tables for DBCS-only extension .cnv files.
    198  *
    199  * For both tables in the same file, the extension table is automatically
    200  * built.
    201  * For separate files, the extension file can use a complete mapping table (.ucm file),
    202  * so that common mappings need not be stripped out manually.
    203  *
    204  *
    205  * Sort both tables, and then for each mapping direction:
    206  *
    207  * If intersectBase is TRUE and the base table contains a mapping
    208  * that does not exist in the extension table, then this mapping is moved
    209  * to moveTarget.
    210  *
    211  * - otherwise -
    212  *
    213  * If the base table contains a mapping for which the input sequence is
    214  * the same as the extension input, then
    215  * - if the output is the same: remove the extension mapping
    216  * - else: error
    217  *
    218  * If the base table contains a mapping for which the input sequence is
    219  * a prefix of the extension input, then
    220  * - if moveTarget!=NULL: move the base mapping to the moveTarget table
    221  * - else: error
    222  *
    223  * @return FALSE in case of an irreparable error
    224  */
    225 U_CAPI UBool U_EXPORT2
    226 ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
    227                  UCMTable *moveTarget, UBool intersectBase);
    228 
    229 U_CAPI void U_EXPORT2
    230 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode);
    231 
    232 U_CAPI void U_EXPORT2
    233 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f);
    234 
    235 
    236 U_CAPI void U_EXPORT2
    237 ucm_addState(UCMStates *states, const char *s);
    238 
    239 U_CAPI void U_EXPORT2
    240 ucm_processStates(UCMStates *states, UBool ignoreSISOCheck);
    241 
    242 U_CAPI int32_t U_EXPORT2
    243 ucm_countChars(UCMStates *states,
    244                const uint8_t *bytes, int32_t length);
    245 
    246 
    247 U_CAPI int8_t U_EXPORT2
    248 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps);
    249 
    250 U_CAPI UBool U_EXPORT2
    251 ucm_parseMappingLine(UCMapping *m,
    252                      UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    253                      uint8_t bytes[UCNV_EXT_MAX_BYTES],
    254                      const char *line);
    255 
    256 U_CAPI void U_EXPORT2
    257 ucm_addMapping(UCMTable *table,
    258                UCMapping *m,
    259                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    260                uint8_t bytes[UCNV_EXT_MAX_BYTES]);
    261 
    262 /* very makeconv-specific functions ----------------------------------------- */
    263 
    264 /* finalize and optimize states after the toUnicode mappings are processed */
    265 U_CAPI void U_EXPORT2
    266 ucm_optimizeStates(UCMStates *states,
    267                    uint16_t **pUnicodeCodeUnits,
    268                    _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
    269                    UBool verbose);
    270 
    271 /* moved here because it is used inside ucmstate.c */
    272 U_CAPI int32_t U_EXPORT2
    273 ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
    274                  uint32_t offset);
    275 
    276 /* very rptp2ucm-specific functions ----------------------------------------- */
    277 
    278 /*
    279  * Input: Separate tables with mappings from/to Unicode,
    280  * subchar and subchar1 (0 if none).
    281  * All mappings must have flag 0.
    282  *
    283  * Output: fromUTable will contain the union of mappings with the correct
    284  * precision flags, and be sorted.
    285  */
    286 U_CAPI void U_EXPORT2
    287 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
    288                 const uint8_t *subchar, int32_t subcharLength,
    289                 uint8_t subchar1);
    290 
    291 U_CAPI UBool U_EXPORT2
    292 ucm_separateMappings(UCMFile *ucm, UBool isSISO);
    293 
    294 U_CDECL_END
    295 
    296 #endif
    297 
    298 #endif
    299 
    300