1 /* 2 ******************************************************************************* 3 * Copyright (C) 2003-2013, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: ucm.h 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2003jun20 12 * created by: Markus W. Scherer 13 * 14 * Definitions for the .ucm file parser and handler module ucm.c. 15 */ 16 17 #ifndef __UCM_H__ 18 #define __UCM_H__ 19 20 #include "unicode/utypes.h" 21 #include "ucnvmbcs.h" 22 #include "ucnv_ext.h" 23 #include "filestrm.h" 24 #include <stdio.h> 25 26 #if !UCONFIG_NO_CONVERSION 27 28 U_CDECL_BEGIN 29 30 /* constants for UCMapping.moveFlag */ 31 enum { 32 UCM_MOVE_TO_EXT=1, 33 UCM_REMOVE_MAPPING=2 34 }; 35 36 /* 37 * Per-mapping data structure 38 * 39 * u if uLen==1: Unicode code point 40 * else index to uLen code points 41 * b if bLen<=4: up to 4 bytes 42 * else index to bLen bytes 43 * uLen number of code points 44 * bLen number of words containing left-justified bytes 45 * bIsMultipleChars indicates that the bytes contain more than one sequence 46 * according to the state table 47 * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3) 48 * or "good one-way" mapping (4). 49 * Same values as in the source file after | 50 */ 51 typedef struct UCMapping { 52 UChar32 u; 53 union { 54 uint32_t idx; 55 uint8_t bytes[4]; 56 } b; 57 int8_t uLen, bLen, f, moveFlag; 58 } UCMapping; 59 60 /* constants for UCMTable.flagsType */ 61 enum { 62 UCM_FLAGS_INITIAL, /* no mappings parsed yet */ 63 UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */ 64 UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */ 65 UCM_FLAGS_MIXED /* both implicit and explicit */ 66 }; 67 68 typedef struct UCMTable { 69 UCMapping *mappings; 70 int32_t mappingsCapacity, mappingsLength; 71 72 UChar32 *codePoints; 73 int32_t codePointsCapacity, codePointsLength; 74 75 uint8_t *bytes; 76 int32_t bytesCapacity, bytesLength; 77 78 /* index map for mapping by bytes first */ 79 int32_t *reverseMap; 80 81 uint8_t unicodeMask; 82 int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */ 83 UBool isSorted; 84 } UCMTable; 85 86 enum { 87 MBCS_STATE_FLAG_DIRECT=1, 88 MBCS_STATE_FLAG_SURROGATES, 89 90 MBCS_STATE_FLAG_READY=16 91 }; 92 93 typedef struct UCMStates { 94 int32_t stateTable[MBCS_MAX_STATE_COUNT][256]; 95 uint32_t stateFlags[MBCS_MAX_STATE_COUNT], 96 stateOffsetSum[MBCS_MAX_STATE_COUNT]; 97 98 int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits; 99 int8_t conversionType, outputType; 100 } UCMStates; 101 102 typedef struct UCMFile { 103 UCMTable *base, *ext; 104 UCMStates states; 105 106 char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH]; 107 } UCMFile; 108 109 /* simple accesses ---------------------------------------------------------- */ 110 111 #define UCM_GET_CODE_POINTS(t, m) \ 112 (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u) 113 114 #define UCM_GET_BYTES(t, m) \ 115 (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx) 116 117 /* APIs --------------------------------------------------------------------- */ 118 119 U_CAPI UCMFile * U_EXPORT2 120 ucm_open(void); 121 122 U_CAPI void U_EXPORT2 123 ucm_close(UCMFile *ucm); 124 125 U_CAPI UBool U_EXPORT2 126 ucm_parseHeaderLine(UCMFile *ucm, 127 char *line, char **pKey, char **pValue); 128 129 /* @return -1 illegal bytes 0 suitable for base table 1 needs to go into extension table */ 130 U_CAPI int32_t U_EXPORT2 131 ucm_mappingType(UCMStates *baseStates, 132 UCMapping *m, 133 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 134 uint8_t bytes[UCNV_EXT_MAX_BYTES]); 135 136 /* add a mapping to the base or extension table as appropriate */ 137 U_CAPI UBool U_EXPORT2 138 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, 139 UCMapping *m, 140 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 141 uint8_t bytes[UCNV_EXT_MAX_BYTES]); 142 143 U_CAPI UBool U_EXPORT2 144 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates); 145 146 147 U_CAPI UCMTable * U_EXPORT2 148 ucm_openTable(void); 149 150 U_CAPI void U_EXPORT2 151 ucm_closeTable(UCMTable *table); 152 153 U_CAPI void U_EXPORT2 154 ucm_resetTable(UCMTable *table); 155 156 U_CAPI void U_EXPORT2 157 ucm_sortTable(UCMTable *t); 158 159 /* 160 * Remove mappings with their move flag set from the base table 161 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table. 162 */ 163 U_CAPI void U_EXPORT2 164 ucm_moveMappings(UCMTable *base, UCMTable *ext); 165 166 /** 167 * Read a table from a .ucm file, from after the CHARMAP line to 168 * including the END CHARMAP line. 169 */ 170 U_CAPI void U_EXPORT2 171 ucm_readTable(UCMFile *ucm, FileStream* convFile, 172 UBool forBase, UCMStates *baseStates, 173 UErrorCode *pErrorCode); 174 175 /** 176 * Check the validity of mappings against a base table's states; 177 * necessary for extension-only tables that were read before their base tables. 178 */ 179 U_CAPI UBool U_EXPORT2 180 ucm_checkValidity(UCMTable *ext, UCMStates *baseStates); 181 182 /** 183 * Check a base table against an extension table. 184 * Set the moveTarget!=NULL if it is possible to move mappings from the base. 185 * This is the case where base and extension tables are parsed from a single file 186 * (moveTarget==ext) 187 * or when delta file mappings are subtracted from a base table. 188 * 189 * When a base table cannot be modified because a delta file is parsed in makeconv, 190 * then set moveTarget=NULL. 191 * 192 * if(intersectBase) then mappings that exist in the base table but not in 193 * the extension table are moved to moveTarget instead of showing an error. 194 * 195 * Special mode: 196 * If intersectBase==2 for a DBCS extension table, then SBCS mappings are 197 * not moved out of the base unless their Unicode input requires it. 198 * This helps ucmkbase generate base tables for DBCS-only extension .cnv files. 199 * 200 * For both tables in the same file, the extension table is automatically 201 * built. 202 * For separate files, the extension file can use a complete mapping table (.ucm file), 203 * so that common mappings need not be stripped out manually. 204 * 205 * 206 * Sort both tables, and then for each mapping direction: 207 * 208 * If intersectBase is TRUE and the base table contains a mapping 209 * that does not exist in the extension table, then this mapping is moved 210 * to moveTarget. 211 * 212 * - otherwise - 213 * 214 * If the base table contains a mapping for which the input sequence is 215 * the same as the extension input, then 216 * - if the output is the same: remove the extension mapping 217 * - else: error 218 * 219 * If the base table contains a mapping for which the input sequence is 220 * a prefix of the extension input, then 221 * - if moveTarget!=NULL: move the base mapping to the moveTarget table 222 * - else: error 223 * 224 * @return FALSE in case of an irreparable error 225 */ 226 U_CAPI UBool U_EXPORT2 227 ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, 228 UCMTable *moveTarget, UBool intersectBase); 229 230 U_CAPI void U_EXPORT2 231 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode); 232 233 U_CAPI void U_EXPORT2 234 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f); 235 236 237 U_CAPI void U_EXPORT2 238 ucm_addState(UCMStates *states, const char *s); 239 240 U_CAPI void U_EXPORT2 241 ucm_processStates(UCMStates *states, UBool ignoreSISOCheck); 242 243 U_CAPI int32_t U_EXPORT2 244 ucm_countChars(UCMStates *states, 245 const uint8_t *bytes, int32_t length); 246 247 248 U_CAPI int8_t U_EXPORT2 249 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps); 250 251 U_CAPI UBool U_EXPORT2 252 ucm_parseMappingLine(UCMapping *m, 253 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 254 uint8_t bytes[UCNV_EXT_MAX_BYTES], 255 const char *line); 256 257 U_CAPI void U_EXPORT2 258 ucm_addMapping(UCMTable *table, 259 UCMapping *m, 260 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 261 uint8_t bytes[UCNV_EXT_MAX_BYTES]); 262 263 /* very makeconv-specific functions ----------------------------------------- */ 264 265 /* finalize and optimize states after the toUnicode mappings are processed */ 266 U_CAPI void U_EXPORT2 267 ucm_optimizeStates(UCMStates *states, 268 uint16_t **pUnicodeCodeUnits, 269 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 270 UBool verbose); 271 272 /* moved here because it is used inside ucmstate.c */ 273 U_CAPI int32_t U_EXPORT2 274 ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 275 uint32_t offset); 276 277 /* very rptp2ucm-specific functions ----------------------------------------- */ 278 279 /* 280 * Input: Separate tables with mappings from/to Unicode, 281 * subchar and subchar1 (0 if none). 282 * All mappings must have flag 0. 283 * 284 * Output: fromUTable will contain the union of mappings with the correct 285 * precision flags, and be sorted. 286 */ 287 U_CAPI void U_EXPORT2 288 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, 289 const uint8_t *subchar, int32_t subcharLength, 290 uint8_t subchar1); 291 292 U_CAPI UBool U_EXPORT2 293 ucm_separateMappings(UCMFile *ucm, UBool isSISO); 294 295 U_CDECL_END 296 297 #endif 298 299 #endif 300 301