1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2000-2008, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: genmbcs.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2000jul10 14 * created by: Markus W. Scherer 15 */ 16 17 #ifndef __GENMBCS_H__ 18 #define __GENMBCS_H__ 19 20 #include "makeconv.h" 21 22 enum { 23 /* 24 * TODO: Consider using ucnvmbcs.h constants. 25 * However, not all values need to be exactly the same, for example 26 * the xxx_UTF8_MAX values may be different. (Especially SBCS_UTF8_MAX 27 * may be higher in makeconv than in the runtime code because that 28 * affects only a small number of .cnv files [if any] but all 29 * runtime UConverterSharedData objects. 30 */ 31 MBCS_STAGE_2_SHIFT=4, 32 MBCS_STAGE_2_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits in stage 2 */ 33 MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */ 34 MBCS_STAGE_2_BLOCK_MASK=0x3f, /* for after shifting by MBCS_STAGE_2_SHIFT */ 35 MBCS_STAGE_1_SHIFT=10, 36 MBCS_STAGE_1_BMP_SIZE=0x40, /* 0x10000>>MBCS_STAGE_1_SHIFT, or 16 for one entry per 1k code points on the BMP */ 37 MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>MBCS_STAGE_1_SHIFT, or 17*64 for one entry per 1k code points */ 38 MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE: stages 1 & 2 share a 16-bit-indexed array */ 39 MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE, 40 MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT, 41 42 MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */ 43 MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */ 44 45 MBCS_STAGE_3_BLOCK_SIZE=16, /* =16=1<<4 for 4 bits in stage 3 */ 46 MBCS_STAGE_3_BLOCK_MASK=0xf, 47 MBCS_STAGE_3_FIRST_ASSIGNED=MBCS_STAGE_3_BLOCK_SIZE, /* start of the first stage 3 block after the all-unassigned one */ 48 49 MBCS_STAGE_3_GRANULARITY=16, /* =1<<4: MBCS stage 2 indexes are shifted left 4 */ 50 MBCS_STAGE_3_SBCS_SIZE=0x10000, /* max 64k mappings for SBCS */ 51 MBCS_STAGE_3_MBCS_SIZE=0x10000*MBCS_STAGE_3_GRANULARITY, /* max mappings for MBCS */ 52 53 /* 54 * SBCS_UTF8_MAX: Maximum code point with UTF-8-friendly SBCS data structures. 55 * Possible values are 0x01ff..0xffff, in steps of 0x100. 56 * 57 * Unlike for MBCS, this constant only affects the stage 3 block allocation size; 58 * there is no additional stage 1/2 table stored in the .cnv file. 59 * The max value should be at least 0x7ff to cover 2-byte UTF-8. 60 * 0xfff also covers a number other small scripts which have legacy charsets 61 * (like Thai). 62 * Higher values up to 0x1fff are harmless and potentially useful because 63 * that covers small-script blocks which usually have either dense mappings 64 * or no mappings at all. 65 * Starting at U+2000, there are mostly symbols and format characters 66 * with a low density of SBCS mappings, which would result in more wasted 67 * stage 3 entries with the larger block size. 68 */ 69 SBCS_UTF8_MAX=0x1fff, 70 71 /* 72 * MBCS_UTF8_MAX: Maximum code point with UTF-8-friendly MBCS data structures. 73 * Possible values are 0x01ff..0xffff, in steps of 0x100. 74 * 75 * Note that with 0xffff, MBCSAddFromUnicode() may overflow the additional UTF-8 stage table 76 * with extreme input data. The function checks for this overflow. 77 * 78 * 0xd7ff is chosen for the majority of common characters including Unihan and Hangul. 79 * At U+d800 there are mostly surrogates, private use codes, compatibility characters, etc. 80 * Larger values cause slightly larger MBCS .cnv files. 81 */ 82 MBCS_UTF8_MAX=0xd7ff, 83 MBCS_UTF8_LIMIT=MBCS_UTF8_MAX+1, /* =0xd800 */ 84 85 MBCS_UTF8_STAGE_SHIFT=6, 86 MBCS_UTF8_STAGE_3_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits from last trail byte */ 87 MBCS_UTF8_STAGE_3_BLOCK_MASK=0x3f, 88 89 /* size of the single-stage table for up to U+d7ff (used instead of stage1/2) */ 90 MBCS_UTF8_STAGE_SIZE=MBCS_UTF8_LIMIT>>MBCS_UTF8_STAGE_SHIFT, /* =0x360 */ 91 92 MBCS_FROM_U_EXT_FLAG=0x10, /* UCMapping.f bit for base table mappings that fit into the base toU table */ 93 MBCS_FROM_U_EXT_MASK=0x0f, /* but need to go into the extension fromU table */ 94 95 /* =4 number of regular stage 3 blocks for final UTF-8 trail byte */ 96 MBCS_UTF8_STAGE_3_BLOCKS=MBCS_UTF8_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_BLOCK_SIZE, 97 98 MBCS_MAX_FALLBACK_COUNT=8192 99 }; 100 101 U_CFUNC NewConverter * 102 MBCSOpen(UCMFile *ucm); 103 104 struct MBCSData; 105 typedef struct MBCSData MBCSData; 106 107 /* 108 * Get a dummy MBCSData for use with MBCSOkForBaseFromUnicode() 109 * for creating an extension-only file. 110 * Assume maxCharLength>1. 111 */ 112 U_CFUNC const MBCSData * 113 MBCSGetDummy(void); 114 115 /* Test if a 1:1 mapping fits into the MBCS base table's fromUnicode structure. */ 116 U_CFUNC UBool 117 MBCSOkForBaseFromUnicode(const MBCSData *mbcsData, 118 const uint8_t *bytes, int32_t length, 119 UChar32 c, int8_t flag); 120 121 U_CFUNC NewConverter * 122 CnvExtOpen(UCMFile *ucm); 123 124 #endif /* __GENMBCS_H__ */ 125