1 /* 2 ******************************************************************************* 3 * Copyright (C) 2013-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * collationdatareader.h 7 * 8 * created on: 2013feb07 9 * created by: Markus W. Scherer 10 */ 11 12 #ifndef __COLLATIONDATAREADER_H__ 13 #define __COLLATIONDATAREADER_H__ 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_COLLATION 18 19 #include "unicode/udata.h" 20 21 struct UDataMemory; 22 23 U_NAMESPACE_BEGIN 24 25 struct CollationTailoring; 26 27 /** 28 * Collation binary data reader. 29 */ 30 struct U_I18N_API CollationDataReader /* all static */ { 31 // The following constants are also copied into source/common/ucol_swp.cpp. 32 // Keep them in sync! 33 enum { 34 /** 35 * Number of int32_t indexes. 36 * 37 * Can be 2 if there are only options. 38 * Can be 7 or 8 if there are only options and a script reordering. 39 * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0. 40 */ 41 IX_INDEXES_LENGTH, // 0 42 /** 43 * Bits 31..24: numericPrimary, for numeric collation 44 * 23..16: fast Latin format version (0 = no fast Latin table) 45 * 15.. 0: options bit set 46 */ 47 IX_OPTIONS, 48 IX_RESERVED2, 49 IX_RESERVED3, 50 51 /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */ 52 IX_JAMO_CE32S_START, // 4 53 54 // Byte offsets from the start of the data, after the generic header. 55 // The indexes[] are at byte offset 0, other data follows. 56 // Each data item is aligned properly. 57 // The data items should be in descending order of unit size, 58 // to minimize the need for padding. 59 // Each item's byte length is given by the difference between its offset and 60 // the next index/offset value. 61 /** Byte offset to int32_t reorderCodes[]. */ 62 IX_REORDER_CODES_OFFSET, 63 /** 64 * Byte offset to uint8_t reorderTable[]. 65 * Empty table if <256 bytes (padding only). 66 * Otherwise 256 bytes or more (with padding). 67 */ 68 IX_REORDER_TABLE_OFFSET, 69 /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */ 70 IX_TRIE_OFFSET, 71 72 IX_RESERVED8_OFFSET, // 8 73 /** Byte offset to int64_t ces[]. */ 74 IX_CES_OFFSET, 75 IX_RESERVED10_OFFSET, 76 /** Byte offset to uint32_t ce32s[]. */ 77 IX_CE32S_OFFSET, 78 79 /** Byte offset to uint32_t rootElements[]. */ 80 IX_ROOT_ELEMENTS_OFFSET, // 12 81 /** Byte offset to UChar *contexts[]. */ 82 IX_CONTEXTS_OFFSET, 83 /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */ 84 IX_UNSAFE_BWD_OFFSET, 85 /** Byte offset to uint16_t fastLatinTable[]. */ 86 IX_FAST_LATIN_TABLE_OFFSET, 87 88 /** Byte offset to uint16_t scripts[]. */ 89 IX_SCRIPTS_OFFSET, // 16 90 /** 91 * Byte offset to UBool compressibleBytes[]. 92 * Empty table if <256 bytes (padding only). 93 * Otherwise 256 bytes or more (with padding). 94 */ 95 IX_COMPRESSIBLE_BYTES_OFFSET, 96 IX_RESERVED18_OFFSET, 97 IX_TOTAL_SIZE 98 }; 99 100 static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength, 101 CollationTailoring &tailoring, UErrorCode &errorCode); 102 103 static UBool U_CALLCONV 104 isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo); 105 106 private: 107 CollationDataReader(); // no constructor 108 }; 109 110 /* 111 * Format of collation data (ucadata.icu, binary data in coll/ *.res files). 112 * Format version 5. 113 * 114 * The root collation data is stored in the ucadata.icu file. 115 * Tailorings are stored inside .res resource bundle files, with a complete file header. 116 * 117 * Collation data begins with a standard ICU data file header 118 * (DataHeader, see ucmndata.h and unicode/udata.h). 119 * The UDataInfo.dataVersion field contains the UCA and other version numbers, 120 * see the comments for CollationTailoring.version. 121 * 122 * After the header, the file contains the following parts. 123 * Constants are defined as enum values of the CollationDataReader class. 124 * See also the Collation class. 125 * 126 * int32_t indexes[indexesLength]; 127 * The indexes array has variable length. 128 * Some tailorings only need the length and the options, 129 * others only add reorderCodes and the reorderTable, 130 * some need to store mappings. 131 * Only as many indexes are stored as needed to read all of the data. 132 * 133 * Index 0: indexesLength 134 * Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS 135 * Index 2..3: Unused/reserved/0. 136 * Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo 137 * are stored in a short, contiguous part of the ce32s array. 138 * 139 * Indexes 5..19 are byte offsets in ascending order. 140 * Each byte offset marks the start of the next part in the data file, 141 * and the end of the previous one. 142 * When two consecutive byte offsets are the same (or too short), 143 * then the corresponding part is empty. 144 * Byte offsets are offsets from after the header, 145 * that is, from the beginning of the indexes[]. 146 * Each part starts at an offset with proper alignment for its data. 147 * If necessary, the previous part may include padding bytes to achieve this alignment. 148 * The last byte offset that is stored in the indexes indicates the total size of the data 149 * (starting with the indexes). 150 * 151 * int32_t reorderCodes[]; -- empty in root 152 * The list of script and reordering codes. 153 * 154 * Beginning with format version 5, this array may optionally 155 * have trailing entries with a full list of reorder ranges 156 * as described for CollationSettings::reorderRanges. 157 * 158 * Script or reorder codes are first and do not exceed 16-bit values. 159 * Range limits are stored in the upper 16 bits, and are never 0. 160 * Split this array into reorder codes and ranges at the first entry 161 * with non-zero upper 16 bits. 162 * 163 * If the ranges are missing but needed for split-reordered primary lead bytes, 164 * then they are regenerated at load time. 165 * 166 * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes 167 * Primary-weight lead byte permutation table. 168 * Normally present when the reorderCodes are, but can be built at load time. 169 * 170 * Beginning with format version 5, a 0 entry at a non-zero index 171 * (which is otherwise an illegal value) 172 * means that the primary lead byte is "split" 173 * (there are different offsets for primaries that share that lead byte) 174 * and the reordering offset must be determined via the reorder ranges 175 * that are either stored as part of the reorderCodes array 176 * or regenerated at load time. 177 * 178 * UTrie2 trie; -- see utrie2_impl.h and utrie2.h 179 * The trie holds the main collation data. Each code point is mapped to a 32-bit value. 180 * It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set, 181 * in which case it is a special CE32 and contains a 4-bit tag and further data. 182 * See the Collation class for details. 183 * 184 * The trie has a value for each lead surrogate code unit with some bits encoding 185 * collective properties of the 1024 supplementary characters whose UTF-16 form starts with 186 * the lead surrogate. See Collation::LEAD_SURROGATE_TAG.. 187 * 188 * int64_t ces[]; 189 * 64-bit CEs and expansions that cannot be stored in a more compact form. 190 * 191 * uint32_t ce32s[]; 192 * CE32s for expansions in compact form, and for characters whose trie values 193 * contain special data. 194 * 195 * uint32_t rootElements[]; -- empty in all tailorings 196 * Compact storage for all of the CEs that occur in the root collation. 197 * See the CollationRootElements class. 198 * 199 * UChar *contexts[]; 200 * Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings. 201 * 202 * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize() 203 * Serialized form of characters that are unsafe when iterating backwards, 204 * and at the end of an identical string prefix. 205 * Back up to a safe character. 206 * Lead surrogates are "unsafe" when any of their corresponding supplementary 207 * code points are unsafe. 208 * Does not include [:^lccc=0:][:^tccc=0:]. 209 * For each tailoring, the root unsafeBackwardSet is subtracted. 210 * (As a result, in many tailorings no set needs to be stored.) 211 * 212 * uint16_t fastLatinTable[]; 213 * Optional optimization for Latin text. 214 * See the CollationFastLatin class. 215 * 216 * uint16_t scripts[]; -- empty in all tailorings 217 * Format version 5: 218 * uint16_t numScripts; 219 * uint16_t scriptsIndex[numScripts+16]; 220 * uint16_t scriptStarts[]; 221 * See CollationData::numScripts etc. 222 * 223 * Format version 4: 224 * Table of the reordering groups with their first and last lead bytes, 225 * and their script and reordering codes. 226 * See CollationData::scripts. 227 * 228 * UBool compressibleBytes[]; -- empty in all tailorings 229 * Flag for getSortKey(), indicating primary weight lead bytes that are compressible. 230 * 231 * ----------------- 232 * Changes for formatVersion 5 (ICU 55) 233 * 234 * Reordering moves single scripts, not groups of scripts. 235 * Reorder ranges are optionally appended to the reorderCodes, 236 * and a 0 entry in the reorderTable indicates a split lead byte. 237 * The scripts data has a new format. 238 * 239 * The rootElements may contain secondary and tertiary weights below common=05. 240 * (Used for small Hiragana letters.) 241 * Where is occurs, there is also an explicit unit with common secondary & tertiary weights. 242 * There are no other data structure changes, but builder code needs to be able to handle such data. 243 * 244 * The collation element for the merge separator code point U+FFFE 245 * does not necessarily have special, unique secondary/tertiary weights any more. 246 */ 247 248 U_NAMESPACE_END 249 250 #endif // !UCONFIG_NO_COLLATION 251 #endif // __COLLATIONDATAREADER_H__ 252