1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2001-2011, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: utrie.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2001nov08 16 * created by: Markus W. Scherer 17 */ 18 19 #ifndef __UTRIE_H__ 20 #define __UTRIE_H__ 21 22 #include "unicode/utypes.h" 23 #include "unicode/utf16.h" 24 25 U_CDECL_BEGIN 26 27 /** 28 * \file 29 * 30 * This is a common implementation of a "folded" trie. 31 * It is a kind of compressed, serializable table of 16- or 32-bit values associated with 32 * Unicode code points (0..0x10ffff). 33 * 34 * This implementation is optimized for getting values while walking forward 35 * through a UTF-16 string. 36 * Therefore, the simplest and fastest access macros are the 37 * _FROM_LEAD() and _FROM_OFFSET_TRAIL() macros. 38 * 39 * The _FROM_BMP() macros are a little more complicated; they get values 40 * even for lead surrogate code _points_, while the _FROM_LEAD() macros 41 * get special "folded" values for lead surrogate code _units_ if 42 * there is relevant data associated with them. 43 * From such a folded value, an offset needs to be extracted to supply 44 * to the _FROM_OFFSET_TRAIL() macros. 45 * 46 * Most of the more complex (and more convenient) functions/macros call a callback function 47 * to get that offset from the folded value for a lead surrogate unit. 48 */ 49 50 /** 51 * Trie constants, defining shift widths, index array lengths, etc. 52 */ 53 enum { 54 /** Shift size for shifting right the input index. 1..9 */ 55 UTRIE_SHIFT=5, 56 57 /** Number of data values in a stage 2 (data array) block. 2, 4, 8, .., 0x200 */ 58 UTRIE_DATA_BLOCK_LENGTH=1<<UTRIE_SHIFT, 59 60 /** Mask for getting the lower bits from the input index. */ 61 UTRIE_MASK=UTRIE_DATA_BLOCK_LENGTH-1, 62 63 /** 64 * Lead surrogate code points' index displacement in the index array. 65 * 0x10000-0xd800=0x2800 66 */ 67 UTRIE_LEAD_INDEX_DISP=0x2800>>UTRIE_SHIFT, 68 69 /** 70 * Shift size for shifting left the index array values. 71 * Increases possible data size with 16-bit index values at the cost 72 * of compactability. 73 * This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY. 74 * 0..UTRIE_SHIFT 75 */ 76 UTRIE_INDEX_SHIFT=2, 77 78 /** The alignment size of a stage 2 data block. Also the granularity for compaction. */ 79 UTRIE_DATA_GRANULARITY=1<<UTRIE_INDEX_SHIFT, 80 81 /** Number of bits of a trail surrogate that are used in index table lookups. */ 82 UTRIE_SURROGATE_BLOCK_BITS=10-UTRIE_SHIFT, 83 84 /** 85 * Number of index (stage 1) entries per lead surrogate. 86 * Same as number of index entries for 1024 trail surrogates, 87 * ==0x400>>UTRIE_SHIFT 88 */ 89 UTRIE_SURROGATE_BLOCK_COUNT=(1<<UTRIE_SURROGATE_BLOCK_BITS), 90 91 /** Length of the BMP portion of the index (stage 1) array. */ 92 UTRIE_BMP_INDEX_LENGTH=0x10000>>UTRIE_SHIFT 93 }; 94 95 /** 96 * Length of the index (stage 1) array before folding. 97 * Maximum number of Unicode code points (0x110000) shifted right by UTRIE_SHIFT. 98 */ 99 #define UTRIE_MAX_INDEX_LENGTH (0x110000>>UTRIE_SHIFT) 100 101 /** 102 * Maximum length of the runtime data (stage 2) array. 103 * Limited by 16-bit index values that are left-shifted by UTRIE_INDEX_SHIFT. 104 */ 105 #define UTRIE_MAX_DATA_LENGTH (0x10000<<UTRIE_INDEX_SHIFT) 106 107 /** 108 * Maximum length of the build-time data (stage 2) array. 109 * The maximum length is 0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400. 110 * (Number of Unicode code points + one all-initial-value block + 111 * possible duplicate entries for 1024 lead surrogates.) 112 */ 113 #define UTRIE_MAX_BUILD_TIME_DATA_LENGTH (0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400) 114 115 /** 116 * Number of bytes for a dummy trie. 117 * A dummy trie is an empty runtime trie, used when a real data trie cannot 118 * be loaded. 119 * The number of bytes works for Latin-1-linear tries with 32-bit data 120 * (worst case). 121 * 122 * Calculation: 123 * BMP index + 1 index block for lead surrogate code points + 124 * Latin-1-linear array + 1 data block for lead surrogate code points 125 * 126 * Latin-1: if(UTRIE_SHIFT<=8) { 256 } else { included in first data block } 127 * 128 * @see utrie_unserializeDummy 129 */ 130 #define UTRIE_DUMMY_SIZE ((UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT)*2+(UTRIE_SHIFT<=8?256:UTRIE_DATA_BLOCK_LENGTH)*4+UTRIE_DATA_BLOCK_LENGTH*4) 131 132 /** 133 * Runtime UTrie callback function. 134 * Extract from a lead surrogate's data the 135 * index array offset of the indexes for that lead surrogate. 136 * 137 * @param data data value for a surrogate from the trie, including the folding offset 138 * @return offset>=UTRIE_BMP_INDEX_LENGTH, or 0 if there is no data for the lead surrogate 139 */ 140 typedef int32_t U_CALLCONV 141 UTrieGetFoldingOffset(uint32_t data); 142 143 /** 144 * Run-time Trie structure. 145 * 146 * Either the data table is 16 bits wide and accessed via the index 147 * pointer, with each index item increased by indexLength; 148 * in this case, data32==NULL. 149 * 150 * Or the data table is 32 bits wide and accessed via the data32 pointer. 151 */ 152 struct UTrie { 153 const uint16_t *index; 154 const uint32_t *data32; /* NULL if 16b data is used via index */ 155 156 /** 157 * This function is not used in _FROM_LEAD, _FROM_BMP, and _FROM_OFFSET_TRAIL macros. 158 * If convenience macros like _GET16 or _NEXT32 are used, this function must be set. 159 * 160 * utrie_unserialize() sets a default function which simply returns 161 * the lead surrogate's value itself - which is the inverse of the default 162 * folding function used by utrie_serialize(). 163 * 164 * @see UTrieGetFoldingOffset 165 */ 166 UTrieGetFoldingOffset *getFoldingOffset; 167 168 int32_t indexLength, dataLength; 169 uint32_t initialValue; 170 UBool isLatin1Linear; 171 }; 172 173 #ifndef __UTRIE2_H__ 174 typedef struct UTrie UTrie; 175 #endif 176 177 /** Internal trie getter from an offset (0 if c16 is a BMP/lead units) and a 16-bit unit */ 178 #define _UTRIE_GET_RAW(trie, data, offset, c16) \ 179 (trie)->data[ \ 180 ((int32_t)((trie)->index[(offset)+((c16)>>UTRIE_SHIFT)])<<UTRIE_INDEX_SHIFT)+ \ 181 ((c16)&UTRIE_MASK) \ 182 ] 183 184 /** Internal trie getter from a pair of surrogates */ 185 #define _UTRIE_GET_FROM_PAIR(trie, data, c, c2, result, resultType) { \ 186 int32_t __offset; \ 187 \ 188 /* get data for lead surrogate */ \ 189 (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \ 190 __offset=(trie)->getFoldingOffset(result); \ 191 \ 192 /* get the real data from the folded lead/trail units */ \ 193 if(__offset>0) { \ 194 (result)=_UTRIE_GET_RAW((trie), data, __offset, (c2)&0x3ff); \ 195 } else { \ 196 (result)=(resultType)((trie)->initialValue); \ 197 } \ 198 } 199 200 /** Internal trie getter from a BMP code point, treating a lead surrogate as a normal code point */ 201 #define _UTRIE_GET_FROM_BMP(trie, data, c16) \ 202 _UTRIE_GET_RAW(trie, data, 0xd800<=(c16) && (c16)<=0xdbff ? UTRIE_LEAD_INDEX_DISP : 0, c16); 203 204 /** 205 * Internal trie getter from a code point. 206 * Could be faster(?) but longer with 207 * if((c32)<=0xd7ff) { (result)=_UTRIE_GET_RAW(trie, data, 0, c32); } 208 */ 209 #define _UTRIE_GET(trie, data, c32, result, resultType) \ 210 if((uint32_t)(c32)<=0xffff) { \ 211 /* BMP code points */ \ 212 (result)=_UTRIE_GET_FROM_BMP(trie, data, c32); \ 213 } else if((uint32_t)(c32)<=0x10ffff) { \ 214 /* supplementary code point */ \ 215 UChar __lead16=U16_LEAD(c32); \ 216 _UTRIE_GET_FROM_PAIR(trie, data, __lead16, c32, result, resultType); \ 217 } else { \ 218 /* out of range */ \ 219 (result)=(resultType)((trie)->initialValue); \ 220 } 221 222 /** Internal next-post-increment: get the next code point (c, c2) and its data */ 223 #define _UTRIE_NEXT(trie, data, src, limit, c, c2, result, resultType) { \ 224 (c)=*(src)++; \ 225 if(!U16_IS_LEAD(c)) { \ 226 (c2)=0; \ 227 (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \ 228 } else if((src)!=(limit) && U16_IS_TRAIL((c2)=*(src))) { \ 229 ++(src); \ 230 _UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result), resultType); \ 231 } else { \ 232 /* unpaired lead surrogate code point */ \ 233 (c2)=0; \ 234 (result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \ 235 } \ 236 } 237 238 /** Internal previous: get the previous code point (c, c2) and its data */ 239 #define _UTRIE_PREVIOUS(trie, data, start, src, c, c2, result, resultType) { \ 240 (c)=*--(src); \ 241 if(!U16_IS_SURROGATE(c)) { \ 242 (c2)=0; \ 243 (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \ 244 } else if(!U16_IS_SURROGATE_LEAD(c)) { \ 245 /* trail surrogate */ \ 246 if((start)!=(src) && U16_IS_LEAD((c2)=*((src)-1))) { \ 247 --(src); \ 248 (result)=(c); (c)=(c2); (c2)=(UChar)(result); /* swap c, c2 */ \ 249 _UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result), resultType); \ 250 } else { \ 251 /* unpaired trail surrogate code point */ \ 252 (c2)=0; \ 253 (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \ 254 } \ 255 } else { \ 256 /* unpaired lead surrogate code point */ \ 257 (c2)=0; \ 258 (result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \ 259 } \ 260 } 261 262 /* Public UTrie API ---------------------------------------------------------*/ 263 264 /** 265 * Get a pointer to the contiguous part of the data array 266 * for the Latin-1 range (U+0000..U+00ff). 267 * Must be used only if the Latin-1 range is in fact linear 268 * (trie->isLatin1Linear). 269 * 270 * @param trie (const UTrie *, in) a pointer to the runtime trie structure 271 * @return (const uint16_t *) pointer to values for Latin-1 code points 272 */ 273 #define UTRIE_GET16_LATIN1(trie) ((trie)->index+(trie)->indexLength+UTRIE_DATA_BLOCK_LENGTH) 274 275 /** 276 * Get a pointer to the contiguous part of the data array 277 * for the Latin-1 range (U+0000..U+00ff). 278 * Must be used only if the Latin-1 range is in fact linear 279 * (trie->isLatin1Linear). 280 * 281 * @param trie (const UTrie *, in) a pointer to the runtime trie structure 282 * @return (const uint32_t *) pointer to values for Latin-1 code points 283 */ 284 #define UTRIE_GET32_LATIN1(trie) ((trie)->data32+UTRIE_DATA_BLOCK_LENGTH) 285 286 /** 287 * Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff). 288 * c16 may be a lead surrogate, which may have a value including a folding offset. 289 * 290 * @param trie (const UTrie *, in) a pointer to the runtime trie structure 291 * @param c16 (UChar, in) the input BMP code point 292 * @return (uint16_t) trie lookup result 293 */ 294 #define UTRIE_GET16_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, index, 0, c16) 295 296 /** 297 * Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff). 298 * c16 may be a lead surrogate, which may have a value including a folding offset. 299 * 300 * @param trie (const UTrie *, in) a pointer to the runtime trie structure 301 * @param c16 (UChar, in) the input BMP code point 302 * @return (uint32_t) trie lookup result 303 */ 304 #define UTRIE_GET32_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, data32, 0, c16) 305 306 /** 307 * Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff). 308 * Even lead surrogate code points are treated as normal code points, 309 * with unfolded values that may differ from _FROM_LEAD() macro results for them. 310 * 311 * @param trie (const UTrie *, in) a pointer to the runtime trie structure 312 * @param c16 (UChar, in) the input BMP code point 313 * @return (uint16_t) trie lookup result 314 */ 315 #define UTRIE_GET16_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, index, c16) 316 317 /** 318 * Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff). 319 * Even lead surrogate code points are treated as normal code points, 320 * with unfolded values that may differ from _FROM_LEAD() macro results for them. 321 * 322 * @param trie (const UTrie *, in) a pointer to the runtime trie structure 323 * @param c16 (UChar, in) the input BMP code point 324 * @return (uint32_t) trie lookup result 325 */ 326 #define UTRIE_GET32_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, data32, c16) 327 328 /** 329 * Get a 16-bit trie value from a code point. 330 * Even lead surrogate code points are treated as normal code points, 331 * with unfolded values that may differ from _FROM_LEAD() macro results for them. 332 * 333 * @param trie (const UTrie *, in) a pointer to the runtime trie structure 334 * @param c32 (UChar32, in) the input code point 335 * @param result (uint16_t, out) uint16_t variable for the trie lookup result 336 */ 337 #define UTRIE_GET16(trie, c32, result) _UTRIE_GET(trie, index, c32, result, uint16_t) 338 339 /** 340 * Get a 32-bit trie value from a code point. 341 * Even lead surrogate code points are treated as normal code points, 342 * with unfolded values that may differ from _FROM_LEAD() macro results for them. 343 * 344 * @param trie (const UTrie *, in) a pointer to the runtime trie structure 345 * @param c32 (UChar32, in) the input code point 346 * @param result (uint32_t, out) uint32_t variable for the trie lookup result 347 */ 348 #define UTRIE_GET32(trie, c32, result) _UTRIE_GET(trie, data32, c32, result, uint32_t) 349 350 /** 351 * Get the next code point (c, c2), post-increment src, 352 * and get a 16-bit value from the trie. 353 * 354 * @param trie (const UTrie *, in) a pointer to the runtime trie structure 355 * @param src (const UChar *, in/out) the source text pointer 356 * @param limit (const UChar *, in) the limit pointer for the text, or NULL 357 * @param c (UChar, out) variable for the BMP or lead code unit 358 * @param c2 (UChar, out) variable for 0 or the trail code unit 359 * @param result (uint16_t, out) uint16_t variable for the trie lookup result 360 */ 361 #define UTRIE_NEXT16(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, index, src, limit, c, c2, result, uint16_t) 362 363 /** 364 * Get the next code point (c, c2), post-increment src, 365 * and get a 32-bit value from the trie. 366 * 367 * @param trie (const UTrie *, in) a pointer to the runtime trie structure 368 * @param src (const UChar *, in/out) the source text pointer 369 * @param limit (const UChar *, in) the limit pointer for the text, or NULL 370 * @param c (UChar, out) variable for the BMP or lead code unit 371 * @param c2 (UChar, out) variable for 0 or the trail code unit 372 * @param result (uint32_t, out) uint32_t variable for the trie lookup result 373 */ 374 #define UTRIE_NEXT32(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, data32, src, limit, c, c2, result, uint32_t) 375 376 /** 377 * Get the previous code point (c, c2), pre-decrement src, 378 * and get a 16-bit value from the trie. 379 * 380 * @param trie (const UTrie *, in) a pointer to the runtime trie structure 381 * @param start (const UChar *, in) the start pointer for the text, or NULL 382 * @param src (const UChar *, in/out) the source text pointer 383 * @param c (UChar, out) variable for the BMP or lead code unit 384 * @param c2 (UChar, out) variable for 0 or the trail code unit 385 * @param result (uint16_t, out) uint16_t variable for the trie lookup result 386 */ 387 #define UTRIE_PREVIOUS16(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, index, start, src, c, c2, result, uint16_t) 388 389 /** 390 * Get the previous code point (c, c2), pre-decrement src, 391 * and get a 32-bit value from the trie. 392 * 393 * @param trie (const UTrie *, in) a pointer to the runtime trie structure 394 * @param start (const UChar *, in) the start pointer for the text, or NULL 395 * @param src (const UChar *, in/out) the source text pointer 396 * @param c (UChar, out) variable for the BMP or lead code unit 397 * @param c2 (UChar, out) variable for 0 or the trail code unit 398 * @param result (uint32_t, out) uint32_t variable for the trie lookup result 399 */ 400 #define UTRIE_PREVIOUS32(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, data32, start, src, c, c2, result, uint32_t) 401 402 /** 403 * Get a 16-bit trie value from a pair of surrogates. 404 * 405 * @param trie (const UTrie *, in) a pointer to the runtime trie structure 406 * @param c (UChar, in) a lead surrogate 407 * @param c2 (UChar, in) a trail surrogate 408 * @param result (uint16_t, out) uint16_t variable for the trie lookup result 409 */ 410 #define UTRIE_GET16_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, index, c, c2, result, uint16_t) 411 412 /** 413 * Get a 32-bit trie value from a pair of surrogates. 414 * 415 * @param trie (const UTrie *, in) a pointer to the runtime trie structure 416 * @param c (UChar, in) a lead surrogate 417 * @param c2 (UChar, in) a trail surrogate 418 * @param result (uint32_t, out) uint32_t variable for the trie lookup result 419 */ 420 #define UTRIE_GET32_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, data32, c, c2, result, uint32_t) 421 422 /** 423 * Get a 16-bit trie value from a folding offset (from the value of a lead surrogate) 424 * and a trail surrogate. 425 * 426 * @param trie (const UTrie *, in) a pointer to the runtime trie structure 427 * @param offset (int32_t, in) the folding offset from the value of a lead surrogate 428 * @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant) 429 * @return (uint16_t) trie lookup result 430 */ 431 #define UTRIE_GET16_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, index, offset, (c2)&0x3ff) 432 433 /** 434 * Get a 32-bit trie value from a folding offset (from the value of a lead surrogate) 435 * and a trail surrogate. 436 * 437 * @param trie (const UTrie *, in) a pointer to the runtime trie structure 438 * @param offset (int32_t, in) the folding offset from the value of a lead surrogate 439 * @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant) 440 * @return (uint32_t) trie lookup result 441 */ 442 #define UTRIE_GET32_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, data32, offset, (c2)&0x3ff) 443 444 /* enumeration callback types */ 445 446 /** 447 * Callback from utrie_enum(), extracts a uint32_t value from a 448 * trie value. This value will be passed on to the UTrieEnumRange function. 449 * 450 * @param context an opaque pointer, as passed into utrie_enum() 451 * @param value a value from the trie 452 * @return the value that is to be passed on to the UTrieEnumRange function 453 */ 454 typedef uint32_t U_CALLCONV 455 UTrieEnumValue(const void *context, uint32_t value); 456 457 /** 458 * Callback from utrie_enum(), is called for each contiguous range 459 * of code points with the same value as retrieved from the trie and 460 * transformed by the UTrieEnumValue function. 461 * 462 * The callback function can stop the enumeration by returning FALSE. 463 * 464 * @param context an opaque pointer, as passed into utrie_enum() 465 * @param start the first code point in a contiguous range with value 466 * @param limit one past the last code point in a contiguous range with value 467 * @param value the value that is set for all code points in [start..limit[ 468 * @return FALSE to stop the enumeration 469 */ 470 typedef UBool U_CALLCONV 471 UTrieEnumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value); 472 473 /** 474 * Enumerate efficiently all values in a trie. 475 * For each entry in the trie, the value to be delivered is passed through 476 * the UTrieEnumValue function. 477 * The value is unchanged if that function pointer is NULL. 478 * 479 * For each contiguous range of code points with a given value, 480 * the UTrieEnumRange function is called. 481 * 482 * @param trie a pointer to the runtime trie structure 483 * @param enumValue a pointer to a function that may transform the trie entry value, 484 * or NULL if the values from the trie are to be used directly 485 * @param enumRange a pointer to a function that is called for each contiguous range 486 * of code points with the same value 487 * @param context an opaque pointer that is passed on to the callback functions 488 */ 489 U_CAPI void U_EXPORT2 490 utrie_enum(const UTrie *trie, 491 UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context); 492 493 /** 494 * Unserialize a trie from 32-bit-aligned memory. 495 * Inverse of utrie_serialize(). 496 * Fills the UTrie runtime trie structure with the settings for the trie data. 497 * 498 * @param trie a pointer to the runtime trie structure 499 * @param data a pointer to 32-bit-aligned memory containing trie data 500 * @param length the number of bytes available at data 501 * @param pErrorCode an in/out ICU UErrorCode 502 * @return the number of bytes at data taken up by the trie data 503 */ 504 U_CAPI int32_t U_EXPORT2 505 utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode); 506 507 /** 508 * "Unserialize" a dummy trie. 509 * A dummy trie is an empty runtime trie, used when a real data trie cannot 510 * be loaded. 511 * 512 * The input memory is filled so that the trie always returns the initialValue, 513 * or the leadUnitValue for lead surrogate code points. 514 * The Latin-1 part is always set up to be linear. 515 * 516 * @param trie a pointer to the runtime trie structure 517 * @param data a pointer to 32-bit-aligned memory to be filled with the dummy trie data 518 * @param length the number of bytes available at data (recommended to use UTRIE_DUMMY_SIZE) 519 * @param initialValue the initial value that is set for all code points 520 * @param leadUnitValue the value for lead surrogate code _units_ that do not 521 * have associated supplementary data 522 * @param pErrorCode an in/out ICU UErrorCode 523 * 524 * @see UTRIE_DUMMY_SIZE 525 * @see utrie_open 526 */ 527 U_CAPI int32_t U_EXPORT2 528 utrie_unserializeDummy(UTrie *trie, 529 void *data, int32_t length, 530 uint32_t initialValue, uint32_t leadUnitValue, 531 UBool make16BitTrie, 532 UErrorCode *pErrorCode); 533 534 /** 535 * Default implementation for UTrie.getFoldingOffset, set automatically by 536 * utrie_unserialize(). 537 * Simply returns the lead surrogate's value itself - which is the inverse 538 * of the default folding function used by utrie_serialize(). 539 * Exported for static const UTrie structures. 540 * 541 * @see UTrieGetFoldingOffset 542 */ 543 U_CAPI int32_t U_EXPORT2 544 utrie_defaultGetFoldingOffset(uint32_t data); 545 546 /* Building a trie ----------------------------------------------------------*/ 547 548 /** 549 * Build-time trie structure. 550 * Opaque definition, here only to make fillIn parameters possible 551 * for utrie_open() and utrie_clone(). 552 */ 553 struct UNewTrie { 554 /** 555 * Index values at build-time are 32 bits wide for easier processing. 556 * Bit 31 is set if the data block is used by multiple index values (from utrie_setRange()). 557 */ 558 int32_t index[UTRIE_MAX_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT]; 559 uint32_t *data; 560 561 uint32_t leadUnitValue; 562 int32_t indexLength, dataCapacity, dataLength; 563 UBool isAllocated, isDataAllocated; 564 UBool isLatin1Linear, isCompacted; 565 566 /** 567 * Map of adjusted indexes, used in utrie_compact(). 568 * Maps from original indexes to new ones. 569 */ 570 int32_t map[UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT]; 571 }; 572 573 typedef struct UNewTrie UNewTrie; 574 575 /** 576 * Build-time trie callback function, used with utrie_serialize(). 577 * This function calculates a lead surrogate's value including a folding offset 578 * from the 1024 supplementary code points [start..start+1024[ . 579 * It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0. 580 * 581 * The folding offset is provided by the caller. 582 * It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023. 583 * Instead of the offset itself, n can be stored in 10 bits - 584 * or fewer if it can be assumed that few lead surrogates have associated data. 585 * 586 * The returned value must be 587 * - not zero if and only if there is relevant data 588 * for the corresponding 1024 supplementary code points 589 * - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(..., offset))==offset 590 * 591 * @return a folded value, or 0 if there is no relevant data for the lead surrogate. 592 */ 593 typedef uint32_t U_CALLCONV 594 UNewTrieGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset); 595 596 /** 597 * Open a build-time trie structure. 598 * The size of the build-time data array is specified to avoid allocating a large 599 * array in all cases. The array itself can also be passed in. 600 * 601 * Although the trie is never fully expanded to a linear array, especially when 602 * utrie_setRange32() is used, the data array could be large during build time. 603 * The maximum length is 604 * UTRIE_MAX_BUILD_TIME_DATA_LENGTH=0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400. 605 * (Number of Unicode code points + one all-initial-value block + 606 * possible duplicate entries for 1024 lead surrogates.) 607 * (UTRIE_DATA_BLOCK_LENGTH<=0x200 in all cases.) 608 * 609 * @param fillIn a pointer to a UNewTrie structure to be initialized (will not be released), or 610 * NULL if one is to be allocated 611 * @param aliasData a pointer to a data array to be used (will not be released), or 612 * NULL if one is to be allocated 613 * @param maxDataLength the capacity of aliasData (if not NULL) or 614 * the length of the data array to be allocated 615 * @param initialValue the initial value that is set for all code points 616 * @param leadUnitValue the value for lead surrogate code _units_ that do not 617 * have associated supplementary data 618 * @param latin1Linear a flag indicating whether the Latin-1 range is to be allocated and 619 * kept in a linear, contiguous part of the data array 620 * @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie 621 */ 622 U_CAPI UNewTrie * U_EXPORT2 623 utrie_open(UNewTrie *fillIn, 624 uint32_t *aliasData, int32_t maxDataLength, 625 uint32_t initialValue, uint32_t leadUnitValue, 626 UBool latin1Linear); 627 628 /** 629 * Clone a build-time trie structure with all entries. 630 * 631 * @param fillIn like in utrie_open() 632 * @param other the build-time trie structure to clone 633 * @param aliasData like in utrie_open(), 634 * used if aliasDataLength>=(capacity of other's data array) 635 * @param aliasDataLength the length of aliasData 636 * @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie 637 */ 638 U_CAPI UNewTrie * U_EXPORT2 639 utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataLength); 640 641 /** 642 * Close a build-time trie structure, and release memory 643 * that was allocated by utrie_open() or utrie_clone(). 644 * 645 * @param trie the build-time trie 646 */ 647 U_CAPI void U_EXPORT2 648 utrie_close(UNewTrie *trie); 649 650 /** 651 * Get the data array of a build-time trie. 652 * The data may be modified, but entries that are equal before 653 * must still be equal after modification. 654 * 655 * @param trie the build-time trie 656 * @param pLength (out) a pointer to a variable that receives the number 657 * of entries in the data array 658 * @return the data array 659 */ 660 U_CAPI uint32_t * U_EXPORT2 661 utrie_getData(UNewTrie *trie, int32_t *pLength); 662 663 /** 664 * Set a value for a code point. 665 * 666 * @param trie the build-time trie 667 * @param c the code point 668 * @param value the value 669 * @return FALSE if a failure occurred (illegal argument or data array overrun) 670 */ 671 U_CAPI UBool U_EXPORT2 672 utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value); 673 674 /** 675 * Get a value from a code point as stored in the build-time trie. 676 * 677 * @param trie the build-time trie 678 * @param c the code point 679 * @param pInBlockZero if not NULL, then *pInBlockZero is set to TRUE 680 * iff the value is retrieved from block 0; 681 * block 0 is the all-initial-value initial block 682 * @return the value 683 */ 684 U_CAPI uint32_t U_EXPORT2 685 utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero); 686 687 /** 688 * Set a value in a range of code points [start..limit[. 689 * All code points c with start<=c<limit will get the value if 690 * overwrite is TRUE or if the old value is 0. 691 * 692 * @param trie the build-time trie 693 * @param start the first code point to get the value 694 * @param limit one past the last code point to get the value 695 * @param value the value 696 * @param overwrite flag for whether old non-initial values are to be overwritten 697 * @return FALSE if a failure occurred (illegal argument or data array overrun) 698 */ 699 U_CAPI UBool U_EXPORT2 700 utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite); 701 702 /** 703 * Compact the build-time trie after all values are set, and then 704 * serialize it into 32-bit aligned memory. 705 * 706 * After this, the trie can only be serizalized again and/or closed; 707 * no further values can be added. 708 * 709 * @see utrie_unserialize() 710 * 711 * @param trie the build-time trie 712 * @param data a pointer to 32-bit-aligned memory for the trie data 713 * @param capacity the number of bytes available at data 714 * @param getFoldedValue a callback function that calculates the value for 715 * a lead surrogate from all of its supplementary code points 716 * and the folding offset; 717 * if NULL, then a default function is used which returns just 718 * the input offset when there are any non-initial-value entries 719 * @param reduceTo16Bits flag for whether the values are to be reduced to a 720 * width of 16 bits for serialization and runtime 721 * @param pErrorCode a UErrorCode argument; among other possible error codes: 722 * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization 723 * - U_MEMORY_ALLOCATION_ERROR if the trie data array is too small 724 * - U_INDEX_OUTOFBOUNDS_ERROR if the index or data arrays are too long after compaction for serialization 725 * 726 * @return the number of bytes written for the trie 727 */ 728 U_CAPI int32_t U_EXPORT2 729 utrie_serialize(UNewTrie *trie, void *data, int32_t capacity, 730 UNewTrieGetFoldedValue *getFoldedValue, 731 UBool reduceTo16Bits, 732 UErrorCode *pErrorCode); 733 734 /* serialization ------------------------------------------------------------ */ 735 736 // UTrie signature values, in platform endianness and opposite endianness. 737 // The UTrie signature ASCII byte values spell "Trie". 738 #define UTRIE_SIG 0x54726965 739 #define UTRIE_OE_SIG 0x65697254 740 741 /** 742 * Trie data structure in serialized form: 743 * 744 * UTrieHeader header; 745 * uint16_t index[header.indexLength]; 746 * uint16_t data[header.dataLength]; 747 * @internal 748 */ 749 typedef struct UTrieHeader { 750 /** "Trie" in big-endian US-ASCII (0x54726965) */ 751 uint32_t signature; 752 753 /** 754 * options bit field: 755 * 9 1=Latin-1 data is stored linearly at data+UTRIE_DATA_BLOCK_LENGTH 756 * 8 0=16-bit data, 1=32-bit data 757 * 7..4 UTRIE_INDEX_SHIFT // 0..UTRIE_SHIFT 758 * 3..0 UTRIE_SHIFT // 1..9 759 */ 760 uint32_t options; 761 762 /** indexLength is a multiple of UTRIE_SURROGATE_BLOCK_COUNT */ 763 int32_t indexLength; 764 765 /** dataLength>=UTRIE_DATA_BLOCK_LENGTH */ 766 int32_t dataLength; 767 } UTrieHeader; 768 769 /** 770 * Constants for use with UTrieHeader.options. 771 * @internal 772 */ 773 enum { 774 /** Mask to get the UTRIE_SHIFT value from options. */ 775 UTRIE_OPTIONS_SHIFT_MASK=0xf, 776 777 /** Shift options right this much to get the UTRIE_INDEX_SHIFT value. */ 778 UTRIE_OPTIONS_INDEX_SHIFT=4, 779 780 /** If set, then the data (stage 2) array is 32 bits wide. */ 781 UTRIE_OPTIONS_DATA_IS_32_BIT=0x100, 782 783 /** 784 * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data (stage 2) array 785 * as a simple, linear array at data+UTRIE_DATA_BLOCK_LENGTH. 786 */ 787 UTRIE_OPTIONS_LATIN1_IS_LINEAR=0x200 788 }; 789 790 U_CDECL_END 791 792 #endif 793