Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2001-2014, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  utrie2.h
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2008aug16 (starting from a copy of utrie.h)
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #ifndef __UTRIE2_H__
     20 #define __UTRIE2_H__
     21 
     22 #include "unicode/utypes.h"
     23 #include "unicode/utf8.h"
     24 #include "putilimp.h"
     25 
     26 U_CDECL_BEGIN
     27 
     28 struct UTrie;  /* forward declaration */
     29 #ifndef __UTRIE_H__
     30 typedef struct UTrie UTrie;
     31 #endif
     32 
     33 /**
     34  * \file
     35  *
     36  * This is a common implementation of a Unicode trie.
     37  * It is a kind of compressed, serializable table of 16- or 32-bit values associated with
     38  * Unicode code points (0..0x10ffff). (A map from code points to integers.)
     39  *
     40  * This is the second common version of a Unicode trie (hence the name UTrie2).
     41  * Compared with UTrie version 1:
     42  * - Still splitting BMP code points 11:5 bits for index and data table lookups.
     43  * - Still separate data for lead surrogate code _units_ vs. code _points_,
     44  *   but the lead surrogate code unit values are not required any more
     45  *   for data lookup for supplementary code points.
     46  * - The "folding" mechanism is removed. In UTrie version 1, this somewhat
     47  *   hard-to-explain mechanism was meant to be used for optimized UTF-16
     48  *   processing, with application-specific encoding of indexing bits
     49  *   in the lead surrogate data for the associated supplementary code points.
     50  * - For the last single-value code point range (ending with U+10ffff),
     51  *   the starting code point ("highStart") and the value are stored.
     52  * - For supplementary code points U+10000..highStart-1 a three-table lookup
     53  *   (two index tables and one data table) is used. The first index
     54  *   is truncated, omitting both the BMP portion and the high range.
     55  * - There is a special small index for 2-byte UTF-8, and the initial data
     56  *   entries are designed for fast 1/2-byte UTF-8 lookup.
     57  *   Starting with ICU 60, C0 and C1 are not recognized as UTF-8 lead bytes any more at all,
     58  *   and the associated 2-byte indexes are unused.
     59  */
     60 
     61 /**
     62  * Trie structure.
     63  * Use only with public API macros and functions.
     64  */
     65 struct UTrie2;
     66 typedef struct UTrie2 UTrie2;
     67 
     68 /* Public UTrie2 API functions: read-only access ---------------------------- */
     69 
     70 /**
     71  * Selectors for the width of a UTrie2 data value.
     72  */
     73 enum UTrie2ValueBits {
     74     /** 16 bits per UTrie2 data value. */
     75     UTRIE2_16_VALUE_BITS,
     76     /** 32 bits per UTrie2 data value. */
     77     UTRIE2_32_VALUE_BITS,
     78     /** Number of selectors for the width of UTrie2 data values. */
     79     UTRIE2_COUNT_VALUE_BITS
     80 };
     81 typedef enum UTrie2ValueBits UTrie2ValueBits;
     82 
     83 /**
     84  * Open a frozen trie from its serialized from, stored in 32-bit-aligned memory.
     85  * Inverse of utrie2_serialize().
     86  * The memory must remain valid and unchanged as long as the trie is used.
     87  * You must utrie2_close() the trie once you are done using it.
     88  *
     89  * @param valueBits selects the data entry size; results in an
     90  *                  U_INVALID_FORMAT_ERROR if it does not match the serialized form
     91  * @param data a pointer to 32-bit-aligned memory containing the serialized form of a UTrie2
     92  * @param length the number of bytes available at data;
     93  *               can be more than necessary
     94  * @param pActualLength receives the actual number of bytes at data taken up by the trie data;
     95  *                      can be NULL
     96  * @param pErrorCode an in/out ICU UErrorCode
     97  * @return the unserialized trie
     98  *
     99  * @see utrie2_open
    100  * @see utrie2_serialize
    101  */
    102 U_CAPI UTrie2 * U_EXPORT2
    103 utrie2_openFromSerialized(UTrie2ValueBits valueBits,
    104                           const void *data, int32_t length, int32_t *pActualLength,
    105                           UErrorCode *pErrorCode);
    106 
    107 /**
    108  * Open a frozen, empty "dummy" trie.
    109  * A dummy trie is an empty trie, used when a real data trie cannot
    110  * be loaded. Equivalent to calling utrie2_open() and utrie2_freeze(),
    111  * but without internally creating and compacting/serializing the
    112  * builder data structure.
    113  *
    114  * The trie always returns the initialValue,
    115  * or the errorValue for out-of-range code points and illegal UTF-8.
    116  *
    117  * You must utrie2_close() the trie once you are done using it.
    118  *
    119  * @param valueBits selects the data entry size
    120  * @param initialValue the initial value that is set for all code points
    121  * @param errorValue the value for out-of-range code points and illegal UTF-8
    122  * @param pErrorCode an in/out ICU UErrorCode
    123  * @return the dummy trie
    124  *
    125  * @see utrie2_openFromSerialized
    126  * @see utrie2_open
    127  */
    128 U_CAPI UTrie2 * U_EXPORT2
    129 utrie2_openDummy(UTrie2ValueBits valueBits,
    130                  uint32_t initialValue, uint32_t errorValue,
    131                  UErrorCode *pErrorCode);
    132 
    133 /**
    134  * Get a value from a code point as stored in the trie.
    135  * Easier to use than UTRIE2_GET16() and UTRIE2_GET32() but slower.
    136  * Easier to use because, unlike the macros, this function works on all UTrie2
    137  * objects, frozen or not, holding 16-bit or 32-bit data values.
    138  *
    139  * @param trie the trie
    140  * @param c the code point
    141  * @return the value
    142  */
    143 U_CAPI uint32_t U_EXPORT2
    144 utrie2_get32(const UTrie2 *trie, UChar32 c);
    145 
    146 /* enumeration callback types */
    147 
    148 /**
    149  * Callback from utrie2_enum(), extracts a uint32_t value from a
    150  * trie value. This value will be passed on to the UTrie2EnumRange function.
    151  *
    152  * @param context an opaque pointer, as passed into utrie2_enum()
    153  * @param value a value from the trie
    154  * @return the value that is to be passed on to the UTrie2EnumRange function
    155  */
    156 typedef uint32_t U_CALLCONV
    157 UTrie2EnumValue(const void *context, uint32_t value);
    158 
    159 /**
    160  * Callback from utrie2_enum(), is called for each contiguous range
    161  * of code points with the same value as retrieved from the trie and
    162  * transformed by the UTrie2EnumValue function.
    163  *
    164  * The callback function can stop the enumeration by returning FALSE.
    165  *
    166  * @param context an opaque pointer, as passed into utrie2_enum()
    167  * @param start the first code point in a contiguous range with value
    168  * @param end the last code point in a contiguous range with value (inclusive)
    169  * @param value the value that is set for all code points in [start..end]
    170  * @return FALSE to stop the enumeration
    171  */
    172 typedef UBool U_CALLCONV
    173 UTrie2EnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value);
    174 
    175 /**
    176  * Enumerate efficiently all values in a trie.
    177  * Do not modify the trie during the enumeration.
    178  *
    179  * For each entry in the trie, the value to be delivered is passed through
    180  * the UTrie2EnumValue function.
    181  * The value is unchanged if that function pointer is NULL.
    182  *
    183  * For each contiguous range of code points with a given (transformed) value,
    184  * the UTrie2EnumRange function is called.
    185  *
    186  * @param trie a pointer to the trie
    187  * @param enumValue a pointer to a function that may transform the trie entry value,
    188  *                  or NULL if the values from the trie are to be used directly
    189  * @param enumRange a pointer to a function that is called for each contiguous range
    190  *                  of code points with the same (transformed) value
    191  * @param context an opaque pointer that is passed on to the callback functions
    192  */
    193 U_CAPI void U_EXPORT2
    194 utrie2_enum(const UTrie2 *trie,
    195             UTrie2EnumValue *enumValue, UTrie2EnumRange *enumRange, const void *context);
    196 
    197 /* Building a trie ---------------------------------------------------------- */
    198 
    199 /**
    200  * Open an empty, writable trie. At build time, 32-bit data values are used.
    201  * utrie2_freeze() takes a valueBits parameter
    202  * which determines the data value width in the serialized and frozen forms.
    203  * You must utrie2_close() the trie once you are done using it.
    204  *
    205  * @param initialValue the initial value that is set for all code points
    206  * @param errorValue the value for out-of-range code points and illegal UTF-8
    207  * @param pErrorCode an in/out ICU UErrorCode
    208  * @return a pointer to the allocated and initialized new trie
    209  */
    210 U_CAPI UTrie2 * U_EXPORT2
    211 utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode);
    212 
    213 /**
    214  * Clone a trie.
    215  * You must utrie2_close() the clone once you are done using it.
    216  *
    217  * @param other the trie to clone
    218  * @param pErrorCode an in/out ICU UErrorCode
    219  * @return a pointer to the new trie clone
    220  */
    221 U_CAPI UTrie2 * U_EXPORT2
    222 utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode);
    223 
    224 /**
    225  * Clone a trie. The clone will be mutable/writable even if the other trie
    226  * is frozen. (See utrie2_freeze().)
    227  * You must utrie2_close() the clone once you are done using it.
    228  *
    229  * @param other the trie to clone
    230  * @param pErrorCode an in/out ICU UErrorCode
    231  * @return a pointer to the new trie clone
    232  */
    233 U_CAPI UTrie2 * U_EXPORT2
    234 utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode);
    235 
    236 /**
    237  * Close a trie and release associated memory.
    238  *
    239  * @param trie the trie
    240  */
    241 U_CAPI void U_EXPORT2
    242 utrie2_close(UTrie2 *trie);
    243 
    244 /**
    245  * Set a value for a code point.
    246  *
    247  * @param trie the unfrozen trie
    248  * @param c the code point
    249  * @param value the value
    250  * @param pErrorCode an in/out ICU UErrorCode; among other possible error codes:
    251  * - U_NO_WRITE_PERMISSION if the trie is frozen
    252  */
    253 U_CAPI void U_EXPORT2
    254 utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode);
    255 
    256 /**
    257  * Set a value in a range of code points [start..end].
    258  * All code points c with start<=c<=end will get the value if
    259  * overwrite is TRUE or if the old value is the initial value.
    260  *
    261  * @param trie the unfrozen trie
    262  * @param start the first code point to get the value
    263  * @param end the last code point to get the value (inclusive)
    264  * @param value the value
    265  * @param overwrite flag for whether old non-initial values are to be overwritten
    266  * @param pErrorCode an in/out ICU UErrorCode; among other possible error codes:
    267  * - U_NO_WRITE_PERMISSION if the trie is frozen
    268  */
    269 U_CAPI void U_EXPORT2
    270 utrie2_setRange32(UTrie2 *trie,
    271                   UChar32 start, UChar32 end,
    272                   uint32_t value, UBool overwrite,
    273                   UErrorCode *pErrorCode);
    274 
    275 /**
    276  * Freeze a trie. Make it immutable (read-only) and compact it,
    277  * ready for serialization and for use with fast macros.
    278  * Functions to set values will fail after serializing.
    279  *
    280  * A trie can be frozen only once. If this function is called again with different
    281  * valueBits then it will set a U_ILLEGAL_ARGUMENT_ERROR.
    282  *
    283  * @param trie the trie
    284  * @param valueBits selects the data entry size; if smaller than 32 bits, then
    285  *                  the values stored in the trie will be truncated
    286  * @param pErrorCode an in/out ICU UErrorCode; among other possible error codes:
    287  * - U_INDEX_OUTOFBOUNDS_ERROR if the compacted index or data arrays are too long
    288  *                             for serialization
    289  *                             (the trie will be immutable and usable,
    290  *                             but not frozen and not usable with the fast macros)
    291  *
    292  * @see utrie2_cloneAsThawed
    293  */
    294 U_CAPI void U_EXPORT2
    295 utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode);
    296 
    297 /**
    298  * Test if the trie is frozen. (See utrie2_freeze().)
    299  *
    300  * @param trie the trie
    301  * @return TRUE if the trie is frozen, that is, immutable, ready for serialization
    302  *         and for use with fast macros
    303  */
    304 U_CAPI UBool U_EXPORT2
    305 utrie2_isFrozen(const UTrie2 *trie);
    306 
    307 /**
    308  * Serialize a frozen trie into 32-bit aligned memory.
    309  * If the trie is not frozen, then the function returns with a U_ILLEGAL_ARGUMENT_ERROR.
    310  * A trie can be serialized multiple times.
    311  *
    312  * @param trie the frozen trie
    313  * @param data a pointer to 32-bit-aligned memory to be filled with the trie data,
    314  *             can be NULL if capacity==0
    315  * @param capacity the number of bytes available at data,
    316  *                 or 0 for preflighting
    317  * @param pErrorCode an in/out ICU UErrorCode; among other possible error codes:
    318  * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization
    319  * - U_ILLEGAL_ARGUMENT_ERROR if the trie is not frozen or the data and capacity
    320  *                            parameters are bad
    321  * @return the number of bytes written or needed for the trie
    322  *
    323  * @see utrie2_openFromSerialized()
    324  */
    325 U_CAPI int32_t U_EXPORT2
    326 utrie2_serialize(const UTrie2 *trie,
    327                  void *data, int32_t capacity,
    328                  UErrorCode *pErrorCode);
    329 
    330 /* Public UTrie2 API: miscellaneous functions ------------------------------- */
    331 
    332 /**
    333  * Build a UTrie2 (version 2) from a UTrie (version 1).
    334  * Enumerates all values in the UTrie and builds a UTrie2 with the same values.
    335  * The resulting UTrie2 will be frozen.
    336  *
    337  * @param trie1 the runtime UTrie structure to be enumerated
    338  * @param errorValue the value for out-of-range code points and illegal UTF-8
    339  * @param pErrorCode an in/out ICU UErrorCode
    340  * @return The frozen UTrie2 with the same values as the UTrie.
    341  */
    342 U_CAPI UTrie2 * U_EXPORT2
    343 utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode);
    344 
    345 /* Public UTrie2 API macros ------------------------------------------------- */
    346 
    347 /*
    348  * These macros provide fast data lookup from a frozen trie.
    349  * They will crash when used on an unfrozen trie.
    350  */
    351 
    352 /**
    353  * Return a 16-bit trie value from a code point, with range checking.
    354  * Returns trie->errorValue if c is not in the range 0..U+10ffff.
    355  *
    356  * @param trie (const UTrie2 *, in) a frozen trie
    357  * @param c (UChar32, in) the input code point
    358  * @return (uint16_t) The code point's trie value.
    359  */
    360 #define UTRIE2_GET16(trie, c) _UTRIE2_GET((trie), index, (trie)->indexLength, (c))
    361 
    362 /**
    363  * Return a 32-bit trie value from a code point, with range checking.
    364  * Returns trie->errorValue if c is not in the range 0..U+10ffff.
    365  *
    366  * @param trie (const UTrie2 *, in) a frozen trie
    367  * @param c (UChar32, in) the input code point
    368  * @return (uint32_t) The code point's trie value.
    369  */
    370 #define UTRIE2_GET32(trie, c) _UTRIE2_GET((trie), data32, 0, (c))
    371 
    372 /**
    373  * UTF-16: Get the next code point (UChar32 c, out), post-increment src,
    374  * and get a 16-bit value from the trie.
    375  *
    376  * @param trie (const UTrie2 *, in) a frozen trie
    377  * @param src (const UChar *, in/out) the source text pointer
    378  * @param limit (const UChar *, in) the limit pointer for the text, or NULL if NUL-terminated
    379  * @param c (UChar32, out) variable for the code point
    380  * @param result (uint16_t, out) uint16_t variable for the trie lookup result
    381  */
    382 #define UTRIE2_U16_NEXT16(trie, src, limit, c, result) _UTRIE2_U16_NEXT(trie, index, src, limit, c, result)
    383 
    384 /**
    385  * UTF-16: Get the next code point (UChar32 c, out), post-increment src,
    386  * and get a 32-bit value from the trie.
    387  *
    388  * @param trie (const UTrie2 *, in) a frozen trie
    389  * @param src (const UChar *, in/out) the source text pointer
    390  * @param limit (const UChar *, in) the limit pointer for the text, or NULL if NUL-terminated
    391  * @param c (UChar32, out) variable for the code point
    392  * @param result (uint32_t, out) uint32_t variable for the trie lookup result
    393  */
    394 #define UTRIE2_U16_NEXT32(trie, src, limit, c, result) _UTRIE2_U16_NEXT(trie, data32, src, limit, c, result)
    395 
    396 /**
    397  * UTF-16: Get the previous code point (UChar32 c, out), pre-decrement src,
    398  * and get a 16-bit value from the trie.
    399  *
    400  * @param trie (const UTrie2 *, in) a frozen trie
    401  * @param start (const UChar *, in) the start pointer for the text
    402  * @param src (const UChar *, in/out) the source text pointer
    403  * @param c (UChar32, out) variable for the code point
    404  * @param result (uint16_t, out) uint16_t variable for the trie lookup result
    405  */
    406 #define UTRIE2_U16_PREV16(trie, start, src, c, result) _UTRIE2_U16_PREV(trie, index, start, src, c, result)
    407 
    408 /**
    409  * UTF-16: Get the previous code point (UChar32 c, out), pre-decrement src,
    410  * and get a 32-bit value from the trie.
    411  *
    412  * @param trie (const UTrie2 *, in) a frozen trie
    413  * @param start (const UChar *, in) the start pointer for the text
    414  * @param src (const UChar *, in/out) the source text pointer
    415  * @param c (UChar32, out) variable for the code point
    416  * @param result (uint32_t, out) uint32_t variable for the trie lookup result
    417  */
    418 #define UTRIE2_U16_PREV32(trie, start, src, c, result) _UTRIE2_U16_PREV(trie, data32, start, src, c, result)
    419 
    420 /**
    421  * UTF-8: Post-increment src and get a 16-bit value from the trie.
    422  *
    423  * @param trie (const UTrie2 *, in) a frozen trie
    424  * @param src (const char *, in/out) the source text pointer
    425  * @param limit (const char *, in) the limit pointer for the text (must not be NULL)
    426  * @param result (uint16_t, out) uint16_t variable for the trie lookup result
    427  */
    428 #define UTRIE2_U8_NEXT16(trie, src, limit, result)\
    429     _UTRIE2_U8_NEXT(trie, data16, index, src, limit, result)
    430 
    431 /**
    432  * UTF-8: Post-increment src and get a 32-bit value from the trie.
    433  *
    434  * @param trie (const UTrie2 *, in) a frozen trie
    435  * @param src (const char *, in/out) the source text pointer
    436  * @param limit (const char *, in) the limit pointer for the text (must not be NULL)
    437  * @param result (uint16_t, out) uint32_t variable for the trie lookup result
    438  */
    439 #define UTRIE2_U8_NEXT32(trie, src, limit, result) \
    440     _UTRIE2_U8_NEXT(trie, data32, data32, src, limit, result)
    441 
    442 /**
    443  * UTF-8: Pre-decrement src and get a 16-bit value from the trie.
    444  *
    445  * @param trie (const UTrie2 *, in) a frozen trie
    446  * @param start (const char *, in) the start pointer for the text
    447  * @param src (const char *, in/out) the source text pointer
    448  * @param result (uint16_t, out) uint16_t variable for the trie lookup result
    449  */
    450 #define UTRIE2_U8_PREV16(trie, start, src, result) \
    451     _UTRIE2_U8_PREV(trie, data16, index, start, src, result)
    452 
    453 /**
    454  * UTF-8: Pre-decrement src and get a 32-bit value from the trie.
    455  *
    456  * @param trie (const UTrie2 *, in) a frozen trie
    457  * @param start (const char *, in) the start pointer for the text
    458  * @param src (const char *, in/out) the source text pointer
    459  * @param result (uint16_t, out) uint32_t variable for the trie lookup result
    460  */
    461 #define UTRIE2_U8_PREV32(trie, start, src, result) \
    462     _UTRIE2_U8_PREV(trie, data32, data32, start, src, result)
    463 
    464 /* Public UTrie2 API: optimized UTF-16 access ------------------------------- */
    465 
    466 /*
    467  * The following functions and macros are used for highly optimized UTF-16
    468  * text processing. The UTRIE2_U16_NEXTxy() macros do not depend on these.
    469  *
    470  * A UTrie2 stores separate values for lead surrogate code _units_ vs. code _points_.
    471  * UTF-16 text processing can be optimized by detecting surrogate pairs and
    472  * assembling supplementary code points only when there is non-trivial data
    473  * available.
    474  *
    475  * At build-time, use utrie2_enumForLeadSurrogate() to see if there
    476  * is non-trivial (non-initialValue) data for any of the supplementary
    477  * code points associated with a lead surrogate.
    478  * If so, then set a special (application-specific) value for the
    479  * lead surrogate code _unit_, with utrie2_set32ForLeadSurrogateCodeUnit().
    480  *
    481  * At runtime, use UTRIE2_GET16_FROM_U16_SINGLE_LEAD() or
    482  * UTRIE2_GET32_FROM_U16_SINGLE_LEAD() per code unit. If there is non-trivial
    483  * data and the code unit is a lead surrogate, then check if a trail surrogate
    484  * follows. If so, assemble the supplementary code point with
    485  * U16_GET_SUPPLEMENTARY() and look up its value with UTRIE2_GET16_FROM_SUPP()
    486  * or UTRIE2_GET32_FROM_SUPP(); otherwise reset the lead
    487  * surrogate's value or do a code point lookup for it.
    488  *
    489  * If there is only trivial data for lead and trail surrogates, then processing
    490  * can often skip them. For example, in normalization or case mapping
    491  * all characters that do not have any mappings are simply copied as is.
    492  */
    493 
    494 /**
    495  * Get a value from a lead surrogate code unit as stored in the trie.
    496  *
    497  * @param trie the trie
    498  * @param c the code unit (U+D800..U+DBFF)
    499  * @return the value
    500  */
    501 U_CAPI uint32_t U_EXPORT2
    502 utrie2_get32FromLeadSurrogateCodeUnit(const UTrie2 *trie, UChar32 c);
    503 
    504 /**
    505  * Enumerate the trie values for the 1024=0x400 code points
    506  * corresponding to a given lead surrogate.
    507  * For example, for the lead surrogate U+D87E it will enumerate the values
    508  * for [U+2F800..U+2FC00[.
    509  * Used by data builder code that sets special lead surrogate code unit values
    510  * for optimized UTF-16 string processing.
    511  *
    512  * Do not modify the trie during the enumeration.
    513  *
    514  * Except for the limited code point range, this functions just like utrie2_enum():
    515  * For each entry in the trie, the value to be delivered is passed through
    516  * the UTrie2EnumValue function.
    517  * The value is unchanged if that function pointer is NULL.
    518  *
    519  * For each contiguous range of code points with a given (transformed) value,
    520  * the UTrie2EnumRange function is called.
    521  *
    522  * @param trie a pointer to the trie
    523  * @param enumValue a pointer to a function that may transform the trie entry value,
    524  *                  or NULL if the values from the trie are to be used directly
    525  * @param enumRange a pointer to a function that is called for each contiguous range
    526  *                  of code points with the same (transformed) value
    527  * @param context an opaque pointer that is passed on to the callback functions
    528  */
    529 U_CAPI void U_EXPORT2
    530 utrie2_enumForLeadSurrogate(const UTrie2 *trie, UChar32 lead,
    531                             UTrie2EnumValue *enumValue, UTrie2EnumRange *enumRange,
    532                             const void *context);
    533 
    534 /**
    535  * Set a value for a lead surrogate code unit.
    536  *
    537  * @param trie the unfrozen trie
    538  * @param lead the lead surrogate code unit (U+D800..U+DBFF)
    539  * @param value the value
    540  * @param pErrorCode an in/out ICU UErrorCode; among other possible error codes:
    541  * - U_NO_WRITE_PERMISSION if the trie is frozen
    542  */
    543 U_CAPI void U_EXPORT2
    544 utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie,
    545                                      UChar32 lead, uint32_t value,
    546                                      UErrorCode *pErrorCode);
    547 
    548 /**
    549  * Return a 16-bit trie value from a UTF-16 single/lead code unit (<=U+ffff).
    550  * Same as UTRIE2_GET16() if c is a BMP code point except for lead surrogates,
    551  * but smaller and faster.
    552  *
    553  * @param trie (const UTrie2 *, in) a frozen trie
    554  * @param c (UChar32, in) the input code unit, must be 0<=c<=U+ffff
    555  * @return (uint16_t) The code unit's trie value.
    556  */
    557 #define UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c) _UTRIE2_GET_FROM_U16_SINGLE_LEAD((trie), index, c)
    558 
    559 /**
    560  * Return a 32-bit trie value from a UTF-16 single/lead code unit (<=U+ffff).
    561  * Same as UTRIE2_GET32() if c is a BMP code point except for lead surrogates,
    562  * but smaller and faster.
    563  *
    564  * @param trie (const UTrie2 *, in) a frozen trie
    565  * @param c (UChar32, in) the input code unit, must be 0<=c<=U+ffff
    566  * @return (uint32_t) The code unit's trie value.
    567  */
    568 #define UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c) _UTRIE2_GET_FROM_U16_SINGLE_LEAD((trie), data32, c)
    569 
    570 /**
    571  * Return a 16-bit trie value from a supplementary code point (U+10000..U+10ffff).
    572  *
    573  * @param trie (const UTrie2 *, in) a frozen trie
    574  * @param c (UChar32, in) the input code point, must be U+10000<=c<=U+10ffff
    575  * @return (uint16_t) The code point's trie value.
    576  */
    577 #define UTRIE2_GET16_FROM_SUPP(trie, c) _UTRIE2_GET_FROM_SUPP((trie), index, c)
    578 
    579 /**
    580  * Return a 32-bit trie value from a supplementary code point (U+10000..U+10ffff).
    581  *
    582  * @param trie (const UTrie2 *, in) a frozen trie
    583  * @param c (UChar32, in) the input code point, must be U+10000<=c<=U+10ffff
    584  * @return (uint32_t) The code point's trie value.
    585  */
    586 #define UTRIE2_GET32_FROM_SUPP(trie, c) _UTRIE2_GET_FROM_SUPP((trie), data32, c)
    587 
    588 U_CDECL_END
    589 
    590 /* C++ convenience wrappers ------------------------------------------------- */
    591 
    592 #ifdef __cplusplus
    593 
    594 #include "unicode/utf.h"
    595 #include "mutex.h"
    596 
    597 U_NAMESPACE_BEGIN
    598 
    599 // Use the Forward/Backward subclasses below.
    600 class UTrie2StringIterator : public UMemory {
    601 public:
    602     UTrie2StringIterator(const UTrie2 *t, const UChar *p) :
    603         trie(t), codePointStart(p), codePointLimit(p), codePoint(U_SENTINEL) {}
    604 
    605     const UTrie2 *trie;
    606     const UChar *codePointStart, *codePointLimit;
    607     UChar32 codePoint;
    608 };
    609 
    610 class BackwardUTrie2StringIterator : public UTrie2StringIterator {
    611 public:
    612     BackwardUTrie2StringIterator(const UTrie2 *t, const UChar *s, const UChar *p) :
    613         UTrie2StringIterator(t, p), start(s) {}
    614 
    615     uint16_t previous16();
    616 
    617     const UChar *start;
    618 };
    619 
    620 class ForwardUTrie2StringIterator : public UTrie2StringIterator {
    621 public:
    622     // Iteration limit l can be NULL.
    623     // In that case, the caller must detect c==0 and stop.
    624     ForwardUTrie2StringIterator(const UTrie2 *t, const UChar *p, const UChar *l) :
    625         UTrie2StringIterator(t, p), limit(l) {}
    626 
    627     uint16_t next16();
    628 
    629     const UChar *limit;
    630 };
    631 
    632 U_NAMESPACE_END
    633 
    634 #endif
    635 
    636 /* Internal definitions ----------------------------------------------------- */
    637 
    638 U_CDECL_BEGIN
    639 
    640 /** Build-time trie structure. */
    641 struct UNewTrie2;
    642 typedef struct UNewTrie2 UNewTrie2;
    643 
    644 /*
    645  * Trie structure definition.
    646  *
    647  * Either the data table is 16 bits wide and accessed via the index
    648  * pointer, with each index item increased by indexLength;
    649  * in this case, data32==NULL, and data16 is used for direct ASCII access.
    650  *
    651  * Or the data table is 32 bits wide and accessed via the data32 pointer.
    652  */
    653 struct UTrie2 {
    654     /* protected: used by macros and functions for reading values */
    655     const uint16_t *index;
    656     const uint16_t *data16;     /* for fast UTF-8 ASCII access, if 16b data */
    657     const uint32_t *data32;     /* NULL if 16b data is used via index */
    658 
    659     int32_t indexLength, dataLength;
    660     uint16_t index2NullOffset;  /* 0xffff if there is no dedicated index-2 null block */
    661     uint16_t dataNullOffset;
    662     uint32_t initialValue;
    663     /** Value returned for out-of-range code points and illegal UTF-8. */
    664     uint32_t errorValue;
    665 
    666     /* Start of the last range which ends at U+10ffff, and its value. */
    667     UChar32 highStart;
    668     int32_t highValueIndex;
    669 
    670     /* private: used by builder and unserialization functions */
    671     void *memory;           /* serialized bytes; NULL if not frozen yet */
    672     int32_t length;         /* number of serialized bytes at memory; 0 if not frozen yet */
    673     UBool isMemoryOwned;    /* TRUE if the trie owns the memory */
    674     UBool padding1;
    675     int16_t padding2;
    676     UNewTrie2 *newTrie;     /* builder object; NULL when frozen */
    677 
    678 #ifdef UTRIE2_DEBUG
    679     const char *name;
    680 #endif
    681 };
    682 
    683 /**
    684  * Trie constants, defining shift widths, index array lengths, etc.
    685  *
    686  * These are needed for the runtime macros but users can treat these as
    687  * implementation details and skip to the actual public API further below.
    688  */
    689 enum {
    690     /** Shift size for getting the index-1 table offset. */
    691     UTRIE2_SHIFT_1=6+5,
    692 
    693     /** Shift size for getting the index-2 table offset. */
    694     UTRIE2_SHIFT_2=5,
    695 
    696     /**
    697      * Difference between the two shift sizes,
    698      * for getting an index-1 offset from an index-2 offset. 6=11-5
    699      */
    700     UTRIE2_SHIFT_1_2=UTRIE2_SHIFT_1-UTRIE2_SHIFT_2,
    701 
    702     /**
    703      * Number of index-1 entries for the BMP. 32=0x20
    704      * This part of the index-1 table is omitted from the serialized form.
    705      */
    706     UTRIE2_OMITTED_BMP_INDEX_1_LENGTH=0x10000>>UTRIE2_SHIFT_1,
    707 
    708     /** Number of code points per index-1 table entry. 2048=0x800 */
    709     UTRIE2_CP_PER_INDEX_1_ENTRY=1<<UTRIE2_SHIFT_1,
    710 
    711     /** Number of entries in an index-2 block. 64=0x40 */
    712     UTRIE2_INDEX_2_BLOCK_LENGTH=1<<UTRIE2_SHIFT_1_2,
    713 
    714     /** Mask for getting the lower bits for the in-index-2-block offset. */
    715     UTRIE2_INDEX_2_MASK=UTRIE2_INDEX_2_BLOCK_LENGTH-1,
    716 
    717     /** Number of entries in a data block. 32=0x20 */
    718     UTRIE2_DATA_BLOCK_LENGTH=1<<UTRIE2_SHIFT_2,
    719 
    720     /** Mask for getting the lower bits for the in-data-block offset. */
    721     UTRIE2_DATA_MASK=UTRIE2_DATA_BLOCK_LENGTH-1,
    722 
    723     /**
    724      * Shift size for shifting left the index array values.
    725      * Increases possible data size with 16-bit index values at the cost
    726      * of compactability.
    727      * This requires data blocks to be aligned by UTRIE2_DATA_GRANULARITY.
    728      */
    729     UTRIE2_INDEX_SHIFT=2,
    730 
    731     /** The alignment size of a data block. Also the granularity for compaction. */
    732     UTRIE2_DATA_GRANULARITY=1<<UTRIE2_INDEX_SHIFT,
    733 
    734     /* Fixed layout of the first part of the index array. ------------------- */
    735 
    736     /**
    737      * The BMP part of the index-2 table is fixed and linear and starts at offset 0.
    738      * Length=2048=0x800=0x10000>>UTRIE2_SHIFT_2.
    739      */
    740     UTRIE2_INDEX_2_OFFSET=0,
    741 
    742     /**
    743      * The part of the index-2 table for U+D800..U+DBFF stores values for
    744      * lead surrogate code _units_ not code _points_.
    745      * Values for lead surrogate code _points_ are indexed with this portion of the table.
    746      * Length=32=0x20=0x400>>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.)
    747      */
    748     UTRIE2_LSCP_INDEX_2_OFFSET=0x10000>>UTRIE2_SHIFT_2,
    749     UTRIE2_LSCP_INDEX_2_LENGTH=0x400>>UTRIE2_SHIFT_2,
    750 
    751     /** Count the lengths of both BMP pieces. 2080=0x820 */
    752     UTRIE2_INDEX_2_BMP_LENGTH=UTRIE2_LSCP_INDEX_2_OFFSET+UTRIE2_LSCP_INDEX_2_LENGTH,
    753 
    754     /**
    755      * The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820.
    756      * Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2.
    757      */
    758     UTRIE2_UTF8_2B_INDEX_2_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH,
    759     UTRIE2_UTF8_2B_INDEX_2_LENGTH=0x800>>6,  /* U+0800 is the first code point after 2-byte UTF-8 */
    760 
    761     /**
    762      * The index-1 table, only used for supplementary code points, at offset 2112=0x840.
    763      * Variable length, for code points up to highStart, where the last single-value range starts.
    764      * Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1.
    765      * (For 0x100000 supplementary code points U+10000..U+10ffff.)
    766      *
    767      * The part of the index-2 table for supplementary code points starts
    768      * after this index-1 table.
    769      *
    770      * Both the index-1 table and the following part of the index-2 table
    771      * are omitted completely if there is only BMP data.
    772      */
    773     UTRIE2_INDEX_1_OFFSET=UTRIE2_UTF8_2B_INDEX_2_OFFSET+UTRIE2_UTF8_2B_INDEX_2_LENGTH,
    774     UTRIE2_MAX_INDEX_1_LENGTH=0x100000>>UTRIE2_SHIFT_1,
    775 
    776     /*
    777      * Fixed layout of the first part of the data array. -----------------------
    778      * Starts with 4 blocks (128=0x80 entries) for ASCII.
    779      */
    780 
    781     /**
    782      * The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80.
    783      * Used with linear access for single bytes 0..0xbf for simple error handling.
    784      * Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH.
    785      */
    786     UTRIE2_BAD_UTF8_DATA_OFFSET=0x80,
    787 
    788     /** The start of non-linear-ASCII data blocks, at offset 192=0xc0. */
    789     UTRIE2_DATA_START_OFFSET=0xc0
    790 };
    791 
    792 /* Internal functions and macros -------------------------------------------- */
    793 
    794 /**
    795  * Internal function for part of the UTRIE2_U8_NEXTxx() macro implementations.
    796  * Do not call directly.
    797  * @internal
    798  */
    799 U_INTERNAL int32_t U_EXPORT2
    800 utrie2_internalU8NextIndex(const UTrie2 *trie, UChar32 c,
    801                            const uint8_t *src, const uint8_t *limit);
    802 
    803 /**
    804  * Internal function for part of the UTRIE2_U8_PREVxx() macro implementations.
    805  * Do not call directly.
    806  * @internal
    807  */
    808 U_INTERNAL int32_t U_EXPORT2
    809 utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c,
    810                            const uint8_t *start, const uint8_t *src);
    811 
    812 
    813 /** Internal low-level trie getter. Returns a data index. */
    814 #define _UTRIE2_INDEX_RAW(offset, trieIndex, c) \
    815     (((int32_t)((trieIndex)[(offset)+((c)>>UTRIE2_SHIFT_2)]) \
    816     <<UTRIE2_INDEX_SHIFT)+ \
    817     ((c)&UTRIE2_DATA_MASK))
    818 
    819 /** Internal trie getter from a UTF-16 single/lead code unit. Returns the data index. */
    820 #define _UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(trieIndex, c) _UTRIE2_INDEX_RAW(0, trieIndex, c)
    821 
    822 /** Internal trie getter from a lead surrogate code point (D800..DBFF). Returns the data index. */
    823 #define _UTRIE2_INDEX_FROM_LSCP(trieIndex, c) \
    824     _UTRIE2_INDEX_RAW(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2), trieIndex, c)
    825 
    826 /** Internal trie getter from a BMP code point. Returns the data index. */
    827 #define _UTRIE2_INDEX_FROM_BMP(trieIndex, c) \
    828     _UTRIE2_INDEX_RAW(U_IS_LEAD(c) ? UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2) : 0, \
    829                       trieIndex, c)
    830 
    831 /** Internal trie getter from a supplementary code point below highStart. Returns the data index. */
    832 #define _UTRIE2_INDEX_FROM_SUPP(trieIndex, c) \
    833     (((int32_t)((trieIndex)[ \
    834         (trieIndex)[(UTRIE2_INDEX_1_OFFSET-UTRIE2_OMITTED_BMP_INDEX_1_LENGTH)+ \
    835                       ((c)>>UTRIE2_SHIFT_1)]+ \
    836         (((c)>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK)]) \
    837     <<UTRIE2_INDEX_SHIFT)+ \
    838     ((c)&UTRIE2_DATA_MASK))
    839 
    840 /**
    841  * Internal trie getter from a code point, with checking that c is in 0..10FFFF.
    842  * Returns the data index.
    843  */
    844 #define _UTRIE2_INDEX_FROM_CP(trie, asciiOffset, c) \
    845     ((uint32_t)(c)<0xd800 ? \
    846         _UTRIE2_INDEX_RAW(0, (trie)->index, c) : \
    847         (uint32_t)(c)<=0xffff ? \
    848             _UTRIE2_INDEX_RAW( \
    849                 (c)<=0xdbff ? UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2) : 0, \
    850                 (trie)->index, c) : \
    851             (uint32_t)(c)>0x10ffff ? \
    852                 (asciiOffset)+UTRIE2_BAD_UTF8_DATA_OFFSET : \
    853                 (c)>=(trie)->highStart ? \
    854                     (trie)->highValueIndex : \
    855                     _UTRIE2_INDEX_FROM_SUPP((trie)->index, c))
    856 
    857 /** Internal trie getter from a UTF-16 single/lead code unit. Returns the data. */
    858 #define _UTRIE2_GET_FROM_U16_SINGLE_LEAD(trie, data, c) \
    859     (trie)->data[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD((trie)->index, c)]
    860 
    861 /** Internal trie getter from a supplementary code point. Returns the data. */
    862 #define _UTRIE2_GET_FROM_SUPP(trie, data, c) \
    863     (trie)->data[(c)>=(trie)->highStart ? (trie)->highValueIndex : \
    864                  _UTRIE2_INDEX_FROM_SUPP((trie)->index, c)]
    865 
    866 /**
    867  * Internal trie getter from a code point, with checking that c is in 0..10FFFF.
    868  * Returns the data.
    869  */
    870 #define _UTRIE2_GET(trie, data, asciiOffset, c) \
    871     (trie)->data[_UTRIE2_INDEX_FROM_CP(trie, asciiOffset, c)]
    872 
    873 /** Internal next-post-increment: get the next code point (c) and its data. */
    874 #define _UTRIE2_U16_NEXT(trie, data, src, limit, c, result) { \
    875     { \
    876         uint16_t __c2; \
    877         (c)=*(src)++; \
    878         if(!U16_IS_LEAD(c)) { \
    879             (result)=_UTRIE2_GET_FROM_U16_SINGLE_LEAD(trie, data, c); \
    880         } else if((src)==(limit) || !U16_IS_TRAIL(__c2=*(src))) { \
    881             (result)=(trie)->data[_UTRIE2_INDEX_FROM_LSCP((trie)->index, c)]; \
    882         } else { \
    883             ++(src); \
    884             (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
    885             (result)=_UTRIE2_GET_FROM_SUPP((trie), data, (c)); \
    886         } \
    887     } \
    888 }
    889 
    890 /** Internal pre-decrement-previous: get the previous code point (c) and its data */
    891 #define _UTRIE2_U16_PREV(trie, data, start, src, c, result) { \
    892     { \
    893         uint16_t __c2; \
    894         (c)=*--(src); \
    895         if(!U16_IS_TRAIL(c) || (src)==(start) || !U16_IS_LEAD(__c2=*((src)-1))) { \
    896             (result)=(trie)->data[_UTRIE2_INDEX_FROM_BMP((trie)->index, c)]; \
    897         } else { \
    898             --(src); \
    899             (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
    900             (result)=_UTRIE2_GET_FROM_SUPP((trie), data, (c)); \
    901         } \
    902     } \
    903 }
    904 
    905 /** Internal UTF-8 next-post-increment: get the next code point's data. */
    906 #define _UTRIE2_U8_NEXT(trie, ascii, data, src, limit, result) { \
    907     uint8_t __lead=(uint8_t)*(src)++; \
    908     if(U8_IS_SINGLE(__lead)) { \
    909         (result)=(trie)->ascii[__lead]; \
    910     } else { \
    911         uint8_t __t1, __t2; \
    912         if( /* handle U+0800..U+FFFF inline */ \
    913             0xe0<=__lead && __lead<0xf0 && ((src)+1)<(limit) && \
    914             U8_IS_VALID_LEAD3_AND_T1(__lead, __t1=(uint8_t)*(src)) && \
    915             (__t2=(uint8_t)(*((src)+1)-0x80))<= 0x3f \
    916         ) { \
    917             (src)+=2; \
    918             (result)=(trie)->data[ \
    919                 ((int32_t)((trie)->index[((__lead-0xe0)<<(12-UTRIE2_SHIFT_2))+ \
    920                                          ((__t1&0x3f)<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \
    921                 <<UTRIE2_INDEX_SHIFT)+ \
    922                 (__t2&UTRIE2_DATA_MASK)]; \
    923         } else if( /* handle U+0080..U+07FF inline */ \
    924             __lead<0xe0 && __lead>=0xc2 && (src)<(limit) && \
    925             (__t1=(uint8_t)(*(src)-0x80))<=0x3f \
    926         ) { \
    927             ++(src); \
    928             (result)=(trie)->data[ \
    929                 (trie)->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET-0xc0)+__lead]+ \
    930                 __t1]; \
    931         } else { \
    932             int32_t __index=utrie2_internalU8NextIndex((trie), __lead, (const uint8_t *)(src), \
    933                                                                        (const uint8_t *)(limit)); \
    934             (src)+=__index&7; \
    935             (result)=(trie)->data[__index>>3]; \
    936         } \
    937     } \
    938 }
    939 
    940 /** Internal UTF-8 pre-decrement-previous: get the previous code point's data. */
    941 #define _UTRIE2_U8_PREV(trie, ascii, data, start, src, result) { \
    942     uint8_t __b=(uint8_t)*--(src); \
    943     if(U8_IS_SINGLE(__b)) { \
    944         (result)=(trie)->ascii[__b]; \
    945     } else { \
    946         int32_t __index=utrie2_internalU8PrevIndex((trie), __b, (const uint8_t *)(start), \
    947                                                                 (const uint8_t *)(src)); \
    948         (src)-=__index&7; \
    949         (result)=(trie)->data[__index>>3]; \
    950     } \
    951 }
    952 
    953 U_CDECL_END
    954 
    955 #endif
    956