Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2013-2014, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * collationdatareader.h
      7 *
      8 * created on: 2013feb07
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #ifndef __COLLATIONDATAREADER_H__
     13 #define __COLLATIONDATAREADER_H__
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_COLLATION
     18 
     19 #include "unicode/udata.h"
     20 
     21 struct UDataMemory;
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 struct CollationTailoring;
     26 
     27 /**
     28  * Collation binary data reader.
     29  */
     30 struct U_I18N_API CollationDataReader /* all static */ {
     31     // The following constants are also copied into source/common/ucol_swp.cpp.
     32     // Keep them in sync!
     33     enum {
     34         /**
     35          * Number of int32_t indexes.
     36          *
     37          * Can be 2 if there are only options.
     38          * Can be 7 or 8 if there are only options and a script reordering.
     39          * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
     40          */
     41         IX_INDEXES_LENGTH,  // 0
     42         /**
     43          * Bits 31..24: numericPrimary, for numeric collation
     44          *      23..16: fast Latin format version (0 = no fast Latin table)
     45          *      15.. 0: options bit set
     46          */
     47         IX_OPTIONS,
     48         IX_RESERVED2,
     49         IX_RESERVED3,
     50 
     51         /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
     52         IX_JAMO_CE32S_START,  // 4
     53 
     54         // Byte offsets from the start of the data, after the generic header.
     55         // The indexes[] are at byte offset 0, other data follows.
     56         // Each data item is aligned properly.
     57         // The data items should be in descending order of unit size,
     58         // to minimize the need for padding.
     59         // Each item's byte length is given by the difference between its offset and
     60         // the next index/offset value.
     61         /** Byte offset to int32_t reorderCodes[]. */
     62         IX_REORDER_CODES_OFFSET,
     63         /**
     64          * Byte offset to uint8_t reorderTable[].
     65          * Empty table if <256 bytes (padding only).
     66          * Otherwise 256 bytes or more (with padding).
     67          */
     68         IX_REORDER_TABLE_OFFSET,
     69         /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
     70         IX_TRIE_OFFSET,
     71 
     72         IX_RESERVED8_OFFSET,  // 8
     73         /** Byte offset to int64_t ces[]. */
     74         IX_CES_OFFSET,
     75         IX_RESERVED10_OFFSET,
     76         /** Byte offset to uint32_t ce32s[]. */
     77         IX_CE32S_OFFSET,
     78 
     79         /** Byte offset to uint32_t rootElements[]. */
     80         IX_ROOT_ELEMENTS_OFFSET,  // 12
     81         /** Byte offset to UChar *contexts[]. */
     82         IX_CONTEXTS_OFFSET,
     83         /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */
     84         IX_UNSAFE_BWD_OFFSET,
     85         /** Byte offset to uint16_t fastLatinTable[]. */
     86         IX_FAST_LATIN_TABLE_OFFSET,
     87 
     88         /** Byte offset to uint16_t scripts[]. */
     89         IX_SCRIPTS_OFFSET,  // 16
     90         /**
     91          * Byte offset to UBool compressibleBytes[].
     92          * Empty table if <256 bytes (padding only).
     93          * Otherwise 256 bytes or more (with padding).
     94          */
     95         IX_COMPRESSIBLE_BYTES_OFFSET,
     96         IX_RESERVED18_OFFSET,
     97         IX_TOTAL_SIZE
     98     };
     99 
    100     static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
    101                      CollationTailoring &tailoring, UErrorCode &errorCode);
    102 
    103     static UBool U_CALLCONV
    104     isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
    105 
    106 private:
    107     CollationDataReader();  // no constructor
    108 };
    109 
    110 /*
    111  * Format of collation data (ucadata.icu, binary data in coll/ *.res files).
    112  * Format version 4.0.
    113  *
    114  * The root collation data is stored in the ucadata.icu file.
    115  * Tailorings are stored inside .res resource bundle files, with a complete file header.
    116  *
    117  * Collation data begins with a standard ICU data file header
    118  * (DataHeader, see ucmndata.h and unicode/udata.h).
    119  * The UDataInfo.dataVersion field contains the UCA and other version numbers,
    120  * see the comments for CollationTailoring.version.
    121  *
    122  * After the header, the file contains the following parts.
    123  * Constants are defined as enum values of the CollationDataReader class.
    124  * See also the Collation class.
    125  *
    126  * int32_t indexes[indexesLength];
    127  *      The indexes array has variable length.
    128  *      Some tailorings only need the length and the options,
    129  *      others only add reorderCodes and the reorderTable,
    130  *      some need to store mappings.
    131  *      Only as many indexes are stored as needed to read all of the data.
    132  *
    133  *      Index 0: indexesLength
    134  *      Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS
    135  *      Index 2..3: Unused/reserved/0.
    136  *      Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo
    137  *               are stored in a short, contiguous part of the ce32s array.
    138  *
    139  *      Indexes 5..19 are byte offsets in ascending order.
    140  *      Each byte offset marks the start of the next part in the data file,
    141  *      and the end of the previous one.
    142  *      When two consecutive byte offsets are the same (or too short),
    143  *      then the corresponding part is empty.
    144  *      Byte offsets are offsets from after the header,
    145  *      that is, from the beginning of the indexes[].
    146  *      Each part starts at an offset with proper alignment for its data.
    147  *      If necessary, the previous part may include padding bytes to achieve this alignment.
    148  *      The last byte offset that is stored in the indexes indicates the total size of the data
    149  *      (starting with the indexes).
    150  *
    151  * int32_t reorderCodes[]; -- empty in root
    152  *      The list of script and reordering codes.
    153  *
    154  * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes
    155  *      Primary-weight lead byte permutation table.
    156  *      Normally present when the reorderCodes are, but can be built at load time.
    157  *
    158  * UTrie2 trie; -- see utrie2_impl.h and utrie2.h
    159  *      The trie holds the main collation data. Each code point is mapped to a 32-bit value.
    160  *      It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set,
    161  *      in which case it is a special CE32 and contains a 4-bit tag and further data.
    162  *      See the Collation class for details.
    163  *
    164  *      The trie has a value for each lead surrogate code unit with some bits encoding
    165  *      collective properties of the 1024 supplementary characters whose UTF-16 form starts with
    166  *      the lead surrogate. See Collation::LEAD_SURROGATE_TAG..
    167  *
    168  * int64_t ces[];
    169  *      64-bit CEs and expansions that cannot be stored in a more compact form.
    170  *
    171  * uint32_t ce32s[];
    172  *      CE32s for expansions in compact form, and for characters whose trie values
    173  *      contain special data.
    174  *
    175  * uint32_t rootElements[]; -- empty in all tailorings
    176  *      Compact storage for all of the CEs that occur in the root collation.
    177  *      See the CollationRootElements class.
    178  *
    179  * UChar *contexts[];
    180  *      Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings.
    181  *
    182  * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize()
    183  *      Serialized form of characters that are unsafe when iterating backwards,
    184  *      and at the end of an identical string prefix.
    185  *      Back up to a safe character.
    186  *      Lead surrogates are "unsafe" when any of their corresponding supplementary
    187  *      code points are unsafe.
    188  *      Does not include [:^lccc=0:][:^tccc=0:].
    189  *      For each tailoring, the root unsafeBackwardSet is subtracted.
    190  *      (As a result, in many tailorings no set needs to be stored.)
    191  *
    192  * uint16_t fastLatinTable[];
    193  *      Optional optimization for Latin text.
    194  *      See the CollationFastLatin class.
    195  *
    196  * uint16_t scripts[]; -- empty in all tailorings
    197  *      Table of the reordering groups with their first and last lead bytes,
    198  *      and their script and reordering codes.
    199  *      See CollationData::scripts.
    200  *
    201  * UBool compressibleBytes[]; -- empty in all tailorings
    202  *      Flag for getSortKey(), indicating primary weight lead bytes that are compressible.
    203  */
    204 
    205 U_NAMESPACE_END
    206 
    207 #endif  // !UCONFIG_NO_COLLATION
    208 #endif  // __COLLATIONDATAREADER_H__
    209