Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2010-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationdata.h
      9 *
     10 * created on: 2010oct27
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #ifndef __COLLATIONDATA_H__
     15 #define __COLLATIONDATA_H__
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_COLLATION
     20 
     21 #include "unicode/ucol.h"
     22 #include "unicode/uniset.h"
     23 #include "collation.h"
     24 #include "normalizer2impl.h"
     25 #include "utrie2.h"
     26 
     27 struct UDataMemory;
     28 
     29 U_NAMESPACE_BEGIN
     30 
     31 class UVector32;
     32 
     33 /**
     34  * Collation data container.
     35  * Immutable data created by a CollationDataBuilder, or loaded from a file,
     36  * or deserialized from API-provided binary data.
     37  *
     38  * Includes data for the collation base (root/default), aliased if this is not the base.
     39  */
     40 struct U_I18N_API CollationData : public UMemory {
     41     // Note: The ucadata.icu loader could discover the reserved ranges by setting an array
     42     // parallel with the ranges, and resetting ranges that are indexed.
     43     // The reordering builder code could clone the resulting template array.
     44     enum {
     45         REORDER_RESERVED_BEFORE_LATIN = UCOL_REORDER_CODE_FIRST + 14,
     46         REORDER_RESERVED_AFTER_LATIN
     47     };
     48 
     49     enum {
     50         MAX_NUM_SPECIAL_REORDER_CODES = 8,
     51         /** C++ only, data reader check scriptStartsLength. */
     52         MAX_NUM_SCRIPT_RANGES = 256
     53     };
     54 
     55     CollationData(const Normalizer2Impl &nfc)
     56             : trie(NULL),
     57               ce32s(NULL), ces(NULL), contexts(NULL), base(NULL),
     58               jamoCE32s(NULL),
     59               nfcImpl(nfc),
     60               numericPrimary(0x12000000),
     61               ce32sLength(0), cesLength(0), contextsLength(0),
     62               compressibleBytes(NULL),
     63               unsafeBackwardSet(NULL),
     64               fastLatinTable(NULL), fastLatinTableLength(0),
     65               numScripts(0), scriptsIndex(NULL), scriptStarts(NULL), scriptStartsLength(0),
     66               rootElements(NULL), rootElementsLength(0) {}
     67 
     68     uint32_t getCE32(UChar32 c) const {
     69         return UTRIE2_GET32(trie, c);
     70     }
     71 
     72     uint32_t getCE32FromSupplementary(UChar32 c) const {
     73         return UTRIE2_GET32_FROM_SUPP(trie, c);
     74     }
     75 
     76     UBool isDigit(UChar32 c) const {
     77         return c < 0x660 ? c <= 0x39 && 0x30 <= c :
     78                 Collation::hasCE32Tag(getCE32(c), Collation::DIGIT_TAG);
     79     }
     80 
     81     UBool isUnsafeBackward(UChar32 c, UBool numeric) const {
     82         return unsafeBackwardSet->contains(c) || (numeric && isDigit(c));
     83     }
     84 
     85     UBool isCompressibleLeadByte(uint32_t b) const {
     86         return compressibleBytes[b];
     87     }
     88 
     89     inline UBool isCompressiblePrimary(uint32_t p) const {
     90         return isCompressibleLeadByte(p >> 24);
     91     }
     92 
     93     /**
     94      * Returns the CE32 from two contexts words.
     95      * Access to the defaultCE32 for contraction and prefix matching.
     96      */
     97     static uint32_t readCE32(const UChar *p) {
     98         return ((uint32_t)p[0] << 16) | p[1];
     99     }
    100 
    101     /**
    102      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
    103      * Requires that ce32 is special.
    104      */
    105     uint32_t getIndirectCE32(uint32_t ce32) const;
    106     /**
    107      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
    108      * if ce32 is special.
    109      */
    110     uint32_t getFinalCE32(uint32_t ce32) const;
    111 
    112     /**
    113      * Computes a CE from c's ce32 which has the OFFSET_TAG.
    114      */
    115     int64_t getCEFromOffsetCE32(UChar32 c, uint32_t ce32) const {
    116         int64_t dataCE = ces[Collation::indexFromCE32(ce32)];
    117         return Collation::makeCE(Collation::getThreeBytePrimaryForOffsetData(c, dataCE));
    118     }
    119 
    120     /**
    121      * Returns the single CE that c maps to.
    122      * Sets U_UNSUPPORTED_ERROR if c does not map to a single CE.
    123      */
    124     int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
    125 
    126     /**
    127      * Returns the FCD16 value for code point c. c must be >= 0.
    128      */
    129     uint16_t getFCD16(UChar32 c) const {
    130         return nfcImpl.getFCD16(c);
    131     }
    132 
    133     /**
    134      * Returns the first primary for the script's reordering group.
    135      * @return the primary with only the first primary lead byte of the group
    136      *         (not necessarily an actual root collator primary weight),
    137      *         or 0 if the script is unknown
    138      */
    139     uint32_t getFirstPrimaryForGroup(int32_t script) const;
    140 
    141     /**
    142      * Returns the last primary for the script's reordering group.
    143      * @return the last primary of the group
    144      *         (not an actual root collator primary weight),
    145      *         or 0 if the script is unknown
    146      */
    147     uint32_t getLastPrimaryForGroup(int32_t script) const;
    148 
    149     /**
    150      * Finds the reordering group which contains the primary weight.
    151      * @return the first script of the group, or -1 if the weight is beyond the last group
    152      */
    153     int32_t getGroupForPrimary(uint32_t p) const;
    154 
    155     int32_t getEquivalentScripts(int32_t script,
    156                                  int32_t dest[], int32_t capacity, UErrorCode &errorCode) const;
    157 
    158     /**
    159      * Writes the permutation of primary-weight ranges
    160      * for the given reordering of scripts and groups.
    161      * The caller checks for illegal arguments and
    162      * takes care of [DEFAULT] and memory allocation.
    163      *
    164      * Each list element will be a (limit, offset) pair as described
    165      * for the CollationSettings::reorderRanges.
    166      * The list will be empty if no ranges are reordered.
    167      */
    168     void makeReorderRanges(const int32_t *reorder, int32_t length,
    169                            UVector32 &ranges, UErrorCode &errorCode) const;
    170 
    171     /** @see jamoCE32s */
    172     static const int32_t JAMO_CE32S_LENGTH = 19 + 21 + 27;
    173 
    174     /** Main lookup trie. */
    175     const UTrie2 *trie;
    176     /**
    177      * Array of CE32 values.
    178      * At index 0 there must be CE32(U+0000)
    179      * to support U+0000's special-tag for NUL-termination handling.
    180      */
    181     const uint32_t *ce32s;
    182     /** Array of CE values for expansions and OFFSET_TAG. */
    183     const int64_t *ces;
    184     /** Array of prefix and contraction-suffix matching data. */
    185     const UChar *contexts;
    186     /** Base collation data, or NULL if this data itself is a base. */
    187     const CollationData *base;
    188     /**
    189      * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
    190      * They are normally simple CE32s, rarely expansions.
    191      * For fast handling of HANGUL_TAG.
    192      */
    193     const uint32_t *jamoCE32s;
    194     const Normalizer2Impl &nfcImpl;
    195     /** The single-byte primary weight (xx000000) for numeric collation. */
    196     uint32_t numericPrimary;
    197 
    198     int32_t ce32sLength;
    199     int32_t cesLength;
    200     int32_t contextsLength;
    201 
    202     /** 256 flags for which primary-weight lead bytes are compressible. */
    203     const UBool *compressibleBytes;
    204     /**
    205      * Set of code points that are unsafe for starting string comparison after an identical prefix,
    206      * or in backwards CE iteration.
    207      */
    208     const UnicodeSet *unsafeBackwardSet;
    209 
    210     /**
    211      * Fast Latin table for common-Latin-text string comparisons.
    212      * Data structure see class CollationFastLatin.
    213      */
    214     const uint16_t *fastLatinTable;
    215     int32_t fastLatinTableLength;
    216 
    217     /**
    218      * Data for scripts and reordering groups.
    219      * Uses include building a reordering permutation table and
    220      * providing script boundaries to AlphabeticIndex.
    221      */
    222     int32_t numScripts;
    223     /**
    224      * The length of scriptsIndex is numScripts+16.
    225      * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
    226      * 16 special reorder codes (not all used) are mapped starting at numScripts.
    227      * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
    228      * There are special codes at the end for reorder-reserved primary ranges.
    229      *
    230      * Multiple scripts may share a range and index, for example Hira & Kana.
    231      */
    232     const uint16_t *scriptsIndex;
    233     /**
    234      * Start primary weight (top 16 bits only) for a group/script/reserved range
    235      * indexed by scriptsIndex.
    236      * The first range (separators & terminators) and the last range (trailing weights)
    237      * are not reorderable, and no scriptsIndex entry points to them.
    238      */
    239     const uint16_t *scriptStarts;
    240     int32_t scriptStartsLength;
    241 
    242     /**
    243      * Collation elements in the root collator.
    244      * Used by the CollationRootElements class. The data structure is described there.
    245      * NULL in a tailoring.
    246      */
    247     const uint32_t *rootElements;
    248     int32_t rootElementsLength;
    249 
    250 private:
    251     int32_t getScriptIndex(int32_t script) const;
    252     void makeReorderRanges(const int32_t *reorder, int32_t length,
    253                            UBool latinMustMove,
    254                            UVector32 &ranges, UErrorCode &errorCode) const;
    255     int32_t addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const;
    256     int32_t addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const;
    257 };
    258 
    259 U_NAMESPACE_END
    260 
    261 #endif  // !UCONFIG_NO_COLLATION
    262 #endif  // __COLLATIONDATA_H__
    263