Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationrootelements.h
      9 *
     10 * created on: 2013mar01
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #ifndef __COLLATIONROOTELEMENTS_H__
     15 #define __COLLATIONROOTELEMENTS_H__
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_COLLATION
     20 
     21 #include "unicode/uobject.h"
     22 #include "collation.h"
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 /**
     27  * Container and access methods for collation elements and weights
     28  * that occur in the root collator.
     29  * Needed for finding boundaries for building a tailoring.
     30  *
     31  * This class takes and returns 16-bit secondary and tertiary weights.
     32  */
     33 class U_I18N_API CollationRootElements : public UMemory {
     34 public:
     35     CollationRootElements(const uint32_t *rootElements, int32_t rootElementsLength)
     36             : elements(rootElements), length(rootElementsLength) {}
     37 
     38     /**
     39      * Higher than any root primary.
     40      */
     41     static const uint32_t PRIMARY_SENTINEL = 0xffffff00;
     42 
     43     /**
     44      * Flag in a root element, set if the element contains secondary & tertiary weights,
     45      * rather than a primary.
     46      */
     47     static const uint32_t SEC_TER_DELTA_FLAG = 0x80;
     48     /**
     49      * Mask for getting the primary range step value from a primary-range-end element.
     50      */
     51     static const uint8_t PRIMARY_STEP_MASK = 0x7f;
     52 
     53     enum {
     54         /**
     55          * Index of the first CE with a non-zero tertiary weight.
     56          * Same as the start of the compact root elements table.
     57          */
     58         IX_FIRST_TERTIARY_INDEX,
     59         /**
     60          * Index of the first CE with a non-zero secondary weight.
     61          */
     62         IX_FIRST_SECONDARY_INDEX,
     63         /**
     64          * Index of the first CE with a non-zero primary weight.
     65          */
     66         IX_FIRST_PRIMARY_INDEX,
     67         /**
     68          * Must match Collation::COMMON_SEC_AND_TER_CE.
     69          */
     70         IX_COMMON_SEC_AND_TER_CE,
     71         /**
     72          * Secondary & tertiary boundaries.
     73          * Bits 31..24: [fixed last secondary common byte 45]
     74          * Bits 23..16: [fixed first ignorable secondary byte 80]
     75          * Bits 15.. 8: reserved, 0
     76          * Bits  7.. 0: [fixed first ignorable tertiary byte 3C]
     77          */
     78         IX_SEC_TER_BOUNDARIES,
     79         /**
     80          * The current number of indexes.
     81          * Currently the same as elements[IX_FIRST_TERTIARY_INDEX].
     82          */
     83         IX_COUNT
     84     };
     85 
     86     /**
     87      * Returns the boundary between tertiary weights of primary/secondary CEs
     88      * and those of tertiary CEs.
     89      * This is the upper limit for tertiaries of primary/secondary CEs.
     90      * This minus one is the lower limit for tertiaries of tertiary CEs.
     91      */
     92     uint32_t getTertiaryBoundary() const {
     93         return (elements[IX_SEC_TER_BOUNDARIES] << 8) & 0xff00;
     94     }
     95 
     96     /**
     97      * Returns the first assigned tertiary CE.
     98      */
     99     uint32_t getFirstTertiaryCE() const {
    100         return elements[elements[IX_FIRST_TERTIARY_INDEX]] & ~SEC_TER_DELTA_FLAG;
    101     }
    102 
    103     /**
    104      * Returns the last assigned tertiary CE.
    105      */
    106     uint32_t getLastTertiaryCE() const {
    107         return elements[elements[IX_FIRST_SECONDARY_INDEX] - 1] & ~SEC_TER_DELTA_FLAG;
    108     }
    109 
    110     /**
    111      * Returns the last common secondary weight.
    112      * This is the lower limit for secondaries of primary CEs.
    113      */
    114     uint32_t getLastCommonSecondary() const {
    115         return (elements[IX_SEC_TER_BOUNDARIES] >> 16) & 0xff00;
    116     }
    117 
    118     /**
    119      * Returns the boundary between secondary weights of primary CEs
    120      * and those of secondary CEs.
    121      * This is the upper limit for secondaries of primary CEs.
    122      * This minus one is the lower limit for secondaries of secondary CEs.
    123      */
    124     uint32_t getSecondaryBoundary() const {
    125         return (elements[IX_SEC_TER_BOUNDARIES] >> 8) & 0xff00;
    126     }
    127 
    128     /**
    129      * Returns the first assigned secondary CE.
    130      */
    131     uint32_t getFirstSecondaryCE() const {
    132         return elements[elements[IX_FIRST_SECONDARY_INDEX]] & ~SEC_TER_DELTA_FLAG;
    133     }
    134 
    135     /**
    136      * Returns the last assigned secondary CE.
    137      */
    138     uint32_t getLastSecondaryCE() const {
    139         return elements[elements[IX_FIRST_PRIMARY_INDEX] - 1] & ~SEC_TER_DELTA_FLAG;
    140     }
    141 
    142     /**
    143      * Returns the first assigned primary weight.
    144      */
    145     uint32_t getFirstPrimary() const {
    146         return elements[elements[IX_FIRST_PRIMARY_INDEX]];  // step=0: cannot be a range end
    147     }
    148 
    149     /**
    150      * Returns the first assigned primary CE.
    151      */
    152     int64_t getFirstPrimaryCE() const {
    153         return Collation::makeCE(getFirstPrimary());
    154     }
    155 
    156     /**
    157      * Returns the last root CE with a primary weight before p.
    158      * Intended only for reordering group boundaries.
    159      */
    160     int64_t lastCEWithPrimaryBefore(uint32_t p) const;
    161 
    162     /**
    163      * Returns the first root CE with a primary weight of at least p.
    164      * Intended only for reordering group boundaries.
    165      */
    166     int64_t firstCEWithPrimaryAtLeast(uint32_t p) const;
    167 
    168     /**
    169      * Returns the primary weight before p.
    170      * p must be greater than the first root primary.
    171      */
    172     uint32_t getPrimaryBefore(uint32_t p, UBool isCompressible) const;
    173 
    174     /** Returns the secondary weight before [p, s]. */
    175     uint32_t getSecondaryBefore(uint32_t p, uint32_t s) const;
    176 
    177     /** Returns the tertiary weight before [p, s, t]. */
    178     uint32_t getTertiaryBefore(uint32_t p, uint32_t s, uint32_t t) const;
    179 
    180     /**
    181      * Finds the index of the input primary.
    182      * p must occur as a root primary, and must not be 0.
    183      */
    184     int32_t findPrimary(uint32_t p) const;
    185 
    186     /**
    187      * Returns the primary weight after p where index=findPrimary(p).
    188      * p must be at least the first root primary.
    189      */
    190     uint32_t getPrimaryAfter(uint32_t p, int32_t index, UBool isCompressible) const;
    191     /**
    192      * Returns the secondary weight after [p, s] where index=findPrimary(p)
    193      * except use index=0 for p=0.
    194      *
    195      * Must return a weight for every root [p, s] as well as for every weight
    196      * returned by getSecondaryBefore(). If p!=0 then s can be BEFORE_WEIGHT16.
    197      *
    198      * Exception: [0, 0] is handled by the CollationBuilder:
    199      * Both its lower and upper boundaries are special.
    200      */
    201     uint32_t getSecondaryAfter(int32_t index, uint32_t s) const;
    202     /**
    203      * Returns the tertiary weight after [p, s, t] where index=findPrimary(p)
    204      * except use index=0 for p=0.
    205      *
    206      * Must return a weight for every root [p, s, t] as well as for every weight
    207      * returned by getTertiaryBefore(). If s!=0 then t can be BEFORE_WEIGHT16.
    208      *
    209      * Exception: [0, 0, 0] is handled by the CollationBuilder:
    210      * Both its lower and upper boundaries are special.
    211      */
    212     uint32_t getTertiaryAfter(int32_t index, uint32_t s, uint32_t t) const;
    213 
    214 private:
    215     /**
    216      * Returns the first secondary & tertiary weights for p where index=findPrimary(p)+1.
    217      */
    218     uint32_t getFirstSecTerForPrimary(int32_t index) const;
    219 
    220     /**
    221      * Finds the largest index i where elements[i]<=p.
    222      * Requires first primary<=p<0xffffff00 (PRIMARY_SENTINEL).
    223      * Does not require that p is a root collator primary.
    224      */
    225     int32_t findP(uint32_t p) const;
    226 
    227     static inline UBool isEndOfPrimaryRange(uint32_t q) {
    228         return (q & SEC_TER_DELTA_FLAG) == 0 && (q & PRIMARY_STEP_MASK) != 0;
    229     }
    230 
    231     /**
    232      * Data structure:
    233      *
    234      * The first few entries are indexes, up to elements[IX_FIRST_TERTIARY_INDEX].
    235      * See the comments on the IX_ constants.
    236      *
    237      * All other elements are a compact form of the root collator CEs
    238      * in mostly collation order.
    239      *
    240      * A sequence of one or more root CEs with the same primary weight is stored as
    241      * one element with the primary weight, with the SEC_TER_DELTA_FLAG flag not set,
    242      * followed by elements with only the secondary/tertiary weights,
    243      * each with that flag set.
    244      * If the lowest secondary/tertiary combination is Collation::COMMON_SEC_AND_TER_CE,
    245      * then the element for that combination is omitted.
    246      *
    247      * Note: If the first actual secondary/tertiary combination is higher than
    248      * Collation::COMMON_SEC_AND_TER_CE (which is unusual),
    249      * the runtime code will assume anyway that Collation::COMMON_SEC_AND_TER_CE is present.
    250      *
    251      * A range of only-primary CEs with a consistent "step" increment
    252      * from each primary to the next may be stored as a range.
    253      * Only the first and last primary are stored, and the last has the step
    254      * value in the low bits (PRIMARY_STEP_MASK).
    255      *
    256      * An range-end element may also either start a new range or be followed by
    257      * elements with secondary/tertiary deltas.
    258      *
    259      * A primary element that is not a range end has zero step bits.
    260      *
    261      * There is no element for the completely ignorable CE (all weights 0).
    262      *
    263      * Before elements[IX_FIRST_PRIMARY_INDEX], all elements are secondary/tertiary deltas,
    264      * for all of the ignorable root CEs.
    265      *
    266      * There are no elements for unassigned-implicit primary CEs.
    267      * All primaries stored here are at most 3 bytes long.
    268      */
    269     const uint32_t *elements;
    270     int32_t length;
    271 };
    272 
    273 U_NAMESPACE_END
    274 
    275 #endif  // !UCONFIG_NO_COLLATION
    276 #endif  // __COLLATIONROOTELEMENTS_H__
    277