Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2012-2014, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * collationdatabuilder.h
      7 *
      8 * created on: 2012apr01
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #ifndef __COLLATIONDATABUILDER_H__
     13 #define __COLLATIONDATABUILDER_H__
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_COLLATION
     18 
     19 #include "unicode/uniset.h"
     20 #include "unicode/unistr.h"
     21 #include "unicode/uversion.h"
     22 #include "collation.h"
     23 #include "collationdata.h"
     24 #include "collationsettings.h"
     25 #include "normalizer2impl.h"
     26 #include "utrie2.h"
     27 #include "uvectr32.h"
     28 #include "uvectr64.h"
     29 #include "uvector.h"
     30 
     31 U_NAMESPACE_BEGIN
     32 
     33 struct ConditionalCE32;
     34 
     35 class CollationFastLatinBuilder;
     36 class CopyHelper;
     37 class DataBuilderCollationIterator;
     38 class UCharsTrieBuilder;
     39 
     40 /**
     41  * Low-level CollationData builder.
     42  * Takes (character, CE) pairs and builds them into runtime data structures.
     43  * Supports characters with context prefixes and contraction suffixes.
     44  */
     45 class U_I18N_API CollationDataBuilder : public UObject {
     46 public:
     47     /**
     48      * Collation element modifier. Interface class for a modifier
     49      * that changes a tailoring builder's temporary CEs to final CEs.
     50      * Called for every non-special CE32 and every expansion CE.
     51      */
     52     class CEModifier : public UObject {
     53     public:
     54         virtual ~CEModifier();
     55         /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */
     56         virtual int64_t modifyCE32(uint32_t ce32) const = 0;
     57         /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */
     58         virtual int64_t modifyCE(int64_t ce) const = 0;
     59     };
     60 
     61     CollationDataBuilder(UErrorCode &errorCode);
     62 
     63     virtual ~CollationDataBuilder();
     64 
     65     void initForTailoring(const CollationData *b, UErrorCode &errorCode);
     66 
     67     virtual UBool isCompressibleLeadByte(uint32_t b) const;
     68 
     69     inline UBool isCompressiblePrimary(uint32_t p) const {
     70         return isCompressibleLeadByte(p >> 24);
     71     }
     72 
     73     /**
     74      * @return TRUE if this builder has mappings (e.g., add() has been called)
     75      */
     76     UBool hasMappings() const { return modified; }
     77 
     78     /**
     79      * @return TRUE if c has CEs in this builder
     80      */
     81     UBool isAssigned(UChar32 c) const;
     82 
     83     /**
     84      * @return the three-byte primary if c maps to a single such CE and has no context data,
     85      * otherwise returns 0.
     86      */
     87     uint32_t getLongPrimaryIfSingleCE(UChar32 c) const;
     88 
     89     /**
     90      * @return the single CE for c.
     91      * Sets an error code if c does not have a single CE.
     92      */
     93     int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
     94 
     95     void add(const UnicodeString &prefix, const UnicodeString &s,
     96              const int64_t ces[], int32_t cesLength,
     97              UErrorCode &errorCode);
     98 
     99     /**
    100      * Encodes the ces as either the returned ce32 by itself,
    101      * or by storing an expansion, with the returned ce32 referring to that.
    102      *
    103      * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength))
    104      */
    105     virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
    106     void addCE32(const UnicodeString &prefix, const UnicodeString &s,
    107                  uint32_t ce32, UErrorCode &errorCode);
    108 
    109     /**
    110      * Sets three-byte-primary CEs for a range of code points in code point order,
    111      * if it is worth doing; otherwise no change is made.
    112      * None of the code points in the range should have complex mappings so far
    113      * (expansions/contractions/prefixes).
    114      * @param start first code point
    115      * @param end last code point (inclusive)
    116      * @param primary primary weight for 'start'
    117      * @param step per-code point primary-weight increment
    118      * @param errorCode ICU in/out error code
    119      * @return TRUE if an OFFSET_TAG range was used for start..end
    120      */
    121     UBool maybeSetPrimaryRange(UChar32 start, UChar32 end,
    122                                uint32_t primary, int32_t step,
    123                                UErrorCode &errorCode);
    124 
    125     /**
    126      * Sets three-byte-primary CEs for a range of code points in code point order.
    127      * Sets range values if that is worth doing, or else individual values.
    128      * None of the code points in the range should have complex mappings so far
    129      * (expansions/contractions/prefixes).
    130      * @param start first code point
    131      * @param end last code point (inclusive)
    132      * @param primary primary weight for 'start'
    133      * @param step per-code point primary-weight increment
    134      * @param errorCode ICU in/out error code
    135      * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step
    136      */
    137     uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,
    138                                           uint32_t primary, int32_t step,
    139                                           UErrorCode &errorCode);
    140 
    141     /**
    142      * Copies all mappings from the src builder, with modifications.
    143      * This builder here must not be built yet, and should be empty.
    144      */
    145     void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
    146                   UErrorCode &errorCode);
    147 
    148     void optimize(const UnicodeSet &set, UErrorCode &errorCode);
    149     void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode);
    150 
    151     void enableFastLatin() { fastLatinEnabled = TRUE; }
    152     virtual void build(CollationData &data, UErrorCode &errorCode);
    153 
    154     /**
    155      * Looks up CEs for s and appends them to the ces array.
    156      * Does not handle normalization: s should be in FCD form.
    157      *
    158      * Does not write completely ignorable CEs.
    159      * Does not write beyond Collation::MAX_EXPANSION_LENGTH.
    160      *
    161      * @return incremented cesLength
    162      */
    163     int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength);
    164     int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s,
    165                    int64_t ces[], int32_t cesLength);
    166 
    167 protected:
    168     friend class CopyHelper;
    169     friend class DataBuilderCollationIterator;
    170 
    171     uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const;
    172 
    173     int32_t addCE(int64_t ce, UErrorCode &errorCode);
    174     int32_t addCE32(uint32_t ce32, UErrorCode &errorCode);
    175     int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode);
    176 
    177     inline ConditionalCE32 *getConditionalCE32(int32_t index) const {
    178         return static_cast<ConditionalCE32 *>(conditionalCE32s[index]);
    179     }
    180     inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const {
    181         return getConditionalCE32(Collation::indexFromCE32(ce32));
    182     }
    183 
    184     static uint32_t makeBuilderContextCE32(int32_t index) {
    185         return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index);
    186     }
    187     static inline UBool isBuilderContextCE32(uint32_t ce32) {
    188         return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG);
    189     }
    190 
    191     static uint32_t encodeOneCEAsCE32(int64_t ce);
    192     uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode);
    193     uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode);
    194     uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode);
    195 
    196     uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode);
    197     /**
    198      * Copies base contractions to a list of ConditionalCE32.
    199      * Sets cond->next to the index of the first new item
    200      * and returns the index of the last new item.
    201      */
    202     int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
    203                                          ConditionalCE32 *cond, UErrorCode &errorCode);
    204 
    205     UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode);
    206     void setDigitTags(UErrorCode &errorCode);
    207     void setLeadSurrogates(UErrorCode &errorCode);
    208 
    209     void buildMappings(CollationData &data, UErrorCode &errorCode);
    210 
    211     void clearContexts();
    212     void buildContexts(UErrorCode &errorCode);
    213     uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode);
    214     int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,
    215                            UErrorCode &errorCode);
    216 
    217     void buildFastLatinTable(CollationData &data, UErrorCode &errorCode);
    218 
    219     int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength);
    220 
    221     static UChar32 jamoCpFromIndex(int32_t i) {
    222         // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27
    223         if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; }
    224         i -= Hangul::JAMO_L_COUNT;
    225         if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; }
    226         i -= Hangul::JAMO_V_COUNT;
    227         // i < 27
    228         return Hangul::JAMO_T_BASE + 1 + i;
    229     }
    230 
    231     /** @see Collation::BUILDER_DATA_TAG */
    232     static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100;
    233 
    234     const Normalizer2Impl &nfcImpl;
    235     const CollationData *base;
    236     const CollationSettings *baseSettings;
    237     UTrie2 *trie;
    238     UVector32 ce32s;
    239     UVector64 ce64s;
    240     UVector conditionalCE32s;  // vector of ConditionalCE32
    241     // Characters that have context (prefixes or contraction suffixes).
    242     UnicodeSet contextChars;
    243     // Serialized UCharsTrie structures for finalized contexts.
    244     UnicodeString contexts;
    245     UnicodeSet unsafeBackwardSet;
    246     UBool modified;
    247 
    248     UBool fastLatinEnabled;
    249     CollationFastLatinBuilder *fastLatinBuilder;
    250 
    251     DataBuilderCollationIterator *collIter;
    252 };
    253 
    254 U_NAMESPACE_END
    255 
    256 #endif  // !UCONFIG_NO_COLLATION
    257 #endif  // __COLLATIONDATABUILDER_H__
    258