1 /* 2 ******************************************************************************* 3 * Copyright (C) 2012-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * collationdatabuilder.h 7 * 8 * created on: 2012apr01 9 * created by: Markus W. Scherer 10 */ 11 12 #ifndef __COLLATIONDATABUILDER_H__ 13 #define __COLLATIONDATABUILDER_H__ 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_COLLATION 18 19 #include "unicode/uniset.h" 20 #include "unicode/unistr.h" 21 #include "unicode/uversion.h" 22 #include "collation.h" 23 #include "collationdata.h" 24 #include "collationsettings.h" 25 #include "normalizer2impl.h" 26 #include "utrie2.h" 27 #include "uvectr32.h" 28 #include "uvectr64.h" 29 #include "uvector.h" 30 31 U_NAMESPACE_BEGIN 32 33 struct ConditionalCE32; 34 35 class CollationFastLatinBuilder; 36 class CopyHelper; 37 class DataBuilderCollationIterator; 38 class UCharsTrieBuilder; 39 40 /** 41 * Low-level CollationData builder. 42 * Takes (character, CE) pairs and builds them into runtime data structures. 43 * Supports characters with context prefixes and contraction suffixes. 44 */ 45 class U_I18N_API CollationDataBuilder : public UObject { 46 public: 47 /** 48 * Collation element modifier. Interface class for a modifier 49 * that changes a tailoring builder's temporary CEs to final CEs. 50 * Called for every non-special CE32 and every expansion CE. 51 */ 52 class CEModifier : public UObject { 53 public: 54 virtual ~CEModifier(); 55 /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */ 56 virtual int64_t modifyCE32(uint32_t ce32) const = 0; 57 /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */ 58 virtual int64_t modifyCE(int64_t ce) const = 0; 59 }; 60 61 CollationDataBuilder(UErrorCode &errorCode); 62 63 virtual ~CollationDataBuilder(); 64 65 void initForTailoring(const CollationData *b, UErrorCode &errorCode); 66 67 virtual UBool isCompressibleLeadByte(uint32_t b) const; 68 69 inline UBool isCompressiblePrimary(uint32_t p) const { 70 return isCompressibleLeadByte(p >> 24); 71 } 72 73 /** 74 * @return TRUE if this builder has mappings (e.g., add() has been called) 75 */ 76 UBool hasMappings() const { return modified; } 77 78 /** 79 * @return TRUE if c has CEs in this builder 80 */ 81 UBool isAssigned(UChar32 c) const; 82 83 /** 84 * @return the three-byte primary if c maps to a single such CE and has no context data, 85 * otherwise returns 0. 86 */ 87 uint32_t getLongPrimaryIfSingleCE(UChar32 c) const; 88 89 /** 90 * @return the single CE for c. 91 * Sets an error code if c does not have a single CE. 92 */ 93 int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const; 94 95 void add(const UnicodeString &prefix, const UnicodeString &s, 96 const int64_t ces[], int32_t cesLength, 97 UErrorCode &errorCode); 98 99 /** 100 * Encodes the ces as either the returned ce32 by itself, 101 * or by storing an expansion, with the returned ce32 referring to that. 102 * 103 * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength)) 104 */ 105 virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode); 106 void addCE32(const UnicodeString &prefix, const UnicodeString &s, 107 uint32_t ce32, UErrorCode &errorCode); 108 109 /** 110 * Sets three-byte-primary CEs for a range of code points in code point order, 111 * if it is worth doing; otherwise no change is made. 112 * None of the code points in the range should have complex mappings so far 113 * (expansions/contractions/prefixes). 114 * @param start first code point 115 * @param end last code point (inclusive) 116 * @param primary primary weight for 'start' 117 * @param step per-code point primary-weight increment 118 * @param errorCode ICU in/out error code 119 * @return TRUE if an OFFSET_TAG range was used for start..end 120 */ 121 UBool maybeSetPrimaryRange(UChar32 start, UChar32 end, 122 uint32_t primary, int32_t step, 123 UErrorCode &errorCode); 124 125 /** 126 * Sets three-byte-primary CEs for a range of code points in code point order. 127 * Sets range values if that is worth doing, or else individual values. 128 * None of the code points in the range should have complex mappings so far 129 * (expansions/contractions/prefixes). 130 * @param start first code point 131 * @param end last code point (inclusive) 132 * @param primary primary weight for 'start' 133 * @param step per-code point primary-weight increment 134 * @param errorCode ICU in/out error code 135 * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step 136 */ 137 uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end, 138 uint32_t primary, int32_t step, 139 UErrorCode &errorCode); 140 141 /** 142 * Copies all mappings from the src builder, with modifications. 143 * This builder here must not be built yet, and should be empty. 144 */ 145 void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier, 146 UErrorCode &errorCode); 147 148 void optimize(const UnicodeSet &set, UErrorCode &errorCode); 149 void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode); 150 151 void enableFastLatin() { fastLatinEnabled = TRUE; } 152 virtual void build(CollationData &data, UErrorCode &errorCode); 153 154 /** 155 * Looks up CEs for s and appends them to the ces array. 156 * Does not handle normalization: s should be in FCD form. 157 * 158 * Does not write completely ignorable CEs. 159 * Does not write beyond Collation::MAX_EXPANSION_LENGTH. 160 * 161 * @return incremented cesLength 162 */ 163 int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength); 164 int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s, 165 int64_t ces[], int32_t cesLength); 166 167 protected: 168 friend class CopyHelper; 169 friend class DataBuilderCollationIterator; 170 171 uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const; 172 173 int32_t addCE(int64_t ce, UErrorCode &errorCode); 174 int32_t addCE32(uint32_t ce32, UErrorCode &errorCode); 175 int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode); 176 177 inline ConditionalCE32 *getConditionalCE32(int32_t index) const { 178 return static_cast<ConditionalCE32 *>(conditionalCE32s[index]); 179 } 180 inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const { 181 return getConditionalCE32(Collation::indexFromCE32(ce32)); 182 } 183 184 static uint32_t makeBuilderContextCE32(int32_t index) { 185 return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index); 186 } 187 static inline UBool isBuilderContextCE32(uint32_t ce32) { 188 return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG); 189 } 190 191 static uint32_t encodeOneCEAsCE32(int64_t ce); 192 uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode); 193 uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode); 194 uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode); 195 196 uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode); 197 /** 198 * Copies base contractions to a list of ConditionalCE32. 199 * Sets cond->next to the index of the first new item 200 * and returns the index of the last new item. 201 */ 202 int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32, 203 ConditionalCE32 *cond, UErrorCode &errorCode); 204 205 UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode); 206 void setDigitTags(UErrorCode &errorCode); 207 void setLeadSurrogates(UErrorCode &errorCode); 208 209 void buildMappings(CollationData &data, UErrorCode &errorCode); 210 211 void clearContexts(); 212 void buildContexts(UErrorCode &errorCode); 213 uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode); 214 int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder, 215 UErrorCode &errorCode); 216 217 void buildFastLatinTable(CollationData &data, UErrorCode &errorCode); 218 219 int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength); 220 221 static UChar32 jamoCpFromIndex(int32_t i) { 222 // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27 223 if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; } 224 i -= Hangul::JAMO_L_COUNT; 225 if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; } 226 i -= Hangul::JAMO_V_COUNT; 227 // i < 27 228 return Hangul::JAMO_T_BASE + 1 + i; 229 } 230 231 /** @see Collation::BUILDER_DATA_TAG */ 232 static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100; 233 234 const Normalizer2Impl &nfcImpl; 235 const CollationData *base; 236 const CollationSettings *baseSettings; 237 UTrie2 *trie; 238 UVector32 ce32s; 239 UVector64 ce64s; 240 UVector conditionalCE32s; // vector of ConditionalCE32 241 // Characters that have context (prefixes or contraction suffixes). 242 UnicodeSet contextChars; 243 // Serialized UCharsTrie structures for finalized contexts. 244 UnicodeString contexts; 245 UnicodeSet unsafeBackwardSet; 246 UBool modified; 247 248 UBool fastLatinEnabled; 249 CollationFastLatinBuilder *fastLatinBuilder; 250 251 DataBuilderCollationIterator *collIter; 252 }; 253 254 U_NAMESPACE_END 255 256 #endif // !UCONFIG_NO_COLLATION 257 #endif // __COLLATIONDATABUILDER_H__ 258