1 /* 2 ********************************************************************** 3 * Copyright (C) 2001-2015 IBM and others. All rights reserved. 4 ********************************************************************** 5 * Date Name Description 6 * 08/13/2001 synwee Creation. 7 ********************************************************************** 8 */ 9 #ifndef USRCHIMP_H 10 #define USRCHIMP_H 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION 15 16 #include "unicode/normalizer2.h" 17 #include "unicode/ucol.h" 18 #include "unicode/ucoleitr.h" 19 #include "unicode/ubrk.h" 20 21 /* mask off anything but primary order */ 22 #define UCOL_PRIMARYORDERMASK 0xffff0000 23 /* mask off anything but secondary order */ 24 #define UCOL_SECONDARYORDERMASK 0x0000ff00 25 /* mask off anything but tertiary order */ 26 #define UCOL_TERTIARYORDERMASK 0x000000ff 27 /* primary order shift */ 28 #define UCOL_PRIMARYORDERSHIFT 16 29 /* secondary order shift */ 30 #define UCOL_SECONDARYORDERSHIFT 8 31 32 #define UCOL_IGNORABLE 0 33 34 /* get weights from a CE */ 35 #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff) 36 #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT) 37 #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK) 38 39 #define UCOL_CONTINUATION_MARKER 0xC0 40 41 #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER) 42 43 /** 44 * This indicates an error has occured during processing or there are no more CEs 45 * to be returned. 46 */ 47 #define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX) 48 49 U_NAMESPACE_BEGIN 50 51 class CollationElementIterator; 52 class Collator; 53 54 struct PCEI 55 { 56 uint64_t ce; 57 int32_t low; 58 int32_t high; 59 }; 60 61 struct PCEBuffer 62 { 63 PCEI defaultBuffer[16]; 64 PCEI *buffer; 65 int32_t bufferIndex; 66 int32_t bufferSize; 67 68 PCEBuffer(); 69 ~PCEBuffer(); 70 71 void reset(); 72 UBool isEmpty() const; 73 void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode); 74 const PCEI *get(); 75 }; 76 77 class UCollationPCE : public UMemory { 78 private: 79 PCEBuffer pceBuffer; 80 CollationElementIterator *cei; 81 UCollationStrength strength; 82 UBool toShift; 83 UBool isShifted; 84 uint32_t variableTop; 85 86 public: 87 UCollationPCE(UCollationElements *elems); 88 UCollationPCE(CollationElementIterator *iter); 89 ~UCollationPCE(); 90 91 void init(UCollationElements *elems); 92 void init(CollationElementIterator *iter); 93 94 /** 95 * Get the processed ordering priority of the next collation element in the text. 96 * A single character may contain more than one collation element. 97 * 98 * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE. 99 * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE. 100 * @param status A pointer to an UErrorCode to receive any errors. 101 * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER 102 * if an error has occured or if the end of string has been reached 103 */ 104 int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); 105 /** 106 * Get the processed ordering priority of the previous collation element in the text. 107 * A single character may contain more than one collation element. 108 * 109 * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE 110 * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE 111 * @param status A pointer to an UErrorCode to receive any errors. Noteably 112 * a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack 113 * buffer has been exhausted. 114 * @return The previous collation elements ordering, otherwise returns 115 * UCOL_PROCESSED_NULLORDER if an error has occured or if the start of 116 * string has been reached. 117 */ 118 int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); 119 120 private: 121 void init(const Collator &coll); 122 uint64_t processCE(uint32_t ce); 123 }; 124 125 U_NAMESPACE_END 126 127 #define INITIAL_ARRAY_SIZE_ 256 128 #define MAX_TABLE_SIZE_ 257 129 130 struct USearch { 131 // required since collation element iterator does not have a getText API 132 const UChar *text; 133 int32_t textLength; // exact length 134 UBool isOverlap; 135 UBool isCanonicalMatch; 136 int16_t elementComparisonType; 137 UBreakIterator *internalBreakIter; //internal character breakiterator 138 UBreakIterator *breakIter; 139 // value USEARCH_DONE is the default value 140 // if we are not at the start of the text or the end of the text, 141 // depending on the iteration direction and matchedIndex is USEARCH_DONE 142 // it means that we can't find any more matches in that particular direction 143 int32_t matchedIndex; 144 int32_t matchedLength; 145 UBool isForwardSearching; 146 UBool reset; 147 }; 148 149 struct UPattern { 150 const UChar *text; 151 int32_t textLength; // exact length 152 // length required for backwards ce comparison 153 int32_t cesLength; 154 int32_t *ces; 155 int32_t cesBuffer[INITIAL_ARRAY_SIZE_]; 156 int32_t pcesLength; 157 int64_t *pces; 158 int64_t pcesBuffer[INITIAL_ARRAY_SIZE_]; 159 UBool hasPrefixAccents; 160 UBool hasSuffixAccents; 161 int16_t defaultShiftSize; 162 int16_t shift[MAX_TABLE_SIZE_]; 163 int16_t backShift[MAX_TABLE_SIZE_]; 164 }; 165 166 struct UStringSearch { 167 struct USearch *search; 168 struct UPattern pattern; 169 const UCollator *collator; 170 const icu::Normalizer2 *nfd; 171 // positions within the collation element iterator is used to determine 172 // if we are at the start of the text. 173 UCollationElements *textIter; 174 icu::UCollationPCE *textProcessedIter; 175 // utility collation element, used throughout program for temporary 176 // iteration. 177 UCollationElements *utilIter; 178 UBool ownCollator; 179 UCollationStrength strength; 180 uint32_t ceMask; 181 uint32_t variableTop; 182 UBool toShift; 183 UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_]; 184 UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_]; 185 }; 186 187 /** 188 * Exact matches without checking for the ends for extra accents. 189 * The match after the position within the collation element iterator is to be 190 * found. 191 * After a match is found the offset in the collation element iterator will be 192 * shifted to the start of the match. 193 * Implementation note: 194 * For tertiary we can't use the collator->tertiaryMask, that is a 195 * preprocessed mask that takes into account case options. since we are only 196 * concerned with exact matches, we don't need that. 197 * Alternate handling - since only the 16 most significant digits is only used, 198 * we can safely do a compare without masking if the ce is a variable, we mask 199 * and get only the primary values no shifting to quartenary is required since 200 * all primary values less than variabletop will need to be masked off anyway. 201 * If the end character is composite and the pattern ce does not match the text 202 * ce, we skip it until we find a match in the end composite character or when 203 * it has passed the character. This is so that we can match pattern "a" with 204 * the text "\u00e6" 205 * @param strsrch string search data 206 * @param status error status if any 207 * @return TRUE if an exact match is found, FALSE otherwise 208 */ 209 U_CFUNC 210 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status); 211 212 /** 213 * Canonical matches. 214 * According to the definition, matches found here will include the whole span 215 * of beginning and ending accents if it overlaps that region. 216 * @param strsrch string search data 217 * @param status error status if any 218 * @return TRUE if a canonical match is found, FALSE otherwise 219 */ 220 U_CFUNC 221 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status); 222 223 /** 224 * Gets the previous match. 225 * Comments follows from handleNextExact 226 * @param strsrch string search data 227 * @param status error status if any 228 * @return True if a exact math is found, FALSE otherwise. 229 */ 230 U_CFUNC 231 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status); 232 233 /** 234 * Canonical matches. 235 * According to the definition, matches found here will include the whole span 236 * of beginning and ending accents if it overlaps that region. 237 * @param strsrch string search data 238 * @param status error status if any 239 * @return TRUE if a canonical match is found, FALSE otherwise 240 */ 241 U_CFUNC 242 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 243 UErrorCode *status); 244 245 #endif /* #if !UCONFIG_NO_COLLATION */ 246 247 #endif 248