1 /* 2 ******************************************************************************* 3 * Copyright (C) 2012-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * utf8collationiterator.h 7 * 8 * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h) 9 * created by: Markus W. Scherer 10 */ 11 12 #ifndef __UTF8COLLATIONITERATOR_H__ 13 #define __UTF8COLLATIONITERATOR_H__ 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_COLLATION 18 19 #include "cmemory.h" 20 #include "collation.h" 21 #include "collationdata.h" 22 #include "normalizer2impl.h" 23 24 U_NAMESPACE_BEGIN 25 26 /** 27 * UTF-8 collation element and character iterator. 28 * Handles normalized UTF-8 text inline, with length or NUL-terminated. 29 * Unnormalized text is handled by a subclass. 30 */ 31 class U_I18N_API UTF8CollationIterator : public CollationIterator { 32 public: 33 UTF8CollationIterator(const CollationData *d, UBool numeric, 34 const uint8_t *s, int32_t p, int32_t len) 35 : CollationIterator(d, numeric), 36 u8(s), pos(p), length(len) {} 37 38 virtual ~UTF8CollationIterator(); 39 40 virtual void resetToOffset(int32_t newOffset); 41 42 virtual int32_t getOffset() const; 43 44 virtual UChar32 nextCodePoint(UErrorCode &errorCode); 45 46 virtual UChar32 previousCodePoint(UErrorCode &errorCode); 47 48 protected: 49 /** 50 * For byte sequences that are illegal in UTF-8, an error value may be returned 51 * together with a bogus code point. The caller will ignore that code point. 52 * 53 * Special values may be returned for surrogate code points, which are also illegal in UTF-8, 54 * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns TRUE. 55 * 56 * Valid lead surrogates are returned from inside a normalized text segment, 57 * where handleGetTrailSurrogate() will return the matching trail surrogate. 58 */ 59 virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode); 60 61 virtual UBool foundNULTerminator(); 62 63 virtual UBool forbidSurrogateCodePoints() const; 64 65 virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode); 66 67 virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode); 68 69 const uint8_t *u8; 70 int32_t pos; 71 int32_t length; // <0 for NUL-terminated strings 72 }; 73 74 /** 75 * Incrementally checks the input text for FCD and normalizes where necessary. 76 */ 77 class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator { 78 public: 79 FCDUTF8CollationIterator(const CollationData *data, UBool numeric, 80 const uint8_t *s, int32_t p, int32_t len) 81 : UTF8CollationIterator(data, numeric, s, p, len), 82 state(CHECK_FWD), start(p), 83 nfcImpl(data->nfcImpl) {} 84 85 virtual ~FCDUTF8CollationIterator(); 86 87 virtual void resetToOffset(int32_t newOffset); 88 89 virtual int32_t getOffset() const; 90 91 virtual UChar32 nextCodePoint(UErrorCode &errorCode); 92 93 virtual UChar32 previousCodePoint(UErrorCode &errorCode); 94 95 protected: 96 virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode); 97 98 virtual UChar handleGetTrailSurrogate(); 99 100 virtual UBool foundNULTerminator(); 101 102 virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode); 103 104 virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode); 105 106 private: 107 UBool nextHasLccc() const; 108 UBool previousHasTccc() const; 109 110 /** 111 * Switches to forward checking if possible. 112 */ 113 void switchToForward(); 114 115 /** 116 * Extends the FCD text segment forward or normalizes around pos. 117 * @return TRUE if success 118 */ 119 UBool nextSegment(UErrorCode &errorCode); 120 121 /** 122 * Switches to backward checking. 123 */ 124 void switchToBackward(); 125 126 /** 127 * Extends the FCD text segment backward or normalizes around pos. 128 * @return TRUE if success 129 */ 130 UBool previousSegment(UErrorCode &errorCode); 131 132 UBool normalize(const UnicodeString &s, UErrorCode &errorCode); 133 134 enum State { 135 /** 136 * The input text [start..pos[ passes the FCD check. 137 * Moving forward checks incrementally. 138 * limit is undefined. 139 */ 140 CHECK_FWD, 141 /** 142 * The input text [pos..limit[ passes the FCD check. 143 * Moving backward checks incrementally. 144 * start is undefined. 145 */ 146 CHECK_BWD, 147 /** 148 * The input text [start..limit[ passes the FCD check. 149 * pos tracks the current text index. 150 */ 151 IN_FCD_SEGMENT, 152 /** 153 * The input text [start..limit[ failed the FCD check and was normalized. 154 * pos tracks the current index in the normalized string. 155 */ 156 IN_NORMALIZED 157 }; 158 159 State state; 160 161 int32_t start; 162 int32_t limit; 163 164 const Normalizer2Impl &nfcImpl; 165 UnicodeString normalized; 166 }; 167 168 U_NAMESPACE_END 169 170 #endif // !UCONFIG_NO_COLLATION 171 #endif // __UTF8COLLATIONITERATOR_H__ 172