1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (c) 2001-2012, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 07/23/01 aliu Creation. 10 ********************************************************************** 11 */ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_TRANSLITERATION 16 17 #include "strmatch.h" 18 #include "rbt_data.h" 19 #include "util.h" 20 #include "unicode/uniset.h" 21 #include "unicode/utf16.h" 22 23 U_NAMESPACE_BEGIN 24 25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher) 26 27 StringMatcher::StringMatcher(const UnicodeString& theString, 28 int32_t start, 29 int32_t limit, 30 int32_t segmentNum, 31 const TransliterationRuleData& theData) : 32 data(&theData), 33 segmentNumber(segmentNum), 34 matchStart(-1), 35 matchLimit(-1) 36 { 37 theString.extractBetween(start, limit, pattern); 38 } 39 40 StringMatcher::StringMatcher(const StringMatcher& o) : 41 UnicodeFunctor(o), 42 UnicodeMatcher(o), 43 UnicodeReplacer(o), 44 pattern(o.pattern), 45 data(o.data), 46 segmentNumber(o.segmentNumber), 47 matchStart(o.matchStart), 48 matchLimit(o.matchLimit) 49 { 50 } 51 52 /** 53 * Destructor 54 */ 55 StringMatcher::~StringMatcher() { 56 } 57 58 /** 59 * Implement UnicodeFunctor 60 */ 61 UnicodeFunctor* StringMatcher::clone() const { 62 return new StringMatcher(*this); 63 } 64 65 /** 66 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer 67 * and return the pointer. 68 */ 69 UnicodeMatcher* StringMatcher::toMatcher() const { 70 StringMatcher *nonconst_this = const_cast<StringMatcher *>(this); 71 UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this); 72 73 return nonconst_base; 74 } 75 76 /** 77 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer 78 * and return the pointer. 79 */ 80 UnicodeReplacer* StringMatcher::toReplacer() const { 81 StringMatcher *nonconst_this = const_cast<StringMatcher *>(this); 82 UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this); 83 84 return nonconst_base; 85 } 86 87 /** 88 * Implement UnicodeMatcher 89 */ 90 UMatchDegree StringMatcher::matches(const Replaceable& text, 91 int32_t& offset, 92 int32_t limit, 93 UBool incremental) { 94 int32_t i; 95 int32_t cursor = offset; 96 if (limit < cursor) { 97 // Match in the reverse direction 98 for (i=pattern.length()-1; i>=0; --i) { 99 UChar keyChar = pattern.charAt(i); 100 UnicodeMatcher* subm = data->lookupMatcher(keyChar); 101 if (subm == 0) { 102 if (cursor > limit && 103 keyChar == text.charAt(cursor)) { 104 --cursor; 105 } else { 106 return U_MISMATCH; 107 } 108 } else { 109 UMatchDegree m = 110 subm->matches(text, cursor, limit, incremental); 111 if (m != U_MATCH) { 112 return m; 113 } 114 } 115 } 116 // Record the match position, but adjust for a normal 117 // forward start, limit, and only if a prior match does not 118 // exist -- we want the rightmost match. 119 if (matchStart < 0) { 120 matchStart = cursor+1; 121 matchLimit = offset+1; 122 } 123 } else { 124 for (i=0; i<pattern.length(); ++i) { 125 if (incremental && cursor == limit) { 126 // We've reached the context limit without a mismatch and 127 // without completing our match. 128 return U_PARTIAL_MATCH; 129 } 130 UChar keyChar = pattern.charAt(i); 131 UnicodeMatcher* subm = data->lookupMatcher(keyChar); 132 if (subm == 0) { 133 // Don't need the cursor < limit check if 134 // incremental is TRUE (because it's done above); do need 135 // it otherwise. 136 if (cursor < limit && 137 keyChar == text.charAt(cursor)) { 138 ++cursor; 139 } else { 140 return U_MISMATCH; 141 } 142 } else { 143 UMatchDegree m = 144 subm->matches(text, cursor, limit, incremental); 145 if (m != U_MATCH) { 146 return m; 147 } 148 } 149 } 150 // Record the match position 151 matchStart = offset; 152 matchLimit = cursor; 153 } 154 155 offset = cursor; 156 return U_MATCH; 157 } 158 159 /** 160 * Implement UnicodeMatcher 161 */ 162 UnicodeString& StringMatcher::toPattern(UnicodeString& result, 163 UBool escapeUnprintable) const 164 { 165 result.truncate(0); 166 UnicodeString str, quoteBuf; 167 if (segmentNumber > 0) { 168 result.append((UChar)40); /*(*/ 169 } 170 for (int32_t i=0; i<pattern.length(); ++i) { 171 UChar keyChar = pattern.charAt(i); 172 const UnicodeMatcher* m = data->lookupMatcher(keyChar); 173 if (m == 0) { 174 ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf); 175 } else { 176 ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable), 177 TRUE, escapeUnprintable, quoteBuf); 178 } 179 } 180 if (segmentNumber > 0) { 181 result.append((UChar)41); /*)*/ 182 } 183 // Flush quoteBuf out to result 184 ICU_Utility::appendToRule(result, -1, 185 TRUE, escapeUnprintable, quoteBuf); 186 return result; 187 } 188 189 /** 190 * Implement UnicodeMatcher 191 */ 192 UBool StringMatcher::matchesIndexValue(uint8_t v) const { 193 if (pattern.length() == 0) { 194 return TRUE; 195 } 196 UChar32 c = pattern.char32At(0); 197 const UnicodeMatcher *m = data->lookupMatcher(c); 198 return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v); 199 } 200 201 /** 202 * Implement UnicodeMatcher 203 */ 204 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const { 205 UChar32 ch; 206 for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) { 207 ch = pattern.char32At(i); 208 const UnicodeMatcher* matcher = data->lookupMatcher(ch); 209 if (matcher == NULL) { 210 toUnionTo.add(ch); 211 } else { 212 matcher->addMatchSetTo(toUnionTo); 213 } 214 } 215 } 216 217 /** 218 * UnicodeReplacer API 219 */ 220 int32_t StringMatcher::replace(Replaceable& text, 221 int32_t start, 222 int32_t limit, 223 int32_t& /*cursor*/) { 224 225 int32_t outLen = 0; 226 227 // Copy segment with out-of-band data 228 int32_t dest = limit; 229 // If there was no match, that means that a quantifier 230 // matched zero-length. E.g., x (a)* y matched "xy". 231 if (matchStart >= 0) { 232 if (matchStart != matchLimit) { 233 text.copy(matchStart, matchLimit, dest); 234 outLen = matchLimit - matchStart; 235 } 236 } 237 238 text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text 239 240 return outLen; 241 } 242 243 /** 244 * UnicodeReplacer API 245 */ 246 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule, 247 UBool /*escapeUnprintable*/) const { 248 // assert(segmentNumber > 0); 249 rule.truncate(0); 250 rule.append((UChar)0x0024 /*$*/); 251 ICU_Utility::appendNumber(rule, segmentNumber, 10, 1); 252 return rule; 253 } 254 255 /** 256 * Remove any match info. This must be called before performing a 257 * set of matches with this segment. 258 */ 259 void StringMatcher::resetMatch() { 260 matchStart = matchLimit = -1; 261 } 262 263 /** 264 * Union the set of all characters that may output by this object 265 * into the given set. 266 * @param toUnionTo the set into which to union the output characters 267 */ 268 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const { 269 // The output of this replacer varies; it is the source text between 270 // matchStart and matchLimit. Since this varies depending on the 271 // input text, we can't compute it here. We can either do nothing 272 // or we can add ALL characters to the set. It's probably more useful 273 // to do nothing. 274 } 275 276 /** 277 * Implement UnicodeFunctor 278 */ 279 void StringMatcher::setData(const TransliterationRuleData* d) { 280 data = d; 281 int32_t i = 0; 282 while (i<pattern.length()) { 283 UChar32 c = pattern.char32At(i); 284 UnicodeFunctor* f = data->lookup(c); 285 if (f != NULL) { 286 f->setData(data); 287 } 288 i += U16_LENGTH(c); 289 } 290 } 291 292 U_NAMESPACE_END 293 294 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 295 296 //eof 297