1 /* 2 ********************************************************************** 3 * Copyright (c) 2001-2012, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 07/23/01 aliu Creation. 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION 14 15 #include "strmatch.h" 16 #include "rbt_data.h" 17 #include "util.h" 18 #include "unicode/uniset.h" 19 #include "unicode/utf16.h" 20 21 U_NAMESPACE_BEGIN 22 23 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher) 24 25 StringMatcher::StringMatcher(const UnicodeString& theString, 26 int32_t start, 27 int32_t limit, 28 int32_t segmentNum, 29 const TransliterationRuleData& theData) : 30 data(&theData), 31 segmentNumber(segmentNum), 32 matchStart(-1), 33 matchLimit(-1) 34 { 35 theString.extractBetween(start, limit, pattern); 36 } 37 38 StringMatcher::StringMatcher(const StringMatcher& o) : 39 UnicodeFunctor(o), 40 UnicodeMatcher(o), 41 UnicodeReplacer(o), 42 pattern(o.pattern), 43 data(o.data), 44 segmentNumber(o.segmentNumber), 45 matchStart(o.matchStart), 46 matchLimit(o.matchLimit) 47 { 48 } 49 50 /** 51 * Destructor 52 */ 53 StringMatcher::~StringMatcher() { 54 } 55 56 /** 57 * Implement UnicodeFunctor 58 */ 59 UnicodeFunctor* StringMatcher::clone() const { 60 return new StringMatcher(*this); 61 } 62 63 /** 64 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer 65 * and return the pointer. 66 */ 67 UnicodeMatcher* StringMatcher::toMatcher() const { 68 StringMatcher *nonconst_this = const_cast<StringMatcher *>(this); 69 UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this); 70 71 return nonconst_base; 72 } 73 74 /** 75 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer 76 * and return the pointer. 77 */ 78 UnicodeReplacer* StringMatcher::toReplacer() const { 79 StringMatcher *nonconst_this = const_cast<StringMatcher *>(this); 80 UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this); 81 82 return nonconst_base; 83 } 84 85 /** 86 * Implement UnicodeMatcher 87 */ 88 UMatchDegree StringMatcher::matches(const Replaceable& text, 89 int32_t& offset, 90 int32_t limit, 91 UBool incremental) { 92 int32_t i; 93 int32_t cursor = offset; 94 if (limit < cursor) { 95 // Match in the reverse direction 96 for (i=pattern.length()-1; i>=0; --i) { 97 UChar keyChar = pattern.charAt(i); 98 UnicodeMatcher* subm = data->lookupMatcher(keyChar); 99 if (subm == 0) { 100 if (cursor > limit && 101 keyChar == text.charAt(cursor)) { 102 --cursor; 103 } else { 104 return U_MISMATCH; 105 } 106 } else { 107 UMatchDegree m = 108 subm->matches(text, cursor, limit, incremental); 109 if (m != U_MATCH) { 110 return m; 111 } 112 } 113 } 114 // Record the match position, but adjust for a normal 115 // forward start, limit, and only if a prior match does not 116 // exist -- we want the rightmost match. 117 if (matchStart < 0) { 118 matchStart = cursor+1; 119 matchLimit = offset+1; 120 } 121 } else { 122 for (i=0; i<pattern.length(); ++i) { 123 if (incremental && cursor == limit) { 124 // We've reached the context limit without a mismatch and 125 // without completing our match. 126 return U_PARTIAL_MATCH; 127 } 128 UChar keyChar = pattern.charAt(i); 129 UnicodeMatcher* subm = data->lookupMatcher(keyChar); 130 if (subm == 0) { 131 // Don't need the cursor < limit check if 132 // incremental is TRUE (because it's done above); do need 133 // it otherwise. 134 if (cursor < limit && 135 keyChar == text.charAt(cursor)) { 136 ++cursor; 137 } else { 138 return U_MISMATCH; 139 } 140 } else { 141 UMatchDegree m = 142 subm->matches(text, cursor, limit, incremental); 143 if (m != U_MATCH) { 144 return m; 145 } 146 } 147 } 148 // Record the match position 149 matchStart = offset; 150 matchLimit = cursor; 151 } 152 153 offset = cursor; 154 return U_MATCH; 155 } 156 157 /** 158 * Implement UnicodeMatcher 159 */ 160 UnicodeString& StringMatcher::toPattern(UnicodeString& result, 161 UBool escapeUnprintable) const 162 { 163 result.truncate(0); 164 UnicodeString str, quoteBuf; 165 if (segmentNumber > 0) { 166 result.append((UChar)40); /*(*/ 167 } 168 for (int32_t i=0; i<pattern.length(); ++i) { 169 UChar keyChar = pattern.charAt(i); 170 const UnicodeMatcher* m = data->lookupMatcher(keyChar); 171 if (m == 0) { 172 ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf); 173 } else { 174 ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable), 175 TRUE, escapeUnprintable, quoteBuf); 176 } 177 } 178 if (segmentNumber > 0) { 179 result.append((UChar)41); /*)*/ 180 } 181 // Flush quoteBuf out to result 182 ICU_Utility::appendToRule(result, -1, 183 TRUE, escapeUnprintable, quoteBuf); 184 return result; 185 } 186 187 /** 188 * Implement UnicodeMatcher 189 */ 190 UBool StringMatcher::matchesIndexValue(uint8_t v) const { 191 if (pattern.length() == 0) { 192 return TRUE; 193 } 194 UChar32 c = pattern.char32At(0); 195 const UnicodeMatcher *m = data->lookupMatcher(c); 196 return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v); 197 } 198 199 /** 200 * Implement UnicodeMatcher 201 */ 202 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const { 203 UChar32 ch; 204 for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) { 205 ch = pattern.char32At(i); 206 const UnicodeMatcher* matcher = data->lookupMatcher(ch); 207 if (matcher == NULL) { 208 toUnionTo.add(ch); 209 } else { 210 matcher->addMatchSetTo(toUnionTo); 211 } 212 } 213 } 214 215 /** 216 * UnicodeReplacer API 217 */ 218 int32_t StringMatcher::replace(Replaceable& text, 219 int32_t start, 220 int32_t limit, 221 int32_t& /*cursor*/) { 222 223 int32_t outLen = 0; 224 225 // Copy segment with out-of-band data 226 int32_t dest = limit; 227 // If there was no match, that means that a quantifier 228 // matched zero-length. E.g., x (a)* y matched "xy". 229 if (matchStart >= 0) { 230 if (matchStart != matchLimit) { 231 text.copy(matchStart, matchLimit, dest); 232 outLen = matchLimit - matchStart; 233 } 234 } 235 236 text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text 237 238 return outLen; 239 } 240 241 /** 242 * UnicodeReplacer API 243 */ 244 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule, 245 UBool /*escapeUnprintable*/) const { 246 // assert(segmentNumber > 0); 247 rule.truncate(0); 248 rule.append((UChar)0x0024 /*$*/); 249 ICU_Utility::appendNumber(rule, segmentNumber, 10, 1); 250 return rule; 251 } 252 253 /** 254 * Remove any match info. This must be called before performing a 255 * set of matches with this segment. 256 */ 257 void StringMatcher::resetMatch() { 258 matchStart = matchLimit = -1; 259 } 260 261 /** 262 * Union the set of all characters that may output by this object 263 * into the given set. 264 * @param toUnionTo the set into which to union the output characters 265 */ 266 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const { 267 // The output of this replacer varies; it is the source text between 268 // matchStart and matchLimit. Since this varies depending on the 269 // input text, we can't compute it here. We can either do nothing 270 // or we can add ALL characters to the set. It's probably more useful 271 // to do nothing. 272 } 273 274 /** 275 * Implement UnicodeFunctor 276 */ 277 void StringMatcher::setData(const TransliterationRuleData* d) { 278 data = d; 279 int32_t i = 0; 280 while (i<pattern.length()) { 281 UChar32 c = pattern.char32At(i); 282 UnicodeFunctor* f = data->lookup(c); 283 if (f != NULL) { 284 f->setData(data); 285 } 286 i += U16_LENGTH(c); 287 } 288 } 289 290 U_NAMESPACE_END 291 292 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 293 294 //eof 295