1 /* 2 ********************************************************************** 3 * Copyright (c) 2001-2004, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 07/23/01 aliu Creation. 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION 14 15 #include "strmatch.h" 16 #include "rbt_data.h" 17 #include "util.h" 18 #include "unicode/uniset.h" 19 20 U_NAMESPACE_BEGIN 21 22 static const UChar EMPTY[] = { 0 }; // empty string: "" 23 24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher) 25 26 StringMatcher::StringMatcher(const UnicodeString& theString, 27 int32_t start, 28 int32_t limit, 29 int32_t segmentNum, 30 const TransliterationRuleData& theData) : 31 data(&theData), 32 segmentNumber(segmentNum), 33 matchStart(-1), 34 matchLimit(-1) 35 { 36 theString.extractBetween(start, limit, pattern); 37 } 38 39 StringMatcher::StringMatcher(const StringMatcher& o) : 40 UnicodeFunctor(o), 41 UnicodeMatcher(o), 42 UnicodeReplacer(o), 43 pattern(o.pattern), 44 data(o.data), 45 segmentNumber(o.segmentNumber), 46 matchStart(o.matchStart), 47 matchLimit(o.matchLimit) 48 { 49 } 50 51 /** 52 * Destructor 53 */ 54 StringMatcher::~StringMatcher() { 55 } 56 57 /** 58 * Implement UnicodeFunctor 59 */ 60 UnicodeFunctor* StringMatcher::clone() const { 61 return new StringMatcher(*this); 62 } 63 64 /** 65 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer 66 * and return the pointer. 67 */ 68 UnicodeMatcher* StringMatcher::toMatcher() const { 69 return (UnicodeMatcher*) this; 70 } 71 72 /** 73 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer 74 * and return the pointer. 75 */ 76 UnicodeReplacer* StringMatcher::toReplacer() const { 77 return (UnicodeReplacer*) this; 78 } 79 80 /** 81 * Implement UnicodeMatcher 82 */ 83 UMatchDegree StringMatcher::matches(const Replaceable& text, 84 int32_t& offset, 85 int32_t limit, 86 UBool incremental) { 87 int32_t i; 88 int32_t cursor = offset; 89 if (limit < cursor) { 90 // Match in the reverse direction 91 for (i=pattern.length()-1; i>=0; --i) { 92 UChar keyChar = pattern.charAt(i); 93 UnicodeMatcher* subm = data->lookupMatcher(keyChar); 94 if (subm == 0) { 95 if (cursor > limit && 96 keyChar == text.charAt(cursor)) { 97 --cursor; 98 } else { 99 return U_MISMATCH; 100 } 101 } else { 102 UMatchDegree m = 103 subm->matches(text, cursor, limit, incremental); 104 if (m != U_MATCH) { 105 return m; 106 } 107 } 108 } 109 // Record the match position, but adjust for a normal 110 // forward start, limit, and only if a prior match does not 111 // exist -- we want the rightmost match. 112 if (matchStart < 0) { 113 matchStart = cursor+1; 114 matchLimit = offset+1; 115 } 116 } else { 117 for (i=0; i<pattern.length(); ++i) { 118 if (incremental && cursor == limit) { 119 // We've reached the context limit without a mismatch and 120 // without completing our match. 121 return U_PARTIAL_MATCH; 122 } 123 UChar keyChar = pattern.charAt(i); 124 UnicodeMatcher* subm = data->lookupMatcher(keyChar); 125 if (subm == 0) { 126 // Don't need the cursor < limit check if 127 // incremental is TRUE (because it's done above); do need 128 // it otherwise. 129 if (cursor < limit && 130 keyChar == text.charAt(cursor)) { 131 ++cursor; 132 } else { 133 return U_MISMATCH; 134 } 135 } else { 136 UMatchDegree m = 137 subm->matches(text, cursor, limit, incremental); 138 if (m != U_MATCH) { 139 return m; 140 } 141 } 142 } 143 // Record the match position 144 matchStart = offset; 145 matchLimit = cursor; 146 } 147 148 offset = cursor; 149 return U_MATCH; 150 } 151 152 /** 153 * Implement UnicodeMatcher 154 */ 155 UnicodeString& StringMatcher::toPattern(UnicodeString& result, 156 UBool escapeUnprintable) const 157 { 158 result.truncate(0); 159 UnicodeString str, quoteBuf; 160 if (segmentNumber > 0) { 161 result.append((UChar)40); /*(*/ 162 } 163 for (int32_t i=0; i<pattern.length(); ++i) { 164 UChar keyChar = pattern.charAt(i); 165 const UnicodeMatcher* m = data->lookupMatcher(keyChar); 166 if (m == 0) { 167 ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf); 168 } else { 169 ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable), 170 TRUE, escapeUnprintable, quoteBuf); 171 } 172 } 173 if (segmentNumber > 0) { 174 result.append((UChar)41); /*)*/ 175 } 176 // Flush quoteBuf out to result 177 ICU_Utility::appendToRule(result, -1, 178 TRUE, escapeUnprintable, quoteBuf); 179 return result; 180 } 181 182 /** 183 * Implement UnicodeMatcher 184 */ 185 UBool StringMatcher::matchesIndexValue(uint8_t v) const { 186 if (pattern.length() == 0) { 187 return TRUE; 188 } 189 UChar32 c = pattern.char32At(0); 190 const UnicodeMatcher *m = data->lookupMatcher(c); 191 return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v); 192 } 193 194 /** 195 * Implement UnicodeMatcher 196 */ 197 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const { 198 UChar32 ch; 199 for (int32_t i=0; i<pattern.length(); i+=UTF_CHAR_LENGTH(ch)) { 200 ch = pattern.char32At(i); 201 const UnicodeMatcher* matcher = data->lookupMatcher(ch); 202 if (matcher == NULL) { 203 toUnionTo.add(ch); 204 } else { 205 matcher->addMatchSetTo(toUnionTo); 206 } 207 } 208 } 209 210 /** 211 * UnicodeReplacer API 212 */ 213 int32_t StringMatcher::replace(Replaceable& text, 214 int32_t start, 215 int32_t limit, 216 int32_t& /*cursor*/) { 217 218 int32_t outLen = 0; 219 220 // Copy segment with out-of-band data 221 int32_t dest = limit; 222 // If there was no match, that means that a quantifier 223 // matched zero-length. E.g., x (a)* y matched "xy". 224 if (matchStart >= 0) { 225 if (matchStart != matchLimit) { 226 text.copy(matchStart, matchLimit, dest); 227 outLen = matchLimit - matchStart; 228 } 229 } 230 231 text.handleReplaceBetween(start, limit, EMPTY); // delete original text 232 233 return outLen; 234 } 235 236 /** 237 * UnicodeReplacer API 238 */ 239 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule, 240 UBool /*escapeUnprintable*/) const { 241 // assert(segmentNumber > 0); 242 rule.truncate(0); 243 rule.append((UChar)0x0024 /*$*/); 244 ICU_Utility::appendNumber(rule, segmentNumber, 10, 1); 245 return rule; 246 } 247 248 /** 249 * Remove any match info. This must be called before performing a 250 * set of matches with this segment. 251 */ 252 void StringMatcher::resetMatch() { 253 matchStart = matchLimit = -1; 254 } 255 256 /** 257 * Union the set of all characters that may output by this object 258 * into the given set. 259 * @param toUnionTo the set into which to union the output characters 260 */ 261 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const { 262 // The output of this replacer varies; it is the source text between 263 // matchStart and matchLimit. Since this varies depending on the 264 // input text, we can't compute it here. We can either do nothing 265 // or we can add ALL characters to the set. It's probably more useful 266 // to do nothing. 267 } 268 269 /** 270 * Implement UnicodeFunctor 271 */ 272 void StringMatcher::setData(const TransliterationRuleData* d) { 273 data = d; 274 int32_t i = 0; 275 while (i<pattern.length()) { 276 UChar32 c = pattern.char32At(i); 277 UnicodeFunctor* f = data->lookup(c); 278 if (f != NULL) { 279 f->setData(data); 280 } 281 i += UTF_CHAR_LENGTH(c); 282 } 283 } 284 285 U_NAMESPACE_END 286 287 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 288 289 //eof 290