1 /* 2 ********************************************************************** 3 * Copyright (c) 2002-2004, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 01/21/2002 aliu Creation. 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION 14 15 #include "strrepl.h" 16 #include "rbt_data.h" 17 #include "util.h" 18 #include "unicode/uniset.h" 19 20 U_NAMESPACE_BEGIN 21 22 static const UChar EMPTY[] = { 0 }; // empty string: "" 23 24 UnicodeReplacer::~UnicodeReplacer() {} 25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer) 26 27 /** 28 * Construct a StringReplacer that sets the emits the given output 29 * text and sets the cursor to the given position. 30 * @param theOutput text that will replace input text when the 31 * replace() method is called. May contain stand-in characters 32 * that represent nested replacers. 33 * @param theCursorPos cursor position that will be returned by 34 * the replace() method 35 * @param theData transliterator context object that translates 36 * stand-in characters to UnicodeReplacer objects 37 */ 38 StringReplacer::StringReplacer(const UnicodeString& theOutput, 39 int32_t theCursorPos, 40 const TransliterationRuleData* theData) { 41 output = theOutput; 42 cursorPos = theCursorPos; 43 hasCursor = TRUE; 44 data = theData; 45 isComplex = TRUE; 46 } 47 48 /** 49 * Construct a StringReplacer that sets the emits the given output 50 * text and does not modify the cursor. 51 * @param theOutput text that will replace input text when the 52 * replace() method is called. May contain stand-in characters 53 * that represent nested replacers. 54 * @param theData transliterator context object that translates 55 * stand-in characters to UnicodeReplacer objects 56 */ 57 StringReplacer::StringReplacer(const UnicodeString& theOutput, 58 const TransliterationRuleData* theData) { 59 output = theOutput; 60 cursorPos = 0; 61 hasCursor = FALSE; 62 data = theData; 63 isComplex = TRUE; 64 } 65 66 /** 67 * Copy constructor. 68 */ 69 StringReplacer::StringReplacer(const StringReplacer& other) : 70 UnicodeFunctor(other), 71 UnicodeReplacer(other) 72 { 73 output = other.output; 74 cursorPos = other.cursorPos; 75 hasCursor = other.hasCursor; 76 data = other.data; 77 isComplex = other.isComplex; 78 } 79 80 /** 81 * Destructor 82 */ 83 StringReplacer::~StringReplacer() { 84 } 85 86 /** 87 * Implement UnicodeFunctor 88 */ 89 UnicodeFunctor* StringReplacer::clone() const { 90 return new StringReplacer(*this); 91 } 92 93 /** 94 * Implement UnicodeFunctor 95 */ 96 UnicodeReplacer* StringReplacer::toReplacer() const { 97 return (UnicodeReplacer*) this; 98 } 99 100 /** 101 * UnicodeReplacer API 102 */ 103 int32_t StringReplacer::replace(Replaceable& text, 104 int32_t start, 105 int32_t limit, 106 int32_t& cursor) { 107 int32_t outLen; 108 int32_t newStart = 0; 109 110 // NOTE: It should be possible to _always_ run the complex 111 // processing code; just slower. If not, then there is a bug 112 // in the complex processing code. 113 114 // Simple (no nested replacers) Processing Code : 115 if (!isComplex) { 116 text.handleReplaceBetween(start, limit, output); 117 outLen = output.length(); 118 119 // Setup default cursor position (for cursorPos within output) 120 newStart = cursorPos; 121 } 122 123 // Complex (nested replacers) Processing Code : 124 else { 125 /* When there are segments to be copied, use the Replaceable.copy() 126 * API in order to retain out-of-band data. Copy everything to the 127 * end of the string, then copy them back over the key. This preserves 128 * the integrity of indices into the key and surrounding context while 129 * generating the output text. 130 */ 131 UnicodeString buf; 132 int32_t oOutput; // offset into 'output' 133 isComplex = FALSE; 134 135 // The temporary buffer starts at tempStart, and extends 136 // to destLimit. The start of the buffer has a single 137 // character from before the key. This provides style 138 // data when addition characters are filled into the 139 // temporary buffer. If there is nothing to the left, use 140 // the non-character U+FFFF, which Replaceable subclasses 141 // should treat specially as a "no-style character." 142 // destStart points to the point after the style context 143 // character, so it is tempStart+1 or tempStart+2. 144 int32_t tempStart = text.length(); // start of temp buffer 145 int32_t destStart = tempStart; // copy new text to here 146 if (start > 0) { 147 int32_t len = UTF_CHAR_LENGTH(text.char32At(start-1)); 148 text.copy(start-len, start, tempStart); 149 destStart += len; 150 } else { 151 UnicodeString str((UChar) 0xFFFF); 152 text.handleReplaceBetween(tempStart, tempStart, str); 153 destStart++; 154 } 155 int32_t destLimit = destStart; 156 157 for (oOutput=0; oOutput<output.length(); ) { 158 if (oOutput == cursorPos) { 159 // Record the position of the cursor 160 newStart = destLimit - destStart; // relative to start 161 } 162 UChar32 c = output.char32At(oOutput); 163 UnicodeReplacer* r = data->lookupReplacer(c); 164 if (r == NULL) { 165 // Accumulate straight (non-segment) text. 166 buf.append(c); 167 } else { 168 isComplex = TRUE; 169 170 // Insert any accumulated straight text. 171 if (buf.length() > 0) { 172 text.handleReplaceBetween(destLimit, destLimit, buf); 173 destLimit += buf.length(); 174 buf.truncate(0); 175 } 176 177 // Delegate output generation to replacer object 178 int32_t len = r->replace(text, destLimit, destLimit, cursor); 179 destLimit += len; 180 } 181 oOutput += UTF_CHAR_LENGTH(c); 182 } 183 // Insert any accumulated straight text. 184 if (buf.length() > 0) { 185 text.handleReplaceBetween(destLimit, destLimit, buf); 186 destLimit += buf.length(); 187 } 188 if (oOutput == cursorPos) { 189 // Record the position of the cursor 190 newStart = destLimit - destStart; // relative to start 191 } 192 193 outLen = destLimit - destStart; 194 195 // Copy new text to start, and delete it 196 text.copy(destStart, destLimit, start); 197 text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, EMPTY); 198 199 // Delete the old text (the key) 200 text.handleReplaceBetween(start + outLen, limit + outLen, EMPTY); 201 } 202 203 if (hasCursor) { 204 // Adjust the cursor for positions outside the key. These 205 // refer to code points rather than code units. If cursorPos 206 // is within the output string, then use newStart, which has 207 // already been set above. 208 if (cursorPos < 0) { 209 newStart = start; 210 int32_t n = cursorPos; 211 // Outside the output string, cursorPos counts code points 212 while (n < 0 && newStart > 0) { 213 newStart -= UTF_CHAR_LENGTH(text.char32At(newStart-1)); 214 ++n; 215 } 216 newStart += n; 217 } else if (cursorPos > output.length()) { 218 newStart = start + outLen; 219 int32_t n = cursorPos - output.length(); 220 // Outside the output string, cursorPos counts code points 221 while (n > 0 && newStart < text.length()) { 222 newStart += UTF_CHAR_LENGTH(text.char32At(newStart)); 223 --n; 224 } 225 newStart += n; 226 } else { 227 // Cursor is within output string. It has been set up above 228 // to be relative to start. 229 newStart += start; 230 } 231 232 cursor = newStart; 233 } 234 235 return outLen; 236 } 237 238 /** 239 * UnicodeReplacer API 240 */ 241 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule, 242 UBool escapeUnprintable) const { 243 rule.truncate(0); 244 UnicodeString quoteBuf; 245 246 int32_t cursor = cursorPos; 247 248 // Handle a cursor preceding the output 249 if (hasCursor && cursor < 0) { 250 while (cursor++ < 0) { 251 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); 252 } 253 // Fall through and append '|' below 254 } 255 256 for (int32_t i=0; i<output.length(); ++i) { 257 if (hasCursor && i == cursor) { 258 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); 259 } 260 UChar c = output.charAt(i); // Ok to use 16-bits here 261 262 UnicodeReplacer* r = data->lookupReplacer(c); 263 if (r == NULL) { 264 ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf); 265 } else { 266 UnicodeString buf; 267 r->toReplacerPattern(buf, escapeUnprintable); 268 buf.insert(0, (UChar)0x20); 269 buf.append((UChar)0x20); 270 ICU_Utility::appendToRule(rule, buf, 271 TRUE, escapeUnprintable, quoteBuf); 272 } 273 } 274 275 // Handle a cursor after the output. Use > rather than >= because 276 // if cursor == output.length() it is at the end of the output, 277 // which is the default position, so we need not emit it. 278 if (hasCursor && cursor > output.length()) { 279 cursor -= output.length(); 280 while (cursor-- > 0) { 281 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); 282 } 283 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); 284 } 285 // Flush quoteBuf out to result 286 ICU_Utility::appendToRule(rule, -1, 287 TRUE, escapeUnprintable, quoteBuf); 288 289 return rule; 290 } 291 292 /** 293 * Implement UnicodeReplacer 294 */ 295 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { 296 UChar32 ch; 297 for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) { 298 ch = output.char32At(i); 299 UnicodeReplacer* r = data->lookupReplacer(ch); 300 if (r == NULL) { 301 toUnionTo.add(ch); 302 } else { 303 r->addReplacementSetTo(toUnionTo); 304 } 305 } 306 } 307 308 /** 309 * UnicodeFunctor API 310 */ 311 void StringReplacer::setData(const TransliterationRuleData* d) { 312 data = d; 313 int32_t i = 0; 314 while (i<output.length()) { 315 UChar32 c = output.char32At(i); 316 UnicodeFunctor* f = data->lookup(c); 317 if (f != NULL) { 318 f->setData(data); 319 } 320 i += UTF_CHAR_LENGTH(c); 321 } 322 } 323 324 U_NAMESPACE_END 325 326 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 327 328 //eof 329