1 /* 2 ********************************************************************** 3 * Copyright (c) 2002-2012, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 01/21/2002 aliu Creation. 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION 14 15 #include "unicode/uniset.h" 16 #include "unicode/utf16.h" 17 #include "strrepl.h" 18 #include "rbt_data.h" 19 #include "util.h" 20 21 U_NAMESPACE_BEGIN 22 23 UnicodeReplacer::~UnicodeReplacer() {} 24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer) 25 26 /** 27 * Construct a StringReplacer that sets the emits the given output 28 * text and sets the cursor to the given position. 29 * @param theOutput text that will replace input text when the 30 * replace() method is called. May contain stand-in characters 31 * that represent nested replacers. 32 * @param theCursorPos cursor position that will be returned by 33 * the replace() method 34 * @param theData transliterator context object that translates 35 * stand-in characters to UnicodeReplacer objects 36 */ 37 StringReplacer::StringReplacer(const UnicodeString& theOutput, 38 int32_t theCursorPos, 39 const TransliterationRuleData* theData) { 40 output = theOutput; 41 cursorPos = theCursorPos; 42 hasCursor = TRUE; 43 data = theData; 44 isComplex = TRUE; 45 } 46 47 /** 48 * Construct a StringReplacer that sets the emits the given output 49 * text and does not modify the cursor. 50 * @param theOutput text that will replace input text when the 51 * replace() method is called. May contain stand-in characters 52 * that represent nested replacers. 53 * @param theData transliterator context object that translates 54 * stand-in characters to UnicodeReplacer objects 55 */ 56 StringReplacer::StringReplacer(const UnicodeString& theOutput, 57 const TransliterationRuleData* theData) { 58 output = theOutput; 59 cursorPos = 0; 60 hasCursor = FALSE; 61 data = theData; 62 isComplex = TRUE; 63 } 64 65 /** 66 * Copy constructor. 67 */ 68 StringReplacer::StringReplacer(const StringReplacer& other) : 69 UnicodeFunctor(other), 70 UnicodeReplacer(other) 71 { 72 output = other.output; 73 cursorPos = other.cursorPos; 74 hasCursor = other.hasCursor; 75 data = other.data; 76 isComplex = other.isComplex; 77 } 78 79 /** 80 * Destructor 81 */ 82 StringReplacer::~StringReplacer() { 83 } 84 85 /** 86 * Implement UnicodeFunctor 87 */ 88 UnicodeFunctor* StringReplacer::clone() const { 89 return new StringReplacer(*this); 90 } 91 92 /** 93 * Implement UnicodeFunctor 94 */ 95 UnicodeReplacer* StringReplacer::toReplacer() const { 96 return const_cast<StringReplacer *>(this); 97 } 98 99 /** 100 * UnicodeReplacer API 101 */ 102 int32_t StringReplacer::replace(Replaceable& text, 103 int32_t start, 104 int32_t limit, 105 int32_t& cursor) { 106 int32_t outLen; 107 int32_t newStart = 0; 108 109 // NOTE: It should be possible to _always_ run the complex 110 // processing code; just slower. If not, then there is a bug 111 // in the complex processing code. 112 113 // Simple (no nested replacers) Processing Code : 114 if (!isComplex) { 115 text.handleReplaceBetween(start, limit, output); 116 outLen = output.length(); 117 118 // Setup default cursor position (for cursorPos within output) 119 newStart = cursorPos; 120 } 121 122 // Complex (nested replacers) Processing Code : 123 else { 124 /* When there are segments to be copied, use the Replaceable.copy() 125 * API in order to retain out-of-band data. Copy everything to the 126 * end of the string, then copy them back over the key. This preserves 127 * the integrity of indices into the key and surrounding context while 128 * generating the output text. 129 */ 130 UnicodeString buf; 131 int32_t oOutput; // offset into 'output' 132 isComplex = FALSE; 133 134 // The temporary buffer starts at tempStart, and extends 135 // to destLimit. The start of the buffer has a single 136 // character from before the key. This provides style 137 // data when addition characters are filled into the 138 // temporary buffer. If there is nothing to the left, use 139 // the non-character U+FFFF, which Replaceable subclasses 140 // should treat specially as a "no-style character." 141 // destStart points to the point after the style context 142 // character, so it is tempStart+1 or tempStart+2. 143 int32_t tempStart = text.length(); // start of temp buffer 144 int32_t destStart = tempStart; // copy new text to here 145 if (start > 0) { 146 int32_t len = U16_LENGTH(text.char32At(start-1)); 147 text.copy(start-len, start, tempStart); 148 destStart += len; 149 } else { 150 UnicodeString str((UChar) 0xFFFF); 151 text.handleReplaceBetween(tempStart, tempStart, str); 152 destStart++; 153 } 154 int32_t destLimit = destStart; 155 156 for (oOutput=0; oOutput<output.length(); ) { 157 if (oOutput == cursorPos) { 158 // Record the position of the cursor 159 newStart = destLimit - destStart; // relative to start 160 } 161 UChar32 c = output.char32At(oOutput); 162 UnicodeReplacer* r = data->lookupReplacer(c); 163 if (r == NULL) { 164 // Accumulate straight (non-segment) text. 165 buf.append(c); 166 } else { 167 isComplex = TRUE; 168 169 // Insert any accumulated straight text. 170 if (buf.length() > 0) { 171 text.handleReplaceBetween(destLimit, destLimit, buf); 172 destLimit += buf.length(); 173 buf.truncate(0); 174 } 175 176 // Delegate output generation to replacer object 177 int32_t len = r->replace(text, destLimit, destLimit, cursor); 178 destLimit += len; 179 } 180 oOutput += U16_LENGTH(c); 181 } 182 // Insert any accumulated straight text. 183 if (buf.length() > 0) { 184 text.handleReplaceBetween(destLimit, destLimit, buf); 185 destLimit += buf.length(); 186 } 187 if (oOutput == cursorPos) { 188 // Record the position of the cursor 189 newStart = destLimit - destStart; // relative to start 190 } 191 192 outLen = destLimit - destStart; 193 194 // Copy new text to start, and delete it 195 text.copy(destStart, destLimit, start); 196 text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString()); 197 198 // Delete the old text (the key) 199 text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString()); 200 } 201 202 if (hasCursor) { 203 // Adjust the cursor for positions outside the key. These 204 // refer to code points rather than code units. If cursorPos 205 // is within the output string, then use newStart, which has 206 // already been set above. 207 if (cursorPos < 0) { 208 newStart = start; 209 int32_t n = cursorPos; 210 // Outside the output string, cursorPos counts code points 211 while (n < 0 && newStart > 0) { 212 newStart -= U16_LENGTH(text.char32At(newStart-1)); 213 ++n; 214 } 215 newStart += n; 216 } else if (cursorPos > output.length()) { 217 newStart = start + outLen; 218 int32_t n = cursorPos - output.length(); 219 // Outside the output string, cursorPos counts code points 220 while (n > 0 && newStart < text.length()) { 221 newStart += U16_LENGTH(text.char32At(newStart)); 222 --n; 223 } 224 newStart += n; 225 } else { 226 // Cursor is within output string. It has been set up above 227 // to be relative to start. 228 newStart += start; 229 } 230 231 cursor = newStart; 232 } 233 234 return outLen; 235 } 236 237 /** 238 * UnicodeReplacer API 239 */ 240 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule, 241 UBool escapeUnprintable) const { 242 rule.truncate(0); 243 UnicodeString quoteBuf; 244 245 int32_t cursor = cursorPos; 246 247 // Handle a cursor preceding the output 248 if (hasCursor && cursor < 0) { 249 while (cursor++ < 0) { 250 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); 251 } 252 // Fall through and append '|' below 253 } 254 255 for (int32_t i=0; i<output.length(); ++i) { 256 if (hasCursor && i == cursor) { 257 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); 258 } 259 UChar c = output.charAt(i); // Ok to use 16-bits here 260 261 UnicodeReplacer* r = data->lookupReplacer(c); 262 if (r == NULL) { 263 ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf); 264 } else { 265 UnicodeString buf; 266 r->toReplacerPattern(buf, escapeUnprintable); 267 buf.insert(0, (UChar)0x20); 268 buf.append((UChar)0x20); 269 ICU_Utility::appendToRule(rule, buf, 270 TRUE, escapeUnprintable, quoteBuf); 271 } 272 } 273 274 // Handle a cursor after the output. Use > rather than >= because 275 // if cursor == output.length() it is at the end of the output, 276 // which is the default position, so we need not emit it. 277 if (hasCursor && cursor > output.length()) { 278 cursor -= output.length(); 279 while (cursor-- > 0) { 280 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); 281 } 282 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); 283 } 284 // Flush quoteBuf out to result 285 ICU_Utility::appendToRule(rule, -1, 286 TRUE, escapeUnprintable, quoteBuf); 287 288 return rule; 289 } 290 291 /** 292 * Implement UnicodeReplacer 293 */ 294 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { 295 UChar32 ch; 296 for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) { 297 ch = output.char32At(i); 298 UnicodeReplacer* r = data->lookupReplacer(ch); 299 if (r == NULL) { 300 toUnionTo.add(ch); 301 } else { 302 r->addReplacementSetTo(toUnionTo); 303 } 304 } 305 } 306 307 /** 308 * UnicodeFunctor API 309 */ 310 void StringReplacer::setData(const TransliterationRuleData* d) { 311 data = d; 312 int32_t i = 0; 313 while (i<output.length()) { 314 UChar32 c = output.char32At(i); 315 UnicodeFunctor* f = data->lookup(c); 316 if (f != NULL) { 317 f->setData(data); 318 } 319 i += U16_LENGTH(c); 320 } 321 } 322 323 U_NAMESPACE_END 324 325 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 326 327 //eof 328