1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (c) 2002-2012, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 01/21/2002 aliu Creation. 10 ********************************************************************** 11 */ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_TRANSLITERATION 16 17 #include "unicode/uniset.h" 18 #include "unicode/utf16.h" 19 #include "strrepl.h" 20 #include "rbt_data.h" 21 #include "util.h" 22 23 U_NAMESPACE_BEGIN 24 25 UnicodeReplacer::~UnicodeReplacer() {} 26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer) 27 28 /** 29 * Construct a StringReplacer that sets the emits the given output 30 * text and sets the cursor to the given position. 31 * @param theOutput text that will replace input text when the 32 * replace() method is called. May contain stand-in characters 33 * that represent nested replacers. 34 * @param theCursorPos cursor position that will be returned by 35 * the replace() method 36 * @param theData transliterator context object that translates 37 * stand-in characters to UnicodeReplacer objects 38 */ 39 StringReplacer::StringReplacer(const UnicodeString& theOutput, 40 int32_t theCursorPos, 41 const TransliterationRuleData* theData) { 42 output = theOutput; 43 cursorPos = theCursorPos; 44 hasCursor = TRUE; 45 data = theData; 46 isComplex = TRUE; 47 } 48 49 /** 50 * Construct a StringReplacer that sets the emits the given output 51 * text and does not modify the cursor. 52 * @param theOutput text that will replace input text when the 53 * replace() method is called. May contain stand-in characters 54 * that represent nested replacers. 55 * @param theData transliterator context object that translates 56 * stand-in characters to UnicodeReplacer objects 57 */ 58 StringReplacer::StringReplacer(const UnicodeString& theOutput, 59 const TransliterationRuleData* theData) { 60 output = theOutput; 61 cursorPos = 0; 62 hasCursor = FALSE; 63 data = theData; 64 isComplex = TRUE; 65 } 66 67 /** 68 * Copy constructor. 69 */ 70 StringReplacer::StringReplacer(const StringReplacer& other) : 71 UnicodeFunctor(other), 72 UnicodeReplacer(other) 73 { 74 output = other.output; 75 cursorPos = other.cursorPos; 76 hasCursor = other.hasCursor; 77 data = other.data; 78 isComplex = other.isComplex; 79 } 80 81 /** 82 * Destructor 83 */ 84 StringReplacer::~StringReplacer() { 85 } 86 87 /** 88 * Implement UnicodeFunctor 89 */ 90 UnicodeFunctor* StringReplacer::clone() const { 91 return new StringReplacer(*this); 92 } 93 94 /** 95 * Implement UnicodeFunctor 96 */ 97 UnicodeReplacer* StringReplacer::toReplacer() const { 98 return const_cast<StringReplacer *>(this); 99 } 100 101 /** 102 * UnicodeReplacer API 103 */ 104 int32_t StringReplacer::replace(Replaceable& text, 105 int32_t start, 106 int32_t limit, 107 int32_t& cursor) { 108 int32_t outLen; 109 int32_t newStart = 0; 110 111 // NOTE: It should be possible to _always_ run the complex 112 // processing code; just slower. If not, then there is a bug 113 // in the complex processing code. 114 115 // Simple (no nested replacers) Processing Code : 116 if (!isComplex) { 117 text.handleReplaceBetween(start, limit, output); 118 outLen = output.length(); 119 120 // Setup default cursor position (for cursorPos within output) 121 newStart = cursorPos; 122 } 123 124 // Complex (nested replacers) Processing Code : 125 else { 126 /* When there are segments to be copied, use the Replaceable.copy() 127 * API in order to retain out-of-band data. Copy everything to the 128 * end of the string, then copy them back over the key. This preserves 129 * the integrity of indices into the key and surrounding context while 130 * generating the output text. 131 */ 132 UnicodeString buf; 133 int32_t oOutput; // offset into 'output' 134 isComplex = FALSE; 135 136 // The temporary buffer starts at tempStart, and extends 137 // to destLimit. The start of the buffer has a single 138 // character from before the key. This provides style 139 // data when addition characters are filled into the 140 // temporary buffer. If there is nothing to the left, use 141 // the non-character U+FFFF, which Replaceable subclasses 142 // should treat specially as a "no-style character." 143 // destStart points to the point after the style context 144 // character, so it is tempStart+1 or tempStart+2. 145 int32_t tempStart = text.length(); // start of temp buffer 146 int32_t destStart = tempStart; // copy new text to here 147 if (start > 0) { 148 int32_t len = U16_LENGTH(text.char32At(start-1)); 149 text.copy(start-len, start, tempStart); 150 destStart += len; 151 } else { 152 UnicodeString str((UChar) 0xFFFF); 153 text.handleReplaceBetween(tempStart, tempStart, str); 154 destStart++; 155 } 156 int32_t destLimit = destStart; 157 158 for (oOutput=0; oOutput<output.length(); ) { 159 if (oOutput == cursorPos) { 160 // Record the position of the cursor 161 newStart = destLimit - destStart; // relative to start 162 } 163 UChar32 c = output.char32At(oOutput); 164 UnicodeReplacer* r = data->lookupReplacer(c); 165 if (r == NULL) { 166 // Accumulate straight (non-segment) text. 167 buf.append(c); 168 } else { 169 isComplex = TRUE; 170 171 // Insert any accumulated straight text. 172 if (buf.length() > 0) { 173 text.handleReplaceBetween(destLimit, destLimit, buf); 174 destLimit += buf.length(); 175 buf.truncate(0); 176 } 177 178 // Delegate output generation to replacer object 179 int32_t len = r->replace(text, destLimit, destLimit, cursor); 180 destLimit += len; 181 } 182 oOutput += U16_LENGTH(c); 183 } 184 // Insert any accumulated straight text. 185 if (buf.length() > 0) { 186 text.handleReplaceBetween(destLimit, destLimit, buf); 187 destLimit += buf.length(); 188 } 189 if (oOutput == cursorPos) { 190 // Record the position of the cursor 191 newStart = destLimit - destStart; // relative to start 192 } 193 194 outLen = destLimit - destStart; 195 196 // Copy new text to start, and delete it 197 text.copy(destStart, destLimit, start); 198 text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString()); 199 200 // Delete the old text (the key) 201 text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString()); 202 } 203 204 if (hasCursor) { 205 // Adjust the cursor for positions outside the key. These 206 // refer to code points rather than code units. If cursorPos 207 // is within the output string, then use newStart, which has 208 // already been set above. 209 if (cursorPos < 0) { 210 newStart = start; 211 int32_t n = cursorPos; 212 // Outside the output string, cursorPos counts code points 213 while (n < 0 && newStart > 0) { 214 newStart -= U16_LENGTH(text.char32At(newStart-1)); 215 ++n; 216 } 217 newStart += n; 218 } else if (cursorPos > output.length()) { 219 newStart = start + outLen; 220 int32_t n = cursorPos - output.length(); 221 // Outside the output string, cursorPos counts code points 222 while (n > 0 && newStart < text.length()) { 223 newStart += U16_LENGTH(text.char32At(newStart)); 224 --n; 225 } 226 newStart += n; 227 } else { 228 // Cursor is within output string. It has been set up above 229 // to be relative to start. 230 newStart += start; 231 } 232 233 cursor = newStart; 234 } 235 236 return outLen; 237 } 238 239 /** 240 * UnicodeReplacer API 241 */ 242 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule, 243 UBool escapeUnprintable) const { 244 rule.truncate(0); 245 UnicodeString quoteBuf; 246 247 int32_t cursor = cursorPos; 248 249 // Handle a cursor preceding the output 250 if (hasCursor && cursor < 0) { 251 while (cursor++ < 0) { 252 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); 253 } 254 // Fall through and append '|' below 255 } 256 257 for (int32_t i=0; i<output.length(); ++i) { 258 if (hasCursor && i == cursor) { 259 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); 260 } 261 UChar c = output.charAt(i); // Ok to use 16-bits here 262 263 UnicodeReplacer* r = data->lookupReplacer(c); 264 if (r == NULL) { 265 ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf); 266 } else { 267 UnicodeString buf; 268 r->toReplacerPattern(buf, escapeUnprintable); 269 buf.insert(0, (UChar)0x20); 270 buf.append((UChar)0x20); 271 ICU_Utility::appendToRule(rule, buf, 272 TRUE, escapeUnprintable, quoteBuf); 273 } 274 } 275 276 // Handle a cursor after the output. Use > rather than >= because 277 // if cursor == output.length() it is at the end of the output, 278 // which is the default position, so we need not emit it. 279 if (hasCursor && cursor > output.length()) { 280 cursor -= output.length(); 281 while (cursor-- > 0) { 282 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); 283 } 284 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); 285 } 286 // Flush quoteBuf out to result 287 ICU_Utility::appendToRule(rule, -1, 288 TRUE, escapeUnprintable, quoteBuf); 289 290 return rule; 291 } 292 293 /** 294 * Implement UnicodeReplacer 295 */ 296 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { 297 UChar32 ch; 298 for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) { 299 ch = output.char32At(i); 300 UnicodeReplacer* r = data->lookupReplacer(ch); 301 if (r == NULL) { 302 toUnionTo.add(ch); 303 } else { 304 r->addReplacementSetTo(toUnionTo); 305 } 306 } 307 } 308 309 /** 310 * UnicodeFunctor API 311 */ 312 void StringReplacer::setData(const TransliterationRuleData* d) { 313 data = d; 314 int32_t i = 0; 315 while (i<output.length()) { 316 UChar32 c = output.char32At(i); 317 UnicodeFunctor* f = data->lookup(c); 318 if (f != NULL) { 319 f->setData(data); 320 } 321 i += U16_LENGTH(c); 322 } 323 } 324 325 U_NAMESPACE_END 326 327 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 328 329 //eof 330