1 /* 2 ********************************************************************** 3 * Copyright (C) 1999-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 11/17/99 aliu Creation. 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION 14 15 #include "unicode/rep.h" 16 #include "unicode/unifilt.h" 17 #include "unicode/uniset.h" 18 #include "unicode/utf16.h" 19 #include "rbt_rule.h" 20 #include "rbt_data.h" 21 #include "cmemory.h" 22 #include "strmatch.h" 23 #include "strrepl.h" 24 #include "util.h" 25 #include "putilimp.h" 26 27 static const UChar FORWARD_OP[] = {32,62,32,0}; // " > " 28 29 U_NAMESPACE_BEGIN 30 31 /** 32 * Construct a new rule with the given input, output text, and other 33 * attributes. A cursor position may be specified for the output text. 34 * @param input input string, including key and optional ante and 35 * post context 36 * @param anteContextPos offset into input to end of ante context, or -1 if 37 * none. Must be <= input.length() if not -1. 38 * @param postContextPos offset into input to start of post context, or -1 39 * if none. Must be <= input.length() if not -1, and must be >= 40 * anteContextPos. 41 * @param output output string 42 * @param cursorPosition offset into output at which cursor is located, or -1 if 43 * none. If less than zero, then the cursor is placed after the 44 * <code>output</code>; that is, -1 is equivalent to 45 * <code>output.length()</code>. If greater than 46 * <code>output.length()</code> then an exception is thrown. 47 * @param segs array of UnicodeFunctors corresponding to input pattern 48 * segments, or null if there are none. The array itself is adopted, 49 * but the pointers within it are not. 50 * @param segsCount number of elements in segs[] 51 * @param anchorStart TRUE if the the rule is anchored on the left to 52 * the context start 53 * @param anchorEnd TRUE if the rule is anchored on the right to the 54 * context limit 55 */ 56 TransliterationRule::TransliterationRule(const UnicodeString& input, 57 int32_t anteContextPos, int32_t postContextPos, 58 const UnicodeString& outputStr, 59 int32_t cursorPosition, int32_t cursorOffset, 60 UnicodeFunctor** segs, 61 int32_t segsCount, 62 UBool anchorStart, UBool anchorEnd, 63 const TransliterationRuleData* theData, 64 UErrorCode& status) : 65 UMemory(), 66 segments(0), 67 data(theData) { 68 69 if (U_FAILURE(status)) { 70 return; 71 } 72 // Do range checks only when warranted to save time 73 if (anteContextPos < 0) { 74 anteContextLength = 0; 75 } else { 76 if (anteContextPos > input.length()) { 77 // throw new IllegalArgumentException("Invalid ante context"); 78 status = U_ILLEGAL_ARGUMENT_ERROR; 79 return; 80 } 81 anteContextLength = anteContextPos; 82 } 83 if (postContextPos < 0) { 84 keyLength = input.length() - anteContextLength; 85 } else { 86 if (postContextPos < anteContextLength || 87 postContextPos > input.length()) { 88 // throw new IllegalArgumentException("Invalid post context"); 89 status = U_ILLEGAL_ARGUMENT_ERROR; 90 return; 91 } 92 keyLength = postContextPos - anteContextLength; 93 } 94 if (cursorPosition < 0) { 95 cursorPosition = outputStr.length(); 96 } else if (cursorPosition > outputStr.length()) { 97 // throw new IllegalArgumentException("Invalid cursor position"); 98 status = U_ILLEGAL_ARGUMENT_ERROR; 99 return; 100 } 101 // We don't validate the segments array. The caller must 102 // guarantee that the segments are well-formed (that is, that 103 // all $n references in the output refer to indices of this 104 // array, and that no array elements are null). 105 this->segments = segs; 106 this->segmentsCount = segsCount; 107 108 pattern = input; 109 flags = 0; 110 if (anchorStart) { 111 flags |= ANCHOR_START; 112 } 113 if (anchorEnd) { 114 flags |= ANCHOR_END; 115 } 116 117 anteContext = NULL; 118 if (anteContextLength > 0) { 119 anteContext = new StringMatcher(pattern, 0, anteContextLength, 120 FALSE, *data); 121 /* test for NULL */ 122 if (anteContext == 0) { 123 status = U_MEMORY_ALLOCATION_ERROR; 124 return; 125 } 126 } 127 128 key = NULL; 129 if (keyLength > 0) { 130 key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength, 131 FALSE, *data); 132 /* test for NULL */ 133 if (key == 0) { 134 status = U_MEMORY_ALLOCATION_ERROR; 135 return; 136 } 137 } 138 139 int32_t postContextLength = pattern.length() - keyLength - anteContextLength; 140 postContext = NULL; 141 if (postContextLength > 0) { 142 postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(), 143 FALSE, *data); 144 /* test for NULL */ 145 if (postContext == 0) { 146 status = U_MEMORY_ALLOCATION_ERROR; 147 return; 148 } 149 } 150 151 this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data); 152 /* test for NULL */ 153 if (this->output == 0) { 154 status = U_MEMORY_ALLOCATION_ERROR; 155 return; 156 } 157 } 158 159 /** 160 * Copy constructor. 161 */ 162 TransliterationRule::TransliterationRule(TransliterationRule& other) : 163 UMemory(other), 164 anteContext(NULL), 165 key(NULL), 166 postContext(NULL), 167 pattern(other.pattern), 168 anteContextLength(other.anteContextLength), 169 keyLength(other.keyLength), 170 flags(other.flags), 171 data(other.data) { 172 173 segments = NULL; 174 segmentsCount = 0; 175 if (other.segmentsCount > 0) { 176 segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *)); 177 uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0])); 178 } 179 180 if (other.anteContext != NULL) { 181 anteContext = (StringMatcher*) other.anteContext->clone(); 182 } 183 if (other.key != NULL) { 184 key = (StringMatcher*) other.key->clone(); 185 } 186 if (other.postContext != NULL) { 187 postContext = (StringMatcher*) other.postContext->clone(); 188 } 189 output = other.output->clone(); 190 } 191 192 TransliterationRule::~TransliterationRule() { 193 uprv_free(segments); 194 delete anteContext; 195 delete key; 196 delete postContext; 197 delete output; 198 } 199 200 /** 201 * Return the preceding context length. This method is needed to 202 * support the <code>Transliterator</code> method 203 * <code>getMaximumContextLength()</code>. Internally, this is 204 * implemented as the anteContextLength, optionally plus one if 205 * there is a start anchor. The one character anchor gap is 206 * needed to make repeated incremental transliteration with 207 * anchors work. 208 */ 209 int32_t TransliterationRule::getContextLength(void) const { 210 return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0); 211 } 212 213 /** 214 * Internal method. Returns 8-bit index value for this rule. 215 * This is the low byte of the first character of the key, 216 * unless the first character of the key is a set. If it's a 217 * set, or otherwise can match multiple keys, the index value is -1. 218 */ 219 int16_t TransliterationRule::getIndexValue() const { 220 if (anteContextLength == pattern.length()) { 221 // A pattern with just ante context {such as foo)>bar} can 222 // match any key. 223 return -1; 224 } 225 UChar32 c = pattern.char32At(anteContextLength); 226 return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1); 227 } 228 229 /** 230 * Internal method. Returns true if this rule matches the given 231 * index value. The index value is an 8-bit integer, 0..255, 232 * representing the low byte of the first character of the key. 233 * It matches this rule if it matches the first character of the 234 * key, or if the first character of the key is a set, and the set 235 * contains any character with a low byte equal to the index 236 * value. If the rule contains only ante context, as in foo)>bar, 237 * then it will match any key. 238 */ 239 UBool TransliterationRule::matchesIndexValue(uint8_t v) const { 240 // Delegate to the key, or if there is none, to the postContext. 241 // If there is neither then we match any key; return true. 242 UnicodeMatcher *m = (key != NULL) ? key : postContext; 243 return (m != NULL) ? m->matchesIndexValue(v) : TRUE; 244 } 245 246 /** 247 * Return true if this rule masks another rule. If r1 masks r2 then 248 * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks 249 * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". 250 * "[c]a>x" masks "[dc]a>y". 251 */ 252 UBool TransliterationRule::masks(const TransliterationRule& r2) const { 253 /* Rule r1 masks rule r2 if the string formed of the 254 * antecontext, key, and postcontext overlaps in the following 255 * way: 256 * 257 * r1: aakkkpppp 258 * r2: aaakkkkkpppp 259 * ^ 260 * 261 * The strings must be aligned at the first character of the 262 * key. The length of r1 to the left of the alignment point 263 * must be <= the length of r2 to the left; ditto for the 264 * right. The characters of r1 must equal (or be a superset 265 * of) the corresponding characters of r2. The superset 266 * operation should be performed to check for UnicodeSet 267 * masking. 268 * 269 * Anchors: Two patterns that differ only in anchors only 270 * mask one another if they are exactly equal, and r2 has 271 * all the anchors r1 has (optionally, plus some). Here Y 272 * means the row masks the column, N means it doesn't. 273 * 274 * ab ^ab ab$ ^ab$ 275 * ab Y Y Y Y 276 * ^ab N Y N Y 277 * ab$ N N Y Y 278 * ^ab$ N N N Y 279 * 280 * Post context: {a}b masks ab, but not vice versa, since {a}b 281 * matches everything ab matches, and {a}b matches {|a|}b but ab 282 * does not. Pre context is different (a{b} does not align with 283 * ab). 284 */ 285 286 /* LIMITATION of the current mask algorithm: Some rule 287 * maskings are currently not detected. For example, 288 * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO 289 */ 290 291 int32_t len = pattern.length(); 292 int32_t left = anteContextLength; 293 int32_t left2 = r2.anteContextLength; 294 int32_t right = len - left; 295 int32_t right2 = r2.pattern.length() - left2; 296 int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern); 297 298 // TODO Clean this up -- some logic might be combinable with the 299 // next statement. 300 301 // Test for anchor masking 302 if (left == left2 && right == right2 && 303 keyLength <= r2.keyLength && 304 0 == cachedCompare) { 305 // The following boolean logic implements the table above 306 return (flags == r2.flags) || 307 (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) || 308 ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END)); 309 } 310 311 return left <= left2 && 312 (right < right2 || 313 (right == right2 && keyLength <= r2.keyLength)) && 314 (0 == cachedCompare); 315 } 316 317 static inline int32_t posBefore(const Replaceable& str, int32_t pos) { 318 return (pos > 0) ? 319 pos - U16_LENGTH(str.char32At(pos-1)) : 320 pos - 1; 321 } 322 323 static inline int32_t posAfter(const Replaceable& str, int32_t pos) { 324 return (pos >= 0 && pos < str.length()) ? 325 pos + U16_LENGTH(str.char32At(pos)) : 326 pos + 1; 327 } 328 329 /** 330 * Attempt a match and replacement at the given position. Return 331 * the degree of match between this rule and the given text. The 332 * degree of match may be mismatch, a partial match, or a full 333 * match. A mismatch means at least one character of the text 334 * does not match the context or key. A partial match means some 335 * context and key characters match, but the text is not long 336 * enough to match all of them. A full match means all context 337 * and key characters match. 338 * 339 * If a full match is obtained, perform a replacement, update pos, 340 * and return U_MATCH. Otherwise both text and pos are unchanged. 341 * 342 * @param text the text 343 * @param pos the position indices 344 * @param incremental if TRUE, test for partial matches that may 345 * be completed by additional text inserted at pos.limit. 346 * @return one of <code>U_MISMATCH</code>, 347 * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If 348 * incremental is FALSE then U_PARTIAL_MATCH will not be returned. 349 */ 350 UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, 351 UTransPosition& pos, 352 UBool incremental) const { 353 // Matching and replacing are done in one method because the 354 // replacement operation needs information obtained during the 355 // match. Another way to do this is to have the match method 356 // create a match result struct with relevant offsets, and to pass 357 // this into the replace method. 358 359 // ============================ MATCH =========================== 360 361 // Reset segment match data 362 if (segments != NULL) { 363 for (int32_t i=0; i<segmentsCount; ++i) { 364 ((StringMatcher*) segments[i])->resetMatch(); 365 } 366 } 367 368 // int32_t lenDelta, keyLimit; 369 int32_t keyLimit; 370 371 // ------------------------ Ante Context ------------------------ 372 373 // A mismatch in the ante context, or with the start anchor, 374 // is an outright U_MISMATCH regardless of whether we are 375 // incremental or not. 376 int32_t oText; // offset into 'text' 377 // int32_t newStart = 0; 378 int32_t minOText; 379 380 // Note (1): We process text in 16-bit code units, rather than 381 // 32-bit code points. This works because stand-ins are 382 // always in the BMP and because we are doing a literal match 383 // operation, which can be done 16-bits at a time. 384 385 int32_t anteLimit = posBefore(text, pos.contextStart); 386 387 UMatchDegree match; 388 389 // Start reverse match at char before pos.start 390 oText = posBefore(text, pos.start); 391 392 if (anteContext != NULL) { 393 match = anteContext->matches(text, oText, anteLimit, FALSE); 394 if (match != U_MATCH) { 395 return U_MISMATCH; 396 } 397 } 398 399 minOText = posAfter(text, oText); 400 401 // ------------------------ Start Anchor ------------------------ 402 403 if (((flags & ANCHOR_START) != 0) && oText != anteLimit) { 404 return U_MISMATCH; 405 } 406 407 // -------------------- Key and Post Context -------------------- 408 409 oText = pos.start; 410 411 if (key != NULL) { 412 match = key->matches(text, oText, pos.limit, incremental); 413 if (match != U_MATCH) { 414 return match; 415 } 416 } 417 418 keyLimit = oText; 419 420 if (postContext != NULL) { 421 if (incremental && keyLimit == pos.limit) { 422 // The key matches just before pos.limit, and there is 423 // a postContext. Since we are in incremental mode, 424 // we must assume more characters may be inserted at 425 // pos.limit -- this is a partial match. 426 return U_PARTIAL_MATCH; 427 } 428 429 match = postContext->matches(text, oText, pos.contextLimit, incremental); 430 if (match != U_MATCH) { 431 return match; 432 } 433 } 434 435 // ------------------------- Stop Anchor ------------------------ 436 437 if (((flags & ANCHOR_END)) != 0) { 438 if (oText != pos.contextLimit) { 439 return U_MISMATCH; 440 } 441 if (incremental) { 442 return U_PARTIAL_MATCH; 443 } 444 } 445 446 // =========================== REPLACE ========================== 447 448 // We have a full match. The key is between pos.start and 449 // keyLimit. 450 451 int32_t newStart; 452 int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart); 453 int32_t lenDelta = newLength - (keyLimit - pos.start); 454 455 oText += lenDelta; 456 pos.limit += lenDelta; 457 pos.contextLimit += lenDelta; 458 // Restrict new value of start to [minOText, min(oText, pos.limit)]. 459 pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart)); 460 return U_MATCH; 461 } 462 463 /** 464 * Create a source string that represents this rule. Append it to the 465 * given string. 466 */ 467 UnicodeString& TransliterationRule::toRule(UnicodeString& rule, 468 UBool escapeUnprintable) const { 469 470 // Accumulate special characters (and non-specials following them) 471 // into quoteBuf. Append quoteBuf, within single quotes, when 472 // a non-quoted element must be inserted. 473 UnicodeString str, quoteBuf; 474 475 // Do not emit the braces '{' '}' around the pattern if there 476 // is neither anteContext nor postContext. 477 UBool emitBraces = 478 (anteContext != NULL) || (postContext != NULL); 479 480 // Emit start anchor 481 if ((flags & ANCHOR_START) != 0) { 482 rule.append((UChar)94/*^*/); 483 } 484 485 // Emit the input pattern 486 ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf); 487 488 if (emitBraces) { 489 ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf); 490 } 491 492 ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf); 493 494 if (emitBraces) { 495 ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf); 496 } 497 498 ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf); 499 500 // Emit end anchor 501 if ((flags & ANCHOR_END) != 0) { 502 rule.append((UChar)36/*$*/); 503 } 504 505 ICU_Utility::appendToRule(rule, UnicodeString(TRUE, FORWARD_OP, 3), TRUE, escapeUnprintable, quoteBuf); 506 507 // Emit the output pattern 508 509 ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable), 510 TRUE, escapeUnprintable, quoteBuf); 511 512 ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf); 513 514 return rule; 515 } 516 517 void TransliterationRule::setData(const TransliterationRuleData* d) { 518 data = d; 519 if (anteContext != NULL) anteContext->setData(d); 520 if (postContext != NULL) postContext->setData(d); 521 if (key != NULL) key->setData(d); 522 // assert(output != NULL); 523 output->setData(d); 524 // Don't have to do segments since they are in the context or key 525 } 526 527 /** 528 * Union the set of all characters that may be modified by this rule 529 * into the given set. 530 */ 531 void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const { 532 int32_t limit = anteContextLength + keyLength; 533 for (int32_t i=anteContextLength; i<limit; ) { 534 UChar32 ch = pattern.char32At(i); 535 i += U16_LENGTH(ch); 536 const UnicodeMatcher* matcher = data->lookupMatcher(ch); 537 if (matcher == NULL) { 538 toUnionTo.add(ch); 539 } else { 540 matcher->addMatchSetTo(toUnionTo); 541 } 542 } 543 } 544 545 /** 546 * Union the set of all characters that may be emitted by this rule 547 * into the given set. 548 */ 549 void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const { 550 output->toReplacer()->addReplacementSetTo(toUnionTo); 551 } 552 553 U_NAMESPACE_END 554 555 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 556 557 //eof 558