1 /* 2 ********************************************************************** 3 * Copyright (C) 1999-2008, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 11/17/99 aliu Creation. 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION 14 15 #include "unicode/rep.h" 16 #include "unicode/unifilt.h" 17 #include "unicode/uniset.h" 18 #include "rbt_rule.h" 19 #include "rbt_data.h" 20 #include "cmemory.h" 21 #include "strmatch.h" 22 #include "strrepl.h" 23 #include "util.h" 24 #include "putilimp.h" 25 26 static const UChar FORWARD_OP[] = {32,62,32,0}; // " > " 27 28 U_NAMESPACE_BEGIN 29 30 /** 31 * Construct a new rule with the given input, output text, and other 32 * attributes. A cursor position may be specified for the output text. 33 * @param input input string, including key and optional ante and 34 * post context 35 * @param anteContextPos offset into input to end of ante context, or -1 if 36 * none. Must be <= input.length() if not -1. 37 * @param postContextPos offset into input to start of post context, or -1 38 * if none. Must be <= input.length() if not -1, and must be >= 39 * anteContextPos. 40 * @param output output string 41 * @param cursorPosition offset into output at which cursor is located, or -1 if 42 * none. If less than zero, then the cursor is placed after the 43 * <code>output</code>; that is, -1 is equivalent to 44 * <code>output.length()</code>. If greater than 45 * <code>output.length()</code> then an exception is thrown. 46 * @param segs array of UnicodeFunctors corresponding to input pattern 47 * segments, or null if there are none. The array itself is adopted, 48 * but the pointers within it are not. 49 * @param segsCount number of elements in segs[] 50 * @param anchorStart TRUE if the the rule is anchored on the left to 51 * the context start 52 * @param anchorEnd TRUE if the rule is anchored on the right to the 53 * context limit 54 */ 55 TransliterationRule::TransliterationRule(const UnicodeString& input, 56 int32_t anteContextPos, int32_t postContextPos, 57 const UnicodeString& outputStr, 58 int32_t cursorPosition, int32_t cursorOffset, 59 UnicodeFunctor** segs, 60 int32_t segsCount, 61 UBool anchorStart, UBool anchorEnd, 62 const TransliterationRuleData* theData, 63 UErrorCode& status) : 64 UMemory(), 65 segments(0), 66 data(theData) { 67 68 if (U_FAILURE(status)) { 69 return; 70 } 71 // Do range checks only when warranted to save time 72 if (anteContextPos < 0) { 73 anteContextLength = 0; 74 } else { 75 if (anteContextPos > input.length()) { 76 // throw new IllegalArgumentException("Invalid ante context"); 77 status = U_ILLEGAL_ARGUMENT_ERROR; 78 return; 79 } 80 anteContextLength = anteContextPos; 81 } 82 if (postContextPos < 0) { 83 keyLength = input.length() - anteContextLength; 84 } else { 85 if (postContextPos < anteContextLength || 86 postContextPos > input.length()) { 87 // throw new IllegalArgumentException("Invalid post context"); 88 status = U_ILLEGAL_ARGUMENT_ERROR; 89 return; 90 } 91 keyLength = postContextPos - anteContextLength; 92 } 93 if (cursorPosition < 0) { 94 cursorPosition = outputStr.length(); 95 } else if (cursorPosition > outputStr.length()) { 96 // throw new IllegalArgumentException("Invalid cursor position"); 97 status = U_ILLEGAL_ARGUMENT_ERROR; 98 return; 99 } 100 // We don't validate the segments array. The caller must 101 // guarantee that the segments are well-formed (that is, that 102 // all $n references in the output refer to indices of this 103 // array, and that no array elements are null). 104 this->segments = segs; 105 this->segmentsCount = segsCount; 106 107 pattern = input; 108 flags = 0; 109 if (anchorStart) { 110 flags |= ANCHOR_START; 111 } 112 if (anchorEnd) { 113 flags |= ANCHOR_END; 114 } 115 116 anteContext = NULL; 117 if (anteContextLength > 0) { 118 anteContext = new StringMatcher(pattern, 0, anteContextLength, 119 FALSE, *data); 120 /* test for NULL */ 121 if (anteContext == 0) { 122 status = U_MEMORY_ALLOCATION_ERROR; 123 return; 124 } 125 } 126 127 key = NULL; 128 if (keyLength > 0) { 129 key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength, 130 FALSE, *data); 131 /* test for NULL */ 132 if (key == 0) { 133 status = U_MEMORY_ALLOCATION_ERROR; 134 return; 135 } 136 } 137 138 int32_t postContextLength = pattern.length() - keyLength - anteContextLength; 139 postContext = NULL; 140 if (postContextLength > 0) { 141 postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(), 142 FALSE, *data); 143 /* test for NULL */ 144 if (postContext == 0) { 145 status = U_MEMORY_ALLOCATION_ERROR; 146 return; 147 } 148 } 149 150 this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data); 151 /* test for NULL */ 152 if (this->output == 0) { 153 status = U_MEMORY_ALLOCATION_ERROR; 154 return; 155 } 156 } 157 158 /** 159 * Copy constructor. 160 */ 161 TransliterationRule::TransliterationRule(TransliterationRule& other) : 162 UMemory(other), 163 anteContext(NULL), 164 key(NULL), 165 postContext(NULL), 166 pattern(other.pattern), 167 anteContextLength(other.anteContextLength), 168 keyLength(other.keyLength), 169 flags(other.flags), 170 data(other.data) { 171 172 segments = NULL; 173 segmentsCount = 0; 174 if (other.segmentsCount > 0) { 175 segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *)); 176 uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0])); 177 } 178 179 if (other.anteContext != NULL) { 180 anteContext = (StringMatcher*) other.anteContext->clone(); 181 } 182 if (other.key != NULL) { 183 key = (StringMatcher*) other.key->clone(); 184 } 185 if (other.postContext != NULL) { 186 postContext = (StringMatcher*) other.postContext->clone(); 187 } 188 output = other.output->clone(); 189 } 190 191 TransliterationRule::~TransliterationRule() { 192 uprv_free(segments); 193 delete anteContext; 194 delete key; 195 delete postContext; 196 delete output; 197 } 198 199 /** 200 * Return the preceding context length. This method is needed to 201 * support the <code>Transliterator</code> method 202 * <code>getMaximumContextLength()</code>. Internally, this is 203 * implemented as the anteContextLength, optionally plus one if 204 * there is a start anchor. The one character anchor gap is 205 * needed to make repeated incremental transliteration with 206 * anchors work. 207 */ 208 int32_t TransliterationRule::getContextLength(void) const { 209 return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0); 210 } 211 212 /** 213 * Internal method. Returns 8-bit index value for this rule. 214 * This is the low byte of the first character of the key, 215 * unless the first character of the key is a set. If it's a 216 * set, or otherwise can match multiple keys, the index value is -1. 217 */ 218 int16_t TransliterationRule::getIndexValue() const { 219 if (anteContextLength == pattern.length()) { 220 // A pattern with just ante context {such as foo)>bar} can 221 // match any key. 222 return -1; 223 } 224 UChar32 c = pattern.char32At(anteContextLength); 225 return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1); 226 } 227 228 /** 229 * Internal method. Returns true if this rule matches the given 230 * index value. The index value is an 8-bit integer, 0..255, 231 * representing the low byte of the first character of the key. 232 * It matches this rule if it matches the first character of the 233 * key, or if the first character of the key is a set, and the set 234 * contains any character with a low byte equal to the index 235 * value. If the rule contains only ante context, as in foo)>bar, 236 * then it will match any key. 237 */ 238 UBool TransliterationRule::matchesIndexValue(uint8_t v) const { 239 // Delegate to the key, or if there is none, to the postContext. 240 // If there is neither then we match any key; return true. 241 UnicodeMatcher *m = (key != NULL) ? key : postContext; 242 return (m != NULL) ? m->matchesIndexValue(v) : TRUE; 243 } 244 245 /** 246 * Return true if this rule masks another rule. If r1 masks r2 then 247 * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks 248 * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". 249 * "[c]a>x" masks "[dc]a>y". 250 */ 251 UBool TransliterationRule::masks(const TransliterationRule& r2) const { 252 /* Rule r1 masks rule r2 if the string formed of the 253 * antecontext, key, and postcontext overlaps in the following 254 * way: 255 * 256 * r1: aakkkpppp 257 * r2: aaakkkkkpppp 258 * ^ 259 * 260 * The strings must be aligned at the first character of the 261 * key. The length of r1 to the left of the alignment point 262 * must be <= the length of r2 to the left; ditto for the 263 * right. The characters of r1 must equal (or be a superset 264 * of) the corresponding characters of r2. The superset 265 * operation should be performed to check for UnicodeSet 266 * masking. 267 * 268 * Anchors: Two patterns that differ only in anchors only 269 * mask one another if they are exactly equal, and r2 has 270 * all the anchors r1 has (optionally, plus some). Here Y 271 * means the row masks the column, N means it doesn't. 272 * 273 * ab ^ab ab$ ^ab$ 274 * ab Y Y Y Y 275 * ^ab N Y N Y 276 * ab$ N N Y Y 277 * ^ab$ N N N Y 278 * 279 * Post context: {a}b masks ab, but not vice versa, since {a}b 280 * matches everything ab matches, and {a}b matches {|a|}b but ab 281 * does not. Pre context is different (a{b} does not align with 282 * ab). 283 */ 284 285 /* LIMITATION of the current mask algorithm: Some rule 286 * maskings are currently not detected. For example, 287 * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO 288 */ 289 290 int32_t len = pattern.length(); 291 int32_t left = anteContextLength; 292 int32_t left2 = r2.anteContextLength; 293 int32_t right = len - left; 294 int32_t right2 = r2.pattern.length() - left2; 295 int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern); 296 297 // TODO Clean this up -- some logic might be combinable with the 298 // next statement. 299 300 // Test for anchor masking 301 if (left == left2 && right == right2 && 302 keyLength <= r2.keyLength && 303 0 == cachedCompare) { 304 // The following boolean logic implements the table above 305 return (flags == r2.flags) || 306 (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) || 307 ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END)); 308 } 309 310 return left <= left2 && 311 (right < right2 || 312 (right == right2 && keyLength <= r2.keyLength)) && 313 (0 == cachedCompare); 314 } 315 316 static inline int32_t posBefore(const Replaceable& str, int32_t pos) { 317 return (pos > 0) ? 318 pos - UTF_CHAR_LENGTH(str.char32At(pos-1)) : 319 pos - 1; 320 } 321 322 static inline int32_t posAfter(const Replaceable& str, int32_t pos) { 323 return (pos >= 0 && pos < str.length()) ? 324 pos + UTF_CHAR_LENGTH(str.char32At(pos)) : 325 pos + 1; 326 } 327 328 /** 329 * Attempt a match and replacement at the given position. Return 330 * the degree of match between this rule and the given text. The 331 * degree of match may be mismatch, a partial match, or a full 332 * match. A mismatch means at least one character of the text 333 * does not match the context or key. A partial match means some 334 * context and key characters match, but the text is not long 335 * enough to match all of them. A full match means all context 336 * and key characters match. 337 * 338 * If a full match is obtained, perform a replacement, update pos, 339 * and return U_MATCH. Otherwise both text and pos are unchanged. 340 * 341 * @param text the text 342 * @param pos the position indices 343 * @param incremental if TRUE, test for partial matches that may 344 * be completed by additional text inserted at pos.limit. 345 * @return one of <code>U_MISMATCH</code>, 346 * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If 347 * incremental is FALSE then U_PARTIAL_MATCH will not be returned. 348 */ 349 UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, 350 UTransPosition& pos, 351 UBool incremental) const { 352 // Matching and replacing are done in one method because the 353 // replacement operation needs information obtained during the 354 // match. Another way to do this is to have the match method 355 // create a match result struct with relevant offsets, and to pass 356 // this into the replace method. 357 358 // ============================ MATCH =========================== 359 360 // Reset segment match data 361 if (segments != NULL) { 362 for (int32_t i=0; i<segmentsCount; ++i) { 363 ((StringMatcher*) segments[i])->resetMatch(); 364 } 365 } 366 367 // int32_t lenDelta, keyLimit; 368 int32_t keyLimit; 369 370 // ------------------------ Ante Context ------------------------ 371 372 // A mismatch in the ante context, or with the start anchor, 373 // is an outright U_MISMATCH regardless of whether we are 374 // incremental or not. 375 int32_t oText; // offset into 'text' 376 // int32_t newStart = 0; 377 int32_t minOText; 378 379 // Note (1): We process text in 16-bit code units, rather than 380 // 32-bit code points. This works because stand-ins are 381 // always in the BMP and because we are doing a literal match 382 // operation, which can be done 16-bits at a time. 383 384 int32_t anteLimit = posBefore(text, pos.contextStart); 385 386 UMatchDegree match; 387 388 // Start reverse match at char before pos.start 389 oText = posBefore(text, pos.start); 390 391 if (anteContext != NULL) { 392 match = anteContext->matches(text, oText, anteLimit, FALSE); 393 if (match != U_MATCH) { 394 return U_MISMATCH; 395 } 396 } 397 398 minOText = posAfter(text, oText); 399 400 // ------------------------ Start Anchor ------------------------ 401 402 if (((flags & ANCHOR_START) != 0) && oText != anteLimit) { 403 return U_MISMATCH; 404 } 405 406 // -------------------- Key and Post Context -------------------- 407 408 oText = pos.start; 409 410 if (key != NULL) { 411 match = key->matches(text, oText, pos.limit, incremental); 412 if (match != U_MATCH) { 413 return match; 414 } 415 } 416 417 keyLimit = oText; 418 419 if (postContext != NULL) { 420 if (incremental && keyLimit == pos.limit) { 421 // The key matches just before pos.limit, and there is 422 // a postContext. Since we are in incremental mode, 423 // we must assume more characters may be inserted at 424 // pos.limit -- this is a partial match. 425 return U_PARTIAL_MATCH; 426 } 427 428 match = postContext->matches(text, oText, pos.contextLimit, incremental); 429 if (match != U_MATCH) { 430 return match; 431 } 432 } 433 434 // ------------------------- Stop Anchor ------------------------ 435 436 if (((flags & ANCHOR_END)) != 0) { 437 if (oText != pos.contextLimit) { 438 return U_MISMATCH; 439 } 440 if (incremental) { 441 return U_PARTIAL_MATCH; 442 } 443 } 444 445 // =========================== REPLACE ========================== 446 447 // We have a full match. The key is between pos.start and 448 // keyLimit. 449 450 int32_t newStart; 451 int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart); 452 int32_t lenDelta = newLength - (keyLimit - pos.start); 453 454 oText += lenDelta; 455 pos.limit += lenDelta; 456 pos.contextLimit += lenDelta; 457 // Restrict new value of start to [minOText, min(oText, pos.limit)]. 458 pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart)); 459 return U_MATCH; 460 } 461 462 /** 463 * Create a source string that represents this rule. Append it to the 464 * given string. 465 */ 466 UnicodeString& TransliterationRule::toRule(UnicodeString& rule, 467 UBool escapeUnprintable) const { 468 469 // Accumulate special characters (and non-specials following them) 470 // into quoteBuf. Append quoteBuf, within single quotes, when 471 // a non-quoted element must be inserted. 472 UnicodeString str, quoteBuf; 473 474 // Do not emit the braces '{' '}' around the pattern if there 475 // is neither anteContext nor postContext. 476 UBool emitBraces = 477 (anteContext != NULL) || (postContext != NULL); 478 479 // Emit start anchor 480 if ((flags & ANCHOR_START) != 0) { 481 rule.append((UChar)94/*^*/); 482 } 483 484 // Emit the input pattern 485 ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf); 486 487 if (emitBraces) { 488 ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf); 489 } 490 491 ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf); 492 493 if (emitBraces) { 494 ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf); 495 } 496 497 ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf); 498 499 // Emit end anchor 500 if ((flags & ANCHOR_END) != 0) { 501 rule.append((UChar)36/*$*/); 502 } 503 504 ICU_Utility::appendToRule(rule, FORWARD_OP, TRUE, escapeUnprintable, quoteBuf); 505 506 // Emit the output pattern 507 508 ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable), 509 TRUE, escapeUnprintable, quoteBuf); 510 511 ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf); 512 513 return rule; 514 } 515 516 void TransliterationRule::setData(const TransliterationRuleData* d) { 517 data = d; 518 if (anteContext != NULL) anteContext->setData(d); 519 if (postContext != NULL) postContext->setData(d); 520 if (key != NULL) key->setData(d); 521 // assert(output != NULL); 522 output->setData(d); 523 // Don't have to do segments since they are in the context or key 524 } 525 526 /** 527 * Union the set of all characters that may be modified by this rule 528 * into the given set. 529 */ 530 void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const { 531 int32_t limit = anteContextLength + keyLength; 532 for (int32_t i=anteContextLength; i<limit; ) { 533 UChar32 ch = pattern.char32At(i); 534 i += UTF_CHAR_LENGTH(ch); 535 const UnicodeMatcher* matcher = data->lookupMatcher(ch); 536 if (matcher == NULL) { 537 toUnionTo.add(ch); 538 } else { 539 matcher->addMatchSetTo(toUnionTo); 540 } 541 } 542 } 543 544 /** 545 * Union the set of all characters that may be emitted by this rule 546 * into the given set. 547 */ 548 void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const { 549 output->toReplacer()->addReplacementSetTo(toUnionTo); 550 } 551 552 U_NAMESPACE_END 553 554 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 555 556 //eof 557