1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 1999-2011, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 11/17/99 aliu Creation. 10 ********************************************************************** 11 */ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_TRANSLITERATION 16 17 #include "unicode/rep.h" 18 #include "unicode/unifilt.h" 19 #include "unicode/uniset.h" 20 #include "unicode/utf16.h" 21 #include "rbt_rule.h" 22 #include "rbt_data.h" 23 #include "cmemory.h" 24 #include "strmatch.h" 25 #include "strrepl.h" 26 #include "util.h" 27 #include "putilimp.h" 28 29 static const UChar FORWARD_OP[] = {32,62,32,0}; // " > " 30 31 U_NAMESPACE_BEGIN 32 33 /** 34 * Construct a new rule with the given input, output text, and other 35 * attributes. A cursor position may be specified for the output text. 36 * @param input input string, including key and optional ante and 37 * post context 38 * @param anteContextPos offset into input to end of ante context, or -1 if 39 * none. Must be <= input.length() if not -1. 40 * @param postContextPos offset into input to start of post context, or -1 41 * if none. Must be <= input.length() if not -1, and must be >= 42 * anteContextPos. 43 * @param output output string 44 * @param cursorPosition offset into output at which cursor is located, or -1 if 45 * none. If less than zero, then the cursor is placed after the 46 * <code>output</code>; that is, -1 is equivalent to 47 * <code>output.length()</code>. If greater than 48 * <code>output.length()</code> then an exception is thrown. 49 * @param segs array of UnicodeFunctors corresponding to input pattern 50 * segments, or null if there are none. The array itself is adopted, 51 * but the pointers within it are not. 52 * @param segsCount number of elements in segs[] 53 * @param anchorStart TRUE if the the rule is anchored on the left to 54 * the context start 55 * @param anchorEnd TRUE if the rule is anchored on the right to the 56 * context limit 57 */ 58 TransliterationRule::TransliterationRule(const UnicodeString& input, 59 int32_t anteContextPos, int32_t postContextPos, 60 const UnicodeString& outputStr, 61 int32_t cursorPosition, int32_t cursorOffset, 62 UnicodeFunctor** segs, 63 int32_t segsCount, 64 UBool anchorStart, UBool anchorEnd, 65 const TransliterationRuleData* theData, 66 UErrorCode& status) : 67 UMemory(), 68 segments(0), 69 data(theData) { 70 71 if (U_FAILURE(status)) { 72 return; 73 } 74 // Do range checks only when warranted to save time 75 if (anteContextPos < 0) { 76 anteContextLength = 0; 77 } else { 78 if (anteContextPos > input.length()) { 79 // throw new IllegalArgumentException("Invalid ante context"); 80 status = U_ILLEGAL_ARGUMENT_ERROR; 81 return; 82 } 83 anteContextLength = anteContextPos; 84 } 85 if (postContextPos < 0) { 86 keyLength = input.length() - anteContextLength; 87 } else { 88 if (postContextPos < anteContextLength || 89 postContextPos > input.length()) { 90 // throw new IllegalArgumentException("Invalid post context"); 91 status = U_ILLEGAL_ARGUMENT_ERROR; 92 return; 93 } 94 keyLength = postContextPos - anteContextLength; 95 } 96 if (cursorPosition < 0) { 97 cursorPosition = outputStr.length(); 98 } else if (cursorPosition > outputStr.length()) { 99 // throw new IllegalArgumentException("Invalid cursor position"); 100 status = U_ILLEGAL_ARGUMENT_ERROR; 101 return; 102 } 103 // We don't validate the segments array. The caller must 104 // guarantee that the segments are well-formed (that is, that 105 // all $n references in the output refer to indices of this 106 // array, and that no array elements are null). 107 this->segments = segs; 108 this->segmentsCount = segsCount; 109 110 pattern = input; 111 flags = 0; 112 if (anchorStart) { 113 flags |= ANCHOR_START; 114 } 115 if (anchorEnd) { 116 flags |= ANCHOR_END; 117 } 118 119 anteContext = NULL; 120 if (anteContextLength > 0) { 121 anteContext = new StringMatcher(pattern, 0, anteContextLength, 122 FALSE, *data); 123 /* test for NULL */ 124 if (anteContext == 0) { 125 status = U_MEMORY_ALLOCATION_ERROR; 126 return; 127 } 128 } 129 130 key = NULL; 131 if (keyLength > 0) { 132 key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength, 133 FALSE, *data); 134 /* test for NULL */ 135 if (key == 0) { 136 status = U_MEMORY_ALLOCATION_ERROR; 137 return; 138 } 139 } 140 141 int32_t postContextLength = pattern.length() - keyLength - anteContextLength; 142 postContext = NULL; 143 if (postContextLength > 0) { 144 postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(), 145 FALSE, *data); 146 /* test for NULL */ 147 if (postContext == 0) { 148 status = U_MEMORY_ALLOCATION_ERROR; 149 return; 150 } 151 } 152 153 this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data); 154 /* test for NULL */ 155 if (this->output == 0) { 156 status = U_MEMORY_ALLOCATION_ERROR; 157 return; 158 } 159 } 160 161 /** 162 * Copy constructor. 163 */ 164 TransliterationRule::TransliterationRule(TransliterationRule& other) : 165 UMemory(other), 166 anteContext(NULL), 167 key(NULL), 168 postContext(NULL), 169 pattern(other.pattern), 170 anteContextLength(other.anteContextLength), 171 keyLength(other.keyLength), 172 flags(other.flags), 173 data(other.data) { 174 175 segments = NULL; 176 segmentsCount = 0; 177 if (other.segmentsCount > 0) { 178 segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *)); 179 uprv_memcpy(segments, other.segments, (size_t)other.segmentsCount*sizeof(segments[0])); 180 } 181 182 if (other.anteContext != NULL) { 183 anteContext = (StringMatcher*) other.anteContext->clone(); 184 } 185 if (other.key != NULL) { 186 key = (StringMatcher*) other.key->clone(); 187 } 188 if (other.postContext != NULL) { 189 postContext = (StringMatcher*) other.postContext->clone(); 190 } 191 output = other.output->clone(); 192 } 193 194 TransliterationRule::~TransliterationRule() { 195 uprv_free(segments); 196 delete anteContext; 197 delete key; 198 delete postContext; 199 delete output; 200 } 201 202 /** 203 * Return the preceding context length. This method is needed to 204 * support the <code>Transliterator</code> method 205 * <code>getMaximumContextLength()</code>. Internally, this is 206 * implemented as the anteContextLength, optionally plus one if 207 * there is a start anchor. The one character anchor gap is 208 * needed to make repeated incremental transliteration with 209 * anchors work. 210 */ 211 int32_t TransliterationRule::getContextLength(void) const { 212 return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0); 213 } 214 215 /** 216 * Internal method. Returns 8-bit index value for this rule. 217 * This is the low byte of the first character of the key, 218 * unless the first character of the key is a set. If it's a 219 * set, or otherwise can match multiple keys, the index value is -1. 220 */ 221 int16_t TransliterationRule::getIndexValue() const { 222 if (anteContextLength == pattern.length()) { 223 // A pattern with just ante context {such as foo)>bar} can 224 // match any key. 225 return -1; 226 } 227 UChar32 c = pattern.char32At(anteContextLength); 228 return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1); 229 } 230 231 /** 232 * Internal method. Returns true if this rule matches the given 233 * index value. The index value is an 8-bit integer, 0..255, 234 * representing the low byte of the first character of the key. 235 * It matches this rule if it matches the first character of the 236 * key, or if the first character of the key is a set, and the set 237 * contains any character with a low byte equal to the index 238 * value. If the rule contains only ante context, as in foo)>bar, 239 * then it will match any key. 240 */ 241 UBool TransliterationRule::matchesIndexValue(uint8_t v) const { 242 // Delegate to the key, or if there is none, to the postContext. 243 // If there is neither then we match any key; return true. 244 UnicodeMatcher *m = (key != NULL) ? key : postContext; 245 return (m != NULL) ? m->matchesIndexValue(v) : TRUE; 246 } 247 248 /** 249 * Return true if this rule masks another rule. If r1 masks r2 then 250 * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks 251 * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". 252 * "[c]a>x" masks "[dc]a>y". 253 */ 254 UBool TransliterationRule::masks(const TransliterationRule& r2) const { 255 /* Rule r1 masks rule r2 if the string formed of the 256 * antecontext, key, and postcontext overlaps in the following 257 * way: 258 * 259 * r1: aakkkpppp 260 * r2: aaakkkkkpppp 261 * ^ 262 * 263 * The strings must be aligned at the first character of the 264 * key. The length of r1 to the left of the alignment point 265 * must be <= the length of r2 to the left; ditto for the 266 * right. The characters of r1 must equal (or be a superset 267 * of) the corresponding characters of r2. The superset 268 * operation should be performed to check for UnicodeSet 269 * masking. 270 * 271 * Anchors: Two patterns that differ only in anchors only 272 * mask one another if they are exactly equal, and r2 has 273 * all the anchors r1 has (optionally, plus some). Here Y 274 * means the row masks the column, N means it doesn't. 275 * 276 * ab ^ab ab$ ^ab$ 277 * ab Y Y Y Y 278 * ^ab N Y N Y 279 * ab$ N N Y Y 280 * ^ab$ N N N Y 281 * 282 * Post context: {a}b masks ab, but not vice versa, since {a}b 283 * matches everything ab matches, and {a}b matches {|a|}b but ab 284 * does not. Pre context is different (a{b} does not align with 285 * ab). 286 */ 287 288 /* LIMITATION of the current mask algorithm: Some rule 289 * maskings are currently not detected. For example, 290 * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO 291 */ 292 293 int32_t len = pattern.length(); 294 int32_t left = anteContextLength; 295 int32_t left2 = r2.anteContextLength; 296 int32_t right = len - left; 297 int32_t right2 = r2.pattern.length() - left2; 298 int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern); 299 300 // TODO Clean this up -- some logic might be combinable with the 301 // next statement. 302 303 // Test for anchor masking 304 if (left == left2 && right == right2 && 305 keyLength <= r2.keyLength && 306 0 == cachedCompare) { 307 // The following boolean logic implements the table above 308 return (flags == r2.flags) || 309 (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) || 310 ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END)); 311 } 312 313 return left <= left2 && 314 (right < right2 || 315 (right == right2 && keyLength <= r2.keyLength)) && 316 (0 == cachedCompare); 317 } 318 319 static inline int32_t posBefore(const Replaceable& str, int32_t pos) { 320 return (pos > 0) ? 321 pos - U16_LENGTH(str.char32At(pos-1)) : 322 pos - 1; 323 } 324 325 static inline int32_t posAfter(const Replaceable& str, int32_t pos) { 326 return (pos >= 0 && pos < str.length()) ? 327 pos + U16_LENGTH(str.char32At(pos)) : 328 pos + 1; 329 } 330 331 /** 332 * Attempt a match and replacement at the given position. Return 333 * the degree of match between this rule and the given text. The 334 * degree of match may be mismatch, a partial match, or a full 335 * match. A mismatch means at least one character of the text 336 * does not match the context or key. A partial match means some 337 * context and key characters match, but the text is not long 338 * enough to match all of them. A full match means all context 339 * and key characters match. 340 * 341 * If a full match is obtained, perform a replacement, update pos, 342 * and return U_MATCH. Otherwise both text and pos are unchanged. 343 * 344 * @param text the text 345 * @param pos the position indices 346 * @param incremental if TRUE, test for partial matches that may 347 * be completed by additional text inserted at pos.limit. 348 * @return one of <code>U_MISMATCH</code>, 349 * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If 350 * incremental is FALSE then U_PARTIAL_MATCH will not be returned. 351 */ 352 UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, 353 UTransPosition& pos, 354 UBool incremental) const { 355 // Matching and replacing are done in one method because the 356 // replacement operation needs information obtained during the 357 // match. Another way to do this is to have the match method 358 // create a match result struct with relevant offsets, and to pass 359 // this into the replace method. 360 361 // ============================ MATCH =========================== 362 363 // Reset segment match data 364 if (segments != NULL) { 365 for (int32_t i=0; i<segmentsCount; ++i) { 366 ((StringMatcher*) segments[i])->resetMatch(); 367 } 368 } 369 370 // int32_t lenDelta, keyLimit; 371 int32_t keyLimit; 372 373 // ------------------------ Ante Context ------------------------ 374 375 // A mismatch in the ante context, or with the start anchor, 376 // is an outright U_MISMATCH regardless of whether we are 377 // incremental or not. 378 int32_t oText; // offset into 'text' 379 // int32_t newStart = 0; 380 int32_t minOText; 381 382 // Note (1): We process text in 16-bit code units, rather than 383 // 32-bit code points. This works because stand-ins are 384 // always in the BMP and because we are doing a literal match 385 // operation, which can be done 16-bits at a time. 386 387 int32_t anteLimit = posBefore(text, pos.contextStart); 388 389 UMatchDegree match; 390 391 // Start reverse match at char before pos.start 392 oText = posBefore(text, pos.start); 393 394 if (anteContext != NULL) { 395 match = anteContext->matches(text, oText, anteLimit, FALSE); 396 if (match != U_MATCH) { 397 return U_MISMATCH; 398 } 399 } 400 401 minOText = posAfter(text, oText); 402 403 // ------------------------ Start Anchor ------------------------ 404 405 if (((flags & ANCHOR_START) != 0) && oText != anteLimit) { 406 return U_MISMATCH; 407 } 408 409 // -------------------- Key and Post Context -------------------- 410 411 oText = pos.start; 412 413 if (key != NULL) { 414 match = key->matches(text, oText, pos.limit, incremental); 415 if (match != U_MATCH) { 416 return match; 417 } 418 } 419 420 keyLimit = oText; 421 422 if (postContext != NULL) { 423 if (incremental && keyLimit == pos.limit) { 424 // The key matches just before pos.limit, and there is 425 // a postContext. Since we are in incremental mode, 426 // we must assume more characters may be inserted at 427 // pos.limit -- this is a partial match. 428 return U_PARTIAL_MATCH; 429 } 430 431 match = postContext->matches(text, oText, pos.contextLimit, incremental); 432 if (match != U_MATCH) { 433 return match; 434 } 435 } 436 437 // ------------------------- Stop Anchor ------------------------ 438 439 if (((flags & ANCHOR_END)) != 0) { 440 if (oText != pos.contextLimit) { 441 return U_MISMATCH; 442 } 443 if (incremental) { 444 return U_PARTIAL_MATCH; 445 } 446 } 447 448 // =========================== REPLACE ========================== 449 450 // We have a full match. The key is between pos.start and 451 // keyLimit. 452 453 int32_t newStart; 454 int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart); 455 int32_t lenDelta = newLength - (keyLimit - pos.start); 456 457 oText += lenDelta; 458 pos.limit += lenDelta; 459 pos.contextLimit += lenDelta; 460 // Restrict new value of start to [minOText, min(oText, pos.limit)]. 461 pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart)); 462 return U_MATCH; 463 } 464 465 /** 466 * Create a source string that represents this rule. Append it to the 467 * given string. 468 */ 469 UnicodeString& TransliterationRule::toRule(UnicodeString& rule, 470 UBool escapeUnprintable) const { 471 472 // Accumulate special characters (and non-specials following them) 473 // into quoteBuf. Append quoteBuf, within single quotes, when 474 // a non-quoted element must be inserted. 475 UnicodeString str, quoteBuf; 476 477 // Do not emit the braces '{' '}' around the pattern if there 478 // is neither anteContext nor postContext. 479 UBool emitBraces = 480 (anteContext != NULL) || (postContext != NULL); 481 482 // Emit start anchor 483 if ((flags & ANCHOR_START) != 0) { 484 rule.append((UChar)94/*^*/); 485 } 486 487 // Emit the input pattern 488 ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf); 489 490 if (emitBraces) { 491 ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf); 492 } 493 494 ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf); 495 496 if (emitBraces) { 497 ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf); 498 } 499 500 ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf); 501 502 // Emit end anchor 503 if ((flags & ANCHOR_END) != 0) { 504 rule.append((UChar)36/*$*/); 505 } 506 507 ICU_Utility::appendToRule(rule, UnicodeString(TRUE, FORWARD_OP, 3), TRUE, escapeUnprintable, quoteBuf); 508 509 // Emit the output pattern 510 511 ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable), 512 TRUE, escapeUnprintable, quoteBuf); 513 514 ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf); 515 516 return rule; 517 } 518 519 void TransliterationRule::setData(const TransliterationRuleData* d) { 520 data = d; 521 if (anteContext != NULL) anteContext->setData(d); 522 if (postContext != NULL) postContext->setData(d); 523 if (key != NULL) key->setData(d); 524 // assert(output != NULL); 525 output->setData(d); 526 // Don't have to do segments since they are in the context or key 527 } 528 529 /** 530 * Union the set of all characters that may be modified by this rule 531 * into the given set. 532 */ 533 void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const { 534 int32_t limit = anteContextLength + keyLength; 535 for (int32_t i=anteContextLength; i<limit; ) { 536 UChar32 ch = pattern.char32At(i); 537 i += U16_LENGTH(ch); 538 const UnicodeMatcher* matcher = data->lookupMatcher(ch); 539 if (matcher == NULL) { 540 toUnionTo.add(ch); 541 } else { 542 matcher->addMatchSetTo(toUnionTo); 543 } 544 } 545 } 546 547 /** 548 * Union the set of all characters that may be emitted by this rule 549 * into the given set. 550 */ 551 void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const { 552 output->toReplacer()->addReplacementSetTo(toUnionTo); 553 } 554 555 U_NAMESPACE_END 556 557 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 558 559 //eof 560