1 /* 2 ************************************************************************** 3 * Copyright (C) 2002-2008 International Business Machines Corporation * 4 * and others. All rights reserved. * 5 ************************************************************************** 6 */ 7 // 8 // file: rematch.cpp 9 // 10 // Contains the implementation of class RegexMatcher, 11 // which is one of the main API classes for the ICU regular expression package. 12 // 13 14 #include "unicode/utypes.h" 15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 16 17 #include "unicode/regex.h" 18 #include "unicode/uniset.h" 19 #include "unicode/uchar.h" 20 #include "unicode/ustring.h" 21 #include "unicode/rbbi.h" 22 #include "uassert.h" 23 #include "cmemory.h" 24 #include "uvector.h" 25 #include "uvectr32.h" 26 #include "regeximp.h" 27 #include "regexst.h" 28 29 // #include <malloc.h> // Needed for heapcheck testing 30 31 U_NAMESPACE_BEGIN 32 33 // Default limit for the size of the back track stack, to avoid system 34 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. 35 // This value puts ICU's limits higher than most other regexp implementations, 36 // which use recursion rather than the heap, and take more storage per 37 // backtrack point. 38 // 39 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; 40 41 // Time limit counter constant. 42 // Time limits for expression evaluation are in terms of quanta of work by 43 // the engine, each of which is 10,000 state saves. 44 // This constant determines that state saves per tick number. 45 static const int32_t TIMER_INITIAL_VALUE = 10000; 46 47 //----------------------------------------------------------------------------- 48 // 49 // Constructor and Destructor 50 // 51 //----------------------------------------------------------------------------- 52 RegexMatcher::RegexMatcher(const RegexPattern *pat) { 53 fDeferredStatus = U_ZERO_ERROR; 54 init(fDeferredStatus); 55 if (U_FAILURE(fDeferredStatus)) { 56 return; 57 } 58 if (pat==NULL) { 59 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; 60 return; 61 } 62 fPattern = pat; 63 init2(RegexStaticSets::gStaticSets->fEmptyString, fDeferredStatus); 64 } 65 66 67 68 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 69 uint32_t flags, UErrorCode &status) { 70 init(status); 71 if (U_FAILURE(status)) { 72 return; 73 } 74 UParseError pe; 75 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 76 fPattern = fPatternOwned; 77 init2(input, status); 78 } 79 80 81 RegexMatcher::RegexMatcher(const UnicodeString ®exp, 82 uint32_t flags, UErrorCode &status) { 83 init(status); 84 if (U_FAILURE(status)) { 85 return; 86 } 87 UParseError pe; 88 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 89 fPattern = fPatternOwned; 90 init2(RegexStaticSets::gStaticSets->fEmptyString, status); 91 } 92 93 94 95 96 RegexMatcher::~RegexMatcher() { 97 delete fStack; 98 if (fData != fSmallData) { 99 uprv_free(fData); 100 fData = NULL; 101 } 102 if (fPatternOwned) { 103 delete fPatternOwned; 104 fPatternOwned = NULL; 105 fPattern = NULL; 106 } 107 #if UCONFIG_NO_BREAK_ITERATION==0 108 delete fWordBreakItr; 109 #endif 110 } 111 112 // 113 // init() common initialization for use by all constructors. 114 // Initialize all fields, get the object into a consistent state. 115 // This must be done even when the initial status shows an error, 116 // so that the object is initialized sufficiently well for the destructor 117 // to run safely. 118 // 119 void RegexMatcher::init(UErrorCode &status) { 120 fPattern = NULL; 121 fPatternOwned = NULL; 122 fInput = NULL; 123 fFrameSize = 0; 124 fRegionStart = 0; 125 fRegionLimit = 0; 126 fAnchorStart = 0; 127 fAnchorLimit = 0; 128 fLookStart = 0; 129 fLookLimit = 0; 130 fActiveStart = 0; 131 fActiveLimit = 0; 132 fTransparentBounds = FALSE; 133 fAnchoringBounds = TRUE; 134 fMatch = FALSE; 135 fMatchStart = 0; 136 fMatchEnd = 0; 137 fLastMatchEnd = -1; 138 fAppendPosition = 0; 139 fHitEnd = FALSE; 140 fRequireEnd = FALSE; 141 fStack = NULL; 142 fFrame = NULL; 143 fTimeLimit = 0; 144 fTime = 0; 145 fTickCounter = 0; 146 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; 147 fCallbackFn = NULL; 148 fCallbackContext = NULL; 149 fTraceDebug = FALSE; 150 fDeferredStatus = status; 151 fData = fSmallData; 152 fWordBreakItr = NULL; 153 154 fStack = new UVector32(status); 155 if (U_FAILURE(status)) { 156 fDeferredStatus = status; 157 } 158 } 159 160 // 161 // init2() Common initialization for use by RegexMatcher constructors, part 2. 162 // This handles the common setup to be done after the Pattern is available. 163 // 164 void RegexMatcher::init2(const UnicodeString &input, UErrorCode &status) { 165 if (U_FAILURE(status)) { 166 fDeferredStatus = status; 167 return; 168 } 169 170 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { 171 fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); 172 if (fData == NULL) { 173 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 174 return; 175 } 176 } 177 178 reset(input); 179 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); 180 if (U_FAILURE(status)) { 181 fDeferredStatus = status; 182 return; 183 } 184 } 185 186 187 static const UChar BACKSLASH = 0x5c; 188 static const UChar DOLLARSIGN = 0x24; 189 //-------------------------------------------------------------------------------- 190 // 191 // appendReplacement 192 // 193 //-------------------------------------------------------------------------------- 194 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, 195 const UnicodeString &replacement, 196 UErrorCode &status) { 197 if (U_FAILURE(status)) { 198 return *this; 199 } 200 if (U_FAILURE(fDeferredStatus)) { 201 status = fDeferredStatus; 202 return *this; 203 } 204 if (fMatch == FALSE) { 205 status = U_REGEX_INVALID_STATE; 206 return *this; 207 } 208 209 // Copy input string from the end of previous match to start of current match 210 int32_t len = fMatchStart-fAppendPosition; 211 if (len > 0) { 212 dest.append(*fInput, fAppendPosition, len); 213 } 214 fAppendPosition = fMatchEnd; 215 216 217 // scan the replacement text, looking for substitutions ($n) and \escapes. 218 // TODO: optimize this loop by efficiently scanning for '$' or '\', 219 // move entire ranges not containing substitutions. 220 int32_t replLen = replacement.length(); 221 int32_t replIdx = 0; 222 while (replIdx<replLen) { 223 UChar c = replacement.charAt(replIdx); 224 replIdx++; 225 if (c == BACKSLASH) { 226 // Backslash Escape. Copy the following char out without further checks. 227 // Note: Surrogate pairs don't need any special handling 228 // The second half wont be a '$' or a '\', and 229 // will move to the dest normally on the next 230 // loop iteration. 231 if (replIdx >= replLen) { 232 break; 233 } 234 c = replacement.charAt(replIdx); 235 236 if (c==0x55/*U*/ || c==0x75/*u*/) { 237 // We have a \udddd or \Udddddddd escape sequence. 238 UChar32 escapedChar = replacement.unescapeAt(replIdx); 239 if (escapedChar != (UChar32)0xFFFFFFFF) { 240 dest.append(escapedChar); 241 // TODO: Report errors for mal-formed \u escapes? 242 // As this is, the original sequence is output, which may be OK. 243 continue; 244 } 245 } 246 247 // Plain backslash escape. Just put out the escaped character. 248 dest.append(c); 249 replIdx++; 250 continue; 251 } 252 253 if (c != DOLLARSIGN) { 254 // Normal char, not a $. Copy it out without further checks. 255 dest.append(c); 256 continue; 257 } 258 259 // We've got a $. Pick up a capture group number if one follows. 260 // Consume at most the number of digits necessary for the largest capture 261 // number that is valid for this pattern. 262 263 int32_t numDigits = 0; 264 int32_t groupNum = 0; 265 UChar32 digitC; 266 for (;;) { 267 if (replIdx >= replLen) { 268 break; 269 } 270 digitC = replacement.char32At(replIdx); 271 if (u_isdigit(digitC) == FALSE) { 272 break; 273 } 274 replIdx = replacement.moveIndex32(replIdx, 1); 275 groupNum=groupNum*10 + u_charDigitValue(digitC); 276 numDigits++; 277 if (numDigits >= fPattern->fMaxCaptureDigits) { 278 break; 279 } 280 } 281 282 283 if (numDigits == 0) { 284 // The $ didn't introduce a group number at all. 285 // Treat it as just part of the substitution text. 286 dest.append(DOLLARSIGN); 287 continue; 288 } 289 290 // Finally, append the capture group data to the destination. 291 dest.append(group(groupNum, status)); 292 if (U_FAILURE(status)) { 293 // Can fail if group number is out of range. 294 break; 295 } 296 297 } 298 299 return *this; 300 } 301 302 303 304 //-------------------------------------------------------------------------------- 305 // 306 // appendTail Intended to be used in conjunction with appendReplacement() 307 // To the destination string, append everything following 308 // the last match position from the input string. 309 // 310 // Note: Match ranges do not affect appendTail or appendReplacement 311 // 312 //-------------------------------------------------------------------------------- 313 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { 314 int32_t len = fInput->length() - fAppendPosition; 315 if (len > 0) { 316 dest.append(*fInput, fAppendPosition, len); 317 } 318 return dest; 319 } 320 321 322 323 //-------------------------------------------------------------------------------- 324 // 325 // end 326 // 327 //-------------------------------------------------------------------------------- 328 int32_t RegexMatcher::end(UErrorCode &err) const { 329 return end(0, err); 330 } 331 332 333 334 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { 335 if (U_FAILURE(err)) { 336 return -1; 337 } 338 if (fMatch == FALSE) { 339 err = U_REGEX_INVALID_STATE; 340 return -1; 341 } 342 if (group < 0 || group > fPattern->fGroupMap->size()) { 343 err = U_INDEX_OUTOFBOUNDS_ERROR; 344 return -1; 345 } 346 int32_t e = -1; 347 if (group == 0) { 348 e = fMatchEnd; 349 } else { 350 // Get the position within the stack frame of the variables for 351 // this capture group. 352 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 353 U_ASSERT(groupOffset < fPattern->fFrameSize); 354 U_ASSERT(groupOffset >= 0); 355 e = fFrame->fExtra[groupOffset + 1]; 356 } 357 return e; 358 } 359 360 361 362 //-------------------------------------------------------------------------------- 363 // 364 // find() 365 // 366 //-------------------------------------------------------------------------------- 367 UBool RegexMatcher::find() { 368 // Start at the position of the last match end. (Will be zero if the 369 // matcher has been reset. 370 // 371 if (U_FAILURE(fDeferredStatus)) { 372 return FALSE; 373 } 374 375 int32_t startPos = fMatchEnd; 376 if (startPos==0) { 377 startPos = fActiveStart; 378 } 379 380 if (fMatch) { 381 // Save the position of any previous successful match. 382 fLastMatchEnd = fMatchEnd; 383 384 if (fMatchStart == fMatchEnd) { 385 // Previous match had zero length. Move start position up one position 386 // to avoid sending find() into a loop on zero-length matches. 387 if (startPos >= fActiveLimit) { 388 fMatch = FALSE; 389 fHitEnd = TRUE; 390 return FALSE; 391 } 392 startPos = fInput->moveIndex32(startPos, 1); 393 } 394 } else { 395 if (fLastMatchEnd >= 0) { 396 // A previous find() failed to match. Don't try again. 397 // (without this test, a pattern with a zero-length match 398 // could match again at the end of an input string.) 399 fHitEnd = TRUE; 400 return FALSE; 401 } 402 } 403 404 405 // Compute the position in the input string beyond which a match can not begin, because 406 // the minimum length match would extend past the end of the input. 407 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 408 // Be aware of possible overflows if making changes here. 409 int32_t testLen = fActiveLimit - fPattern->fMinMatchLen; 410 if (startPos > testLen) { 411 fMatch = FALSE; 412 fHitEnd = TRUE; 413 return FALSE; 414 } 415 416 const UChar *inputBuf = fInput->getBuffer(); 417 UChar32 c; 418 U_ASSERT(startPos >= 0); 419 420 switch (fPattern->fStartType) { 421 case START_NO_INFO: 422 // No optimization was found. 423 // Try a match at each input position. 424 for (;;) { 425 MatchAt(startPos, FALSE, fDeferredStatus); 426 if (U_FAILURE(fDeferredStatus)) { 427 return FALSE; 428 } 429 if (fMatch) { 430 return TRUE; 431 } 432 if (startPos >= testLen) { 433 fHitEnd = TRUE; 434 return FALSE; 435 } 436 U16_FWD_1(inputBuf, startPos, fActiveLimit); 437 // Note that it's perfectly OK for a pattern to have a zero-length 438 // match at the end of a string, so we must make sure that the loop 439 // runs with startPos == testLen the last time through. 440 } 441 U_ASSERT(FALSE); 442 443 case START_START: 444 // Matches are only possible at the start of the input string 445 // (pattern begins with ^ or \A) 446 if (startPos > fActiveStart) { 447 fMatch = FALSE; 448 return FALSE; 449 } 450 MatchAt(startPos, FALSE, fDeferredStatus); 451 if (U_FAILURE(fDeferredStatus)) { 452 return FALSE; 453 } 454 return fMatch; 455 456 457 case START_SET: 458 { 459 // Match may start on any char from a pre-computed set. 460 U_ASSERT(fPattern->fMinMatchLen > 0); 461 for (;;) { 462 int32_t pos = startPos; 463 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 464 if (c<256 && fPattern->fInitialChars8->contains(c) || 465 c>=256 && fPattern->fInitialChars->contains(c)) { 466 MatchAt(pos, FALSE, fDeferredStatus); 467 if (U_FAILURE(fDeferredStatus)) { 468 return FALSE; 469 } 470 if (fMatch) { 471 return TRUE; 472 } 473 } 474 if (pos >= testLen) { 475 fMatch = FALSE; 476 fHitEnd = TRUE; 477 return FALSE; 478 } 479 } 480 } 481 U_ASSERT(FALSE); 482 483 case START_STRING: 484 case START_CHAR: 485 { 486 // Match starts on exactly one char. 487 U_ASSERT(fPattern->fMinMatchLen > 0); 488 UChar32 theChar = fPattern->fInitialChar; 489 for (;;) { 490 int32_t pos = startPos; 491 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 492 if (c == theChar) { 493 MatchAt(pos, FALSE, fDeferredStatus); 494 if (U_FAILURE(fDeferredStatus)) { 495 return FALSE; 496 } 497 if (fMatch) { 498 return TRUE; 499 } 500 } 501 if (pos >= testLen) { 502 fMatch = FALSE; 503 fHitEnd = TRUE; 504 return FALSE; 505 } 506 } 507 } 508 U_ASSERT(FALSE); 509 510 case START_LINE: 511 { 512 UChar32 c; 513 if (startPos == fAnchorStart) { 514 MatchAt(startPos, FALSE, fDeferredStatus); 515 if (U_FAILURE(fDeferredStatus)) { 516 return FALSE; 517 } 518 if (fMatch) { 519 return TRUE; 520 } 521 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 522 } 523 524 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 525 for (;;) { 526 c = inputBuf[startPos-1]; 527 if (c == 0x0a) { 528 MatchAt(startPos, FALSE, fDeferredStatus); 529 if (U_FAILURE(fDeferredStatus)) { 530 return FALSE; 531 } 532 if (fMatch) { 533 return TRUE; 534 } 535 } 536 if (startPos >= testLen) { 537 fMatch = FALSE; 538 fHitEnd = TRUE; 539 return FALSE; 540 } 541 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 542 // Note that it's perfectly OK for a pattern to have a zero-length 543 // match at the end of a string, so we must make sure that the loop 544 // runs with startPos == testLen the last time through. 545 } 546 } else { 547 for (;;) { 548 c = inputBuf[startPos-1]; 549 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 550 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { 551 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { 552 startPos++; 553 } 554 MatchAt(startPos, FALSE, fDeferredStatus); 555 if (U_FAILURE(fDeferredStatus)) { 556 return FALSE; 557 } 558 if (fMatch) { 559 return TRUE; 560 } 561 } 562 if (startPos >= testLen) { 563 fMatch = FALSE; 564 fHitEnd = TRUE; 565 return FALSE; 566 } 567 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 568 // Note that it's perfectly OK for a pattern to have a zero-length 569 // match at the end of a string, so we must make sure that the loop 570 // runs with startPos == testLen the last time through. 571 } 572 } 573 } 574 575 default: 576 U_ASSERT(FALSE); 577 } 578 579 U_ASSERT(FALSE); 580 return FALSE; 581 } 582 583 584 585 UBool RegexMatcher::find(int32_t start, UErrorCode &status) { 586 if (U_FAILURE(status)) { 587 return FALSE; 588 } 589 if (U_FAILURE(fDeferredStatus)) { 590 status = fDeferredStatus; 591 return FALSE; 592 } 593 this->reset(); // Note: Reset() is specified by Java Matcher documentation. 594 // This will reset the region to be the full input length. 595 if (start < fActiveStart || start > fActiveLimit) { 596 status = U_INDEX_OUTOFBOUNDS_ERROR; 597 return FALSE; 598 } 599 fMatchEnd = start; 600 return find(); 601 } 602 603 604 605 //-------------------------------------------------------------------------------- 606 // 607 // group() 608 // 609 //-------------------------------------------------------------------------------- 610 UnicodeString RegexMatcher::group(UErrorCode &status) const { 611 return group(0, status); 612 } 613 614 615 616 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { 617 int32_t s = start(groupNum, status); 618 int32_t e = end(groupNum, status); 619 620 // Note: calling start() and end() above will do all necessary checking that 621 // the group number is OK and that a match exists. status will be set. 622 if (U_FAILURE(status)) { 623 return UnicodeString(); 624 } 625 if (U_FAILURE(fDeferredStatus)) { 626 status = fDeferredStatus; 627 return UnicodeString(); 628 } 629 630 if (s < 0) { 631 // A capture group wasn't part of the match 632 return UnicodeString(); 633 } 634 U_ASSERT(s <= e); 635 return UnicodeString(*fInput, s, e-s); 636 } 637 638 639 640 641 int32_t RegexMatcher::groupCount() const { 642 return fPattern->fGroupMap->size(); 643 } 644 645 646 647 const UnicodeString &RegexMatcher::input() const { 648 return *fInput; 649 } 650 651 652 //-------------------------------------------------------------------------------- 653 // 654 // hasAnchoringBounds() 655 // 656 //-------------------------------------------------------------------------------- 657 UBool RegexMatcher::hasAnchoringBounds() const { 658 return fAnchoringBounds; 659 } 660 661 662 //-------------------------------------------------------------------------------- 663 // 664 // hasTransparentBounds() 665 // 666 //-------------------------------------------------------------------------------- 667 UBool RegexMatcher::hasTransparentBounds() const { 668 return fTransparentBounds; 669 } 670 671 672 673 //-------------------------------------------------------------------------------- 674 // 675 // hitEnd() 676 // 677 //-------------------------------------------------------------------------------- 678 UBool RegexMatcher::hitEnd() const { 679 return fHitEnd; 680 } 681 682 //-------------------------------------------------------------------------------- 683 // 684 // lookingAt() 685 // 686 //-------------------------------------------------------------------------------- 687 UBool RegexMatcher::lookingAt(UErrorCode &status) { 688 if (U_FAILURE(status)) { 689 return FALSE; 690 } 691 if (U_FAILURE(fDeferredStatus)) { 692 status = fDeferredStatus; 693 return FALSE; 694 } 695 resetPreserveRegion(); 696 MatchAt(fActiveStart, FALSE, status); 697 return fMatch; 698 } 699 700 701 UBool RegexMatcher::lookingAt(int32_t start, UErrorCode &status) { 702 if (U_FAILURE(status)) { 703 return FALSE; 704 } 705 if (U_FAILURE(fDeferredStatus)) { 706 status = fDeferredStatus; 707 return FALSE; 708 } 709 reset(); 710 if (start < fActiveStart || start > fActiveLimit) { 711 status = U_INDEX_OUTOFBOUNDS_ERROR; 712 return FALSE; 713 } 714 MatchAt(start, FALSE, status); 715 return fMatch; 716 } 717 718 719 720 //-------------------------------------------------------------------------------- 721 // 722 // matches() 723 // 724 //-------------------------------------------------------------------------------- 725 UBool RegexMatcher::matches(UErrorCode &status) { 726 if (U_FAILURE(status)) { 727 return FALSE; 728 } 729 if (U_FAILURE(fDeferredStatus)) { 730 status = fDeferredStatus; 731 return FALSE; 732 } 733 resetPreserveRegion(); 734 MatchAt(fActiveStart, TRUE, status); 735 return fMatch; 736 } 737 738 739 UBool RegexMatcher::matches(int32_t start, UErrorCode &status) { 740 if (U_FAILURE(status)) { 741 return FALSE; 742 } 743 if (U_FAILURE(fDeferredStatus)) { 744 status = fDeferredStatus; 745 return FALSE; 746 } 747 reset(); 748 if (start < fActiveStart || start > fActiveLimit) { 749 status = U_INDEX_OUTOFBOUNDS_ERROR; 750 return FALSE; 751 } 752 MatchAt(start, TRUE, status); 753 return fMatch; 754 } 755 756 757 758 //-------------------------------------------------------------------------------- 759 // 760 // pattern 761 // 762 //-------------------------------------------------------------------------------- 763 const RegexPattern &RegexMatcher::pattern() const { 764 return *fPattern; 765 } 766 767 768 769 //-------------------------------------------------------------------------------- 770 // 771 // region 772 // 773 //-------------------------------------------------------------------------------- 774 RegexMatcher &RegexMatcher::region(int32_t start, int32_t limit, UErrorCode &status) { 775 if (U_FAILURE(status)) { 776 return *this; 777 } 778 if (start>limit || start<0 || limit<0 || limit>fInput->length()) { 779 status = U_ILLEGAL_ARGUMENT_ERROR; 780 } 781 this->reset(); 782 fRegionStart = start; 783 fRegionLimit = limit; 784 fActiveStart = start; 785 fActiveLimit = limit; 786 if (!fTransparentBounds) { 787 fLookStart = start; 788 fLookLimit = limit; 789 } 790 if (fAnchoringBounds) { 791 fAnchorStart = start; 792 fAnchorLimit = limit; 793 } 794 return *this; 795 } 796 797 798 799 //-------------------------------------------------------------------------------- 800 // 801 // regionEnd 802 // 803 //-------------------------------------------------------------------------------- 804 int32_t RegexMatcher::regionEnd() const { 805 return fRegionLimit; 806 } 807 808 809 //-------------------------------------------------------------------------------- 810 // 811 // regionStart 812 // 813 //-------------------------------------------------------------------------------- 814 int32_t RegexMatcher::regionStart() const { 815 return fRegionStart; 816 } 817 818 819 //-------------------------------------------------------------------------------- 820 // 821 // replaceAll 822 // 823 //-------------------------------------------------------------------------------- 824 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { 825 if (U_FAILURE(status)) { 826 return *fInput; 827 } 828 if (U_FAILURE(fDeferredStatus)) { 829 status = fDeferredStatus; 830 return *fInput; 831 } 832 UnicodeString destString; 833 reset(); 834 while (find()) { 835 appendReplacement(destString, replacement, status); 836 if (U_FAILURE(status)) { 837 break; 838 } 839 } 840 appendTail(destString); 841 return destString; 842 } 843 844 845 846 847 //-------------------------------------------------------------------------------- 848 // 849 // replaceFirst 850 // 851 //-------------------------------------------------------------------------------- 852 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { 853 if (U_FAILURE(status)) { 854 return *fInput; 855 } 856 if (U_FAILURE(fDeferredStatus)) { 857 status = fDeferredStatus; 858 return *fInput; 859 } 860 861 reset(); 862 if (!find()) { 863 return *fInput; 864 } 865 866 UnicodeString destString; 867 appendReplacement(destString, replacement, status); 868 appendTail(destString); 869 return destString; 870 } 871 872 873 //-------------------------------------------------------------------------------- 874 // 875 // requireEnd 876 // 877 //-------------------------------------------------------------------------------- 878 UBool RegexMatcher::requireEnd() const { 879 return fRequireEnd; 880 } 881 882 883 //-------------------------------------------------------------------------------- 884 // 885 // reset 886 // 887 //-------------------------------------------------------------------------------- 888 RegexMatcher &RegexMatcher::reset() { 889 fRegionStart = 0; 890 fRegionLimit = fInput->length(); 891 fActiveStart = 0; 892 fActiveLimit = fRegionLimit; 893 fAnchorStart = 0; 894 fAnchorLimit = fRegionLimit; 895 fLookStart = 0; 896 fLookLimit = fRegionLimit; 897 resetPreserveRegion(); 898 return *this; 899 } 900 901 902 903 void RegexMatcher::resetPreserveRegion() { 904 fMatchStart = 0; 905 fMatchEnd = 0; 906 fLastMatchEnd = -1; 907 fAppendPosition = 0; 908 fMatch = FALSE; 909 fHitEnd = FALSE; 910 fRequireEnd = FALSE; 911 fTime = 0; 912 fTickCounter = TIMER_INITIAL_VALUE; 913 resetStack(); 914 } 915 916 917 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { 918 fInput = &input; 919 reset(); 920 if (fWordBreakItr != NULL) { 921 #if UCONFIG_NO_BREAK_ITERATION==0 922 fWordBreakItr->setText(input); 923 #endif 924 } 925 return *this; 926 } 927 928 /*RegexMatcher &RegexMatcher::reset(const UChar *) { 929 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; 930 return *this; 931 }*/ 932 933 934 RegexMatcher &RegexMatcher::reset(int32_t position, UErrorCode &status) { 935 if (U_FAILURE(status)) { 936 return *this; 937 } 938 reset(); // Reset also resets the region to be the entire string. 939 if (position < 0 || position >= fActiveLimit) { 940 status = U_INDEX_OUTOFBOUNDS_ERROR; 941 return *this; 942 } 943 fMatchEnd = position; 944 return *this; 945 } 946 947 948 949 950 951 //-------------------------------------------------------------------------------- 952 // 953 // setTrace 954 // 955 //-------------------------------------------------------------------------------- 956 void RegexMatcher::setTrace(UBool state) { 957 fTraceDebug = state; 958 } 959 960 961 962 //--------------------------------------------------------------------- 963 // 964 // split 965 // 966 //--------------------------------------------------------------------- 967 int32_t RegexMatcher::split(const UnicodeString &input, 968 UnicodeString dest[], 969 int32_t destCapacity, 970 UErrorCode &status) 971 { 972 // 973 // Check arguements for validity 974 // 975 if (U_FAILURE(status)) { 976 return 0; 977 }; 978 979 if (destCapacity < 1) { 980 status = U_ILLEGAL_ARGUMENT_ERROR; 981 return 0; 982 } 983 984 // 985 // Reset for the input text 986 // 987 reset(input); 988 int32_t nextOutputStringStart = 0; 989 if (fActiveLimit == 0) { 990 return 0; 991 } 992 993 // 994 // Loop through the input text, searching for the delimiter pattern 995 // 996 int32_t i; 997 int32_t numCaptureGroups = fPattern->fGroupMap->size(); 998 for (i=0; ; i++) { 999 if (i>=destCapacity-1) { 1000 // There is one or zero output string left. 1001 // Fill the last output string with whatever is left from the input, then exit the loop. 1002 // ( i will be == destCapicity if we filled the output array while processing 1003 // capture groups of the delimiter expression, in which case we will discard the 1004 // last capture group saved in favor of the unprocessed remainder of the 1005 // input string.) 1006 i = destCapacity-1; 1007 int32_t remainingLength = fActiveLimit-nextOutputStringStart; 1008 if (remainingLength > 0) { 1009 dest[i].setTo(input, nextOutputStringStart, remainingLength); 1010 } 1011 break; 1012 } 1013 if (find()) { 1014 // We found another delimiter. Move everything from where we started looking 1015 // up until the start of the delimiter into the next output string. 1016 int32_t fieldLen = fMatchStart - nextOutputStringStart; 1017 dest[i].setTo(input, nextOutputStringStart, fieldLen); 1018 nextOutputStringStart = fMatchEnd; 1019 1020 // If the delimiter pattern has capturing parentheses, the captured 1021 // text goes out into the next n destination strings. 1022 int32_t groupNum; 1023 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 1024 if (i==destCapacity-1) { 1025 break; 1026 } 1027 i++; 1028 dest[i] = group(groupNum, status); 1029 } 1030 1031 if (nextOutputStringStart == fActiveLimit) { 1032 // The delimiter was at the end of the string. We're done. 1033 break; 1034 } 1035 1036 } 1037 else 1038 { 1039 // We ran off the end of the input while looking for the next delimiter. 1040 // All the remaining text goes into the current output string. 1041 dest[i].setTo(input, nextOutputStringStart, fActiveLimit-nextOutputStringStart); 1042 break; 1043 } 1044 } 1045 return i+1; 1046 } 1047 1048 1049 1050 //-------------------------------------------------------------------------------- 1051 // 1052 // start 1053 // 1054 //-------------------------------------------------------------------------------- 1055 int32_t RegexMatcher::start(UErrorCode &status) const { 1056 return start(0, status); 1057 } 1058 1059 1060 1061 1062 //-------------------------------------------------------------------------------- 1063 // 1064 // start(int32_t group, UErrorCode &status) 1065 // 1066 //-------------------------------------------------------------------------------- 1067 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { 1068 if (U_FAILURE(status)) { 1069 return -1; 1070 } 1071 if (U_FAILURE(fDeferredStatus)) { 1072 status = fDeferredStatus; 1073 return -1; 1074 } 1075 if (fMatch == FALSE) { 1076 status = U_REGEX_INVALID_STATE; 1077 return -1; 1078 } 1079 if (group < 0 || group > fPattern->fGroupMap->size()) { 1080 status = U_INDEX_OUTOFBOUNDS_ERROR; 1081 return -1; 1082 } 1083 int32_t s; 1084 if (group == 0) { 1085 s = fMatchStart; 1086 } else { 1087 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 1088 U_ASSERT(groupOffset < fPattern->fFrameSize); 1089 U_ASSERT(groupOffset >= 0); 1090 s = fFrame->fExtra[groupOffset]; 1091 } 1092 return s; 1093 } 1094 1095 1096 1097 //-------------------------------------------------------------------------------- 1098 // 1099 // useAnchoringBounds 1100 // 1101 //-------------------------------------------------------------------------------- 1102 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { 1103 fAnchoringBounds = b; 1104 UErrorCode status = U_ZERO_ERROR; 1105 region(fRegionStart, fRegionLimit, status); 1106 U_ASSERT(U_SUCCESS(status)); 1107 return *this; 1108 } 1109 1110 1111 //-------------------------------------------------------------------------------- 1112 // 1113 // useTransparentBounds 1114 // 1115 //-------------------------------------------------------------------------------- 1116 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { 1117 fTransparentBounds = b; 1118 UErrorCode status = U_ZERO_ERROR; 1119 region(fRegionStart, fRegionLimit, status); 1120 U_ASSERT(U_SUCCESS(status)); 1121 return *this; 1122 } 1123 1124 //-------------------------------------------------------------------------------- 1125 // 1126 // setTimeLimit 1127 // 1128 //-------------------------------------------------------------------------------- 1129 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { 1130 if (U_FAILURE(status)) { 1131 return; 1132 } 1133 if (U_FAILURE(fDeferredStatus)) { 1134 status = fDeferredStatus; 1135 return; 1136 } 1137 if (limit < 0) { 1138 status = U_ILLEGAL_ARGUMENT_ERROR; 1139 return; 1140 } 1141 fTimeLimit = limit; 1142 } 1143 1144 1145 //-------------------------------------------------------------------------------- 1146 // 1147 // getTimeLimit 1148 // 1149 //-------------------------------------------------------------------------------- 1150 int32_t RegexMatcher::getTimeLimit() const { 1151 return fTimeLimit; 1152 } 1153 1154 1155 //-------------------------------------------------------------------------------- 1156 // 1157 // setStackLimit 1158 // 1159 //-------------------------------------------------------------------------------- 1160 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { 1161 if (U_FAILURE(status)) { 1162 return; 1163 } 1164 if (U_FAILURE(fDeferredStatus)) { 1165 status = fDeferredStatus; 1166 return; 1167 } 1168 if (limit < 0) { 1169 status = U_ILLEGAL_ARGUMENT_ERROR; 1170 return; 1171 } 1172 1173 // Reset the matcher. This is needed here in case there is a current match 1174 // whose final stack frame (containing the match results, pointed to by fFrame) 1175 // would be lost by resizing to a smaller stack size. 1176 reset(); 1177 1178 if (limit == 0) { 1179 // Unlimited stack expansion 1180 fStack->setMaxCapacity(0); 1181 } else { 1182 // Change the units of the limit from bytes to ints, and bump the size up 1183 // to be big enough to hold at least one stack frame for the pattern, 1184 // if it isn't there already. 1185 int32_t adjustedLimit = limit / sizeof(int32_t); 1186 if (adjustedLimit < fPattern->fFrameSize) { 1187 adjustedLimit = fPattern->fFrameSize; 1188 } 1189 fStack->setMaxCapacity(adjustedLimit); 1190 } 1191 fStackLimit = limit; 1192 } 1193 1194 1195 //-------------------------------------------------------------------------------- 1196 // 1197 // getStackLimit 1198 // 1199 //-------------------------------------------------------------------------------- 1200 int32_t RegexMatcher::getStackLimit() const { 1201 return fStackLimit; 1202 } 1203 1204 1205 //-------------------------------------------------------------------------------- 1206 // 1207 // setMatchCallback 1208 // 1209 //-------------------------------------------------------------------------------- 1210 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, 1211 const void *context, 1212 UErrorCode &status) { 1213 if (U_FAILURE(status)) { 1214 return; 1215 } 1216 fCallbackFn = callback; 1217 fCallbackContext = context; 1218 } 1219 1220 1221 //-------------------------------------------------------------------------------- 1222 // 1223 // getMatchCallback 1224 // 1225 //-------------------------------------------------------------------------------- 1226 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, 1227 const void *&context, 1228 UErrorCode &status) { 1229 if (U_FAILURE(status)) { 1230 return; 1231 } 1232 callback = fCallbackFn; 1233 context = fCallbackContext; 1234 } 1235 1236 1237 //================================================================================ 1238 // 1239 // Code following this point in this file is the internal 1240 // Match Engine Implementation. 1241 // 1242 //================================================================================ 1243 1244 1245 //-------------------------------------------------------------------------------- 1246 // 1247 // resetStack 1248 // Discard any previous contents of the state save stack, and initialize a 1249 // new stack frame to all -1. The -1s are needed for capture group limits, 1250 // where they indicate that a group has not yet matched anything. 1251 //-------------------------------------------------------------------------------- 1252 REStackFrame *RegexMatcher::resetStack() { 1253 // Discard any previous contents of the state save stack, and initialize a 1254 // new stack frame to all -1. The -1s are needed for capture group limits, where 1255 // they indicate that a group has not yet matched anything. 1256 fStack->removeAllElements(); 1257 1258 int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); 1259 int32_t i; 1260 for (i=0; i<fPattern->fFrameSize; i++) { 1261 iFrame[i] = -1; 1262 } 1263 return (REStackFrame *)iFrame; 1264 } 1265 1266 1267 1268 //-------------------------------------------------------------------------------- 1269 // 1270 // isWordBoundary 1271 // in perl, "xab..cd..", \b is true at positions 0,3,5,7 1272 // For us, 1273 // If the current char is a combining mark, 1274 // \b is FALSE. 1275 // Else Scan backwards to the first non-combining char. 1276 // We are at a boundary if the this char and the original chars are 1277 // opposite in membership in \w set 1278 // 1279 // parameters: pos - the current position in the input buffer 1280 // 1281 // TODO: double-check edge cases at region boundaries. 1282 // 1283 //-------------------------------------------------------------------------------- 1284 UBool RegexMatcher::isWordBoundary(int32_t pos) { 1285 UBool isBoundary = FALSE; 1286 UBool cIsWord = FALSE; 1287 1288 if (pos >= fLookLimit) { 1289 fHitEnd = TRUE; 1290 } else { 1291 // Determine whether char c at current position is a member of the word set of chars. 1292 // If we're off the end of the string, behave as though we're not at a word char. 1293 UChar32 c = fInput->char32At(pos); 1294 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 1295 // Current char is a combining one. Not a boundary. 1296 return FALSE; 1297 } 1298 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 1299 } 1300 1301 // Back up until we come to a non-combining char, determine whether 1302 // that char is a word char. 1303 UBool prevCIsWord = FALSE; 1304 int32_t prevPos = pos; 1305 for (;;) { 1306 if (prevPos <= fLookStart) { 1307 break; 1308 } 1309 prevPos = fInput->moveIndex32(prevPos, -1); 1310 UChar32 prevChar = fInput->char32At(prevPos); 1311 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 1312 || u_charType(prevChar) == U_FORMAT_CHAR)) { 1313 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 1314 break; 1315 } 1316 } 1317 isBoundary = cIsWord ^ prevCIsWord; 1318 return isBoundary; 1319 } 1320 1321 //-------------------------------------------------------------------------------- 1322 // 1323 // isUWordBoundary 1324 // 1325 // Test for a word boundary using RBBI word break. 1326 // 1327 // parameters: pos - the current position in the input buffer 1328 // 1329 //-------------------------------------------------------------------------------- 1330 UBool RegexMatcher::isUWordBoundary(int32_t pos) { 1331 UBool returnVal = FALSE; 1332 #if UCONFIG_NO_BREAK_ITERATION==0 1333 1334 // If we haven't yet created a break iterator for this matcher, do it now. 1335 if (fWordBreakItr == NULL) { 1336 fWordBreakItr = 1337 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus); 1338 if (U_FAILURE(fDeferredStatus)) { 1339 return FALSE; 1340 } 1341 fWordBreakItr->setText(*fInput); 1342 } 1343 1344 if (pos >= fLookLimit) { 1345 fHitEnd = TRUE; 1346 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" 1347 // words are not boundaries. All non-word chars stand by themselves, 1348 // with word boundaries on both sides. 1349 } else { 1350 returnVal = fWordBreakItr->isBoundary(pos); 1351 } 1352 #endif 1353 return returnVal; 1354 } 1355 1356 //-------------------------------------------------------------------------------- 1357 // 1358 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state 1359 // saves. Increment the "time" counter, and call the 1360 // user callback function if there is one installed. 1361 // 1362 // If the match operation needs to be aborted, either for a time-out 1363 // or because the user callback asked for it, just set an error status. 1364 // The engine will pick that up and stop in its outer loop. 1365 // 1366 //-------------------------------------------------------------------------------- 1367 void RegexMatcher::IncrementTime(UErrorCode &status) { 1368 fTickCounter = TIMER_INITIAL_VALUE; 1369 fTime++; 1370 if (fCallbackFn != NULL) { 1371 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { 1372 status = U_REGEX_STOPPED_BY_CALLER; 1373 return; 1374 } 1375 } 1376 if (fTimeLimit > 0 && fTime >= fTimeLimit) { 1377 status = U_REGEX_TIME_OUT; 1378 } 1379 } 1380 1381 //-------------------------------------------------------------------------------- 1382 // 1383 // StateSave 1384 // Make a new stack frame, initialized as a copy of the current stack frame. 1385 // Set the pattern index in the original stack frame from the operand value 1386 // in the opcode. Execution of the engine continues with the state in 1387 // the newly created stack frame 1388 // 1389 // Note that reserveBlock() may grow the stack, resulting in the 1390 // whole thing being relocated in memory. 1391 // 1392 // Parameters: 1393 // fp The top frame pointer when called. At return, a new 1394 // fame will be present 1395 // savePatIdx An index into the compiled pattern. Goes into the original 1396 // (not new) frame. If execution ever back-tracks out of the 1397 // new frame, this will be where we continue from in the pattern. 1398 // Return 1399 // The new frame pointer. 1400 // 1401 //-------------------------------------------------------------------------------- 1402 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, UErrorCode &status) { 1403 // push storage for a new frame. 1404 int32_t *newFP = fStack->reserveBlock(fFrameSize, status); 1405 if (newFP == NULL) { 1406 // Failure on attempted stack expansion. 1407 // Stack function set some other error code, change it to a more 1408 // specific one for regular expressions. 1409 status = U_REGEX_STACK_OVERFLOW; 1410 // We need to return a writable stack frame, so just return the 1411 // previous frame. The match operation will stop quickly 1412 // because of the error status, after which the frame will never 1413 // be looked at again. 1414 return fp; 1415 } 1416 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. 1417 1418 // New stack frame = copy of old top frame. 1419 int32_t *source = (int32_t *)fp; 1420 int32_t *dest = newFP; 1421 for (;;) { 1422 *dest++ = *source++; 1423 if (source == newFP) { 1424 break; 1425 } 1426 } 1427 1428 fTickCounter--; 1429 if (fTickCounter <= 0) { 1430 IncrementTime(status); // Re-initializes fTickCounter 1431 } 1432 fp->fPatIdx = savePatIdx; 1433 return (REStackFrame *)newFP; 1434 } 1435 1436 1437 //-------------------------------------------------------------------------------- 1438 // 1439 // MatchAt This is the actual matching engine. 1440 // 1441 // startIdx: begin matching a this index. 1442 // toEnd: if true, match must extend to end of the input region 1443 // 1444 //-------------------------------------------------------------------------------- 1445 void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { 1446 UBool isMatch = FALSE; // True if the we have a match. 1447 1448 int32_t op; // Operation from the compiled pattern, split into 1449 int32_t opType; // the opcode 1450 int32_t opValue; // and the operand value. 1451 1452 #ifdef REGEX_RUN_DEBUG 1453 if (fTraceDebug) 1454 { 1455 printf("MatchAt(startIdx=%d)\n", startIdx); 1456 printf("Original Pattern: "); 1457 int32_t i; 1458 for (i=0; i<fPattern->fPattern.length(); i++) { 1459 printf("%c", fPattern->fPattern.charAt(i)); 1460 } 1461 printf("\n"); 1462 printf("Input String: "); 1463 for (i=0; i<fInput->length(); i++) { 1464 UChar c = fInput->charAt(i); 1465 if (c<32 || c>256) { 1466 c = '.'; 1467 } 1468 printf("%c", c); 1469 } 1470 printf("\n"); 1471 printf("\n"); 1472 } 1473 #endif 1474 1475 if (U_FAILURE(status)) { 1476 return; 1477 } 1478 1479 // Cache frequently referenced items from the compiled pattern 1480 // 1481 int32_t *pat = fPattern->fCompiledPat->getBuffer(); 1482 1483 const UChar *litText = fPattern->fLiteralText.getBuffer(); 1484 UVector *sets = fPattern->fSets; 1485 1486 const UChar *inputBuf = fInput->getBuffer(); 1487 1488 fFrameSize = fPattern->fFrameSize; 1489 REStackFrame *fp = resetStack(); 1490 1491 fp->fPatIdx = 0; 1492 fp->fInputIdx = startIdx; 1493 1494 // Zero out the pattern's static data 1495 int32_t i; 1496 for (i = 0; i<fPattern->fDataSize; i++) { 1497 fData[i] = 0; 1498 } 1499 1500 // 1501 // Main loop for interpreting the compiled pattern. 1502 // One iteration of the loop per pattern operation performed. 1503 // 1504 for (;;) { 1505 #if 0 1506 if (_heapchk() != _HEAPOK) { 1507 fprintf(stderr, "Heap Trouble\n"); 1508 } 1509 #endif 1510 op = pat[fp->fPatIdx]; 1511 opType = URX_TYPE(op); 1512 opValue = URX_VAL(op); 1513 #ifdef REGEX_RUN_DEBUG 1514 if (fTraceDebug) { 1515 printf("inputIdx=%d inputChar=%c sp=%3d ", fp->fInputIdx, 1516 fInput->char32At(fp->fInputIdx), (int32_t *)fp-fStack->getBuffer()); 1517 fPattern->dumpOp(fp->fPatIdx); 1518 } 1519 #endif 1520 fp->fPatIdx++; 1521 1522 switch (opType) { 1523 1524 1525 case URX_NOP: 1526 break; 1527 1528 1529 case URX_BACKTRACK: 1530 // Force a backtrack. In some circumstances, the pattern compiler 1531 // will notice that the pattern can't possibly match anything, and will 1532 // emit one of these at that point. 1533 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1534 break; 1535 1536 1537 case URX_ONECHAR: 1538 if (fp->fInputIdx < fActiveLimit) { 1539 UChar32 c; 1540 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1541 if (c == opValue) { 1542 break; 1543 } 1544 } else { 1545 fHitEnd = TRUE; 1546 } 1547 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1548 break; 1549 1550 1551 case URX_STRING: 1552 { 1553 // Test input against a literal string. 1554 // Strings require two slots in the compiled pattern, one for the 1555 // offset to the string text, and one for the length. 1556 int32_t stringStartIdx = opValue; 1557 int32_t stringLen; 1558 1559 op = pat[fp->fPatIdx]; // Fetch the second operand 1560 fp->fPatIdx++; 1561 opType = URX_TYPE(op); 1562 stringLen = URX_VAL(op); 1563 U_ASSERT(opType == URX_STRING_LEN); 1564 U_ASSERT(stringLen >= 2); 1565 1566 if (fp->fInputIdx + stringLen > fActiveLimit) { 1567 // No match. String is longer than the remaining input text. 1568 fHitEnd = TRUE; // TODO: See ticket 6074 1569 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1570 break; 1571 } 1572 1573 const UChar * pInp = inputBuf + fp->fInputIdx; 1574 const UChar * pPat = litText+stringStartIdx; 1575 const UChar * pEnd = pInp + stringLen; 1576 for(;;) { 1577 if (*pInp == *pPat) { 1578 pInp++; 1579 pPat++; 1580 if (pInp == pEnd) { 1581 // Successful Match. 1582 fp->fInputIdx += stringLen; 1583 break; 1584 } 1585 } else { 1586 // Match failed. 1587 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1588 break; 1589 } 1590 } 1591 } 1592 break; 1593 1594 1595 1596 case URX_STATE_SAVE: 1597 fp = StateSave(fp, opValue, status); 1598 break; 1599 1600 1601 case URX_END: 1602 // The match loop will exit via this path on a successful match, 1603 // when we reach the end of the pattern. 1604 if (toEnd && fp->fInputIdx != fActiveLimit) { 1605 // The pattern matched, but not to the end of input. Try some more. 1606 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1607 break; 1608 } 1609 isMatch = TRUE; 1610 goto breakFromLoop; 1611 1612 // Start and End Capture stack frame variables are layout out like this: 1613 // fp->fExtra[opValue] - The start of a completed capture group 1614 // opValue+1 - The end of a completed capture group 1615 // opValue+2 - the start of a capture group whose end 1616 // has not yet been reached (and might not ever be). 1617 case URX_START_CAPTURE: 1618 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 1619 fp->fExtra[opValue+2] = fp->fInputIdx; 1620 break; 1621 1622 1623 case URX_END_CAPTURE: 1624 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 1625 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 1626 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 1627 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 1628 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 1629 break; 1630 1631 1632 case URX_DOLLAR: // $, test for End of line 1633 // or for position before new line at end of input 1634 if (fp->fInputIdx < fAnchorLimit-2) { 1635 // We are no where near the end of input. Fail. 1636 // This is the common case. Keep it first. 1637 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1638 break; 1639 } 1640 if (fp->fInputIdx >= fAnchorLimit) { 1641 // We really are at the end of input. Success. 1642 fHitEnd = TRUE; 1643 fRequireEnd = TRUE; 1644 break; 1645 } 1646 // If we are positioned just before a new-line that is located at the 1647 // end of input, succeed. 1648 if (fp->fInputIdx == fAnchorLimit-1) { 1649 UChar32 c = fInput->char32At(fp->fInputIdx); 1650 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 1651 // If not in the middle of a CR/LF sequence 1652 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 1653 // At new-line at end of input. Success 1654 fHitEnd = TRUE; 1655 fRequireEnd = TRUE; 1656 break; 1657 } 1658 } 1659 } 1660 1661 if (fp->fInputIdx == fAnchorLimit-2 && 1662 fInput->char32At(fp->fInputIdx) == 0x0d && fInput->char32At(fp->fInputIdx+1) == 0x0a) { 1663 fHitEnd = TRUE; 1664 fRequireEnd = TRUE; 1665 break; // At CR/LF at end of input. Success 1666 } 1667 1668 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1669 1670 break; 1671 1672 1673 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 1674 if (fp->fInputIdx >= fAnchorLimit-1) { 1675 // Either at the last character of input, or off the end. 1676 if (fp->fInputIdx == fAnchorLimit-1) { 1677 // At last char of input. Success if it's a new line. 1678 if (fInput->char32At(fp->fInputIdx) == 0x0a) { 1679 fHitEnd = TRUE; 1680 fRequireEnd = TRUE; 1681 break; 1682 } 1683 } else { 1684 // Off the end of input. Success. 1685 fHitEnd = TRUE; 1686 fRequireEnd = TRUE; 1687 break; 1688 } 1689 } 1690 1691 // Not at end of input. Back-track out. 1692 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1693 break; 1694 1695 1696 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 1697 { 1698 if (fp->fInputIdx >= fAnchorLimit) { 1699 // We really are at the end of input. Success. 1700 fHitEnd = TRUE; 1701 fRequireEnd = TRUE; 1702 break; 1703 } 1704 // If we are positioned just before a new-line, succeed. 1705 // It makes no difference where the new-line is within the input. 1706 UChar32 c = inputBuf[fp->fInputIdx]; 1707 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 1708 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 1709 // In multi-line mode, hitting a new-line just before the end of input does not 1710 // set the hitEnd or requireEnd flags 1711 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 1712 break; 1713 } 1714 } 1715 // not at a new line. Fail. 1716 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1717 } 1718 break; 1719 1720 1721 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 1722 { 1723 if (fp->fInputIdx >= fAnchorLimit) { 1724 // We really are at the end of input. Success. 1725 fHitEnd = TRUE; 1726 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 1727 break; // adding a new-line would not lose the match. 1728 } 1729 // If we are not positioned just before a new-line, the test fails; backtrack out. 1730 // It makes no difference where the new-line is within the input. 1731 if (inputBuf[fp->fInputIdx] != 0x0a) { 1732 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1733 } 1734 } 1735 break; 1736 1737 1738 case URX_CARET: // ^, test for start of line 1739 if (fp->fInputIdx != fAnchorStart) { 1740 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1741 } 1742 break; 1743 1744 1745 case URX_CARET_M: // ^, test for start of line in mulit-line mode 1746 { 1747 if (fp->fInputIdx == fAnchorStart) { 1748 // We are at the start input. Success. 1749 break; 1750 } 1751 // Check whether character just before the current pos is a new-line 1752 // unless we are at the end of input 1753 UChar c = inputBuf[fp->fInputIdx - 1]; 1754 if ((fp->fInputIdx < fAnchorLimit) && 1755 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 1756 // It's a new-line. ^ is true. Success. 1757 // TODO: what should be done with positions between a CR and LF? 1758 break; 1759 } 1760 // Not at the start of a line. Fail. 1761 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1762 } 1763 break; 1764 1765 1766 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 1767 { 1768 U_ASSERT(fp->fInputIdx >= fAnchorStart); 1769 if (fp->fInputIdx <= fAnchorStart) { 1770 // We are at the start input. Success. 1771 break; 1772 } 1773 // Check whether character just before the current pos is a new-line 1774 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 1775 UChar c = inputBuf[fp->fInputIdx - 1]; 1776 if (c != 0x0a) { 1777 // Not at the start of a line. Back-track out. 1778 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1779 } 1780 } 1781 break; 1782 1783 case URX_BACKSLASH_B: // Test for word boundaries 1784 { 1785 UBool success = isWordBoundary(fp->fInputIdx); 1786 success ^= (opValue != 0); // flip sense for \B 1787 if (!success) { 1788 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1789 } 1790 } 1791 break; 1792 1793 1794 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 1795 { 1796 UBool success = isUWordBoundary(fp->fInputIdx); 1797 success ^= (opValue != 0); // flip sense for \B 1798 if (!success) { 1799 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1800 } 1801 } 1802 break; 1803 1804 1805 case URX_BACKSLASH_D: // Test for decimal digit 1806 { 1807 if (fp->fInputIdx >= fActiveLimit) { 1808 fHitEnd = TRUE; 1809 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1810 break; 1811 } 1812 1813 UChar32 c = fInput->char32At(fp->fInputIdx); 1814 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 1815 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 1816 success ^= (opValue != 0); // flip sense for \D 1817 if (success) { 1818 fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1); 1819 } else { 1820 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1821 } 1822 } 1823 break; 1824 1825 1826 case URX_BACKSLASH_G: // Test for position at end of previous match 1827 if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==fActiveStart)) { 1828 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1829 } 1830 break; 1831 1832 1833 case URX_BACKSLASH_X: 1834 // Match a Grapheme, as defined by Unicode TR 29. 1835 // Differs slightly from Perl, which consumes combining marks independently 1836 // of context. 1837 { 1838 1839 // Fail if at end of input 1840 if (fp->fInputIdx >= fActiveLimit) { 1841 fHitEnd = TRUE; 1842 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1843 break; 1844 } 1845 1846 // Examine (and consume) the current char. 1847 // Dispatch into a little state machine, based on the char. 1848 UChar32 c; 1849 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1850 UnicodeSet **sets = fPattern->fStaticSets; 1851 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 1852 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 1853 if (sets[URX_GC_L]->contains(c)) goto GC_L; 1854 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 1855 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 1856 if (sets[URX_GC_V]->contains(c)) goto GC_V; 1857 if (sets[URX_GC_T]->contains(c)) goto GC_T; 1858 goto GC_Extend; 1859 1860 1861 1862 GC_L: 1863 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 1864 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1865 if (sets[URX_GC_L]->contains(c)) goto GC_L; 1866 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 1867 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 1868 if (sets[URX_GC_V]->contains(c)) goto GC_V; 1869 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 1870 goto GC_Extend; 1871 1872 GC_V: 1873 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 1874 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1875 if (sets[URX_GC_V]->contains(c)) goto GC_V; 1876 if (sets[URX_GC_T]->contains(c)) goto GC_T; 1877 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 1878 goto GC_Extend; 1879 1880 GC_T: 1881 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 1882 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1883 if (sets[URX_GC_T]->contains(c)) goto GC_T; 1884 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 1885 goto GC_Extend; 1886 1887 GC_Extend: 1888 // Combining characters are consumed here 1889 for (;;) { 1890 if (fp->fInputIdx >= fActiveLimit) { 1891 break; 1892 } 1893 U16_GET(inputBuf, 0, fp->fInputIdx, fActiveLimit, c); 1894 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 1895 break; 1896 } 1897 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); 1898 } 1899 goto GC_Done; 1900 1901 GC_Control: 1902 // Most control chars stand alone (don't combine with combining chars), 1903 // except for that CR/LF sequence is a single grapheme cluster. 1904 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) { 1905 fp->fInputIdx++; 1906 } 1907 1908 GC_Done: 1909 if (fp->fInputIdx >= fActiveLimit) { 1910 fHitEnd = TRUE; 1911 } 1912 break; 1913 } 1914 1915 1916 1917 1918 case URX_BACKSLASH_Z: // Test for end of Input 1919 if (fp->fInputIdx < fAnchorLimit) { 1920 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1921 } else { 1922 fHitEnd = TRUE; 1923 fRequireEnd = TRUE; 1924 } 1925 break; 1926 1927 1928 1929 case URX_STATIC_SETREF: 1930 { 1931 // Test input character against one of the predefined sets 1932 // (Word Characters, for example) 1933 // The high bit of the op value is a flag for the match polarity. 1934 // 0: success if input char is in set. 1935 // 1: success if input char is not in set. 1936 if (fp->fInputIdx >= fActiveLimit) { 1937 fHitEnd = TRUE; 1938 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1939 break; 1940 } 1941 1942 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 1943 opValue &= ~URX_NEG_SET; 1944 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 1945 UChar32 c; 1946 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1947 if (c < 256) { 1948 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 1949 if (s8->contains(c)) { 1950 success = !success; 1951 } 1952 } else { 1953 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 1954 if (s->contains(c)) { 1955 success = !success; 1956 } 1957 } 1958 if (!success) { 1959 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1960 } 1961 } 1962 break; 1963 1964 1965 case URX_STAT_SETREF_N: 1966 { 1967 // Test input character for NOT being a member of one of 1968 // the predefined sets (Word Characters, for example) 1969 if (fp->fInputIdx >= fActiveLimit) { 1970 fHitEnd = TRUE; 1971 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1972 break; 1973 } 1974 1975 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 1976 UChar32 c; 1977 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 1978 if (c < 256) { 1979 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 1980 if (s8->contains(c) == FALSE) { 1981 break; 1982 } 1983 } else { 1984 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 1985 if (s->contains(c) == FALSE) { 1986 break; 1987 } 1988 } 1989 1990 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1991 } 1992 break; 1993 1994 1995 case URX_SETREF: 1996 if (fp->fInputIdx >= fActiveLimit) { 1997 fHitEnd = TRUE; 1998 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 1999 break; 2000 } 2001 // There is input left. Pick up one char and test it for set membership. 2002 UChar32 c; 2003 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 2004 U_ASSERT(opValue > 0 && opValue < sets->size()); 2005 if (c<256) { 2006 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 2007 if (s8->contains(c)) { 2008 break; 2009 } 2010 } else { 2011 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 2012 if (s->contains(c)) { 2013 // The character is in the set. A Match. 2014 break; 2015 } 2016 } 2017 // the character wasn't in the set. Back track out. 2018 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2019 break; 2020 2021 2022 case URX_DOTANY: 2023 { 2024 // . matches anything, but stops at end-of-line. 2025 if (fp->fInputIdx >= fActiveLimit) { 2026 // At end of input. Match failed. Backtrack out. 2027 fHitEnd = TRUE; 2028 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2029 break; 2030 } 2031 // There is input left. Advance over one char, unless we've hit end-of-line 2032 UChar32 c; 2033 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 2034 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 2035 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 2036 // End of line in normal mode. . does not match. 2037 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2038 break; 2039 } 2040 } 2041 break; 2042 2043 2044 case URX_DOTANY_ALL: 2045 { 2046 // ., in dot-matches-all (including new lines) mode 2047 if (fp->fInputIdx >= fActiveLimit) { 2048 // At end of input. Match failed. Backtrack out. 2049 fHitEnd = TRUE; 2050 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2051 break; 2052 } 2053 // There is input left. Advance over one char, except if we are 2054 // at a cr/lf, advance over both of them. 2055 UChar32 c; 2056 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 2057 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 2058 // In the case of a CR/LF, we need to advance over both. 2059 UChar nextc = inputBuf[fp->fInputIdx]; 2060 if (nextc == 0x0a) { 2061 fp->fInputIdx++; 2062 } 2063 } 2064 } 2065 break; 2066 2067 2068 case URX_DOTANY_UNIX: 2069 { 2070 // '.' operator, matches all, but stops at end-of-line. 2071 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 2072 if (fp->fInputIdx >= fActiveLimit) { 2073 // At end of input. Match failed. Backtrack out. 2074 fHitEnd = TRUE; 2075 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2076 break; 2077 } 2078 // There is input left. Advance over one char, unless we've hit end-of-line 2079 UChar32 c; 2080 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 2081 if (c == 0x0a) { 2082 // End of line in normal mode. '.' does not match the \n 2083 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2084 } 2085 } 2086 break; 2087 2088 2089 case URX_JMP: 2090 fp->fPatIdx = opValue; 2091 break; 2092 2093 case URX_FAIL: 2094 isMatch = FALSE; 2095 goto breakFromLoop; 2096 2097 case URX_JMP_SAV: 2098 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 2099 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 2100 fp->fPatIdx = opValue; // Then JMP. 2101 break; 2102 2103 case URX_JMP_SAV_X: 2104 // This opcode is used with (x)+, when x can match a zero length string. 2105 // Same as JMP_SAV, except conditional on the match having made forward progress. 2106 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 2107 // data address of the input position at the start of the loop. 2108 { 2109 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 2110 int32_t stoOp = pat[opValue-1]; 2111 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 2112 int32_t frameLoc = URX_VAL(stoOp); 2113 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 2114 int32_t prevInputIdx = fp->fExtra[frameLoc]; 2115 U_ASSERT(prevInputIdx <= fp->fInputIdx); 2116 if (prevInputIdx < fp->fInputIdx) { 2117 // The match did make progress. Repeat the loop. 2118 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 2119 fp->fPatIdx = opValue; 2120 fp->fExtra[frameLoc] = fp->fInputIdx; 2121 } 2122 // If the input position did not advance, we do nothing here, 2123 // execution will fall out of the loop. 2124 } 2125 break; 2126 2127 case URX_CTR_INIT: 2128 { 2129 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 2130 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 2131 2132 // Pick up the three extra operands that CTR_INIT has, and 2133 // skip the pattern location counter past 2134 int32_t instrOperandLoc = fp->fPatIdx; 2135 fp->fPatIdx += 3; 2136 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 2137 int32_t minCount = pat[instrOperandLoc+1]; 2138 int32_t maxCount = pat[instrOperandLoc+2]; 2139 U_ASSERT(minCount>=0); 2140 U_ASSERT(maxCount>=minCount || maxCount==-1); 2141 U_ASSERT(loopLoc>fp->fPatIdx); 2142 2143 if (minCount == 0) { 2144 fp = StateSave(fp, loopLoc+1, status); 2145 } 2146 if (maxCount == 0) { 2147 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2148 } 2149 } 2150 break; 2151 2152 case URX_CTR_LOOP: 2153 { 2154 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 2155 int32_t initOp = pat[opValue]; 2156 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 2157 int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 2158 int32_t minCount = pat[opValue+2]; 2159 int32_t maxCount = pat[opValue+3]; 2160 // Increment the counter. Note: we're not worrying about counter 2161 // overflow, since the data comes from UnicodeStrings, which 2162 // stores its length in an int32_t. 2163 (*pCounter)++; 2164 U_ASSERT(*pCounter > 0); 2165 if ((uint32_t)*pCounter >= (uint32_t)maxCount) { 2166 U_ASSERT(*pCounter == maxCount || maxCount == -1); 2167 break; 2168 } 2169 if (*pCounter >= minCount) { 2170 fp = StateSave(fp, fp->fPatIdx, status); 2171 } 2172 fp->fPatIdx = opValue + 4; // Loop back. 2173 } 2174 break; 2175 2176 case URX_CTR_INIT_NG: 2177 { 2178 // Initialize a non-greedy loop 2179 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 2180 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 2181 2182 // Pick up the three extra operands that CTR_INIT has, and 2183 // skip the pattern location counter past 2184 int32_t instrOperandLoc = fp->fPatIdx; 2185 fp->fPatIdx += 3; 2186 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 2187 int32_t minCount = pat[instrOperandLoc+1]; 2188 int32_t maxCount = pat[instrOperandLoc+2]; 2189 U_ASSERT(minCount>=0); 2190 U_ASSERT(maxCount>=minCount || maxCount==-1); 2191 U_ASSERT(loopLoc>fp->fPatIdx); 2192 2193 if (minCount == 0) { 2194 if (maxCount != 0) { 2195 fp = StateSave(fp, fp->fPatIdx, status); 2196 } 2197 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 2198 } 2199 } 2200 break; 2201 2202 case URX_CTR_LOOP_NG: 2203 { 2204 // Non-greedy {min, max} loops 2205 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 2206 int32_t initOp = pat[opValue]; 2207 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 2208 int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 2209 int32_t minCount = pat[opValue+2]; 2210 int32_t maxCount = pat[opValue+3]; 2211 // Increment the counter. Note: we're not worrying about counter 2212 // overflow, since the data comes from UnicodeStrings, which 2213 // stores its length in an int32_t. 2214 (*pCounter)++; 2215 U_ASSERT(*pCounter > 0); 2216 2217 if ((uint32_t)*pCounter >= (uint32_t)maxCount) { 2218 // The loop has matched the maximum permitted number of times. 2219 // Break out of here with no action. Matching will 2220 // continue with the following pattern. 2221 U_ASSERT(*pCounter == maxCount || maxCount == -1); 2222 break; 2223 } 2224 2225 if (*pCounter < minCount) { 2226 // We haven't met the minimum number of matches yet. 2227 // Loop back for another one. 2228 fp->fPatIdx = opValue + 4; // Loop back. 2229 } else { 2230 // We do have the minimum number of matches. 2231 // Fall into the following pattern, but first do 2232 // a state save to the top of the loop, so that a failure 2233 // in the following pattern will try another iteration of the loop. 2234 fp = StateSave(fp, opValue + 4, status); 2235 } 2236 } 2237 break; 2238 2239 case URX_STO_SP: 2240 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 2241 fData[opValue] = fStack->size(); 2242 break; 2243 2244 case URX_LD_SP: 2245 { 2246 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 2247 int32_t newStackSize = fData[opValue]; 2248 U_ASSERT(newStackSize <= fStack->size()); 2249 int32_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 2250 if (newFP == (int32_t *)fp) { 2251 break; 2252 } 2253 int32_t i; 2254 for (i=0; i<fFrameSize; i++) { 2255 newFP[i] = ((int32_t *)fp)[i]; 2256 } 2257 fp = (REStackFrame *)newFP; 2258 fStack->setSize(newStackSize); 2259 } 2260 break; 2261 2262 case URX_BACKREF: 2263 case URX_BACKREF_I: 2264 { 2265 U_ASSERT(opValue < fFrameSize); 2266 int32_t groupStartIdx = fp->fExtra[opValue]; 2267 int32_t groupEndIdx = fp->fExtra[opValue+1]; 2268 U_ASSERT(groupStartIdx <= groupEndIdx); 2269 int32_t len = groupEndIdx-groupStartIdx; 2270 if (groupStartIdx < 0) { 2271 // This capture group has not participated in the match thus far, 2272 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 2273 } 2274 2275 if (len == 0) { 2276 // The capture group match was of an empty string. 2277 // Verified by testing: Perl matches succeed in this case, so 2278 // we do too. 2279 break; 2280 } 2281 2282 UBool haveMatch = FALSE; 2283 if (fp->fInputIdx + len <= fActiveLimit) { 2284 if (opType == URX_BACKREF) { 2285 if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) == 0) { 2286 haveMatch = TRUE; 2287 } 2288 } else { 2289 if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, 2290 len, U_FOLD_CASE_DEFAULT) == 0) { 2291 haveMatch = TRUE; 2292 } 2293 } 2294 } else { 2295 // TODO: probably need to do a partial string comparison, and only 2296 // set HitEnd if the available input matched. Ticket #6074 2297 fHitEnd = TRUE; 2298 } 2299 if (haveMatch) { 2300 fp->fInputIdx += len; // Match. Advance current input position. 2301 } else { 2302 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 2303 } 2304 } 2305 break; 2306 2307 case URX_STO_INP_LOC: 2308 { 2309 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 2310 fp->fExtra[opValue] = fp->fInputIdx; 2311 } 2312 break; 2313 2314 case URX_JMPX: 2315 { 2316 int32_t instrOperandLoc = fp->fPatIdx; 2317 fp->fPatIdx += 1; 2318 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 2319 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 2320 int32_t savedInputIdx = fp->fExtra[dataLoc]; 2321 U_ASSERT(savedInputIdx <= fp->fInputIdx); 2322 if (savedInputIdx < fp->fInputIdx) { 2323 fp->fPatIdx = opValue; // JMP 2324 } else { 2325 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 2326 } 2327 } 2328 break; 2329 2330 case URX_LA_START: 2331 { 2332 // Entering a lookahead block. 2333 // Save Stack Ptr, Input Pos. 2334 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 2335 fData[opValue] = fStack->size(); 2336 fData[opValue+1] = fp->fInputIdx; 2337 fActiveStart = fLookStart; // Set the match region change for 2338 fActiveLimit = fLookLimit; // transparent bounds. 2339 } 2340 break; 2341 2342 case URX_LA_END: 2343 { 2344 // Leaving a look-ahead block. 2345 // restore Stack Ptr, Input Pos to positions they had on entry to block. 2346 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 2347 int32_t stackSize = fStack->size(); 2348 int32_t newStackSize = fData[opValue]; 2349 U_ASSERT(stackSize >= newStackSize); 2350 if (stackSize > newStackSize) { 2351 // Copy the current top frame back to the new (cut back) top frame. 2352 // This makes the capture groups from within the look-ahead 2353 // expression available. 2354 int32_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 2355 int32_t i; 2356 for (i=0; i<fFrameSize; i++) { 2357 newFP[i] = ((int32_t *)fp)[i]; 2358 } 2359 fp = (REStackFrame *)newFP; 2360 fStack->setSize(newStackSize); 2361 } 2362 fp->fInputIdx = fData[opValue+1]; 2363 2364 // Restore the active region bounds in the input string; they may have 2365 // been changed because of transparent bounds on a Region. 2366 fActiveStart = fRegionStart; 2367 fActiveLimit = fRegionLimit; 2368 } 2369 break; 2370 2371 case URX_ONECHAR_I: 2372 if (fp->fInputIdx < fActiveLimit) { 2373 UChar32 c; 2374 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 2375 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 2376 break; 2377 } 2378 } else { 2379 fHitEnd = TRUE; 2380 } 2381 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2382 break; 2383 2384 case URX_STRING_I: 2385 { 2386 // Test input against a literal string. 2387 // Strings require two slots in the compiled pattern, one for the 2388 // offset to the string text, and one for the length. 2389 int32_t stringStartIdx, stringLen; 2390 stringStartIdx = opValue; 2391 2392 op = pat[fp->fPatIdx]; 2393 fp->fPatIdx++; 2394 opType = URX_TYPE(op); 2395 opValue = URX_VAL(op); 2396 U_ASSERT(opType == URX_STRING_LEN); 2397 stringLen = opValue; 2398 2399 int32_t stringEndIndex = fp->fInputIdx + stringLen; 2400 if (stringEndIndex <= fActiveLimit) { 2401 if (u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx, 2402 stringLen, U_FOLD_CASE_DEFAULT) == 0) { 2403 // Success. Advance the current input position. 2404 fp->fInputIdx = stringEndIndex; 2405 break; 2406 } 2407 } else { 2408 // Insufficent input left for a match. 2409 fHitEnd = TRUE; // See ticket 6074 2410 } 2411 // No match. Back up matching to a saved state 2412 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2413 } 2414 break; 2415 2416 case URX_LB_START: 2417 { 2418 // Entering a look-behind block. 2419 // Save Stack Ptr, Input Pos. 2420 // TODO: implement transparent bounds. Ticket #6067 2421 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 2422 fData[opValue] = fStack->size(); 2423 fData[opValue+1] = fp->fInputIdx; 2424 // Init the variable containing the start index for attempted matches. 2425 fData[opValue+2] = -1; 2426 // Save input string length, then reset to pin any matches to end at 2427 // the current position. 2428 fData[opValue+3] = fActiveLimit; 2429 fActiveLimit = fp->fInputIdx; 2430 } 2431 break; 2432 2433 2434 case URX_LB_CONT: 2435 { 2436 // Positive Look-Behind, at top of loop checking for matches of LB expression 2437 // at all possible input starting positions. 2438 2439 // Fetch the min and max possible match lengths. They are the operands 2440 // of this op in the pattern. 2441 int32_t minML = pat[fp->fPatIdx++]; 2442 int32_t maxML = pat[fp->fPatIdx++]; 2443 U_ASSERT(minML <= maxML); 2444 U_ASSERT(minML >= 0); 2445 2446 // Fetch (from data) the last input index where a match was attempted. 2447 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 2448 int32_t *lbStartIdx = &fData[opValue+2]; 2449 if (*lbStartIdx < 0) { 2450 // First time through loop. 2451 *lbStartIdx = fp->fInputIdx - minML; 2452 } else { 2453 // 2nd through nth time through the loop. 2454 // Back up start position for match by one. 2455 if (*lbStartIdx == 0) { 2456 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. 2457 } else { 2458 U16_BACK_1(inputBuf, 0, *lbStartIdx); 2459 } 2460 } 2461 2462 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 2463 // We have tried all potential match starting points without 2464 // getting a match. Backtrack out, and out of the 2465 // Look Behind altogether. 2466 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2467 int32_t restoreInputLen = fData[opValue+3]; 2468 U_ASSERT(restoreInputLen >= fActiveLimit); 2469 U_ASSERT(restoreInputLen <= fInput->length()); 2470 fActiveLimit = restoreInputLen; 2471 break; 2472 } 2473 2474 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 2475 // (successful match will fall off the end of the loop.) 2476 fp = StateSave(fp, fp->fPatIdx-3, status); 2477 fp->fInputIdx = *lbStartIdx; 2478 } 2479 break; 2480 2481 case URX_LB_END: 2482 // End of a look-behind block, after a successful match. 2483 { 2484 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 2485 if (fp->fInputIdx != fActiveLimit) { 2486 // The look-behind expression matched, but the match did not 2487 // extend all the way to the point that we are looking behind from. 2488 // FAIL out of here, which will take us back to the LB_CONT, which 2489 // will retry the match starting at another position or fail 2490 // the look-behind altogether, whichever is appropriate. 2491 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2492 break; 2493 } 2494 2495 // Look-behind match is good. Restore the orignal input string length, 2496 // which had been truncated to pin the end of the lookbehind match to the 2497 // position being looked-behind. 2498 int32_t originalInputLen = fData[opValue+3]; 2499 U_ASSERT(originalInputLen >= fActiveLimit); 2500 U_ASSERT(originalInputLen <= fInput->length()); 2501 fActiveLimit = originalInputLen; 2502 } 2503 break; 2504 2505 2506 case URX_LBN_CONT: 2507 { 2508 // Negative Look-Behind, at top of loop checking for matches of LB expression 2509 // at all possible input starting positions. 2510 2511 // Fetch the extra parameters of this op. 2512 int32_t minML = pat[fp->fPatIdx++]; 2513 int32_t maxML = pat[fp->fPatIdx++]; 2514 int32_t continueLoc = pat[fp->fPatIdx++]; 2515 continueLoc = URX_VAL(continueLoc); 2516 U_ASSERT(minML <= maxML); 2517 U_ASSERT(minML >= 0); 2518 U_ASSERT(continueLoc > fp->fPatIdx); 2519 2520 // Fetch (from data) the last input index where a match was attempted. 2521 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 2522 int32_t *lbStartIdx = &fData[opValue+2]; 2523 if (*lbStartIdx < 0) { 2524 // First time through loop. 2525 *lbStartIdx = fp->fInputIdx - minML; 2526 } else { 2527 // 2nd through nth time through the loop. 2528 // Back up start position for match by one. 2529 if (*lbStartIdx == 0) { 2530 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. 2531 } else { 2532 U16_BACK_1(inputBuf, 0, *lbStartIdx); 2533 } 2534 } 2535 2536 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 2537 // We have tried all potential match starting points without 2538 // getting a match, which means that the negative lookbehind as 2539 // a whole has succeeded. Jump forward to the continue location 2540 int32_t restoreInputLen = fData[opValue+3]; 2541 U_ASSERT(restoreInputLen >= fActiveLimit); 2542 U_ASSERT(restoreInputLen <= fInput->length()); 2543 fActiveLimit = restoreInputLen; 2544 fp->fPatIdx = continueLoc; 2545 break; 2546 } 2547 2548 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 2549 // (successful match will cause a FAIL out of the loop altogether.) 2550 fp = StateSave(fp, fp->fPatIdx-4, status); 2551 fp->fInputIdx = *lbStartIdx; 2552 } 2553 break; 2554 2555 case URX_LBN_END: 2556 // End of a negative look-behind block, after a successful match. 2557 { 2558 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 2559 if (fp->fInputIdx != fActiveLimit) { 2560 // The look-behind expression matched, but the match did not 2561 // extend all the way to the point that we are looking behind from. 2562 // FAIL out of here, which will take us back to the LB_CONT, which 2563 // will retry the match starting at another position or succeed 2564 // the look-behind altogether, whichever is appropriate. 2565 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2566 break; 2567 } 2568 2569 // Look-behind expression matched, which means look-behind test as 2570 // a whole Fails 2571 2572 // Restore the orignal input string length, which had been truncated 2573 // inorder to pin the end of the lookbehind match 2574 // to the position being looked-behind. 2575 int32_t originalInputLen = fData[opValue+3]; 2576 U_ASSERT(originalInputLen >= fActiveLimit); 2577 U_ASSERT(originalInputLen <= fInput->length()); 2578 fActiveLimit = originalInputLen; 2579 2580 // Restore original stack position, discarding any state saved 2581 // by the successful pattern match. 2582 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 2583 int32_t newStackSize = fData[opValue]; 2584 U_ASSERT(fStack->size() > newStackSize); 2585 fStack->setSize(newStackSize); 2586 2587 // FAIL, which will take control back to someplace 2588 // prior to entering the look-behind test. 2589 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2590 } 2591 break; 2592 2593 2594 case URX_LOOP_SR_I: 2595 // Loop Initialization for the optimized implementation of 2596 // [some character set]* 2597 // This op scans through all matching input. 2598 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 2599 { 2600 U_ASSERT(opValue > 0 && opValue < sets->size()); 2601 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 2602 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 2603 2604 // Loop through input, until either the input is exhausted or 2605 // we reach a character that is not a member of the set. 2606 int32_t ix = fp->fInputIdx; 2607 for (;;) { 2608 if (ix >= fActiveLimit) { 2609 fHitEnd = TRUE; 2610 break; 2611 } 2612 UChar32 c; 2613 U16_NEXT(inputBuf, ix, fActiveLimit, c); 2614 if (c<256) { 2615 if (s8->contains(c) == FALSE) { 2616 U16_BACK_1(inputBuf, 0, ix); 2617 break; 2618 } 2619 } else { 2620 if (s->contains(c) == FALSE) { 2621 U16_BACK_1(inputBuf, 0, ix); 2622 break; 2623 } 2624 } 2625 } 2626 2627 // If there were no matching characters, skip over the loop altogether. 2628 // The loop doesn't run at all, a * op always succeeds. 2629 if (ix == fp->fInputIdx) { 2630 fp->fPatIdx++; // skip the URX_LOOP_C op. 2631 break; 2632 } 2633 2634 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 2635 // must follow. It's operand is the stack location 2636 // that holds the starting input index for the match of this [set]* 2637 int32_t loopcOp = pat[fp->fPatIdx]; 2638 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 2639 int32_t stackLoc = URX_VAL(loopcOp); 2640 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 2641 fp->fExtra[stackLoc] = fp->fInputIdx; 2642 fp->fInputIdx = ix; 2643 2644 // Save State to the URX_LOOP_C op that follows this one, 2645 // so that match failures in the following code will return to there. 2646 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 2647 fp = StateSave(fp, fp->fPatIdx, status); 2648 fp->fPatIdx++; 2649 } 2650 break; 2651 2652 2653 case URX_LOOP_DOT_I: 2654 // Loop Initialization for the optimized implementation of .* 2655 // This op scans through all remaining input. 2656 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 2657 { 2658 // Loop through input until the input is exhausted (we reach an end-of-line) 2659 // In DOTALL mode, we can just go straight to the end of the input. 2660 int32_t ix; 2661 if ((opValue & 1) == 1) { 2662 // Dot-matches-All mode. Jump straight to the end of the string. 2663 ix = fActiveLimit; 2664 fHitEnd = TRUE; 2665 } else { 2666 // NOT DOT ALL mode. Line endings do not match '.' 2667 // Scan forward until a line ending or end of input. 2668 ix = fp->fInputIdx; 2669 for (;;) { 2670 if (ix >= fActiveLimit) { 2671 fHitEnd = TRUE; 2672 ix = fActiveLimit; 2673 break; 2674 } 2675 UChar32 c; 2676 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] 2677 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 2678 if ((c == 0x0a) || // 0x0a is newline in both modes. 2679 ((opValue & 2) == 0) && // IF not UNIX_LINES mode 2680 (c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029) { 2681 // char is a line ending. Put the input pos back to the 2682 // line ending char, and exit the scanning loop. 2683 U16_BACK_1(inputBuf, 0, ix); 2684 break; 2685 } 2686 } 2687 } 2688 } 2689 2690 // If there were no matching characters, skip over the loop altogether. 2691 // The loop doesn't run at all, a * op always succeeds. 2692 if (ix == fp->fInputIdx) { 2693 fp->fPatIdx++; // skip the URX_LOOP_C op. 2694 break; 2695 } 2696 2697 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 2698 // must follow. It's operand is the stack location 2699 // that holds the starting input index for the match of this .* 2700 int32_t loopcOp = pat[fp->fPatIdx]; 2701 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 2702 int32_t stackLoc = URX_VAL(loopcOp); 2703 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 2704 fp->fExtra[stackLoc] = fp->fInputIdx; 2705 fp->fInputIdx = ix; 2706 2707 // Save State to the URX_LOOP_C op that follows this one, 2708 // so that match failures in the following code will return to there. 2709 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 2710 fp = StateSave(fp, fp->fPatIdx, status); 2711 fp->fPatIdx++; 2712 } 2713 break; 2714 2715 2716 case URX_LOOP_C: 2717 { 2718 U_ASSERT(opValue>=0 && opValue<fFrameSize); 2719 int32_t terminalIdx = fp->fExtra[opValue]; 2720 U_ASSERT(terminalIdx <= fp->fInputIdx); 2721 if (terminalIdx == fp->fInputIdx) { 2722 // We've backed up the input idx to the point that the loop started. 2723 // The loop is done. Leave here without saving state. 2724 // Subsequent failures won't come back here. 2725 break; 2726 } 2727 // Set up for the next iteration of the loop, with input index 2728 // backed up by one from the last time through, 2729 // and a state save to this instruction in case the following code fails again. 2730 // (We're going backwards because this loop emulates stack unwinding, not 2731 // the initial scan forward.) 2732 U_ASSERT(fp->fInputIdx > 0); 2733 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 2734 if (inputBuf[fp->fInputIdx] == 0x0a && 2735 fp->fInputIdx > terminalIdx && 2736 inputBuf[fp->fInputIdx-1] == 0x0d) { 2737 int32_t prevOp = pat[fp->fPatIdx-2]; 2738 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 2739 // .*, stepping back over CRLF pair. 2740 fp->fInputIdx--; 2741 } 2742 } 2743 2744 2745 fp = StateSave(fp, fp->fPatIdx-1, status); 2746 } 2747 break; 2748 2749 2750 2751 default: 2752 // Trouble. The compiled pattern contains an entry with an 2753 // unrecognized type tag. 2754 U_ASSERT(FALSE); 2755 } 2756 2757 if (U_FAILURE(status)) { 2758 isMatch = FALSE; 2759 break; 2760 } 2761 } 2762 2763 breakFromLoop: 2764 fMatch = isMatch; 2765 if (isMatch) { 2766 fLastMatchEnd = fMatchEnd; 2767 fMatchStart = startIdx; 2768 fMatchEnd = fp->fInputIdx; 2769 if (fTraceDebug) { 2770 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); 2771 } 2772 } 2773 else 2774 { 2775 if (fTraceDebug) { 2776 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); 2777 } 2778 } 2779 2780 fFrame = fp; // The active stack frame when the engine stopped. 2781 // Contains the capture group results that we need to 2782 // access later. 2783 2784 return; 2785 } 2786 2787 2788 2789 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) 2790 2791 U_NAMESPACE_END 2792 2793 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 2794 2795