1 /* 2 ************************************************************************** 3 * Copyright (C) 2002-2011 International Business Machines Corporation * 4 * and others. All rights reserved. * 5 ************************************************************************** 6 */ 7 // 8 // file: rematch.cpp 9 // 10 // Contains the implementation of class RegexMatcher, 11 // which is one of the main API classes for the ICU regular expression package. 12 // 13 14 #include "unicode/utypes.h" 15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 16 17 #include "unicode/regex.h" 18 #include "unicode/uniset.h" 19 #include "unicode/uchar.h" 20 #include "unicode/ustring.h" 21 #include "unicode/rbbi.h" 22 #include "uassert.h" 23 #include "cmemory.h" 24 #include "uvector.h" 25 #include "uvectr32.h" 26 #include "uvectr64.h" 27 #include "regeximp.h" 28 #include "regexst.h" 29 #include "regextxt.h" 30 #include "ucase.h" 31 32 // #include <malloc.h> // Needed for heapcheck testing 33 34 35 // Find progress callback 36 // ---------------------- 37 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call. 38 // 39 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \ 40 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE) 41 42 43 // Smart Backtracking 44 // ------------------ 45 // When a failure would go back to a LOOP_C instruction, 46 // strings, characters, and setrefs scan backwards for a valid start 47 // character themselves, pop the stack, and save state, emulating the 48 // LOOP_C's effect but assured that the next character of input is a 49 // possible matching character. 50 // 51 // Good idea in theory; unfortunately it only helps out a few specific 52 // cases and slows the engine down a little in the rest. 53 54 //#define REGEX_SMART_BACKTRACKING 1 55 56 U_NAMESPACE_BEGIN 57 58 // Default limit for the size of the back track stack, to avoid system 59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. 60 // This value puts ICU's limits higher than most other regexp implementations, 61 // which use recursion rather than the heap, and take more storage per 62 // backtrack point. 63 // 64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; 65 66 // Time limit counter constant. 67 // Time limits for expression evaluation are in terms of quanta of work by 68 // the engine, each of which is 10,000 state saves. 69 // This constant determines that state saves per tick number. 70 static const int32_t TIMER_INITIAL_VALUE = 10000; 71 72 //----------------------------------------------------------------------------- 73 // 74 // Constructor and Destructor 75 // 76 //----------------------------------------------------------------------------- 77 RegexMatcher::RegexMatcher(const RegexPattern *pat) { 78 fDeferredStatus = U_ZERO_ERROR; 79 init(fDeferredStatus); 80 if (U_FAILURE(fDeferredStatus)) { 81 return; 82 } 83 if (pat==NULL) { 84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; 85 return; 86 } 87 fPattern = pat; 88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); 89 } 90 91 92 93 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 94 uint32_t flags, UErrorCode &status) { 95 init(status); 96 if (U_FAILURE(status)) { 97 return; 98 } 99 UParseError pe; 100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 101 fPattern = fPatternOwned; 102 103 UText inputText = UTEXT_INITIALIZER; 104 utext_openConstUnicodeString(&inputText, &input, &status); 105 init2(&inputText, status); 106 utext_close(&inputText); 107 108 fInputUniStrMaybeMutable = TRUE; 109 } 110 111 112 RegexMatcher::RegexMatcher(UText *regexp, UText *input, 113 uint32_t flags, UErrorCode &status) { 114 init(status); 115 if (U_FAILURE(status)) { 116 return; 117 } 118 UParseError pe; 119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 120 if (U_FAILURE(status)) { 121 return; 122 } 123 124 fPattern = fPatternOwned; 125 init2(input, status); 126 } 127 128 129 RegexMatcher::RegexMatcher(const UnicodeString ®exp, 130 uint32_t flags, UErrorCode &status) { 131 init(status); 132 if (U_FAILURE(status)) { 133 return; 134 } 135 UParseError pe; 136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 137 if (U_FAILURE(status)) { 138 return; 139 } 140 fPattern = fPatternOwned; 141 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 142 } 143 144 RegexMatcher::RegexMatcher(UText *regexp, 145 uint32_t flags, UErrorCode &status) { 146 init(status); 147 if (U_FAILURE(status)) { 148 return; 149 } 150 UParseError pe; 151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 152 if (U_FAILURE(status)) { 153 return; 154 } 155 156 fPattern = fPatternOwned; 157 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 158 } 159 160 161 162 163 RegexMatcher::~RegexMatcher() { 164 delete fStack; 165 if (fData != fSmallData) { 166 uprv_free(fData); 167 fData = NULL; 168 } 169 if (fPatternOwned) { 170 delete fPatternOwned; 171 fPatternOwned = NULL; 172 fPattern = NULL; 173 } 174 175 if (fInput) { 176 delete fInput; 177 } 178 if (fInputText) { 179 utext_close(fInputText); 180 } 181 if (fAltInputText) { 182 utext_close(fAltInputText); 183 } 184 185 #if UCONFIG_NO_BREAK_ITERATION==0 186 delete fWordBreakItr; 187 #endif 188 } 189 190 // 191 // init() common initialization for use by all constructors. 192 // Initialize all fields, get the object into a consistent state. 193 // This must be done even when the initial status shows an error, 194 // so that the object is initialized sufficiently well for the destructor 195 // to run safely. 196 // 197 void RegexMatcher::init(UErrorCode &status) { 198 fPattern = NULL; 199 fPatternOwned = NULL; 200 fFrameSize = 0; 201 fRegionStart = 0; 202 fRegionLimit = 0; 203 fAnchorStart = 0; 204 fAnchorLimit = 0; 205 fLookStart = 0; 206 fLookLimit = 0; 207 fActiveStart = 0; 208 fActiveLimit = 0; 209 fTransparentBounds = FALSE; 210 fAnchoringBounds = TRUE; 211 fMatch = FALSE; 212 fMatchStart = 0; 213 fMatchEnd = 0; 214 fLastMatchEnd = -1; 215 fAppendPosition = 0; 216 fHitEnd = FALSE; 217 fRequireEnd = FALSE; 218 fStack = NULL; 219 fFrame = NULL; 220 fTimeLimit = 0; 221 fTime = 0; 222 fTickCounter = 0; 223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; 224 fCallbackFn = NULL; 225 fCallbackContext = NULL; 226 fFindProgressCallbackFn = NULL; 227 fFindProgressCallbackContext = NULL; 228 fTraceDebug = FALSE; 229 fDeferredStatus = status; 230 fData = fSmallData; 231 fWordBreakItr = NULL; 232 233 fStack = new UVector64(status); 234 fInputText = NULL; 235 fAltInputText = NULL; 236 fInput = NULL; 237 fInputLength = 0; 238 fInputUniStrMaybeMutable = FALSE; 239 240 if (U_FAILURE(status)) { 241 fDeferredStatus = status; 242 } 243 } 244 245 // 246 // init2() Common initialization for use by RegexMatcher constructors, part 2. 247 // This handles the common setup to be done after the Pattern is available. 248 // 249 void RegexMatcher::init2(UText *input, UErrorCode &status) { 250 if (U_FAILURE(status)) { 251 fDeferredStatus = status; 252 return; 253 } 254 255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) { 256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); 257 if (fData == NULL) { 258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 259 return; 260 } 261 } 262 263 reset(input); 264 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); 265 if (U_FAILURE(status)) { 266 fDeferredStatus = status; 267 return; 268 } 269 } 270 271 272 static const UChar BACKSLASH = 0x5c; 273 static const UChar DOLLARSIGN = 0x24; 274 //-------------------------------------------------------------------------------- 275 // 276 // appendReplacement 277 // 278 //-------------------------------------------------------------------------------- 279 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, 280 const UnicodeString &replacement, 281 UErrorCode &status) { 282 UText replacementText = UTEXT_INITIALIZER; 283 284 utext_openConstUnicodeString(&replacementText, &replacement, &status); 285 if (U_SUCCESS(status)) { 286 UText resultText = UTEXT_INITIALIZER; 287 utext_openUnicodeString(&resultText, &dest, &status); 288 289 if (U_SUCCESS(status)) { 290 appendReplacement(&resultText, &replacementText, status); 291 utext_close(&resultText); 292 } 293 utext_close(&replacementText); 294 } 295 296 return *this; 297 } 298 299 // 300 // appendReplacement, UText mode 301 // 302 RegexMatcher &RegexMatcher::appendReplacement(UText *dest, 303 UText *replacement, 304 UErrorCode &status) { 305 if (U_FAILURE(status)) { 306 return *this; 307 } 308 if (U_FAILURE(fDeferredStatus)) { 309 status = fDeferredStatus; 310 return *this; 311 } 312 if (fMatch == FALSE) { 313 status = U_REGEX_INVALID_STATE; 314 return *this; 315 } 316 317 // Copy input string from the end of previous match to start of current match 318 int64_t destLen = utext_nativeLength(dest); 319 if (fMatchStart > fAppendPosition) { 320 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 321 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 322 (int32_t)(fMatchStart-fAppendPosition), &status); 323 } else { 324 int32_t len16; 325 if (UTEXT_USES_U16(fInputText)) { 326 len16 = (int32_t)(fMatchStart-fAppendPosition); 327 } else { 328 UErrorCode lengthStatus = U_ZERO_ERROR; 329 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus); 330 } 331 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 332 if (inputChars == NULL) { 333 status = U_MEMORY_ALLOCATION_ERROR; 334 return *this; 335 } 336 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); 337 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); 338 uprv_free(inputChars); 339 } 340 } 341 fAppendPosition = fMatchEnd; 342 343 344 // scan the replacement text, looking for substitutions ($n) and \escapes. 345 // TODO: optimize this loop by efficiently scanning for '$' or '\', 346 // move entire ranges not containing substitutions. 347 UTEXT_SETNATIVEINDEX(replacement, 0); 348 UChar32 c = UTEXT_NEXT32(replacement); 349 while (c != U_SENTINEL) { 350 if (c == BACKSLASH) { 351 // Backslash Escape. Copy the following char out without further checks. 352 // Note: Surrogate pairs don't need any special handling 353 // The second half wont be a '$' or a '\', and 354 // will move to the dest normally on the next 355 // loop iteration. 356 c = UTEXT_CURRENT32(replacement); 357 if (c == U_SENTINEL) { 358 break; 359 } 360 361 if (c==0x55/*U*/ || c==0x75/*u*/) { 362 // We have a \udddd or \Udddddddd escape sequence. 363 int32_t offset = 0; 364 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement); 365 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 366 if (escapedChar != (UChar32)0xFFFFFFFF) { 367 if (U_IS_BMP(escapedChar)) { 368 UChar c16 = (UChar)escapedChar; 369 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 370 } else { 371 UChar surrogate[2]; 372 surrogate[0] = U16_LEAD(escapedChar); 373 surrogate[1] = U16_TRAIL(escapedChar); 374 if (U_SUCCESS(status)) { 375 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 376 } 377 } 378 // TODO: Report errors for mal-formed \u escapes? 379 // As this is, the original sequence is output, which may be OK. 380 if (context.lastOffset == offset) { 381 (void)UTEXT_PREVIOUS32(replacement); 382 } else if (context.lastOffset != offset-1) { 383 utext_moveIndex32(replacement, offset - context.lastOffset - 1); 384 } 385 } 386 } else { 387 (void)UTEXT_NEXT32(replacement); 388 // Plain backslash escape. Just put out the escaped character. 389 if (U_IS_BMP(c)) { 390 UChar c16 = (UChar)c; 391 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 392 } else { 393 UChar surrogate[2]; 394 surrogate[0] = U16_LEAD(c); 395 surrogate[1] = U16_TRAIL(c); 396 if (U_SUCCESS(status)) { 397 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 398 } 399 } 400 } 401 } else if (c != DOLLARSIGN) { 402 // Normal char, not a $. Copy it out without further checks. 403 if (U_IS_BMP(c)) { 404 UChar c16 = (UChar)c; 405 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 406 } else { 407 UChar surrogate[2]; 408 surrogate[0] = U16_LEAD(c); 409 surrogate[1] = U16_TRAIL(c); 410 if (U_SUCCESS(status)) { 411 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 412 } 413 } 414 } else { 415 // We've got a $. Pick up a capture group number if one follows. 416 // Consume at most the number of digits necessary for the largest capture 417 // number that is valid for this pattern. 418 419 int32_t numDigits = 0; 420 int32_t groupNum = 0; 421 UChar32 digitC; 422 for (;;) { 423 digitC = UTEXT_CURRENT32(replacement); 424 if (digitC == U_SENTINEL) { 425 break; 426 } 427 if (u_isdigit(digitC) == FALSE) { 428 break; 429 } 430 (void)UTEXT_NEXT32(replacement); 431 groupNum=groupNum*10 + u_charDigitValue(digitC); 432 numDigits++; 433 if (numDigits >= fPattern->fMaxCaptureDigits) { 434 break; 435 } 436 } 437 438 439 if (numDigits == 0) { 440 // The $ didn't introduce a group number at all. 441 // Treat it as just part of the substitution text. 442 UChar c16 = DOLLARSIGN; 443 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 444 } else { 445 // Finally, append the capture group data to the destination. 446 destLen += appendGroup(groupNum, dest, status); 447 if (U_FAILURE(status)) { 448 // Can fail if group number is out of range. 449 break; 450 } 451 } 452 } 453 454 if (U_FAILURE(status)) { 455 break; 456 } else { 457 c = UTEXT_NEXT32(replacement); 458 } 459 } 460 461 return *this; 462 } 463 464 465 466 //-------------------------------------------------------------------------------- 467 // 468 // appendTail Intended to be used in conjunction with appendReplacement() 469 // To the destination string, append everything following 470 // the last match position from the input string. 471 // 472 // Note: Match ranges do not affect appendTail or appendReplacement 473 // 474 //-------------------------------------------------------------------------------- 475 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { 476 UErrorCode status = U_ZERO_ERROR; 477 UText resultText = UTEXT_INITIALIZER; 478 utext_openUnicodeString(&resultText, &dest, &status); 479 480 if (U_SUCCESS(status)) { 481 appendTail(&resultText, status); 482 utext_close(&resultText); 483 } 484 485 return dest; 486 } 487 488 // 489 // appendTail, UText mode 490 // 491 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { 492 UBool bailOut = FALSE; 493 if (U_FAILURE(status)) { 494 bailOut = TRUE; 495 } 496 if (U_FAILURE(fDeferredStatus)) { 497 status = fDeferredStatus; 498 bailOut = TRUE; 499 } 500 501 if (bailOut) { 502 // dest must not be NULL 503 if (dest) { 504 utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status); 505 return dest; 506 } 507 } 508 509 if (fInputLength > fAppendPosition) { 510 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 511 int64_t destLen = utext_nativeLength(dest); 512 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 513 (int32_t)(fInputLength-fAppendPosition), &status); 514 } else { 515 int32_t len16; 516 if (UTEXT_USES_U16(fInputText)) { 517 len16 = (int32_t)(fInputLength-fAppendPosition); 518 } else { 519 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status); 520 status = U_ZERO_ERROR; // buffer overflow 521 } 522 523 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); 524 if (inputChars == NULL) { 525 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 526 } else { 527 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated 528 int64_t destLen = utext_nativeLength(dest); 529 utext_replace(dest, destLen, destLen, inputChars, len16, &status); 530 uprv_free(inputChars); 531 } 532 } 533 } 534 return dest; 535 } 536 537 538 539 //-------------------------------------------------------------------------------- 540 // 541 // end 542 // 543 //-------------------------------------------------------------------------------- 544 int32_t RegexMatcher::end(UErrorCode &err) const { 545 return end(0, err); 546 } 547 548 int64_t RegexMatcher::end64(UErrorCode &err) const { 549 return end64(0, err); 550 } 551 552 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const { 553 if (U_FAILURE(err)) { 554 return -1; 555 } 556 if (fMatch == FALSE) { 557 err = U_REGEX_INVALID_STATE; 558 return -1; 559 } 560 if (group < 0 || group > fPattern->fGroupMap->size()) { 561 err = U_INDEX_OUTOFBOUNDS_ERROR; 562 return -1; 563 } 564 int64_t e = -1; 565 if (group == 0) { 566 e = fMatchEnd; 567 } else { 568 // Get the position within the stack frame of the variables for 569 // this capture group. 570 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 571 U_ASSERT(groupOffset < fPattern->fFrameSize); 572 U_ASSERT(groupOffset >= 0); 573 e = fFrame->fExtra[groupOffset + 1]; 574 } 575 576 return e; 577 } 578 579 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { 580 return (int32_t)end64(group, err); 581 } 582 583 584 //-------------------------------------------------------------------------------- 585 // 586 // find() 587 // 588 //-------------------------------------------------------------------------------- 589 UBool RegexMatcher::find() { 590 // Start at the position of the last match end. (Will be zero if the 591 // matcher has been reset.) 592 // 593 if (U_FAILURE(fDeferredStatus)) { 594 return FALSE; 595 } 596 597 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 598 return findUsingChunk(); 599 } 600 601 int64_t startPos = fMatchEnd; 602 if (startPos==0) { 603 startPos = fActiveStart; 604 } 605 606 if (fMatch) { 607 // Save the position of any previous successful match. 608 fLastMatchEnd = fMatchEnd; 609 610 if (fMatchStart == fMatchEnd) { 611 // Previous match had zero length. Move start position up one position 612 // to avoid sending find() into a loop on zero-length matches. 613 if (startPos >= fActiveLimit) { 614 fMatch = FALSE; 615 fHitEnd = TRUE; 616 return FALSE; 617 } 618 UTEXT_SETNATIVEINDEX(fInputText, startPos); 619 (void)UTEXT_NEXT32(fInputText); 620 startPos = UTEXT_GETNATIVEINDEX(fInputText); 621 } 622 } else { 623 if (fLastMatchEnd >= 0) { 624 // A previous find() failed to match. Don't try again. 625 // (without this test, a pattern with a zero-length match 626 // could match again at the end of an input string.) 627 fHitEnd = TRUE; 628 return FALSE; 629 } 630 } 631 632 633 // Compute the position in the input string beyond which a match can not begin, because 634 // the minimum length match would extend past the end of the input. 635 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 636 // Be aware of possible overflows if making changes here. 637 int64_t testStartLimit; 638 if (UTEXT_USES_U16(fInputText)) { 639 testStartLimit = fActiveLimit - fPattern->fMinMatchLen; 640 if (startPos > testStartLimit) { 641 fMatch = FALSE; 642 fHitEnd = TRUE; 643 return FALSE; 644 } 645 } else { 646 // For now, let the matcher discover that it can't match on its own 647 // We don't know how long the match len is in native characters 648 testStartLimit = fActiveLimit; 649 } 650 651 UChar32 c; 652 U_ASSERT(startPos >= 0); 653 654 switch (fPattern->fStartType) { 655 case START_NO_INFO: 656 // No optimization was found. 657 // Try a match at each input position. 658 for (;;) { 659 MatchAt(startPos, FALSE, fDeferredStatus); 660 if (U_FAILURE(fDeferredStatus)) { 661 return FALSE; 662 } 663 if (fMatch) { 664 return TRUE; 665 } 666 if (startPos >= testStartLimit) { 667 fHitEnd = TRUE; 668 return FALSE; 669 } 670 UTEXT_SETNATIVEINDEX(fInputText, startPos); 671 (void)UTEXT_NEXT32(fInputText); 672 startPos = UTEXT_GETNATIVEINDEX(fInputText); 673 // Note that it's perfectly OK for a pattern to have a zero-length 674 // match at the end of a string, so we must make sure that the loop 675 // runs with startPos == testStartLimit the last time through. 676 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 677 return FALSE; 678 } 679 U_ASSERT(FALSE); 680 681 case START_START: 682 // Matches are only possible at the start of the input string 683 // (pattern begins with ^ or \A) 684 if (startPos > fActiveStart) { 685 fMatch = FALSE; 686 return FALSE; 687 } 688 MatchAt(startPos, FALSE, fDeferredStatus); 689 if (U_FAILURE(fDeferredStatus)) { 690 return FALSE; 691 } 692 return fMatch; 693 694 695 case START_SET: 696 { 697 // Match may start on any char from a pre-computed set. 698 U_ASSERT(fPattern->fMinMatchLen > 0); 699 int64_t pos; 700 UTEXT_SETNATIVEINDEX(fInputText, startPos); 701 for (;;) { 702 c = UTEXT_NEXT32(fInputText); 703 pos = UTEXT_GETNATIVEINDEX(fInputText); 704 // c will be -1 (U_SENTINEL) at end of text, in which case we 705 // skip this next block (so we don't have a negative array index) 706 // and handle end of text in the following block. 707 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) || 708 (c>=256 && fPattern->fInitialChars->contains(c)))) { 709 MatchAt(startPos, FALSE, fDeferredStatus); 710 if (U_FAILURE(fDeferredStatus)) { 711 return FALSE; 712 } 713 if (fMatch) { 714 return TRUE; 715 } 716 UTEXT_SETNATIVEINDEX(fInputText, pos); 717 } 718 if (startPos >= testStartLimit) { 719 fMatch = FALSE; 720 fHitEnd = TRUE; 721 return FALSE; 722 } 723 startPos = pos; 724 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 725 return FALSE; 726 } 727 } 728 U_ASSERT(FALSE); 729 730 case START_STRING: 731 case START_CHAR: 732 { 733 // Match starts on exactly one char. 734 U_ASSERT(fPattern->fMinMatchLen > 0); 735 UChar32 theChar = fPattern->fInitialChar; 736 int64_t pos; 737 UTEXT_SETNATIVEINDEX(fInputText, startPos); 738 for (;;) { 739 c = UTEXT_NEXT32(fInputText); 740 pos = UTEXT_GETNATIVEINDEX(fInputText); 741 if (c == theChar) { 742 MatchAt(startPos, FALSE, fDeferredStatus); 743 if (U_FAILURE(fDeferredStatus)) { 744 return FALSE; 745 } 746 if (fMatch) { 747 return TRUE; 748 } 749 UTEXT_SETNATIVEINDEX(fInputText, pos); 750 } 751 if (startPos >= testStartLimit) { 752 fMatch = FALSE; 753 fHitEnd = TRUE; 754 return FALSE; 755 } 756 startPos = pos; 757 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 758 return FALSE; 759 } 760 } 761 U_ASSERT(FALSE); 762 763 case START_LINE: 764 { 765 UChar32 c; 766 if (startPos == fAnchorStart) { 767 MatchAt(startPos, FALSE, fDeferredStatus); 768 if (U_FAILURE(fDeferredStatus)) { 769 return FALSE; 770 } 771 if (fMatch) { 772 return TRUE; 773 } 774 UTEXT_SETNATIVEINDEX(fInputText, startPos); 775 c = UTEXT_NEXT32(fInputText); 776 startPos = UTEXT_GETNATIVEINDEX(fInputText); 777 } else { 778 UTEXT_SETNATIVEINDEX(fInputText, startPos); 779 c = UTEXT_PREVIOUS32(fInputText); 780 UTEXT_SETNATIVEINDEX(fInputText, startPos); 781 } 782 783 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 784 for (;;) { 785 if (c == 0x0a) { 786 MatchAt(startPos, FALSE, fDeferredStatus); 787 if (U_FAILURE(fDeferredStatus)) { 788 return FALSE; 789 } 790 if (fMatch) { 791 return TRUE; 792 } 793 UTEXT_SETNATIVEINDEX(fInputText, startPos); 794 } 795 if (startPos >= testStartLimit) { 796 fMatch = FALSE; 797 fHitEnd = TRUE; 798 return FALSE; 799 } 800 c = UTEXT_NEXT32(fInputText); 801 startPos = UTEXT_GETNATIVEINDEX(fInputText); 802 // Note that it's perfectly OK for a pattern to have a zero-length 803 // match at the end of a string, so we must make sure that the loop 804 // runs with startPos == testStartLimit the last time through. 805 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 806 return FALSE; 807 } 808 } else { 809 for (;;) { 810 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 811 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { 812 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 813 (void)UTEXT_NEXT32(fInputText); 814 startPos = UTEXT_GETNATIVEINDEX(fInputText); 815 } 816 MatchAt(startPos, FALSE, fDeferredStatus); 817 if (U_FAILURE(fDeferredStatus)) { 818 return FALSE; 819 } 820 if (fMatch) { 821 return TRUE; 822 } 823 UTEXT_SETNATIVEINDEX(fInputText, startPos); 824 } 825 if (startPos >= testStartLimit) { 826 fMatch = FALSE; 827 fHitEnd = TRUE; 828 return FALSE; 829 } 830 c = UTEXT_NEXT32(fInputText); 831 startPos = UTEXT_GETNATIVEINDEX(fInputText); 832 // Note that it's perfectly OK for a pattern to have a zero-length 833 // match at the end of a string, so we must make sure that the loop 834 // runs with startPos == testStartLimit the last time through. 835 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 836 return FALSE; 837 } 838 } 839 } 840 841 default: 842 U_ASSERT(FALSE); 843 } 844 845 U_ASSERT(FALSE); 846 return FALSE; 847 } 848 849 850 851 UBool RegexMatcher::find(int64_t start, UErrorCode &status) { 852 if (U_FAILURE(status)) { 853 return FALSE; 854 } 855 if (U_FAILURE(fDeferredStatus)) { 856 status = fDeferredStatus; 857 return FALSE; 858 } 859 this->reset(); // Note: Reset() is specified by Java Matcher documentation. 860 // This will reset the region to be the full input length. 861 if (start < 0) { 862 status = U_INDEX_OUTOFBOUNDS_ERROR; 863 return FALSE; 864 } 865 866 int64_t nativeStart = start; 867 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 868 status = U_INDEX_OUTOFBOUNDS_ERROR; 869 return FALSE; 870 } 871 fMatchEnd = nativeStart; 872 return find(); 873 } 874 875 876 //-------------------------------------------------------------------------------- 877 // 878 // findUsingChunk() -- like find(), but with the advance knowledge that the 879 // entire string is available in the UText's chunk buffer. 880 // 881 //-------------------------------------------------------------------------------- 882 UBool RegexMatcher::findUsingChunk() { 883 // Start at the position of the last match end. (Will be zero if the 884 // matcher has been reset. 885 // 886 887 int32_t startPos = (int32_t)fMatchEnd; 888 if (startPos==0) { 889 startPos = (int32_t)fActiveStart; 890 } 891 892 const UChar *inputBuf = fInputText->chunkContents; 893 894 if (fMatch) { 895 // Save the position of any previous successful match. 896 fLastMatchEnd = fMatchEnd; 897 898 if (fMatchStart == fMatchEnd) { 899 // Previous match had zero length. Move start position up one position 900 // to avoid sending find() into a loop on zero-length matches. 901 if (startPos >= fActiveLimit) { 902 fMatch = FALSE; 903 fHitEnd = TRUE; 904 return FALSE; 905 } 906 U16_FWD_1(inputBuf, startPos, fInputLength); 907 } 908 } else { 909 if (fLastMatchEnd >= 0) { 910 // A previous find() failed to match. Don't try again. 911 // (without this test, a pattern with a zero-length match 912 // could match again at the end of an input string.) 913 fHitEnd = TRUE; 914 return FALSE; 915 } 916 } 917 918 919 // Compute the position in the input string beyond which a match can not begin, because 920 // the minimum length match would extend past the end of the input. 921 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 922 // Be aware of possible overflows if making changes here. 923 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); 924 if (startPos > testLen) { 925 fMatch = FALSE; 926 fHitEnd = TRUE; 927 return FALSE; 928 } 929 930 UChar32 c; 931 U_ASSERT(startPos >= 0); 932 933 switch (fPattern->fStartType) { 934 case START_NO_INFO: 935 // No optimization was found. 936 // Try a match at each input position. 937 for (;;) { 938 MatchChunkAt(startPos, FALSE, fDeferredStatus); 939 if (U_FAILURE(fDeferredStatus)) { 940 return FALSE; 941 } 942 if (fMatch) { 943 return TRUE; 944 } 945 if (startPos >= testLen) { 946 fHitEnd = TRUE; 947 return FALSE; 948 } 949 U16_FWD_1(inputBuf, startPos, fActiveLimit); 950 // Note that it's perfectly OK for a pattern to have a zero-length 951 // match at the end of a string, so we must make sure that the loop 952 // runs with startPos == testLen the last time through. 953 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 954 return FALSE; 955 } 956 U_ASSERT(FALSE); 957 958 case START_START: 959 // Matches are only possible at the start of the input string 960 // (pattern begins with ^ or \A) 961 if (startPos > fActiveStart) { 962 fMatch = FALSE; 963 return FALSE; 964 } 965 MatchChunkAt(startPos, FALSE, fDeferredStatus); 966 if (U_FAILURE(fDeferredStatus)) { 967 return FALSE; 968 } 969 return fMatch; 970 971 972 case START_SET: 973 { 974 // Match may start on any char from a pre-computed set. 975 U_ASSERT(fPattern->fMinMatchLen > 0); 976 for (;;) { 977 int32_t pos = startPos; 978 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 979 if ((c<256 && fPattern->fInitialChars8->contains(c)) || 980 (c>=256 && fPattern->fInitialChars->contains(c))) { 981 MatchChunkAt(pos, FALSE, fDeferredStatus); 982 if (U_FAILURE(fDeferredStatus)) { 983 return FALSE; 984 } 985 if (fMatch) { 986 return TRUE; 987 } 988 } 989 if (pos >= testLen) { 990 fMatch = FALSE; 991 fHitEnd = TRUE; 992 return FALSE; 993 } 994 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 995 return FALSE; 996 } 997 } 998 U_ASSERT(FALSE); 999 1000 case START_STRING: 1001 case START_CHAR: 1002 { 1003 // Match starts on exactly one char. 1004 U_ASSERT(fPattern->fMinMatchLen > 0); 1005 UChar32 theChar = fPattern->fInitialChar; 1006 for (;;) { 1007 int32_t pos = startPos; 1008 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 1009 if (c == theChar) { 1010 MatchChunkAt(pos, FALSE, fDeferredStatus); 1011 if (U_FAILURE(fDeferredStatus)) { 1012 return FALSE; 1013 } 1014 if (fMatch) { 1015 return TRUE; 1016 } 1017 } 1018 if (pos >= testLen) { 1019 fMatch = FALSE; 1020 fHitEnd = TRUE; 1021 return FALSE; 1022 } 1023 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1024 return FALSE; 1025 } 1026 } 1027 U_ASSERT(FALSE); 1028 1029 case START_LINE: 1030 { 1031 UChar32 c; 1032 if (startPos == fAnchorStart) { 1033 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1034 if (U_FAILURE(fDeferredStatus)) { 1035 return FALSE; 1036 } 1037 if (fMatch) { 1038 return TRUE; 1039 } 1040 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1041 } 1042 1043 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 1044 for (;;) { 1045 c = inputBuf[startPos-1]; 1046 if (c == 0x0a) { 1047 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1048 if (U_FAILURE(fDeferredStatus)) { 1049 return FALSE; 1050 } 1051 if (fMatch) { 1052 return TRUE; 1053 } 1054 } 1055 if (startPos >= testLen) { 1056 fMatch = FALSE; 1057 fHitEnd = TRUE; 1058 return FALSE; 1059 } 1060 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1061 // Note that it's perfectly OK for a pattern to have a zero-length 1062 // match at the end of a string, so we must make sure that the loop 1063 // runs with startPos == testLen the last time through. 1064 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1065 return FALSE; 1066 } 1067 } else { 1068 for (;;) { 1069 c = inputBuf[startPos-1]; 1070 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 1071 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { 1072 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { 1073 startPos++; 1074 } 1075 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1076 if (U_FAILURE(fDeferredStatus)) { 1077 return FALSE; 1078 } 1079 if (fMatch) { 1080 return TRUE; 1081 } 1082 } 1083 if (startPos >= testLen) { 1084 fMatch = FALSE; 1085 fHitEnd = TRUE; 1086 return FALSE; 1087 } 1088 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1089 // Note that it's perfectly OK for a pattern to have a zero-length 1090 // match at the end of a string, so we must make sure that the loop 1091 // runs with startPos == testLen the last time through. 1092 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1093 return FALSE; 1094 } 1095 } 1096 } 1097 1098 default: 1099 U_ASSERT(FALSE); 1100 } 1101 1102 U_ASSERT(FALSE); 1103 return FALSE; 1104 } 1105 1106 1107 1108 //-------------------------------------------------------------------------------- 1109 // 1110 // group() 1111 // 1112 //-------------------------------------------------------------------------------- 1113 UnicodeString RegexMatcher::group(UErrorCode &status) const { 1114 return group(0, status); 1115 } 1116 1117 // Return immutable shallow clone 1118 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const { 1119 return group(0, dest, group_len, status); 1120 } 1121 1122 // Return immutable shallow clone 1123 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const { 1124 group_len = 0; 1125 UBool bailOut = FALSE; 1126 if (U_FAILURE(status)) { 1127 return dest; 1128 } 1129 if (U_FAILURE(fDeferredStatus)) { 1130 status = fDeferredStatus; 1131 bailOut = TRUE; 1132 } 1133 if (fMatch == FALSE) { 1134 status = U_REGEX_INVALID_STATE; 1135 bailOut = TRUE; 1136 } 1137 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1138 status = U_INDEX_OUTOFBOUNDS_ERROR; 1139 bailOut = TRUE; 1140 } 1141 1142 if (bailOut) { 1143 return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status); 1144 } 1145 1146 int64_t s, e; 1147 if (groupNum == 0) { 1148 s = fMatchStart; 1149 e = fMatchEnd; 1150 } else { 1151 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1152 U_ASSERT(groupOffset < fPattern->fFrameSize); 1153 U_ASSERT(groupOffset >= 0); 1154 s = fFrame->fExtra[groupOffset]; 1155 e = fFrame->fExtra[groupOffset+1]; 1156 } 1157 1158 if (s < 0) { 1159 // A capture group wasn't part of the match 1160 return utext_clone(dest, fInputText, FALSE, TRUE, &status); 1161 } 1162 U_ASSERT(s <= e); 1163 group_len = e - s; 1164 1165 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); 1166 if (dest) 1167 UTEXT_SETNATIVEINDEX(dest, s); 1168 return dest; 1169 } 1170 1171 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { 1172 UnicodeString result; 1173 if (U_FAILURE(status)) { 1174 return result; 1175 } 1176 UText resultText = UTEXT_INITIALIZER; 1177 utext_openUnicodeString(&resultText, &result, &status); 1178 group(groupNum, &resultText, status); 1179 utext_close(&resultText); 1180 return result; 1181 } 1182 1183 1184 // Return deep (mutable) clone 1185 // Technology Preview (as an API), but note that the UnicodeString API is implemented 1186 // using this function. 1187 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const { 1188 UBool bailOut = FALSE; 1189 if (U_FAILURE(status)) { 1190 return dest; 1191 } 1192 if (U_FAILURE(fDeferredStatus)) { 1193 status = fDeferredStatus; 1194 bailOut = TRUE; 1195 } 1196 1197 if (fMatch == FALSE) { 1198 status = U_REGEX_INVALID_STATE; 1199 bailOut = TRUE; 1200 } 1201 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1202 status = U_INDEX_OUTOFBOUNDS_ERROR; 1203 bailOut = TRUE; 1204 } 1205 1206 if (bailOut) { 1207 if (dest) { 1208 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1209 return dest; 1210 } else { 1211 return utext_openUChars(NULL, NULL, 0, &status); 1212 } 1213 } 1214 1215 int64_t s, e; 1216 if (groupNum == 0) { 1217 s = fMatchStart; 1218 e = fMatchEnd; 1219 } else { 1220 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1221 U_ASSERT(groupOffset < fPattern->fFrameSize); 1222 U_ASSERT(groupOffset >= 0); 1223 s = fFrame->fExtra[groupOffset]; 1224 e = fFrame->fExtra[groupOffset+1]; 1225 } 1226 1227 if (s < 0) { 1228 // A capture group wasn't part of the match 1229 if (dest) { 1230 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1231 return dest; 1232 } else { 1233 return utext_openUChars(NULL, NULL, 0, &status); 1234 } 1235 } 1236 U_ASSERT(s <= e); 1237 1238 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1239 U_ASSERT(e <= fInputLength); 1240 if (dest) { 1241 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status); 1242 } else { 1243 UText groupText = UTEXT_INITIALIZER; 1244 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status); 1245 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); 1246 utext_close(&groupText); 1247 } 1248 } else { 1249 int32_t len16; 1250 if (UTEXT_USES_U16(fInputText)) { 1251 len16 = (int32_t)(e-s); 1252 } else { 1253 UErrorCode lengthStatus = U_ZERO_ERROR; 1254 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1255 } 1256 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1257 if (groupChars == NULL) { 1258 status = U_MEMORY_ALLOCATION_ERROR; 1259 return dest; 1260 } 1261 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1262 1263 if (dest) { 1264 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status); 1265 } else { 1266 UText groupText = UTEXT_INITIALIZER; 1267 utext_openUChars(&groupText, groupChars, len16, &status); 1268 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); 1269 utext_close(&groupText); 1270 } 1271 1272 uprv_free(groupChars); 1273 } 1274 return dest; 1275 } 1276 1277 //-------------------------------------------------------------------------------- 1278 // 1279 // appendGroup() -- currently internal only, appends a group to a UText rather 1280 // than replacing its contents 1281 // 1282 //-------------------------------------------------------------------------------- 1283 1284 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const { 1285 if (U_FAILURE(status)) { 1286 return 0; 1287 } 1288 if (U_FAILURE(fDeferredStatus)) { 1289 status = fDeferredStatus; 1290 return 0; 1291 } 1292 int64_t destLen = utext_nativeLength(dest); 1293 1294 if (fMatch == FALSE) { 1295 status = U_REGEX_INVALID_STATE; 1296 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1297 } 1298 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1299 status = U_INDEX_OUTOFBOUNDS_ERROR; 1300 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1301 } 1302 1303 int64_t s, e; 1304 if (groupNum == 0) { 1305 s = fMatchStart; 1306 e = fMatchEnd; 1307 } else { 1308 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1309 U_ASSERT(groupOffset < fPattern->fFrameSize); 1310 U_ASSERT(groupOffset >= 0); 1311 s = fFrame->fExtra[groupOffset]; 1312 e = fFrame->fExtra[groupOffset+1]; 1313 } 1314 1315 if (s < 0) { 1316 // A capture group wasn't part of the match 1317 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1318 } 1319 U_ASSERT(s <= e); 1320 1321 int64_t deltaLen; 1322 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1323 U_ASSERT(e <= fInputLength); 1324 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status); 1325 } else { 1326 int32_t len16; 1327 if (UTEXT_USES_U16(fInputText)) { 1328 len16 = (int32_t)(e-s); 1329 } else { 1330 UErrorCode lengthStatus = U_ZERO_ERROR; 1331 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1332 } 1333 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1334 if (groupChars == NULL) { 1335 status = U_MEMORY_ALLOCATION_ERROR; 1336 return 0; 1337 } 1338 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1339 1340 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status); 1341 uprv_free(groupChars); 1342 } 1343 return deltaLen; 1344 } 1345 1346 1347 1348 //-------------------------------------------------------------------------------- 1349 // 1350 // groupCount() 1351 // 1352 //-------------------------------------------------------------------------------- 1353 int32_t RegexMatcher::groupCount() const { 1354 return fPattern->fGroupMap->size(); 1355 } 1356 1357 1358 1359 //-------------------------------------------------------------------------------- 1360 // 1361 // hasAnchoringBounds() 1362 // 1363 //-------------------------------------------------------------------------------- 1364 UBool RegexMatcher::hasAnchoringBounds() const { 1365 return fAnchoringBounds; 1366 } 1367 1368 1369 //-------------------------------------------------------------------------------- 1370 // 1371 // hasTransparentBounds() 1372 // 1373 //-------------------------------------------------------------------------------- 1374 UBool RegexMatcher::hasTransparentBounds() const { 1375 return fTransparentBounds; 1376 } 1377 1378 1379 1380 //-------------------------------------------------------------------------------- 1381 // 1382 // hitEnd() 1383 // 1384 //-------------------------------------------------------------------------------- 1385 UBool RegexMatcher::hitEnd() const { 1386 return fHitEnd; 1387 } 1388 1389 1390 //-------------------------------------------------------------------------------- 1391 // 1392 // input() 1393 // 1394 //-------------------------------------------------------------------------------- 1395 const UnicodeString &RegexMatcher::input() const { 1396 if (!fInput) { 1397 UErrorCode status = U_ZERO_ERROR; 1398 int32_t len16; 1399 if (UTEXT_USES_U16(fInputText)) { 1400 len16 = (int32_t)fInputLength; 1401 } else { 1402 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status); 1403 status = U_ZERO_ERROR; // overflow, length status 1404 } 1405 UnicodeString *result = new UnicodeString(len16, 0, 0); 1406 1407 UChar *inputChars = result->getBuffer(len16); 1408 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning 1409 result->releaseBuffer(len16); 1410 1411 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator= 1412 } 1413 1414 return *fInput; 1415 } 1416 1417 //-------------------------------------------------------------------------------- 1418 // 1419 // inputText() 1420 // 1421 //-------------------------------------------------------------------------------- 1422 UText *RegexMatcher::inputText() const { 1423 return fInputText; 1424 } 1425 1426 1427 //-------------------------------------------------------------------------------- 1428 // 1429 // getInput() -- like inputText(), but makes a clone or copies into another UText 1430 // 1431 //-------------------------------------------------------------------------------- 1432 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { 1433 UBool bailOut = FALSE; 1434 if (U_FAILURE(status)) { 1435 return dest; 1436 } 1437 if (U_FAILURE(fDeferredStatus)) { 1438 status = fDeferredStatus; 1439 bailOut = TRUE; 1440 } 1441 1442 if (bailOut) { 1443 if (dest) { 1444 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1445 return dest; 1446 } else { 1447 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); 1448 } 1449 } 1450 1451 if (dest) { 1452 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1453 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status); 1454 } else { 1455 int32_t input16Len; 1456 if (UTEXT_USES_U16(fInputText)) { 1457 input16Len = (int32_t)fInputLength; 1458 } else { 1459 UErrorCode lengthStatus = U_ZERO_ERROR; 1460 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error 1461 } 1462 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len)); 1463 if (inputChars == NULL) { 1464 return dest; 1465 } 1466 1467 status = U_ZERO_ERROR; 1468 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning 1469 status = U_ZERO_ERROR; 1470 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status); 1471 1472 uprv_free(inputChars); 1473 } 1474 return dest; 1475 } else { 1476 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); 1477 } 1478 } 1479 1480 1481 static UBool compat_SyncMutableUTextContents(UText *ut); 1482 static UBool compat_SyncMutableUTextContents(UText *ut) { 1483 UBool retVal = FALSE; 1484 1485 // In the following test, we're really only interested in whether the UText should switch 1486 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents 1487 // will still point to the correct data. 1488 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { 1489 UnicodeString *us=(UnicodeString *)ut->context; 1490 1491 // Update to the latest length. 1492 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). 1493 int32_t newLength = us->length(); 1494 1495 // Update the chunk description. 1496 // The buffer may have switched between stack- and heap-based. 1497 ut->chunkContents = us->getBuffer(); 1498 ut->chunkLength = newLength; 1499 ut->chunkNativeLimit = newLength; 1500 ut->nativeIndexingLimit = newLength; 1501 retVal = TRUE; 1502 } 1503 1504 return retVal; 1505 } 1506 1507 //-------------------------------------------------------------------------------- 1508 // 1509 // lookingAt() 1510 // 1511 //-------------------------------------------------------------------------------- 1512 UBool RegexMatcher::lookingAt(UErrorCode &status) { 1513 if (U_FAILURE(status)) { 1514 return FALSE; 1515 } 1516 if (U_FAILURE(fDeferredStatus)) { 1517 status = fDeferredStatus; 1518 return FALSE; 1519 } 1520 1521 if (fInputUniStrMaybeMutable) { 1522 if (compat_SyncMutableUTextContents(fInputText)) { 1523 fInputLength = utext_nativeLength(fInputText); 1524 reset(); 1525 } 1526 } 1527 else { 1528 resetPreserveRegion(); 1529 } 1530 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1531 MatchChunkAt((int32_t)fActiveStart, FALSE, status); 1532 } else { 1533 MatchAt(fActiveStart, FALSE, status); 1534 } 1535 return fMatch; 1536 } 1537 1538 1539 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { 1540 if (U_FAILURE(status)) { 1541 return FALSE; 1542 } 1543 if (U_FAILURE(fDeferredStatus)) { 1544 status = fDeferredStatus; 1545 return FALSE; 1546 } 1547 reset(); 1548 1549 if (start < 0) { 1550 status = U_INDEX_OUTOFBOUNDS_ERROR; 1551 return FALSE; 1552 } 1553 1554 if (fInputUniStrMaybeMutable) { 1555 if (compat_SyncMutableUTextContents(fInputText)) { 1556 fInputLength = utext_nativeLength(fInputText); 1557 reset(); 1558 } 1559 } 1560 1561 int64_t nativeStart; 1562 nativeStart = start; 1563 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1564 status = U_INDEX_OUTOFBOUNDS_ERROR; 1565 return FALSE; 1566 } 1567 1568 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1569 MatchChunkAt((int32_t)nativeStart, FALSE, status); 1570 } else { 1571 MatchAt(nativeStart, FALSE, status); 1572 } 1573 return fMatch; 1574 } 1575 1576 1577 1578 //-------------------------------------------------------------------------------- 1579 // 1580 // matches() 1581 // 1582 //-------------------------------------------------------------------------------- 1583 UBool RegexMatcher::matches(UErrorCode &status) { 1584 if (U_FAILURE(status)) { 1585 return FALSE; 1586 } 1587 if (U_FAILURE(fDeferredStatus)) { 1588 status = fDeferredStatus; 1589 return FALSE; 1590 } 1591 1592 if (fInputUniStrMaybeMutable) { 1593 if (compat_SyncMutableUTextContents(fInputText)) { 1594 fInputLength = utext_nativeLength(fInputText); 1595 reset(); 1596 } 1597 } 1598 else { 1599 resetPreserveRegion(); 1600 } 1601 1602 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1603 MatchChunkAt((int32_t)fActiveStart, TRUE, status); 1604 } else { 1605 MatchAt(fActiveStart, TRUE, status); 1606 } 1607 return fMatch; 1608 } 1609 1610 1611 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { 1612 if (U_FAILURE(status)) { 1613 return FALSE; 1614 } 1615 if (U_FAILURE(fDeferredStatus)) { 1616 status = fDeferredStatus; 1617 return FALSE; 1618 } 1619 reset(); 1620 1621 if (start < 0) { 1622 status = U_INDEX_OUTOFBOUNDS_ERROR; 1623 return FALSE; 1624 } 1625 1626 if (fInputUniStrMaybeMutable) { 1627 if (compat_SyncMutableUTextContents(fInputText)) { 1628 fInputLength = utext_nativeLength(fInputText); 1629 reset(); 1630 } 1631 } 1632 1633 int64_t nativeStart; 1634 nativeStart = start; 1635 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1636 status = U_INDEX_OUTOFBOUNDS_ERROR; 1637 return FALSE; 1638 } 1639 1640 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1641 MatchChunkAt((int32_t)nativeStart, TRUE, status); 1642 } else { 1643 MatchAt(nativeStart, TRUE, status); 1644 } 1645 return fMatch; 1646 } 1647 1648 1649 1650 //-------------------------------------------------------------------------------- 1651 // 1652 // pattern 1653 // 1654 //-------------------------------------------------------------------------------- 1655 const RegexPattern &RegexMatcher::pattern() const { 1656 return *fPattern; 1657 } 1658 1659 1660 1661 //-------------------------------------------------------------------------------- 1662 // 1663 // region 1664 // 1665 //-------------------------------------------------------------------------------- 1666 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) { 1667 if (U_FAILURE(status)) { 1668 return *this; 1669 } 1670 1671 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { 1672 status = U_ILLEGAL_ARGUMENT_ERROR; 1673 } 1674 1675 int64_t nativeStart = regionStart; 1676 int64_t nativeLimit = regionLimit; 1677 if (nativeStart > fInputLength || nativeLimit > fInputLength) { 1678 status = U_ILLEGAL_ARGUMENT_ERROR; 1679 } 1680 1681 if (startIndex == -1) 1682 this->reset(); 1683 else 1684 resetPreserveRegion(); 1685 1686 fRegionStart = nativeStart; 1687 fRegionLimit = nativeLimit; 1688 fActiveStart = nativeStart; 1689 fActiveLimit = nativeLimit; 1690 1691 if (startIndex != -1) { 1692 if (startIndex < fActiveStart || startIndex > fActiveLimit) { 1693 status = U_INDEX_OUTOFBOUNDS_ERROR; 1694 } 1695 fMatchEnd = startIndex; 1696 } 1697 1698 if (!fTransparentBounds) { 1699 fLookStart = nativeStart; 1700 fLookLimit = nativeLimit; 1701 } 1702 if (fAnchoringBounds) { 1703 fAnchorStart = nativeStart; 1704 fAnchorLimit = nativeLimit; 1705 } 1706 return *this; 1707 } 1708 1709 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) { 1710 return region(start, limit, -1, status); 1711 } 1712 1713 //-------------------------------------------------------------------------------- 1714 // 1715 // regionEnd 1716 // 1717 //-------------------------------------------------------------------------------- 1718 int32_t RegexMatcher::regionEnd() const { 1719 return (int32_t)fRegionLimit; 1720 } 1721 1722 int64_t RegexMatcher::regionEnd64() const { 1723 return fRegionLimit; 1724 } 1725 1726 //-------------------------------------------------------------------------------- 1727 // 1728 // regionStart 1729 // 1730 //-------------------------------------------------------------------------------- 1731 int32_t RegexMatcher::regionStart() const { 1732 return (int32_t)fRegionStart; 1733 } 1734 1735 int64_t RegexMatcher::regionStart64() const { 1736 return fRegionStart; 1737 } 1738 1739 1740 //-------------------------------------------------------------------------------- 1741 // 1742 // replaceAll 1743 // 1744 //-------------------------------------------------------------------------------- 1745 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { 1746 UText replacementText = UTEXT_INITIALIZER; 1747 UText resultText = UTEXT_INITIALIZER; 1748 UnicodeString resultString; 1749 if (U_FAILURE(status)) { 1750 return resultString; 1751 } 1752 1753 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1754 utext_openUnicodeString(&resultText, &resultString, &status); 1755 1756 replaceAll(&replacementText, &resultText, status); 1757 1758 utext_close(&resultText); 1759 utext_close(&replacementText); 1760 1761 return resultString; 1762 } 1763 1764 1765 // 1766 // replaceAll, UText mode 1767 // 1768 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) { 1769 if (U_FAILURE(status)) { 1770 return dest; 1771 } 1772 if (U_FAILURE(fDeferredStatus)) { 1773 status = fDeferredStatus; 1774 return dest; 1775 } 1776 1777 if (dest == NULL) { 1778 UnicodeString emptyString; 1779 UText empty = UTEXT_INITIALIZER; 1780 1781 utext_openUnicodeString(&empty, &emptyString, &status); 1782 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1783 utext_close(&empty); 1784 } 1785 1786 if (U_SUCCESS(status)) { 1787 reset(); 1788 while (find()) { 1789 appendReplacement(dest, replacement, status); 1790 if (U_FAILURE(status)) { 1791 break; 1792 } 1793 } 1794 appendTail(dest, status); 1795 } 1796 1797 return dest; 1798 } 1799 1800 1801 //-------------------------------------------------------------------------------- 1802 // 1803 // replaceFirst 1804 // 1805 //-------------------------------------------------------------------------------- 1806 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { 1807 UText replacementText = UTEXT_INITIALIZER; 1808 UText resultText = UTEXT_INITIALIZER; 1809 UnicodeString resultString; 1810 1811 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1812 utext_openUnicodeString(&resultText, &resultString, &status); 1813 1814 replaceFirst(&replacementText, &resultText, status); 1815 1816 utext_close(&resultText); 1817 utext_close(&replacementText); 1818 1819 return resultString; 1820 } 1821 1822 // 1823 // replaceFirst, UText mode 1824 // 1825 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) { 1826 if (U_FAILURE(status)) { 1827 return dest; 1828 } 1829 if (U_FAILURE(fDeferredStatus)) { 1830 status = fDeferredStatus; 1831 return dest; 1832 } 1833 1834 reset(); 1835 if (!find()) { 1836 return getInput(dest, status); 1837 } 1838 1839 if (dest == NULL) { 1840 UnicodeString emptyString; 1841 UText empty = UTEXT_INITIALIZER; 1842 1843 utext_openUnicodeString(&empty, &emptyString, &status); 1844 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1845 utext_close(&empty); 1846 } 1847 1848 appendReplacement(dest, replacement, status); 1849 appendTail(dest, status); 1850 1851 return dest; 1852 } 1853 1854 1855 //-------------------------------------------------------------------------------- 1856 // 1857 // requireEnd 1858 // 1859 //-------------------------------------------------------------------------------- 1860 UBool RegexMatcher::requireEnd() const { 1861 return fRequireEnd; 1862 } 1863 1864 1865 //-------------------------------------------------------------------------------- 1866 // 1867 // reset 1868 // 1869 //-------------------------------------------------------------------------------- 1870 RegexMatcher &RegexMatcher::reset() { 1871 fRegionStart = 0; 1872 fRegionLimit = fInputLength; 1873 fActiveStart = 0; 1874 fActiveLimit = fInputLength; 1875 fAnchorStart = 0; 1876 fAnchorLimit = fInputLength; 1877 fLookStart = 0; 1878 fLookLimit = fInputLength; 1879 resetPreserveRegion(); 1880 return *this; 1881 } 1882 1883 1884 1885 void RegexMatcher::resetPreserveRegion() { 1886 fMatchStart = 0; 1887 fMatchEnd = 0; 1888 fLastMatchEnd = -1; 1889 fAppendPosition = 0; 1890 fMatch = FALSE; 1891 fHitEnd = FALSE; 1892 fRequireEnd = FALSE; 1893 fTime = 0; 1894 fTickCounter = TIMER_INITIAL_VALUE; 1895 //resetStack(); // more expensive than it looks... 1896 } 1897 1898 1899 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { 1900 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus); 1901 if (fPattern->fNeedsAltInput) { 1902 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1903 } 1904 fInputLength = utext_nativeLength(fInputText); 1905 1906 reset(); 1907 delete fInput; 1908 fInput = NULL; 1909 1910 // Do the following for any UnicodeString. 1911 // This is for compatibility for those clients who modify the input string "live" during regex operations. 1912 fInputUniStrMaybeMutable = TRUE; 1913 1914 if (fWordBreakItr != NULL) { 1915 #if UCONFIG_NO_BREAK_ITERATION==0 1916 UErrorCode status = U_ZERO_ERROR; 1917 fWordBreakItr->setText(fInputText, status); 1918 #endif 1919 } 1920 return *this; 1921 } 1922 1923 1924 RegexMatcher &RegexMatcher::reset(UText *input) { 1925 if (fInputText != input) { 1926 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus); 1927 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1928 fInputLength = utext_nativeLength(fInputText); 1929 1930 delete fInput; 1931 fInput = NULL; 1932 1933 if (fWordBreakItr != NULL) { 1934 #if UCONFIG_NO_BREAK_ITERATION==0 1935 UErrorCode status = U_ZERO_ERROR; 1936 fWordBreakItr->setText(input, status); 1937 #endif 1938 } 1939 } 1940 reset(); 1941 fInputUniStrMaybeMutable = FALSE; 1942 1943 return *this; 1944 } 1945 1946 /*RegexMatcher &RegexMatcher::reset(const UChar *) { 1947 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; 1948 return *this; 1949 }*/ 1950 1951 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { 1952 if (U_FAILURE(status)) { 1953 return *this; 1954 } 1955 reset(); // Reset also resets the region to be the entire string. 1956 1957 if (position < 0 || position > fActiveLimit) { 1958 status = U_INDEX_OUTOFBOUNDS_ERROR; 1959 return *this; 1960 } 1961 fMatchEnd = position; 1962 return *this; 1963 } 1964 1965 1966 //-------------------------------------------------------------------------------- 1967 // 1968 // refresh 1969 // 1970 //-------------------------------------------------------------------------------- 1971 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) { 1972 if (U_FAILURE(status)) { 1973 return *this; 1974 } 1975 if (input == NULL) { 1976 status = U_ILLEGAL_ARGUMENT_ERROR; 1977 return *this; 1978 } 1979 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) { 1980 status = U_ILLEGAL_ARGUMENT_ERROR; 1981 return *this; 1982 } 1983 int64_t pos = utext_getNativeIndex(fInputText); 1984 // Shallow read-only clone of the new UText into the existing input UText 1985 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status); 1986 if (U_FAILURE(status)) { 1987 return *this; 1988 } 1989 utext_setNativeIndex(fInputText, pos); 1990 1991 if (fAltInputText != NULL) { 1992 pos = utext_getNativeIndex(fAltInputText); 1993 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status); 1994 if (U_FAILURE(status)) { 1995 return *this; 1996 } 1997 utext_setNativeIndex(fAltInputText, pos); 1998 } 1999 return *this; 2000 } 2001 2002 2003 2004 //-------------------------------------------------------------------------------- 2005 // 2006 // setTrace 2007 // 2008 //-------------------------------------------------------------------------------- 2009 void RegexMatcher::setTrace(UBool state) { 2010 fTraceDebug = state; 2011 } 2012 2013 2014 2015 //--------------------------------------------------------------------- 2016 // 2017 // split 2018 // 2019 //--------------------------------------------------------------------- 2020 int32_t RegexMatcher::split(const UnicodeString &input, 2021 UnicodeString dest[], 2022 int32_t destCapacity, 2023 UErrorCode &status) 2024 { 2025 UText inputText = UTEXT_INITIALIZER; 2026 utext_openConstUnicodeString(&inputText, &input, &status); 2027 if (U_FAILURE(status)) { 2028 return 0; 2029 } 2030 2031 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); 2032 if (destText == NULL) { 2033 status = U_MEMORY_ALLOCATION_ERROR; 2034 return 0; 2035 } 2036 int32_t i; 2037 for (i = 0; i < destCapacity; i++) { 2038 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); 2039 } 2040 2041 int32_t fieldCount = split(&inputText, destText, destCapacity, status); 2042 2043 for (i = 0; i < destCapacity; i++) { 2044 utext_close(destText[i]); 2045 } 2046 2047 uprv_free(destText); 2048 utext_close(&inputText); 2049 return fieldCount; 2050 } 2051 2052 // 2053 // split, UText mode 2054 // 2055 int32_t RegexMatcher::split(UText *input, 2056 UText *dest[], 2057 int32_t destCapacity, 2058 UErrorCode &status) 2059 { 2060 // 2061 // Check arguements for validity 2062 // 2063 if (U_FAILURE(status)) { 2064 return 0; 2065 }; 2066 2067 if (destCapacity < 1) { 2068 status = U_ILLEGAL_ARGUMENT_ERROR; 2069 return 0; 2070 } 2071 2072 // 2073 // Reset for the input text 2074 // 2075 reset(input); 2076 int64_t nextOutputStringStart = 0; 2077 if (fActiveLimit == 0) { 2078 return 0; 2079 } 2080 2081 // 2082 // Loop through the input text, searching for the delimiter pattern 2083 // 2084 int32_t i; 2085 int32_t numCaptureGroups = fPattern->fGroupMap->size(); 2086 for (i=0; ; i++) { 2087 if (i>=destCapacity-1) { 2088 // There is one or zero output string left. 2089 // Fill the last output string with whatever is left from the input, then exit the loop. 2090 // ( i will be == destCapacity if we filled the output array while processing 2091 // capture groups of the delimiter expression, in which case we will discard the 2092 // last capture group saved in favor of the unprocessed remainder of the 2093 // input string.) 2094 i = destCapacity-1; 2095 if (fActiveLimit > nextOutputStringStart) { 2096 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2097 if (dest[i]) { 2098 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2099 input->chunkContents+nextOutputStringStart, 2100 (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2101 } else { 2102 UText remainingText = UTEXT_INITIALIZER; 2103 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2104 fActiveLimit-nextOutputStringStart, &status); 2105 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2106 utext_close(&remainingText); 2107 } 2108 } else { 2109 UErrorCode lengthStatus = U_ZERO_ERROR; 2110 int32_t remaining16Length = 2111 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 2112 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2113 if (remainingChars == NULL) { 2114 status = U_MEMORY_ALLOCATION_ERROR; 2115 break; 2116 } 2117 2118 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2119 if (dest[i]) { 2120 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2121 } else { 2122 UText remainingText = UTEXT_INITIALIZER; 2123 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2124 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2125 utext_close(&remainingText); 2126 } 2127 2128 uprv_free(remainingChars); 2129 } 2130 } 2131 break; 2132 } 2133 if (find()) { 2134 // We found another delimiter. Move everything from where we started looking 2135 // up until the start of the delimiter into the next output string. 2136 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2137 if (dest[i]) { 2138 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2139 input->chunkContents+nextOutputStringStart, 2140 (int32_t)(fMatchStart-nextOutputStringStart), &status); 2141 } else { 2142 UText remainingText = UTEXT_INITIALIZER; 2143 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2144 fMatchStart-nextOutputStringStart, &status); 2145 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2146 utext_close(&remainingText); 2147 } 2148 } else { 2149 UErrorCode lengthStatus = U_ZERO_ERROR; 2150 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus); 2151 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2152 if (remainingChars == NULL) { 2153 status = U_MEMORY_ALLOCATION_ERROR; 2154 break; 2155 } 2156 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status); 2157 if (dest[i]) { 2158 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2159 } else { 2160 UText remainingText = UTEXT_INITIALIZER; 2161 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2162 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2163 utext_close(&remainingText); 2164 } 2165 2166 uprv_free(remainingChars); 2167 } 2168 nextOutputStringStart = fMatchEnd; 2169 2170 // If the delimiter pattern has capturing parentheses, the captured 2171 // text goes out into the next n destination strings. 2172 int32_t groupNum; 2173 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 2174 if (i >= destCapacity-2) { 2175 // Never fill the last available output string with capture group text. 2176 // It will filled with the last field, the remainder of the 2177 // unsplit input text. 2178 break; 2179 } 2180 i++; 2181 dest[i] = group(groupNum, dest[i], status); 2182 } 2183 2184 if (nextOutputStringStart == fActiveLimit) { 2185 // The delimiter was at the end of the string. We're done, but first 2186 // we output one last empty string, for the empty field following 2187 // the delimiter at the end of input. 2188 if (i+1 < destCapacity) { 2189 ++i; 2190 if (dest[i] == NULL) { 2191 dest[i] = utext_openUChars(NULL, NULL, 0, &status); 2192 } else { 2193 static UChar emptyString[] = {(UChar)0}; 2194 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status); 2195 } 2196 } 2197 break; 2198 2199 } 2200 } 2201 else 2202 { 2203 // We ran off the end of the input while looking for the next delimiter. 2204 // All the remaining text goes into the current output string. 2205 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2206 if (dest[i]) { 2207 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2208 input->chunkContents+nextOutputStringStart, 2209 (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2210 } else { 2211 UText remainingText = UTEXT_INITIALIZER; 2212 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2213 fActiveLimit-nextOutputStringStart, &status); 2214 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2215 utext_close(&remainingText); 2216 } 2217 } else { 2218 UErrorCode lengthStatus = U_ZERO_ERROR; 2219 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 2220 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2221 if (remainingChars == NULL) { 2222 status = U_MEMORY_ALLOCATION_ERROR; 2223 break; 2224 } 2225 2226 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2227 if (dest[i]) { 2228 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2229 } else { 2230 UText remainingText = UTEXT_INITIALIZER; 2231 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2232 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2233 utext_close(&remainingText); 2234 } 2235 2236 uprv_free(remainingChars); 2237 } 2238 break; 2239 } 2240 if (U_FAILURE(status)) { 2241 break; 2242 } 2243 } // end of for loop 2244 return i+1; 2245 } 2246 2247 2248 //-------------------------------------------------------------------------------- 2249 // 2250 // start 2251 // 2252 //-------------------------------------------------------------------------------- 2253 int32_t RegexMatcher::start(UErrorCode &status) const { 2254 return start(0, status); 2255 } 2256 2257 int64_t RegexMatcher::start64(UErrorCode &status) const { 2258 return start64(0, status); 2259 } 2260 2261 //-------------------------------------------------------------------------------- 2262 // 2263 // start(int32_t group, UErrorCode &status) 2264 // 2265 //-------------------------------------------------------------------------------- 2266 2267 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const { 2268 if (U_FAILURE(status)) { 2269 return -1; 2270 } 2271 if (U_FAILURE(fDeferredStatus)) { 2272 status = fDeferredStatus; 2273 return -1; 2274 } 2275 if (fMatch == FALSE) { 2276 status = U_REGEX_INVALID_STATE; 2277 return -1; 2278 } 2279 if (group < 0 || group > fPattern->fGroupMap->size()) { 2280 status = U_INDEX_OUTOFBOUNDS_ERROR; 2281 return -1; 2282 } 2283 int64_t s; 2284 if (group == 0) { 2285 s = fMatchStart; 2286 } else { 2287 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 2288 U_ASSERT(groupOffset < fPattern->fFrameSize); 2289 U_ASSERT(groupOffset >= 0); 2290 s = fFrame->fExtra[groupOffset]; 2291 } 2292 2293 return s; 2294 } 2295 2296 2297 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { 2298 return (int32_t)start64(group, status); 2299 } 2300 2301 //-------------------------------------------------------------------------------- 2302 // 2303 // useAnchoringBounds 2304 // 2305 //-------------------------------------------------------------------------------- 2306 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { 2307 fAnchoringBounds = b; 2308 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); 2309 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); 2310 return *this; 2311 } 2312 2313 2314 //-------------------------------------------------------------------------------- 2315 // 2316 // useTransparentBounds 2317 // 2318 //-------------------------------------------------------------------------------- 2319 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { 2320 fTransparentBounds = b; 2321 fLookStart = (fTransparentBounds ? 0 : fRegionStart); 2322 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); 2323 return *this; 2324 } 2325 2326 //-------------------------------------------------------------------------------- 2327 // 2328 // setTimeLimit 2329 // 2330 //-------------------------------------------------------------------------------- 2331 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { 2332 if (U_FAILURE(status)) { 2333 return; 2334 } 2335 if (U_FAILURE(fDeferredStatus)) { 2336 status = fDeferredStatus; 2337 return; 2338 } 2339 if (limit < 0) { 2340 status = U_ILLEGAL_ARGUMENT_ERROR; 2341 return; 2342 } 2343 fTimeLimit = limit; 2344 } 2345 2346 2347 //-------------------------------------------------------------------------------- 2348 // 2349 // getTimeLimit 2350 // 2351 //-------------------------------------------------------------------------------- 2352 int32_t RegexMatcher::getTimeLimit() const { 2353 return fTimeLimit; 2354 } 2355 2356 2357 //-------------------------------------------------------------------------------- 2358 // 2359 // setStackLimit 2360 // 2361 //-------------------------------------------------------------------------------- 2362 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { 2363 if (U_FAILURE(status)) { 2364 return; 2365 } 2366 if (U_FAILURE(fDeferredStatus)) { 2367 status = fDeferredStatus; 2368 return; 2369 } 2370 if (limit < 0) { 2371 status = U_ILLEGAL_ARGUMENT_ERROR; 2372 return; 2373 } 2374 2375 // Reset the matcher. This is needed here in case there is a current match 2376 // whose final stack frame (containing the match results, pointed to by fFrame) 2377 // would be lost by resizing to a smaller stack size. 2378 reset(); 2379 2380 if (limit == 0) { 2381 // Unlimited stack expansion 2382 fStack->setMaxCapacity(0); 2383 } else { 2384 // Change the units of the limit from bytes to ints, and bump the size up 2385 // to be big enough to hold at least one stack frame for the pattern, 2386 // if it isn't there already. 2387 int32_t adjustedLimit = limit / sizeof(int32_t); 2388 if (adjustedLimit < fPattern->fFrameSize) { 2389 adjustedLimit = fPattern->fFrameSize; 2390 } 2391 fStack->setMaxCapacity(adjustedLimit); 2392 } 2393 fStackLimit = limit; 2394 } 2395 2396 2397 //-------------------------------------------------------------------------------- 2398 // 2399 // getStackLimit 2400 // 2401 //-------------------------------------------------------------------------------- 2402 int32_t RegexMatcher::getStackLimit() const { 2403 return fStackLimit; 2404 } 2405 2406 2407 //-------------------------------------------------------------------------------- 2408 // 2409 // setMatchCallback 2410 // 2411 //-------------------------------------------------------------------------------- 2412 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, 2413 const void *context, 2414 UErrorCode &status) { 2415 if (U_FAILURE(status)) { 2416 return; 2417 } 2418 fCallbackFn = callback; 2419 fCallbackContext = context; 2420 } 2421 2422 2423 //-------------------------------------------------------------------------------- 2424 // 2425 // getMatchCallback 2426 // 2427 //-------------------------------------------------------------------------------- 2428 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, 2429 const void *&context, 2430 UErrorCode &status) { 2431 if (U_FAILURE(status)) { 2432 return; 2433 } 2434 callback = fCallbackFn; 2435 context = fCallbackContext; 2436 } 2437 2438 2439 //-------------------------------------------------------------------------------- 2440 // 2441 // setMatchCallback 2442 // 2443 //-------------------------------------------------------------------------------- 2444 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback, 2445 const void *context, 2446 UErrorCode &status) { 2447 if (U_FAILURE(status)) { 2448 return; 2449 } 2450 fFindProgressCallbackFn = callback; 2451 fFindProgressCallbackContext = context; 2452 } 2453 2454 2455 //-------------------------------------------------------------------------------- 2456 // 2457 // getMatchCallback 2458 // 2459 //-------------------------------------------------------------------------------- 2460 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback, 2461 const void *&context, 2462 UErrorCode &status) { 2463 if (U_FAILURE(status)) { 2464 return; 2465 } 2466 callback = fFindProgressCallbackFn; 2467 context = fFindProgressCallbackContext; 2468 } 2469 2470 2471 //================================================================================ 2472 // 2473 // Code following this point in this file is the internal 2474 // Match Engine Implementation. 2475 // 2476 //================================================================================ 2477 2478 2479 //-------------------------------------------------------------------------------- 2480 // 2481 // resetStack 2482 // Discard any previous contents of the state save stack, and initialize a 2483 // new stack frame to all -1. The -1s are needed for capture group limits, 2484 // where they indicate that a group has not yet matched anything. 2485 //-------------------------------------------------------------------------------- 2486 REStackFrame *RegexMatcher::resetStack() { 2487 // Discard any previous contents of the state save stack, and initialize a 2488 // new stack frame with all -1 data. The -1s are needed for capture group limits, 2489 // where they indicate that a group has not yet matched anything. 2490 fStack->removeAllElements(); 2491 2492 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); 2493 int32_t i; 2494 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { 2495 iFrame->fExtra[i] = -1; 2496 } 2497 return iFrame; 2498 } 2499 2500 2501 2502 //-------------------------------------------------------------------------------- 2503 // 2504 // isWordBoundary 2505 // in perl, "xab..cd..", \b is true at positions 0,3,5,7 2506 // For us, 2507 // If the current char is a combining mark, 2508 // \b is FALSE. 2509 // Else Scan backwards to the first non-combining char. 2510 // We are at a boundary if the this char and the original chars are 2511 // opposite in membership in \w set 2512 // 2513 // parameters: pos - the current position in the input buffer 2514 // 2515 // TODO: double-check edge cases at region boundaries. 2516 // 2517 //-------------------------------------------------------------------------------- 2518 UBool RegexMatcher::isWordBoundary(int64_t pos) { 2519 UBool isBoundary = FALSE; 2520 UBool cIsWord = FALSE; 2521 2522 if (pos >= fLookLimit) { 2523 fHitEnd = TRUE; 2524 } else { 2525 // Determine whether char c at current position is a member of the word set of chars. 2526 // If we're off the end of the string, behave as though we're not at a word char. 2527 UTEXT_SETNATIVEINDEX(fInputText, pos); 2528 UChar32 c = UTEXT_CURRENT32(fInputText); 2529 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2530 // Current char is a combining one. Not a boundary. 2531 return FALSE; 2532 } 2533 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2534 } 2535 2536 // Back up until we come to a non-combining char, determine whether 2537 // that char is a word char. 2538 UBool prevCIsWord = FALSE; 2539 for (;;) { 2540 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { 2541 break; 2542 } 2543 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); 2544 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2545 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2546 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 2547 break; 2548 } 2549 } 2550 isBoundary = cIsWord ^ prevCIsWord; 2551 return isBoundary; 2552 } 2553 2554 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { 2555 UBool isBoundary = FALSE; 2556 UBool cIsWord = FALSE; 2557 2558 const UChar *inputBuf = fInputText->chunkContents; 2559 2560 if (pos >= fLookLimit) { 2561 fHitEnd = TRUE; 2562 } else { 2563 // Determine whether char c at current position is a member of the word set of chars. 2564 // If we're off the end of the string, behave as though we're not at a word char. 2565 UChar32 c; 2566 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); 2567 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2568 // Current char is a combining one. Not a boundary. 2569 return FALSE; 2570 } 2571 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2572 } 2573 2574 // Back up until we come to a non-combining char, determine whether 2575 // that char is a word char. 2576 UBool prevCIsWord = FALSE; 2577 for (;;) { 2578 if (pos <= fLookStart) { 2579 break; 2580 } 2581 UChar32 prevChar; 2582 U16_PREV(inputBuf, fLookStart, pos, prevChar); 2583 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2584 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2585 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 2586 break; 2587 } 2588 } 2589 isBoundary = cIsWord ^ prevCIsWord; 2590 return isBoundary; 2591 } 2592 2593 //-------------------------------------------------------------------------------- 2594 // 2595 // isUWordBoundary 2596 // 2597 // Test for a word boundary using RBBI word break. 2598 // 2599 // parameters: pos - the current position in the input buffer 2600 // 2601 //-------------------------------------------------------------------------------- 2602 UBool RegexMatcher::isUWordBoundary(int64_t pos) { 2603 UBool returnVal = FALSE; 2604 #if UCONFIG_NO_BREAK_ITERATION==0 2605 2606 // If we haven't yet created a break iterator for this matcher, do it now. 2607 if (fWordBreakItr == NULL) { 2608 fWordBreakItr = 2609 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus); 2610 if (U_FAILURE(fDeferredStatus)) { 2611 return FALSE; 2612 } 2613 fWordBreakItr->setText(fInputText, fDeferredStatus); 2614 } 2615 2616 if (pos >= fLookLimit) { 2617 fHitEnd = TRUE; 2618 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" 2619 // words are not boundaries. All non-word chars stand by themselves, 2620 // with word boundaries on both sides. 2621 } else { 2622 if (!UTEXT_USES_U16(fInputText)) { 2623 // !!!: Would like a better way to do this! 2624 UErrorCode status = U_ZERO_ERROR; 2625 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status); 2626 } 2627 returnVal = fWordBreakItr->isBoundary((int32_t)pos); 2628 } 2629 #endif 2630 return returnVal; 2631 } 2632 2633 //-------------------------------------------------------------------------------- 2634 // 2635 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state 2636 // saves. Increment the "time" counter, and call the 2637 // user callback function if there is one installed. 2638 // 2639 // If the match operation needs to be aborted, either for a time-out 2640 // or because the user callback asked for it, just set an error status. 2641 // The engine will pick that up and stop in its outer loop. 2642 // 2643 //-------------------------------------------------------------------------------- 2644 void RegexMatcher::IncrementTime(UErrorCode &status) { 2645 fTickCounter = TIMER_INITIAL_VALUE; 2646 fTime++; 2647 if (fCallbackFn != NULL) { 2648 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { 2649 status = U_REGEX_STOPPED_BY_CALLER; 2650 return; 2651 } 2652 } 2653 if (fTimeLimit > 0 && fTime >= fTimeLimit) { 2654 status = U_REGEX_TIME_OUT; 2655 } 2656 } 2657 2658 //-------------------------------------------------------------------------------- 2659 // 2660 // ReportFindProgress This function is called once for each advance in the target 2661 // string from the find() function, and calls the user progress callback 2662 // function if there is one installed. 2663 // 2664 // NOTE: 2665 // 2666 // If the match operation needs to be aborted because the user 2667 // callback asked for it, just set an error status. 2668 // The engine will pick that up and stop in its outer loop. 2669 // 2670 //-------------------------------------------------------------------------------- 2671 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) { 2672 if (fFindProgressCallbackFn != NULL) { 2673 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) { 2674 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/; 2675 return FALSE; 2676 } 2677 } 2678 return TRUE; 2679 } 2680 2681 //-------------------------------------------------------------------------------- 2682 // 2683 // StateSave 2684 // Make a new stack frame, initialized as a copy of the current stack frame. 2685 // Set the pattern index in the original stack frame from the operand value 2686 // in the opcode. Execution of the engine continues with the state in 2687 // the newly created stack frame 2688 // 2689 // Note that reserveBlock() may grow the stack, resulting in the 2690 // whole thing being relocated in memory. 2691 // 2692 // Parameters: 2693 // fp The top frame pointer when called. At return, a new 2694 // fame will be present 2695 // savePatIdx An index into the compiled pattern. Goes into the original 2696 // (not new) frame. If execution ever back-tracks out of the 2697 // new frame, this will be where we continue from in the pattern. 2698 // Return 2699 // The new frame pointer. 2700 // 2701 //-------------------------------------------------------------------------------- 2702 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) { 2703 // push storage for a new frame. 2704 int64_t *newFP = fStack->reserveBlock(fFrameSize, status); 2705 if (newFP == NULL) { 2706 // Failure on attempted stack expansion. 2707 // Stack function set some other error code, change it to a more 2708 // specific one for regular expressions. 2709 status = U_REGEX_STACK_OVERFLOW; 2710 // We need to return a writable stack frame, so just return the 2711 // previous frame. The match operation will stop quickly 2712 // because of the error status, after which the frame will never 2713 // be looked at again. 2714 return fp; 2715 } 2716 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. 2717 2718 // New stack frame = copy of old top frame. 2719 int64_t *source = (int64_t *)fp; 2720 int64_t *dest = newFP; 2721 for (;;) { 2722 *dest++ = *source++; 2723 if (source == newFP) { 2724 break; 2725 } 2726 } 2727 2728 fTickCounter--; 2729 if (fTickCounter <= 0) { 2730 IncrementTime(status); // Re-initializes fTickCounter 2731 } 2732 fp->fPatIdx = savePatIdx; 2733 return (REStackFrame *)newFP; 2734 } 2735 2736 2737 //-------------------------------------------------------------------------------- 2738 // 2739 // MatchAt This is the actual matching engine. 2740 // 2741 // startIdx: begin matching a this index. 2742 // toEnd: if true, match must extend to end of the input region 2743 // 2744 //-------------------------------------------------------------------------------- 2745 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { 2746 UBool isMatch = FALSE; // True if the we have a match. 2747 2748 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards 2749 2750 int32_t op; // Operation from the compiled pattern, split into 2751 int32_t opType; // the opcode 2752 int32_t opValue; // and the operand value. 2753 2754 #ifdef REGEX_RUN_DEBUG 2755 if (fTraceDebug) 2756 { 2757 printf("MatchAt(startIdx=%ld)\n", startIdx); 2758 printf("Original Pattern: "); 2759 UChar32 c = utext_next32From(fPattern->fPattern, 0); 2760 while (c != U_SENTINEL) { 2761 if (c<32 || c>256) { 2762 c = '.'; 2763 } 2764 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); 2765 2766 c = UTEXT_NEXT32(fPattern->fPattern); 2767 } 2768 printf("\n"); 2769 printf("Input String: "); 2770 c = utext_next32From(fInputText, 0); 2771 while (c != U_SENTINEL) { 2772 if (c<32 || c>256) { 2773 c = '.'; 2774 } 2775 printf("%c", c); 2776 2777 c = UTEXT_NEXT32(fInputText); 2778 } 2779 printf("\n"); 2780 printf("\n"); 2781 } 2782 #endif 2783 2784 if (U_FAILURE(status)) { 2785 return; 2786 } 2787 2788 // Cache frequently referenced items from the compiled pattern 2789 // 2790 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 2791 2792 const UChar *litText = fPattern->fLiteralText.getBuffer(); 2793 UVector *sets = fPattern->fSets; 2794 2795 fFrameSize = fPattern->fFrameSize; 2796 REStackFrame *fp = resetStack(); 2797 2798 fp->fPatIdx = 0; 2799 fp->fInputIdx = startIdx; 2800 2801 // Zero out the pattern's static data 2802 int32_t i; 2803 for (i = 0; i<fPattern->fDataSize; i++) { 2804 fData[i] = 0; 2805 } 2806 2807 // 2808 // Main loop for interpreting the compiled pattern. 2809 // One iteration of the loop per pattern operation performed. 2810 // 2811 for (;;) { 2812 #if 0 2813 if (_heapchk() != _HEAPOK) { 2814 fprintf(stderr, "Heap Trouble\n"); 2815 } 2816 #endif 2817 2818 op = (int32_t)pat[fp->fPatIdx]; 2819 opType = URX_TYPE(op); 2820 opValue = URX_VAL(op); 2821 #ifdef REGEX_RUN_DEBUG 2822 if (fTraceDebug) { 2823 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2824 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, 2825 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 2826 fPattern->dumpOp(fp->fPatIdx); 2827 } 2828 #endif 2829 fp->fPatIdx++; 2830 2831 switch (opType) { 2832 2833 2834 case URX_NOP: 2835 break; 2836 2837 2838 case URX_BACKTRACK: 2839 // Force a backtrack. In some circumstances, the pattern compiler 2840 // will notice that the pattern can't possibly match anything, and will 2841 // emit one of these at that point. 2842 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2843 break; 2844 2845 2846 case URX_ONECHAR: 2847 if (fp->fInputIdx < fActiveLimit) { 2848 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2849 UChar32 c = UTEXT_NEXT32(fInputText); 2850 if (c == opValue) { 2851 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2852 break; 2853 } 2854 } else { 2855 fHitEnd = TRUE; 2856 } 2857 2858 #ifdef REGEX_SMART_BACKTRACKING 2859 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 2860 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 2861 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 2862 UBool success = FALSE; 2863 UChar32 c = UTEXT_PREVIOUS32(fInputText); 2864 while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) { 2865 if (c == opValue) { 2866 success = TRUE; 2867 break; 2868 } else if (c == U_SENTINEL) { 2869 break; 2870 } 2871 c = UTEXT_PREVIOUS32(fInputText); 2872 } 2873 if (success) { 2874 fHitEnd = FALSE; 2875 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2876 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2877 if (fp->fInputIdx > backSearchIndex) { 2878 fp = StateSave(fp, fp->fPatIdx, status); 2879 } 2880 fp->fPatIdx++; // Skip the LOOP_C, we just did that 2881 break; 2882 } 2883 } 2884 } 2885 #endif 2886 2887 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2888 break; 2889 2890 2891 case URX_STRING: 2892 { 2893 // Test input against a literal string. 2894 // Strings require two slots in the compiled pattern, one for the 2895 // offset to the string text, and one for the length. 2896 int32_t stringStartIdx = opValue; 2897 int32_t stringLen; 2898 2899 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 2900 fp->fPatIdx++; 2901 opType = URX_TYPE(op); 2902 stringLen = URX_VAL(op); 2903 U_ASSERT(opType == URX_STRING_LEN); 2904 U_ASSERT(stringLen >= 2); 2905 2906 const UChar *patternChars = litText+stringStartIdx; 2907 const UChar *patternEnd = patternChars+stringLen; 2908 2909 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2910 UChar32 c; 2911 UBool success = TRUE; 2912 2913 while (patternChars < patternEnd && success) { 2914 c = UTEXT_NEXT32(fInputText); 2915 2916 if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) { 2917 if (U_IS_BMP(c)) { 2918 success = (*patternChars == c); 2919 patternChars += 1; 2920 } else if (patternChars+1 < patternEnd) { 2921 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); 2922 patternChars += 2; 2923 } 2924 } else { 2925 success = FALSE; 2926 fHitEnd = TRUE; // TODO: See ticket 6074 2927 } 2928 } 2929 2930 if (success) { 2931 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2932 } else { 2933 #ifdef REGEX_SMART_BACKTRACKING 2934 if (fp->fInputIdx > backSearchIndex && fStack->size()) { 2935 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 2936 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 2937 // Reset to last start point 2938 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2939 patternChars = litText+stringStartIdx; 2940 2941 // Search backwards for a possible start 2942 do { 2943 c = UTEXT_PREVIOUS32(fInputText); 2944 if (c == U_SENTINEL) { 2945 break; 2946 } else if ((U_IS_BMP(c) && *patternChars == c) || 2947 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { 2948 success = TRUE; 2949 break; 2950 } 2951 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); 2952 2953 // And try again 2954 if (success) { 2955 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2956 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2957 if (fp->fInputIdx > backSearchIndex) { 2958 fp = StateSave(fp, fp->fPatIdx, status); 2959 } 2960 fp->fPatIdx++; // Skip the LOOP_C, we just did that 2961 break; 2962 } 2963 } 2964 } 2965 #endif 2966 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2967 } 2968 } 2969 break; 2970 2971 2972 case URX_STATE_SAVE: 2973 fp = StateSave(fp, opValue, status); 2974 break; 2975 2976 2977 case URX_END: 2978 // The match loop will exit via this path on a successful match, 2979 // when we reach the end of the pattern. 2980 if (toEnd && fp->fInputIdx != fActiveLimit) { 2981 // The pattern matched, but not to the end of input. Try some more. 2982 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2983 break; 2984 } 2985 isMatch = TRUE; 2986 goto breakFromLoop; 2987 2988 // Start and End Capture stack frame variables are laid out out like this: 2989 // fp->fExtra[opValue] - The start of a completed capture group 2990 // opValue+1 - The end of a completed capture group 2991 // opValue+2 - the start of a capture group whose end 2992 // has not yet been reached (and might not ever be). 2993 case URX_START_CAPTURE: 2994 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2995 fp->fExtra[opValue+2] = fp->fInputIdx; 2996 break; 2997 2998 2999 case URX_END_CAPTURE: 3000 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 3001 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 3002 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 3003 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 3004 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 3005 break; 3006 3007 3008 case URX_DOLLAR: // $, test for End of line 3009 // or for position before new line at end of input 3010 { 3011 if (fp->fInputIdx >= fAnchorLimit) { 3012 // We really are at the end of input. Success. 3013 fHitEnd = TRUE; 3014 fRequireEnd = TRUE; 3015 break; 3016 } 3017 3018 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3019 3020 // If we are positioned just before a new-line that is located at the 3021 // end of input, succeed. 3022 UChar32 c = UTEXT_NEXT32(fInputText); 3023 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 3024 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 3025 // If not in the middle of a CR/LF sequence 3026 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { 3027 // At new-line at end of input. Success 3028 fHitEnd = TRUE; 3029 fRequireEnd = TRUE; 3030 3031 break; 3032 } 3033 } 3034 } else { 3035 UChar32 nextC = UTEXT_NEXT32(fInputText); 3036 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 3037 fHitEnd = TRUE; 3038 fRequireEnd = TRUE; 3039 break; // At CR/LF at end of input. Success 3040 } 3041 } 3042 3043 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3044 } 3045 break; 3046 3047 3048 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 3049 if (fp->fInputIdx >= fAnchorLimit) { 3050 // Off the end of input. Success. 3051 fHitEnd = TRUE; 3052 fRequireEnd = TRUE; 3053 break; 3054 } else { 3055 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3056 UChar32 c = UTEXT_NEXT32(fInputText); 3057 // Either at the last character of input, or off the end. 3058 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) { 3059 fHitEnd = TRUE; 3060 fRequireEnd = TRUE; 3061 break; 3062 } 3063 } 3064 3065 // Not at end of input. Back-track out. 3066 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3067 break; 3068 3069 3070 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 3071 { 3072 if (fp->fInputIdx >= fAnchorLimit) { 3073 // We really are at the end of input. Success. 3074 fHitEnd = TRUE; 3075 fRequireEnd = TRUE; 3076 break; 3077 } 3078 // If we are positioned just before a new-line, succeed. 3079 // It makes no difference where the new-line is within the input. 3080 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3081 UChar32 c = UTEXT_CURRENT32(fInputText); 3082 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 3083 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 3084 // In multi-line mode, hitting a new-line just before the end of input does not 3085 // set the hitEnd or requireEnd flags 3086 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) { 3087 break; 3088 } 3089 } 3090 // not at a new line. Fail. 3091 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3092 } 3093 break; 3094 3095 3096 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 3097 { 3098 if (fp->fInputIdx >= fAnchorLimit) { 3099 // We really are at the end of input. Success. 3100 fHitEnd = TRUE; 3101 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 3102 break; // adding a new-line would not lose the match. 3103 } 3104 // If we are not positioned just before a new-line, the test fails; backtrack out. 3105 // It makes no difference where the new-line is within the input. 3106 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3107 if (UTEXT_CURRENT32(fInputText) != 0x0a) { 3108 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3109 } 3110 } 3111 break; 3112 3113 3114 case URX_CARET: // ^, test for start of line 3115 if (fp->fInputIdx != fAnchorStart) { 3116 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3117 } 3118 break; 3119 3120 3121 case URX_CARET_M: // ^, test for start of line in mulit-line mode 3122 { 3123 if (fp->fInputIdx == fAnchorStart) { 3124 // We are at the start input. Success. 3125 break; 3126 } 3127 // Check whether character just before the current pos is a new-line 3128 // unless we are at the end of input 3129 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3130 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3131 if ((fp->fInputIdx < fAnchorLimit) && 3132 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 3133 // It's a new-line. ^ is true. Success. 3134 // TODO: what should be done with positions between a CR and LF? 3135 break; 3136 } 3137 // Not at the start of a line. Fail. 3138 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3139 } 3140 break; 3141 3142 3143 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 3144 { 3145 U_ASSERT(fp->fInputIdx >= fAnchorStart); 3146 if (fp->fInputIdx <= fAnchorStart) { 3147 // We are at the start input. Success. 3148 break; 3149 } 3150 // Check whether character just before the current pos is a new-line 3151 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 3152 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3153 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3154 if (c != 0x0a) { 3155 // Not at the start of a line. Back-track out. 3156 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3157 } 3158 } 3159 break; 3160 3161 case URX_BACKSLASH_B: // Test for word boundaries 3162 { 3163 UBool success = isWordBoundary(fp->fInputIdx); 3164 success ^= (opValue != 0); // flip sense for \B 3165 if (!success) { 3166 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3167 } 3168 } 3169 break; 3170 3171 3172 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 3173 { 3174 UBool success = isUWordBoundary(fp->fInputIdx); 3175 success ^= (opValue != 0); // flip sense for \B 3176 if (!success) { 3177 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3178 } 3179 } 3180 break; 3181 3182 3183 case URX_BACKSLASH_D: // Test for decimal digit 3184 { 3185 if (fp->fInputIdx >= fActiveLimit) { 3186 fHitEnd = TRUE; 3187 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3188 break; 3189 } 3190 3191 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3192 3193 UChar32 c = UTEXT_NEXT32(fInputText); 3194 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 3195 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 3196 success ^= (opValue != 0); // flip sense for \D 3197 if (success) { 3198 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3199 } else { 3200 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3201 } 3202 } 3203 break; 3204 3205 3206 case URX_BACKSLASH_G: // Test for position at end of previous match 3207 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { 3208 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3209 } 3210 break; 3211 3212 3213 case URX_BACKSLASH_X: 3214 // Match a Grapheme, as defined by Unicode TR 29. 3215 // Differs slightly from Perl, which consumes combining marks independently 3216 // of context. 3217 { 3218 3219 // Fail if at end of input 3220 if (fp->fInputIdx >= fActiveLimit) { 3221 fHitEnd = TRUE; 3222 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3223 break; 3224 } 3225 3226 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3227 3228 // Examine (and consume) the current char. 3229 // Dispatch into a little state machine, based on the char. 3230 UChar32 c; 3231 c = UTEXT_NEXT32(fInputText); 3232 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3233 UnicodeSet **sets = fPattern->fStaticSets; 3234 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 3235 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 3236 if (sets[URX_GC_L]->contains(c)) goto GC_L; 3237 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 3238 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 3239 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3240 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3241 goto GC_Extend; 3242 3243 3244 3245 GC_L: 3246 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3247 c = UTEXT_NEXT32(fInputText); 3248 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3249 if (sets[URX_GC_L]->contains(c)) goto GC_L; 3250 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 3251 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 3252 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3253 (void)UTEXT_PREVIOUS32(fInputText); 3254 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3255 goto GC_Extend; 3256 3257 GC_V: 3258 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3259 c = UTEXT_NEXT32(fInputText); 3260 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3261 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3262 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3263 (void)UTEXT_PREVIOUS32(fInputText); 3264 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3265 goto GC_Extend; 3266 3267 GC_T: 3268 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3269 c = UTEXT_NEXT32(fInputText); 3270 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3271 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3272 (void)UTEXT_PREVIOUS32(fInputText); 3273 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3274 goto GC_Extend; 3275 3276 GC_Extend: 3277 // Combining characters are consumed here 3278 for (;;) { 3279 if (fp->fInputIdx >= fActiveLimit) { 3280 break; 3281 } 3282 c = UTEXT_CURRENT32(fInputText); 3283 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 3284 break; 3285 } 3286 (void)UTEXT_NEXT32(fInputText); 3287 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3288 } 3289 goto GC_Done; 3290 3291 GC_Control: 3292 // Most control chars stand alone (don't combine with combining chars), 3293 // except for that CR/LF sequence is a single grapheme cluster. 3294 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 3295 c = UTEXT_NEXT32(fInputText); 3296 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3297 } 3298 3299 GC_Done: 3300 if (fp->fInputIdx >= fActiveLimit) { 3301 fHitEnd = TRUE; 3302 } 3303 break; 3304 } 3305 3306 3307 3308 3309 case URX_BACKSLASH_Z: // Test for end of Input 3310 if (fp->fInputIdx < fAnchorLimit) { 3311 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3312 } else { 3313 fHitEnd = TRUE; 3314 fRequireEnd = TRUE; 3315 } 3316 break; 3317 3318 3319 3320 case URX_STATIC_SETREF: 3321 { 3322 // Test input character against one of the predefined sets 3323 // (Word Characters, for example) 3324 // The high bit of the op value is a flag for the match polarity. 3325 // 0: success if input char is in set. 3326 // 1: success if input char is not in set. 3327 if (fp->fInputIdx >= fActiveLimit) { 3328 fHitEnd = TRUE; 3329 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3330 break; 3331 } 3332 3333 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 3334 opValue &= ~URX_NEG_SET; 3335 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3336 3337 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3338 UChar32 c = UTEXT_NEXT32(fInputText); 3339 if (c < 256) { 3340 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3341 if (s8->contains(c)) { 3342 success = !success; 3343 } 3344 } else { 3345 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3346 if (s->contains(c)) { 3347 success = !success; 3348 } 3349 } 3350 if (success) { 3351 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3352 } else { 3353 // the character wasn't in the set. 3354 #ifdef REGEX_SMART_BACKTRACKING 3355 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 3356 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3357 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3358 // Try to find it, backwards 3359 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried 3360 success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset 3361 do { 3362 c = UTEXT_PREVIOUS32(fInputText); 3363 if (c == U_SENTINEL) { 3364 break; 3365 } else if (c < 256) { 3366 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3367 if (s8->contains(c)) { 3368 success = !success; 3369 } 3370 } else { 3371 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3372 if (s->contains(c)) { 3373 success = !success; 3374 } 3375 } 3376 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success); 3377 3378 if (success && c != U_SENTINEL) { 3379 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3380 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3381 if (fp->fInputIdx > backSearchIndex) { 3382 fp = StateSave(fp, fp->fPatIdx, status); 3383 } 3384 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3385 break; 3386 } 3387 } 3388 } 3389 #endif 3390 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3391 } 3392 } 3393 break; 3394 3395 3396 case URX_STAT_SETREF_N: 3397 { 3398 // Test input character for NOT being a member of one of 3399 // the predefined sets (Word Characters, for example) 3400 if (fp->fInputIdx >= fActiveLimit) { 3401 fHitEnd = TRUE; 3402 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3403 break; 3404 } 3405 3406 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3407 3408 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3409 3410 UChar32 c = UTEXT_NEXT32(fInputText); 3411 if (c < 256) { 3412 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3413 if (s8->contains(c) == FALSE) { 3414 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3415 break; 3416 } 3417 } else { 3418 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3419 if (s->contains(c) == FALSE) { 3420 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3421 break; 3422 } 3423 } 3424 // the character wasn't in the set. 3425 #ifdef REGEX_SMART_BACKTRACKING 3426 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 3427 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3428 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3429 // Try to find it, backwards 3430 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried 3431 UBool success = FALSE; 3432 do { 3433 c = UTEXT_PREVIOUS32(fInputText); 3434 if (c == U_SENTINEL) { 3435 break; 3436 } else if (c < 256) { 3437 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3438 if (s8->contains(c) == FALSE) { 3439 success = TRUE; 3440 break; 3441 } 3442 } else { 3443 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3444 if (s->contains(c) == FALSE) { 3445 success = TRUE; 3446 break; 3447 } 3448 } 3449 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); 3450 3451 if (success) { 3452 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3453 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3454 if (fp->fInputIdx > backSearchIndex) { 3455 fp = StateSave(fp, fp->fPatIdx, status); 3456 } 3457 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3458 break; 3459 } 3460 } 3461 } 3462 #endif 3463 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3464 } 3465 break; 3466 3467 3468 case URX_SETREF: 3469 if (fp->fInputIdx >= fActiveLimit) { 3470 fHitEnd = TRUE; 3471 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3472 break; 3473 } else { 3474 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3475 3476 // There is input left. Pick up one char and test it for set membership. 3477 UChar32 c = UTEXT_NEXT32(fInputText); 3478 U_ASSERT(opValue > 0 && opValue < sets->size()); 3479 if (c<256) { 3480 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 3481 if (s8->contains(c)) { 3482 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3483 break; 3484 } 3485 } else { 3486 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 3487 if (s->contains(c)) { 3488 // The character is in the set. A Match. 3489 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3490 break; 3491 } 3492 } 3493 3494 // the character wasn't in the set. 3495 #ifdef REGEX_SMART_BACKTRACKING 3496 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 3497 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3498 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3499 // Try to find it, backwards 3500 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried 3501 UBool success = FALSE; 3502 do { 3503 c = UTEXT_PREVIOUS32(fInputText); 3504 if (c == U_SENTINEL) { 3505 break; 3506 } else if (c < 256) { 3507 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 3508 if (s8->contains(c)) { 3509 success = TRUE; 3510 break; 3511 } 3512 } else { 3513 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 3514 if (s->contains(c)) { 3515 success = TRUE; 3516 break; 3517 } 3518 } 3519 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); 3520 3521 if (success) { 3522 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3523 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3524 if (fp->fInputIdx > backSearchIndex) { 3525 fp = StateSave(fp, fp->fPatIdx, status); 3526 } 3527 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3528 break; 3529 } 3530 } 3531 } 3532 #endif 3533 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3534 } 3535 break; 3536 3537 3538 case URX_DOTANY: 3539 { 3540 // . matches anything, but stops at end-of-line. 3541 if (fp->fInputIdx >= fActiveLimit) { 3542 // At end of input. Match failed. Backtrack out. 3543 fHitEnd = TRUE; 3544 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3545 break; 3546 } 3547 3548 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3549 3550 // There is input left. Advance over one char, unless we've hit end-of-line 3551 UChar32 c = UTEXT_NEXT32(fInputText); 3552 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 3553 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 3554 // End of line in normal mode. . does not match. 3555 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3556 break; 3557 } 3558 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3559 } 3560 break; 3561 3562 3563 case URX_DOTANY_ALL: 3564 { 3565 // ., in dot-matches-all (including new lines) mode 3566 if (fp->fInputIdx >= fActiveLimit) { 3567 // At end of input. Match failed. Backtrack out. 3568 fHitEnd = TRUE; 3569 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3570 break; 3571 } 3572 3573 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3574 3575 // There is input left. Advance over one char, except if we are 3576 // at a cr/lf, advance over both of them. 3577 UChar32 c; 3578 c = UTEXT_NEXT32(fInputText); 3579 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3580 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 3581 // In the case of a CR/LF, we need to advance over both. 3582 UChar32 nextc = UTEXT_CURRENT32(fInputText); 3583 if (nextc == 0x0a) { 3584 (void)UTEXT_NEXT32(fInputText); 3585 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3586 } 3587 } 3588 } 3589 break; 3590 3591 3592 case URX_DOTANY_UNIX: 3593 { 3594 // '.' operator, matches all, but stops at end-of-line. 3595 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 3596 if (fp->fInputIdx >= fActiveLimit) { 3597 // At end of input. Match failed. Backtrack out. 3598 fHitEnd = TRUE; 3599 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3600 break; 3601 } 3602 3603 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3604 3605 // There is input left. Advance over one char, unless we've hit end-of-line 3606 UChar32 c = UTEXT_NEXT32(fInputText); 3607 if (c == 0x0a) { 3608 // End of line in normal mode. '.' does not match the \n 3609 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3610 } else { 3611 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3612 } 3613 } 3614 break; 3615 3616 3617 case URX_JMP: 3618 fp->fPatIdx = opValue; 3619 break; 3620 3621 case URX_FAIL: 3622 isMatch = FALSE; 3623 goto breakFromLoop; 3624 3625 case URX_JMP_SAV: 3626 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 3627 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3628 fp->fPatIdx = opValue; // Then JMP. 3629 break; 3630 3631 case URX_JMP_SAV_X: 3632 // This opcode is used with (x)+, when x can match a zero length string. 3633 // Same as JMP_SAV, except conditional on the match having made forward progress. 3634 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 3635 // data address of the input position at the start of the loop. 3636 { 3637 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 3638 int32_t stoOp = (int32_t)pat[opValue-1]; 3639 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 3640 int32_t frameLoc = URX_VAL(stoOp); 3641 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 3642 int64_t prevInputIdx = fp->fExtra[frameLoc]; 3643 U_ASSERT(prevInputIdx <= fp->fInputIdx); 3644 if (prevInputIdx < fp->fInputIdx) { 3645 // The match did make progress. Repeat the loop. 3646 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3647 fp->fPatIdx = opValue; 3648 fp->fExtra[frameLoc] = fp->fInputIdx; 3649 } 3650 // If the input position did not advance, we do nothing here, 3651 // execution will fall out of the loop. 3652 } 3653 break; 3654 3655 case URX_CTR_INIT: 3656 { 3657 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3658 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3659 3660 // Pick up the three extra operands that CTR_INIT has, and 3661 // skip the pattern location counter past 3662 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3663 fp->fPatIdx += 3; 3664 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3665 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3666 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3667 U_ASSERT(minCount>=0); 3668 U_ASSERT(maxCount>=minCount || maxCount==-1); 3669 U_ASSERT(loopLoc>fp->fPatIdx); 3670 3671 if (minCount == 0) { 3672 fp = StateSave(fp, loopLoc+1, status); 3673 } 3674 if (maxCount == 0) { 3675 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3676 } 3677 } 3678 break; 3679 3680 case URX_CTR_LOOP: 3681 { 3682 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3683 int32_t initOp = (int32_t)pat[opValue]; 3684 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 3685 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3686 int32_t minCount = (int32_t)pat[opValue+2]; 3687 int32_t maxCount = (int32_t)pat[opValue+3]; 3688 // Increment the counter. Note: we DIDN'T worry about counter 3689 // overflow, since the data comes from UnicodeStrings, which 3690 // stores its length in an int32_t. Do we have to think about 3691 // this now that we're using UText? Probably not, since the length 3692 // in UChar32s is still an int32_t. 3693 (*pCounter)++; 3694 U_ASSERT(*pCounter > 0); 3695 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { 3696 U_ASSERT(*pCounter == maxCount || maxCount == -1); 3697 break; 3698 } 3699 if (*pCounter >= minCount) { 3700 fp = StateSave(fp, fp->fPatIdx, status); 3701 } 3702 fp->fPatIdx = opValue + 4; // Loop back. 3703 } 3704 break; 3705 3706 case URX_CTR_INIT_NG: 3707 { 3708 // Initialize a non-greedy loop 3709 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3710 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3711 3712 // Pick up the three extra operands that CTR_INIT has, and 3713 // skip the pattern location counter past 3714 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3715 fp->fPatIdx += 3; 3716 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3717 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3718 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3719 U_ASSERT(minCount>=0); 3720 U_ASSERT(maxCount>=minCount || maxCount==-1); 3721 U_ASSERT(loopLoc>fp->fPatIdx); 3722 3723 if (minCount == 0) { 3724 if (maxCount != 0) { 3725 fp = StateSave(fp, fp->fPatIdx, status); 3726 } 3727 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 3728 } 3729 } 3730 break; 3731 3732 case URX_CTR_LOOP_NG: 3733 { 3734 // Non-greedy {min, max} loops 3735 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3736 int32_t initOp = (int32_t)pat[opValue]; 3737 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 3738 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3739 int32_t minCount = (int32_t)pat[opValue+2]; 3740 int32_t maxCount = (int32_t)pat[opValue+3]; 3741 // Increment the counter. Note: we DIDN'T worry about counter 3742 // overflow, since the data comes from UnicodeStrings, which 3743 // stores its length in an int32_t. Do we have to think about 3744 // this now that we're using UText? Probably not, since the length 3745 // in UChar32s is still an int32_t. 3746 (*pCounter)++; 3747 U_ASSERT(*pCounter > 0); 3748 3749 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { 3750 // The loop has matched the maximum permitted number of times. 3751 // Break out of here with no action. Matching will 3752 // continue with the following pattern. 3753 U_ASSERT(*pCounter == maxCount || maxCount == -1); 3754 break; 3755 } 3756 3757 if (*pCounter < minCount) { 3758 // We haven't met the minimum number of matches yet. 3759 // Loop back for another one. 3760 fp->fPatIdx = opValue + 4; // Loop back. 3761 } else { 3762 // We do have the minimum number of matches. 3763 // Fall into the following pattern, but first do 3764 // a state save to the top of the loop, so that a failure 3765 // in the following pattern will try another iteration of the loop. 3766 fp = StateSave(fp, opValue + 4, status); 3767 } 3768 } 3769 break; 3770 3771 case URX_STO_SP: 3772 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3773 fData[opValue] = fStack->size(); 3774 break; 3775 3776 case URX_LD_SP: 3777 { 3778 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3779 int32_t newStackSize = (int32_t)fData[opValue]; 3780 U_ASSERT(newStackSize <= fStack->size()); 3781 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3782 if (newFP == (int64_t *)fp) { 3783 break; 3784 } 3785 int32_t i; 3786 for (i=0; i<fFrameSize; i++) { 3787 newFP[i] = ((int64_t *)fp)[i]; 3788 } 3789 fp = (REStackFrame *)newFP; 3790 fStack->setSize(newStackSize); 3791 } 3792 break; 3793 3794 case URX_BACKREF: 3795 case URX_BACKREF_I: 3796 { 3797 U_ASSERT(opValue < fFrameSize); 3798 int64_t groupStartIdx = fp->fExtra[opValue]; 3799 int64_t groupEndIdx = fp->fExtra[opValue+1]; 3800 U_ASSERT(groupStartIdx <= groupEndIdx); 3801 if (groupStartIdx < 0) { 3802 // This capture group has not participated in the match thus far, 3803 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3804 } 3805 3806 if (groupEndIdx == groupStartIdx) { 3807 // The capture group match was of an empty string. 3808 // Verified by testing: Perl matches succeed in this case, so 3809 // we do too. 3810 break; 3811 } 3812 3813 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); 3814 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3815 3816 UBool haveMatch = (opType == URX_BACKREF ? 3817 (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) : 3818 (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status))); 3819 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3820 3821 if (fp->fInputIdx > fActiveLimit) { 3822 fHitEnd = TRUE; 3823 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3824 } else if (!haveMatch) { 3825 if (fp->fInputIdx == fActiveLimit) { 3826 fHitEnd = TRUE; 3827 } 3828 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3829 } 3830 } 3831 break; 3832 3833 case URX_STO_INP_LOC: 3834 { 3835 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 3836 fp->fExtra[opValue] = fp->fInputIdx; 3837 } 3838 break; 3839 3840 case URX_JMPX: 3841 { 3842 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3843 fp->fPatIdx += 1; 3844 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 3845 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 3846 int64_t savedInputIdx = fp->fExtra[dataLoc]; 3847 U_ASSERT(savedInputIdx <= fp->fInputIdx); 3848 if (savedInputIdx < fp->fInputIdx) { 3849 fp->fPatIdx = opValue; // JMP 3850 } else { 3851 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 3852 } 3853 } 3854 break; 3855 3856 case URX_LA_START: 3857 { 3858 // Entering a lookahead block. 3859 // Save Stack Ptr, Input Pos. 3860 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3861 fData[opValue] = fStack->size(); 3862 fData[opValue+1] = fp->fInputIdx; 3863 fActiveStart = fLookStart; // Set the match region change for 3864 fActiveLimit = fLookLimit; // transparent bounds. 3865 } 3866 break; 3867 3868 case URX_LA_END: 3869 { 3870 // Leaving a look-ahead block. 3871 // restore Stack Ptr, Input Pos to positions they had on entry to block. 3872 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3873 int32_t stackSize = fStack->size(); 3874 int32_t newStackSize =(int32_t)fData[opValue]; 3875 U_ASSERT(stackSize >= newStackSize); 3876 if (stackSize > newStackSize) { 3877 // Copy the current top frame back to the new (cut back) top frame. 3878 // This makes the capture groups from within the look-ahead 3879 // expression available. 3880 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3881 int32_t i; 3882 for (i=0; i<fFrameSize; i++) { 3883 newFP[i] = ((int64_t *)fp)[i]; 3884 } 3885 fp = (REStackFrame *)newFP; 3886 fStack->setSize(newStackSize); 3887 } 3888 fp->fInputIdx = fData[opValue+1]; 3889 3890 // Restore the active region bounds in the input string; they may have 3891 // been changed because of transparent bounds on a Region. 3892 fActiveStart = fRegionStart; 3893 fActiveLimit = fRegionLimit; 3894 } 3895 break; 3896 3897 case URX_ONECHAR_I: 3898 if (fp->fInputIdx < fActiveLimit) { 3899 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3900 3901 UChar32 c = UTEXT_NEXT32(fInputText); 3902 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 3903 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3904 break; 3905 } 3906 } else { 3907 fHitEnd = TRUE; 3908 } 3909 3910 #ifdef REGEX_SMART_BACKTRACKING 3911 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 3912 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3913 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3914 UBool success = FALSE; 3915 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3916 while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) { 3917 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 3918 success = TRUE; 3919 break; 3920 } else if (c == U_SENTINEL) { 3921 break; 3922 } 3923 c = UTEXT_PREVIOUS32(fInputText); 3924 } 3925 if (success) { 3926 fHitEnd = FALSE; 3927 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3928 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3929 if (fp->fInputIdx > backSearchIndex) { 3930 fp = StateSave(fp, fp->fPatIdx, status); 3931 } 3932 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3933 break; 3934 } 3935 } 3936 } 3937 #endif 3938 3939 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3940 break; 3941 3942 case URX_STRING_I: 3943 { 3944 // Test input against a literal string. 3945 // Strings require two slots in the compiled pattern, one for the 3946 // offset to the string text, and one for the length. 3947 const UCaseProps *csp = ucase_getSingleton(); 3948 { 3949 int32_t stringStartIdx, stringLen; 3950 stringStartIdx = opValue; 3951 3952 op = (int32_t)pat[fp->fPatIdx]; 3953 fp->fPatIdx++; 3954 opType = URX_TYPE(op); 3955 opValue = URX_VAL(op); 3956 U_ASSERT(opType == URX_STRING_LEN); 3957 stringLen = opValue; 3958 3959 const UChar *patternChars = litText+stringStartIdx; 3960 const UChar *patternEnd = patternChars+stringLen; 3961 3962 const UChar *foldChars = NULL; 3963 int32_t foldOffset, foldLength; 3964 UChar32 c; 3965 3966 foldOffset = foldLength = 0; 3967 UBool success = TRUE; 3968 3969 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3970 while (patternChars < patternEnd && success) { 3971 if(foldOffset < foldLength) { 3972 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 3973 } else { 3974 c = UTEXT_NEXT32(fInputText); 3975 if (c != U_SENTINEL) { 3976 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); 3977 if(foldLength >= 0) { 3978 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings 3979 foldOffset = 0; 3980 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 3981 } else { 3982 c = foldLength; 3983 foldLength = foldOffset; // to avoid reading chars from the folding buffer 3984 } 3985 } 3986 } 3987 3988 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3989 } 3990 3991 success = FALSE; 3992 if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) { 3993 if (U_IS_BMP(c)) { 3994 success = (*patternChars == c); 3995 patternChars += 1; 3996 } else if (patternChars+1 < patternEnd) { 3997 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); 3998 patternChars += 2; 3999 } 4000 } else { 4001 fHitEnd = TRUE; // TODO: See ticket 6074 4002 } 4003 } 4004 4005 if (!success) { 4006 #ifdef REGEX_SMART_BACKTRACKING 4007 if (fp->fInputIdx > backSearchIndex && fStack->size()) { 4008 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 4009 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 4010 // Reset to last start point 4011 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4012 patternChars = litText+stringStartIdx; 4013 4014 // Search backwards for a possible start 4015 do { 4016 c = UTEXT_PREVIOUS32(fInputText); 4017 if (c == U_SENTINEL) { 4018 break; 4019 } else { 4020 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); 4021 if(foldLength >= 0) { 4022 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings 4023 foldOffset = 0; 4024 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 4025 } else { 4026 c = foldLength; 4027 foldLength = foldOffset; // to avoid reading chars from the folding buffer 4028 } 4029 } 4030 4031 if ((U_IS_BMP(c) && *patternChars == c) || 4032 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { 4033 success = TRUE; 4034 break; 4035 } 4036 } 4037 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); 4038 4039 // And try again 4040 if (success) { 4041 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4042 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4043 if (fp->fInputIdx > backSearchIndex) { 4044 fp = StateSave(fp, fp->fPatIdx, status); 4045 } 4046 fp->fPatIdx++; // Skip the LOOP_C, we just did that 4047 break; 4048 } 4049 } 4050 } 4051 #endif 4052 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4053 } 4054 } 4055 } 4056 break; 4057 4058 case URX_LB_START: 4059 { 4060 // Entering a look-behind block. 4061 // Save Stack Ptr, Input Pos. 4062 // TODO: implement transparent bounds. Ticket #6067 4063 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4064 fData[opValue] = fStack->size(); 4065 fData[opValue+1] = fp->fInputIdx; 4066 // Init the variable containing the start index for attempted matches. 4067 fData[opValue+2] = -1; 4068 // Save input string length, then reset to pin any matches to end at 4069 // the current position. 4070 fData[opValue+3] = fActiveLimit; 4071 fActiveLimit = fp->fInputIdx; 4072 } 4073 break; 4074 4075 4076 case URX_LB_CONT: 4077 { 4078 // Positive Look-Behind, at top of loop checking for matches of LB expression 4079 // at all possible input starting positions. 4080 4081 // Fetch the min and max possible match lengths. They are the operands 4082 // of this op in the pattern. 4083 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 4084 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 4085 U_ASSERT(minML <= maxML); 4086 U_ASSERT(minML >= 0); 4087 4088 // Fetch (from data) the last input index where a match was attempted. 4089 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4090 int64_t *lbStartIdx = &fData[opValue+2]; 4091 if (*lbStartIdx < 0) { 4092 // First time through loop. 4093 *lbStartIdx = fp->fInputIdx - minML; 4094 } else { 4095 // 2nd through nth time through the loop. 4096 // Back up start position for match by one. 4097 if (*lbStartIdx == 0) { 4098 (*lbStartIdx)--; 4099 } else { 4100 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); 4101 (void)UTEXT_PREVIOUS32(fInputText); 4102 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 4103 } 4104 } 4105 4106 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 4107 // We have tried all potential match starting points without 4108 // getting a match. Backtrack out, and out of the 4109 // Look Behind altogether. 4110 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4111 int64_t restoreInputLen = fData[opValue+3]; 4112 U_ASSERT(restoreInputLen >= fActiveLimit); 4113 U_ASSERT(restoreInputLen <= fInputLength); 4114 fActiveLimit = restoreInputLen; 4115 break; 4116 } 4117 4118 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 4119 // (successful match will fall off the end of the loop.) 4120 fp = StateSave(fp, fp->fPatIdx-3, status); 4121 fp->fInputIdx = *lbStartIdx; 4122 } 4123 break; 4124 4125 case URX_LB_END: 4126 // End of a look-behind block, after a successful match. 4127 { 4128 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4129 if (fp->fInputIdx != fActiveLimit) { 4130 // The look-behind expression matched, but the match did not 4131 // extend all the way to the point that we are looking behind from. 4132 // FAIL out of here, which will take us back to the LB_CONT, which 4133 // will retry the match starting at another position or fail 4134 // the look-behind altogether, whichever is appropriate. 4135 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4136 break; 4137 } 4138 4139 // Look-behind match is good. Restore the orignal input string length, 4140 // which had been truncated to pin the end of the lookbehind match to the 4141 // position being looked-behind. 4142 int64_t originalInputLen = fData[opValue+3]; 4143 U_ASSERT(originalInputLen >= fActiveLimit); 4144 U_ASSERT(originalInputLen <= fInputLength); 4145 fActiveLimit = originalInputLen; 4146 } 4147 break; 4148 4149 4150 case URX_LBN_CONT: 4151 { 4152 // Negative Look-Behind, at top of loop checking for matches of LB expression 4153 // at all possible input starting positions. 4154 4155 // Fetch the extra parameters of this op. 4156 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 4157 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 4158 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 4159 continueLoc = URX_VAL(continueLoc); 4160 U_ASSERT(minML <= maxML); 4161 U_ASSERT(minML >= 0); 4162 U_ASSERT(continueLoc > fp->fPatIdx); 4163 4164 // Fetch (from data) the last input index where a match was attempted. 4165 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4166 int64_t *lbStartIdx = &fData[opValue+2]; 4167 if (*lbStartIdx < 0) { 4168 // First time through loop. 4169 *lbStartIdx = fp->fInputIdx - minML; 4170 } else { 4171 // 2nd through nth time through the loop. 4172 // Back up start position for match by one. 4173 if (*lbStartIdx == 0) { 4174 (*lbStartIdx)--; 4175 } else { 4176 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); 4177 (void)UTEXT_PREVIOUS32(fInputText); 4178 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 4179 } 4180 } 4181 4182 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 4183 // We have tried all potential match starting points without 4184 // getting a match, which means that the negative lookbehind as 4185 // a whole has succeeded. Jump forward to the continue location 4186 int64_t restoreInputLen = fData[opValue+3]; 4187 U_ASSERT(restoreInputLen >= fActiveLimit); 4188 U_ASSERT(restoreInputLen <= fInputLength); 4189 fActiveLimit = restoreInputLen; 4190 fp->fPatIdx = continueLoc; 4191 break; 4192 } 4193 4194 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 4195 // (successful match will cause a FAIL out of the loop altogether.) 4196 fp = StateSave(fp, fp->fPatIdx-4, status); 4197 fp->fInputIdx = *lbStartIdx; 4198 } 4199 break; 4200 4201 case URX_LBN_END: 4202 // End of a negative look-behind block, after a successful match. 4203 { 4204 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4205 if (fp->fInputIdx != fActiveLimit) { 4206 // The look-behind expression matched, but the match did not 4207 // extend all the way to the point that we are looking behind from. 4208 // FAIL out of here, which will take us back to the LB_CONT, which 4209 // will retry the match starting at another position or succeed 4210 // the look-behind altogether, whichever is appropriate. 4211 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4212 break; 4213 } 4214 4215 // Look-behind expression matched, which means look-behind test as 4216 // a whole Fails 4217 4218 // Restore the orignal input string length, which had been truncated 4219 // inorder to pin the end of the lookbehind match 4220 // to the position being looked-behind. 4221 int64_t originalInputLen = fData[opValue+3]; 4222 U_ASSERT(originalInputLen >= fActiveLimit); 4223 U_ASSERT(originalInputLen <= fInputLength); 4224 fActiveLimit = originalInputLen; 4225 4226 // Restore original stack position, discarding any state saved 4227 // by the successful pattern match. 4228 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4229 int32_t newStackSize = (int32_t)fData[opValue]; 4230 U_ASSERT(fStack->size() > newStackSize); 4231 fStack->setSize(newStackSize); 4232 4233 // FAIL, which will take control back to someplace 4234 // prior to entering the look-behind test. 4235 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4236 } 4237 break; 4238 4239 4240 case URX_LOOP_SR_I: 4241 // Loop Initialization for the optimized implementation of 4242 // [some character set]* 4243 // This op scans through all matching input. 4244 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4245 { 4246 U_ASSERT(opValue > 0 && opValue < sets->size()); 4247 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 4248 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 4249 4250 // Loop through input, until either the input is exhausted or 4251 // we reach a character that is not a member of the set. 4252 int64_t ix = fp->fInputIdx; 4253 UTEXT_SETNATIVEINDEX(fInputText, ix); 4254 for (;;) { 4255 if (ix >= fActiveLimit) { 4256 fHitEnd = TRUE; 4257 break; 4258 } 4259 UChar32 c = UTEXT_NEXT32(fInputText); 4260 if (c<256) { 4261 if (s8->contains(c) == FALSE) { 4262 break; 4263 } 4264 } else { 4265 if (s->contains(c) == FALSE) { 4266 break; 4267 } 4268 } 4269 ix = UTEXT_GETNATIVEINDEX(fInputText); 4270 } 4271 4272 // If there were no matching characters, skip over the loop altogether. 4273 // The loop doesn't run at all, a * op always succeeds. 4274 if (ix == fp->fInputIdx) { 4275 fp->fPatIdx++; // skip the URX_LOOP_C op. 4276 break; 4277 } 4278 4279 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4280 // must follow. It's operand is the stack location 4281 // that holds the starting input index for the match of this [set]* 4282 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 4283 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4284 int32_t stackLoc = URX_VAL(loopcOp); 4285 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4286 fp->fExtra[stackLoc] = fp->fInputIdx; 4287 #ifdef REGEX_SMART_BACKTRACKING 4288 backSearchIndex = fp->fInputIdx; 4289 #endif 4290 fp->fInputIdx = ix; 4291 4292 // Save State to the URX_LOOP_C op that follows this one, 4293 // so that match failures in the following code will return to there. 4294 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4295 fp = StateSave(fp, fp->fPatIdx, status); 4296 fp->fPatIdx++; 4297 } 4298 break; 4299 4300 4301 case URX_LOOP_DOT_I: 4302 // Loop Initialization for the optimized implementation of .* 4303 // This op scans through all remaining input. 4304 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4305 { 4306 // Loop through input until the input is exhausted (we reach an end-of-line) 4307 // In DOTALL mode, we can just go straight to the end of the input. 4308 int64_t ix; 4309 if ((opValue & 1) == 1) { 4310 // Dot-matches-All mode. Jump straight to the end of the string. 4311 ix = fActiveLimit; 4312 fHitEnd = TRUE; 4313 } else { 4314 // NOT DOT ALL mode. Line endings do not match '.' 4315 // Scan forward until a line ending or end of input. 4316 ix = fp->fInputIdx; 4317 UTEXT_SETNATIVEINDEX(fInputText, ix); 4318 for (;;) { 4319 if (ix >= fActiveLimit) { 4320 fHitEnd = TRUE; 4321 break; 4322 } 4323 UChar32 c = UTEXT_NEXT32(fInputText); 4324 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 4325 if ((c == 0x0a) || // 0x0a is newline in both modes. 4326 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 4327 (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) { 4328 // char is a line ending. Exit the scanning loop. 4329 break; 4330 } 4331 } 4332 ix = UTEXT_GETNATIVEINDEX(fInputText); 4333 } 4334 } 4335 4336 // If there were no matching characters, skip over the loop altogether. 4337 // The loop doesn't run at all, a * op always succeeds. 4338 if (ix == fp->fInputIdx) { 4339 fp->fPatIdx++; // skip the URX_LOOP_C op. 4340 break; 4341 } 4342 4343 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4344 // must follow. It's operand is the stack location 4345 // that holds the starting input index for the match of this .* 4346 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 4347 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4348 int32_t stackLoc = URX_VAL(loopcOp); 4349 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4350 fp->fExtra[stackLoc] = fp->fInputIdx; 4351 #ifdef REGEX_SMART_BACKTRACKING 4352 backSearchIndex = fp->fInputIdx; 4353 #endif 4354 fp->fInputIdx = ix; 4355 4356 // Save State to the URX_LOOP_C op that follows this one, 4357 // so that match failures in the following code will return to there. 4358 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4359 fp = StateSave(fp, fp->fPatIdx, status); 4360 fp->fPatIdx++; 4361 } 4362 break; 4363 4364 4365 case URX_LOOP_C: 4366 { 4367 U_ASSERT(opValue>=0 && opValue<fFrameSize); 4368 backSearchIndex = fp->fExtra[opValue]; 4369 U_ASSERT(backSearchIndex <= fp->fInputIdx); 4370 if (backSearchIndex == fp->fInputIdx) { 4371 // We've backed up the input idx to the point that the loop started. 4372 // The loop is done. Leave here without saving state. 4373 // Subsequent failures won't come back here. 4374 break; 4375 } 4376 // Set up for the next iteration of the loop, with input index 4377 // backed up by one from the last time through, 4378 // and a state save to this instruction in case the following code fails again. 4379 // (We're going backwards because this loop emulates stack unwinding, not 4380 // the initial scan forward.) 4381 U_ASSERT(fp->fInputIdx > 0); 4382 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4383 UChar32 prevC = UTEXT_PREVIOUS32(fInputText); 4384 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4385 4386 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); 4387 if (prevC == 0x0a && 4388 fp->fInputIdx > backSearchIndex && 4389 twoPrevC == 0x0d) { 4390 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 4391 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 4392 // .*, stepping back over CRLF pair. 4393 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4394 } 4395 } 4396 4397 4398 fp = StateSave(fp, fp->fPatIdx-1, status); 4399 } 4400 break; 4401 4402 4403 4404 default: 4405 // Trouble. The compiled pattern contains an entry with an 4406 // unrecognized type tag. 4407 U_ASSERT(FALSE); 4408 } 4409 4410 if (U_FAILURE(status)) { 4411 isMatch = FALSE; 4412 break; 4413 } 4414 } 4415 4416 breakFromLoop: 4417 fMatch = isMatch; 4418 if (isMatch) { 4419 fLastMatchEnd = fMatchEnd; 4420 fMatchStart = startIdx; 4421 fMatchEnd = fp->fInputIdx; 4422 if (fTraceDebug) { 4423 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); 4424 } 4425 } 4426 else 4427 { 4428 if (fTraceDebug) { 4429 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); 4430 } 4431 } 4432 4433 fFrame = fp; // The active stack frame when the engine stopped. 4434 // Contains the capture group results that we need to 4435 // access later. 4436 return; 4437 } 4438 4439 4440 //-------------------------------------------------------------------------------- 4441 // 4442 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the 4443 // assumption that the entire string is available in the UText's 4444 // chunk buffer. For now, that means we can use int32_t indexes, 4445 // except for anything that needs to be saved (like group starts 4446 // and ends). 4447 // 4448 // startIdx: begin matching a this index. 4449 // toEnd: if true, match must extend to end of the input region 4450 // 4451 //-------------------------------------------------------------------------------- 4452 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { 4453 UBool isMatch = FALSE; // True if the we have a match. 4454 4455 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards 4456 4457 int32_t op; // Operation from the compiled pattern, split into 4458 int32_t opType; // the opcode 4459 int32_t opValue; // and the operand value. 4460 4461 #ifdef REGEX_RUN_DEBUG 4462 if (fTraceDebug) 4463 { 4464 printf("MatchAt(startIdx=%ld)\n", startIdx); 4465 printf("Original Pattern: "); 4466 UChar32 c = utext_next32From(fPattern->fPattern, 0); 4467 while (c != U_SENTINEL) { 4468 if (c<32 || c>256) { 4469 c = '.'; 4470 } 4471 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); 4472 4473 c = UTEXT_NEXT32(fPattern->fPattern); 4474 } 4475 printf("\n"); 4476 printf("Input String: "); 4477 c = utext_next32From(fInputText, 0); 4478 while (c != U_SENTINEL) { 4479 if (c<32 || c>256) { 4480 c = '.'; 4481 } 4482 printf("%c", c); 4483 4484 c = UTEXT_NEXT32(fInputText); 4485 } 4486 printf("\n"); 4487 printf("\n"); 4488 } 4489 #endif 4490 4491 if (U_FAILURE(status)) { 4492 return; 4493 } 4494 4495 // Cache frequently referenced items from the compiled pattern 4496 // 4497 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 4498 4499 const UChar *litText = fPattern->fLiteralText.getBuffer(); 4500 UVector *sets = fPattern->fSets; 4501 4502 const UChar *inputBuf = fInputText->chunkContents; 4503 4504 fFrameSize = fPattern->fFrameSize; 4505 REStackFrame *fp = resetStack(); 4506 4507 fp->fPatIdx = 0; 4508 fp->fInputIdx = startIdx; 4509 4510 // Zero out the pattern's static data 4511 int32_t i; 4512 for (i = 0; i<fPattern->fDataSize; i++) { 4513 fData[i] = 0; 4514 } 4515 4516 // 4517 // Main loop for interpreting the compiled pattern. 4518 // One iteration of the loop per pattern operation performed. 4519 // 4520 for (;;) { 4521 #if 0 4522 if (_heapchk() != _HEAPOK) { 4523 fprintf(stderr, "Heap Trouble\n"); 4524 } 4525 #endif 4526 4527 op = (int32_t)pat[fp->fPatIdx]; 4528 opType = URX_TYPE(op); 4529 opValue = URX_VAL(op); 4530 #ifdef REGEX_RUN_DEBUG 4531 if (fTraceDebug) { 4532 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4533 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, 4534 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 4535 fPattern->dumpOp(fp->fPatIdx); 4536 } 4537 #endif 4538 fp->fPatIdx++; 4539 4540 switch (opType) { 4541 4542 4543 case URX_NOP: 4544 break; 4545 4546 4547 case URX_BACKTRACK: 4548 // Force a backtrack. In some circumstances, the pattern compiler 4549 // will notice that the pattern can't possibly match anything, and will 4550 // emit one of these at that point. 4551 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4552 break; 4553 4554 4555 case URX_ONECHAR: 4556 if (fp->fInputIdx < fActiveLimit) { 4557 UChar32 c; 4558 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4559 if (c == opValue) { 4560 break; 4561 } 4562 } else { 4563 fHitEnd = TRUE; 4564 } 4565 4566 #ifdef REGEX_SMART_BACKTRACKING 4567 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 4568 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 4569 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 4570 int64_t reverseIndex = fp->fInputIdx; 4571 UChar32 c; 4572 do { 4573 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 4574 if (c == opValue) { 4575 break; 4576 } 4577 } while (reverseIndex > backSearchIndex); 4578 if (c == opValue) { 4579 fHitEnd = FALSE; 4580 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4581 fp->fInputIdx = reverseIndex; 4582 if (fp->fInputIdx > backSearchIndex) { 4583 fp = StateSave(fp, fp->fPatIdx, status); 4584 } 4585 fp->fPatIdx++; // Skip the LOOP_C, we just did that 4586 break; 4587 } 4588 } 4589 } 4590 #endif 4591 4592 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4593 break; 4594 4595 4596 case URX_STRING: 4597 { 4598 // Test input against a literal string. 4599 // Strings require two slots in the compiled pattern, one for the 4600 // offset to the string text, and one for the length. 4601 int32_t stringStartIdx = opValue; 4602 int32_t stringLen; 4603 4604 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 4605 fp->fPatIdx++; 4606 opType = URX_TYPE(op); 4607 stringLen = URX_VAL(op); 4608 U_ASSERT(opType == URX_STRING_LEN); 4609 U_ASSERT(stringLen >= 2); 4610 4611 if (fp->fInputIdx + stringLen > fActiveLimit) { 4612 // No match. String is longer than the remaining input text. 4613 fHitEnd = TRUE; // TODO: See ticket 6074 4614 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4615 break; 4616 } 4617 4618 const UChar * pInp = inputBuf + fp->fInputIdx; 4619 const UChar * pPat = litText+stringStartIdx; 4620 const UChar * pEnd = pInp + stringLen; 4621 UBool success = FALSE; 4622 for(;;) { 4623 if (*pInp == *pPat) { 4624 pInp++; 4625 pPat++; 4626 if (pInp == pEnd) { 4627 // Successful Match. 4628 success = TRUE; 4629 break; 4630 } 4631 } else { 4632 // Match failed. 4633 break; 4634 } 4635 } 4636 4637 if (success) { 4638 fp->fInputIdx += stringLen; 4639 } else { 4640 #ifdef REGEX_SMART_BACKTRACKING 4641 if (fp->fInputIdx > backSearchIndex && fStack->size()) { 4642 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 4643 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 4644 // Reset to last start point 4645 int64_t reverseIndex = fp->fInputIdx; 4646 UChar32 c; 4647 pPat = litText+stringStartIdx; 4648 4649 // Search backwards for a possible start 4650 do { 4651 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 4652 if ((U_IS_BMP(c) && *pPat == c) || 4653 (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) { 4654 success = TRUE; 4655 break; 4656 } 4657 } while (reverseIndex > backSearchIndex); 4658 4659 // And try again 4660 if (success) { 4661 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4662 fp->fInputIdx = reverseIndex; 4663 if (fp->fInputIdx > backSearchIndex) { 4664 fp = StateSave(fp, fp->fPatIdx, status); 4665 } 4666 fp->fPatIdx++; // Skip the LOOP_C, we just did that 4667 break; 4668 } 4669 } 4670 } 4671 #endif 4672 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4673 } 4674 } 4675 break; 4676 4677 4678 case URX_STATE_SAVE: 4679 fp = StateSave(fp, opValue, status); 4680 break; 4681 4682 4683 case URX_END: 4684 // The match loop will exit via this path on a successful match, 4685 // when we reach the end of the pattern. 4686 if (toEnd && fp->fInputIdx != fActiveLimit) { 4687 // The pattern matched, but not to the end of input. Try some more. 4688 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4689 break; 4690 } 4691 isMatch = TRUE; 4692 goto breakFromLoop; 4693 4694 // Start and End Capture stack frame variables are laid out out like this: 4695 // fp->fExtra[opValue] - The start of a completed capture group 4696 // opValue+1 - The end of a completed capture group 4697 // opValue+2 - the start of a capture group whose end 4698 // has not yet been reached (and might not ever be). 4699 case URX_START_CAPTURE: 4700 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4701 fp->fExtra[opValue+2] = fp->fInputIdx; 4702 break; 4703 4704 4705 case URX_END_CAPTURE: 4706 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4707 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 4708 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 4709 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 4710 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 4711 break; 4712 4713 4714 case URX_DOLLAR: // $, test for End of line 4715 // or for position before new line at end of input 4716 if (fp->fInputIdx < fAnchorLimit-2) { 4717 // We are no where near the end of input. Fail. 4718 // This is the common case. Keep it first. 4719 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4720 break; 4721 } 4722 if (fp->fInputIdx >= fAnchorLimit) { 4723 // We really are at the end of input. Success. 4724 fHitEnd = TRUE; 4725 fRequireEnd = TRUE; 4726 break; 4727 } 4728 4729 // If we are positioned just before a new-line that is located at the 4730 // end of input, succeed. 4731 if (fp->fInputIdx == fAnchorLimit-1) { 4732 UChar32 c; 4733 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); 4734 4735 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 4736 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4737 // At new-line at end of input. Success 4738 fHitEnd = TRUE; 4739 fRequireEnd = TRUE; 4740 break; 4741 } 4742 } 4743 } else if (fp->fInputIdx == fAnchorLimit-2 && 4744 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) { 4745 fHitEnd = TRUE; 4746 fRequireEnd = TRUE; 4747 break; // At CR/LF at end of input. Success 4748 } 4749 4750 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4751 4752 break; 4753 4754 4755 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 4756 if (fp->fInputIdx >= fAnchorLimit-1) { 4757 // Either at the last character of input, or off the end. 4758 if (fp->fInputIdx == fAnchorLimit-1) { 4759 // At last char of input. Success if it's a new line. 4760 if (inputBuf[fp->fInputIdx] == 0x0a) { 4761 fHitEnd = TRUE; 4762 fRequireEnd = TRUE; 4763 break; 4764 } 4765 } else { 4766 // Off the end of input. Success. 4767 fHitEnd = TRUE; 4768 fRequireEnd = TRUE; 4769 break; 4770 } 4771 } 4772 4773 // Not at end of input. Back-track out. 4774 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4775 break; 4776 4777 4778 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 4779 { 4780 if (fp->fInputIdx >= fAnchorLimit) { 4781 // We really are at the end of input. Success. 4782 fHitEnd = TRUE; 4783 fRequireEnd = TRUE; 4784 break; 4785 } 4786 // If we are positioned just before a new-line, succeed. 4787 // It makes no difference where the new-line is within the input. 4788 UChar32 c = inputBuf[fp->fInputIdx]; 4789 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 4790 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 4791 // In multi-line mode, hitting a new-line just before the end of input does not 4792 // set the hitEnd or requireEnd flags 4793 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4794 break; 4795 } 4796 } 4797 // not at a new line. Fail. 4798 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4799 } 4800 break; 4801 4802 4803 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 4804 { 4805 if (fp->fInputIdx >= fAnchorLimit) { 4806 // We really are at the end of input. Success. 4807 fHitEnd = TRUE; 4808 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 4809 break; // adding a new-line would not lose the match. 4810 } 4811 // If we are not positioned just before a new-line, the test fails; backtrack out. 4812 // It makes no difference where the new-line is within the input. 4813 if (inputBuf[fp->fInputIdx] != 0x0a) { 4814 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4815 } 4816 } 4817 break; 4818 4819 4820 case URX_CARET: // ^, test for start of line 4821 if (fp->fInputIdx != fAnchorStart) { 4822 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4823 } 4824 break; 4825 4826 4827 case URX_CARET_M: // ^, test for start of line in mulit-line mode 4828 { 4829 if (fp->fInputIdx == fAnchorStart) { 4830 // We are at the start input. Success. 4831 break; 4832 } 4833 // Check whether character just before the current pos is a new-line 4834 // unless we are at the end of input 4835 UChar c = inputBuf[fp->fInputIdx - 1]; 4836 if ((fp->fInputIdx < fAnchorLimit) && 4837 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 4838 // It's a new-line. ^ is true. Success. 4839 // TODO: what should be done with positions between a CR and LF? 4840 break; 4841 } 4842 // Not at the start of a line. Fail. 4843 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4844 } 4845 break; 4846 4847 4848 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 4849 { 4850 U_ASSERT(fp->fInputIdx >= fAnchorStart); 4851 if (fp->fInputIdx <= fAnchorStart) { 4852 // We are at the start input. Success. 4853 break; 4854 } 4855 // Check whether character just before the current pos is a new-line 4856 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 4857 UChar c = inputBuf[fp->fInputIdx - 1]; 4858 if (c != 0x0a) { 4859 // Not at the start of a line. Back-track out. 4860 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4861 } 4862 } 4863 break; 4864 4865 case URX_BACKSLASH_B: // Test for word boundaries 4866 { 4867 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); 4868 success ^= (opValue != 0); // flip sense for \B 4869 if (!success) { 4870 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4871 } 4872 } 4873 break; 4874 4875 4876 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 4877 { 4878 UBool success = isUWordBoundary(fp->fInputIdx); 4879 success ^= (opValue != 0); // flip sense for \B 4880 if (!success) { 4881 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4882 } 4883 } 4884 break; 4885 4886 4887 case URX_BACKSLASH_D: // Test for decimal digit 4888 { 4889 if (fp->fInputIdx >= fActiveLimit) { 4890 fHitEnd = TRUE; 4891 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4892 break; 4893 } 4894 4895 UChar32 c; 4896 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4897 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 4898 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 4899 success ^= (opValue != 0); // flip sense for \D 4900 if (!success) { 4901 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4902 } 4903 } 4904 break; 4905 4906 4907 case URX_BACKSLASH_G: // Test for position at end of previous match 4908 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { 4909 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4910 } 4911 break; 4912 4913 4914 case URX_BACKSLASH_X: 4915 // Match a Grapheme, as defined by Unicode TR 29. 4916 // Differs slightly from Perl, which consumes combining marks independently 4917 // of context. 4918 { 4919 4920 // Fail if at end of input 4921 if (fp->fInputIdx >= fActiveLimit) { 4922 fHitEnd = TRUE; 4923 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4924 break; 4925 } 4926 4927 // Examine (and consume) the current char. 4928 // Dispatch into a little state machine, based on the char. 4929 UChar32 c; 4930 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4931 UnicodeSet **sets = fPattern->fStaticSets; 4932 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 4933 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 4934 if (sets[URX_GC_L]->contains(c)) goto GC_L; 4935 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 4936 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 4937 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4938 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4939 goto GC_Extend; 4940 4941 4942 4943 GC_L: 4944 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4945 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4946 if (sets[URX_GC_L]->contains(c)) goto GC_L; 4947 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 4948 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 4949 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4950 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4951 goto GC_Extend; 4952 4953 GC_V: 4954 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4955 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4956 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4957 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4958 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4959 goto GC_Extend; 4960 4961 GC_T: 4962 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4963 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4964 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4965 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4966 goto GC_Extend; 4967 4968 GC_Extend: 4969 // Combining characters are consumed here 4970 for (;;) { 4971 if (fp->fInputIdx >= fActiveLimit) { 4972 break; 4973 } 4974 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4975 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 4976 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 4977 break; 4978 } 4979 } 4980 goto GC_Done; 4981 4982 GC_Control: 4983 // Most control chars stand alone (don't combine with combining chars), 4984 // except for that CR/LF sequence is a single grapheme cluster. 4985 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) { 4986 fp->fInputIdx++; 4987 } 4988 4989 GC_Done: 4990 if (fp->fInputIdx >= fActiveLimit) { 4991 fHitEnd = TRUE; 4992 } 4993 break; 4994 } 4995 4996 4997 4998 4999 case URX_BACKSLASH_Z: // Test for end of Input 5000 if (fp->fInputIdx < fAnchorLimit) { 5001 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5002 } else { 5003 fHitEnd = TRUE; 5004 fRequireEnd = TRUE; 5005 } 5006 break; 5007 5008 5009 5010 case URX_STATIC_SETREF: 5011 { 5012 // Test input character against one of the predefined sets 5013 // (Word Characters, for example) 5014 // The high bit of the op value is a flag for the match polarity. 5015 // 0: success if input char is in set. 5016 // 1: success if input char is not in set. 5017 if (fp->fInputIdx >= fActiveLimit) { 5018 fHitEnd = TRUE; 5019 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5020 break; 5021 } 5022 5023 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 5024 opValue &= ~URX_NEG_SET; 5025 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 5026 5027 UChar32 c; 5028 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5029 if (c < 256) { 5030 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 5031 if (s8->contains(c)) { 5032 success = !success; 5033 } 5034 } else { 5035 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 5036 if (s->contains(c)) { 5037 success = !success; 5038 } 5039 } 5040 if (!success) { 5041 #ifdef REGEX_SMART_BACKTRACKING 5042 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 5043 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5044 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5045 // Try to find it, backwards 5046 int64_t reverseIndex = fp->fInputIdx; 5047 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried 5048 success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset 5049 do { 5050 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5051 if (c < 256) { 5052 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 5053 if (s8->contains(c)) { 5054 success = !success; 5055 } 5056 } else { 5057 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 5058 if (s->contains(c)) { 5059 success = !success; 5060 } 5061 } 5062 } while (reverseIndex > backSearchIndex && !success); 5063 5064 if (success) { 5065 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5066 fp->fInputIdx = reverseIndex; 5067 if (fp->fInputIdx > backSearchIndex) { 5068 fp = StateSave(fp, fp->fPatIdx, status); 5069 } 5070 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5071 break; 5072 } 5073 } 5074 } 5075 #endif 5076 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5077 } 5078 } 5079 break; 5080 5081 5082 case URX_STAT_SETREF_N: 5083 { 5084 // Test input character for NOT being a member of one of 5085 // the predefined sets (Word Characters, for example) 5086 if (fp->fInputIdx >= fActiveLimit) { 5087 fHitEnd = TRUE; 5088 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5089 break; 5090 } 5091 5092 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 5093 5094 UChar32 c; 5095 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5096 if (c < 256) { 5097 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 5098 if (s8->contains(c) == FALSE) { 5099 break; 5100 } 5101 } else { 5102 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 5103 if (s->contains(c) == FALSE) { 5104 break; 5105 } 5106 } 5107 5108 #ifdef REGEX_SMART_BACKTRACKING 5109 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 5110 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5111 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5112 // Try to find it, backwards 5113 int64_t reverseIndex = fp->fInputIdx; 5114 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried 5115 UBool success = FALSE; 5116 do { 5117 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5118 if (c < 256) { 5119 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 5120 if (s8->contains(c) == FALSE) { 5121 success = TRUE; 5122 break; 5123 } 5124 } else { 5125 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 5126 if (s->contains(c) == FALSE) { 5127 success = TRUE; 5128 break; 5129 } 5130 } 5131 } while (reverseIndex > backSearchIndex); 5132 5133 if (success) { 5134 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5135 fp->fInputIdx = reverseIndex; 5136 if (fp->fInputIdx > backSearchIndex) { 5137 fp = StateSave(fp, fp->fPatIdx, status); 5138 } 5139 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5140 break; 5141 } 5142 } 5143 } 5144 #endif 5145 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5146 } 5147 break; 5148 5149 5150 case URX_SETREF: 5151 { 5152 if (fp->fInputIdx >= fActiveLimit) { 5153 fHitEnd = TRUE; 5154 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5155 break; 5156 } 5157 5158 U_ASSERT(opValue > 0 && opValue < sets->size()); 5159 5160 // There is input left. Pick up one char and test it for set membership. 5161 UChar32 c; 5162 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5163 if (c<256) { 5164 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5165 if (s8->contains(c)) { 5166 // The character is in the set. A Match. 5167 break; 5168 } 5169 } else { 5170 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 5171 if (s->contains(c)) { 5172 // The character is in the set. A Match. 5173 break; 5174 } 5175 } 5176 5177 // the character wasn't in the set. 5178 #ifdef REGEX_SMART_BACKTRACKING 5179 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 5180 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5181 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5182 // Try to find it, backwards 5183 int64_t reverseIndex = fp->fInputIdx; 5184 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried 5185 UBool success = FALSE; 5186 do { 5187 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5188 if (c < 256) { 5189 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5190 if (s8->contains(c)) { 5191 success = TRUE; 5192 break; 5193 } 5194 } else { 5195 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 5196 if (s->contains(c)) { 5197 success = TRUE; 5198 break; 5199 } 5200 } 5201 } while (reverseIndex > backSearchIndex); 5202 5203 if (success) { 5204 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5205 fp->fInputIdx = reverseIndex; 5206 if (fp->fInputIdx > reverseIndex) { 5207 fp = StateSave(fp, fp->fPatIdx, status); 5208 } 5209 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5210 break; 5211 } 5212 } 5213 } 5214 #endif 5215 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5216 } 5217 break; 5218 5219 5220 case URX_DOTANY: 5221 { 5222 // . matches anything, but stops at end-of-line. 5223 if (fp->fInputIdx >= fActiveLimit) { 5224 // At end of input. Match failed. Backtrack out. 5225 fHitEnd = TRUE; 5226 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5227 break; 5228 } 5229 5230 // There is input left. Advance over one char, unless we've hit end-of-line 5231 UChar32 c; 5232 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5233 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 5234 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 5235 // End of line in normal mode. . does not match. 5236 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5237 break; 5238 } 5239 } 5240 break; 5241 5242 5243 case URX_DOTANY_ALL: 5244 { 5245 // . in dot-matches-all (including new lines) mode 5246 if (fp->fInputIdx >= fActiveLimit) { 5247 // At end of input. Match failed. Backtrack out. 5248 fHitEnd = TRUE; 5249 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5250 break; 5251 } 5252 5253 // There is input left. Advance over one char, except if we are 5254 // at a cr/lf, advance over both of them. 5255 UChar32 c; 5256 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5257 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 5258 // In the case of a CR/LF, we need to advance over both. 5259 if (inputBuf[fp->fInputIdx] == 0x0a) { 5260 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); 5261 } 5262 } 5263 } 5264 break; 5265 5266 5267 case URX_DOTANY_UNIX: 5268 { 5269 // '.' operator, matches all, but stops at end-of-line. 5270 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 5271 if (fp->fInputIdx >= fActiveLimit) { 5272 // At end of input. Match failed. Backtrack out. 5273 fHitEnd = TRUE; 5274 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5275 break; 5276 } 5277 5278 // There is input left. Advance over one char, unless we've hit end-of-line 5279 UChar32 c; 5280 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5281 if (c == 0x0a) { 5282 // End of line in normal mode. '.' does not match the \n 5283 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5284 } 5285 } 5286 break; 5287 5288 5289 case URX_JMP: 5290 fp->fPatIdx = opValue; 5291 break; 5292 5293 case URX_FAIL: 5294 isMatch = FALSE; 5295 goto breakFromLoop; 5296 5297 case URX_JMP_SAV: 5298 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 5299 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 5300 fp->fPatIdx = opValue; // Then JMP. 5301 break; 5302 5303 case URX_JMP_SAV_X: 5304 // This opcode is used with (x)+, when x can match a zero length string. 5305 // Same as JMP_SAV, except conditional on the match having made forward progress. 5306 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 5307 // data address of the input position at the start of the loop. 5308 { 5309 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 5310 int32_t stoOp = (int32_t)pat[opValue-1]; 5311 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 5312 int32_t frameLoc = URX_VAL(stoOp); 5313 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 5314 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; 5315 U_ASSERT(prevInputIdx <= fp->fInputIdx); 5316 if (prevInputIdx < fp->fInputIdx) { 5317 // The match did make progress. Repeat the loop. 5318 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 5319 fp->fPatIdx = opValue; 5320 fp->fExtra[frameLoc] = fp->fInputIdx; 5321 } 5322 // If the input position did not advance, we do nothing here, 5323 // execution will fall out of the loop. 5324 } 5325 break; 5326 5327 case URX_CTR_INIT: 5328 { 5329 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 5330 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 5331 5332 // Pick up the three extra operands that CTR_INIT has, and 5333 // skip the pattern location counter past 5334 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5335 fp->fPatIdx += 3; 5336 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 5337 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 5338 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 5339 U_ASSERT(minCount>=0); 5340 U_ASSERT(maxCount>=minCount || maxCount==-1); 5341 U_ASSERT(loopLoc>fp->fPatIdx); 5342 5343 if (minCount == 0) { 5344 fp = StateSave(fp, loopLoc+1, status); 5345 } 5346 if (maxCount == 0) { 5347 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5348 } 5349 } 5350 break; 5351 5352 case URX_CTR_LOOP: 5353 { 5354 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 5355 int32_t initOp = (int32_t)pat[opValue]; 5356 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 5357 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 5358 int32_t minCount = (int32_t)pat[opValue+2]; 5359 int32_t maxCount = (int32_t)pat[opValue+3]; 5360 // Increment the counter. Note: we DIDN'T worry about counter 5361 // overflow, since the data comes from UnicodeStrings, which 5362 // stores its length in an int32_t. Do we have to think about 5363 // this now that we're using UText? Probably not, since the length 5364 // in UChar32s is still an int32_t. 5365 (*pCounter)++; 5366 U_ASSERT(*pCounter > 0); 5367 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { 5368 U_ASSERT(*pCounter == maxCount || maxCount == -1); 5369 break; 5370 } 5371 if (*pCounter >= minCount) { 5372 fp = StateSave(fp, fp->fPatIdx, status); 5373 } 5374 fp->fPatIdx = opValue + 4; // Loop back. 5375 } 5376 break; 5377 5378 case URX_CTR_INIT_NG: 5379 { 5380 // Initialize a non-greedy loop 5381 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 5382 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 5383 5384 // Pick up the three extra operands that CTR_INIT has, and 5385 // skip the pattern location counter past 5386 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5387 fp->fPatIdx += 3; 5388 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 5389 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 5390 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 5391 U_ASSERT(minCount>=0); 5392 U_ASSERT(maxCount>=minCount || maxCount==-1); 5393 U_ASSERT(loopLoc>fp->fPatIdx); 5394 5395 if (minCount == 0) { 5396 if (maxCount != 0) { 5397 fp = StateSave(fp, fp->fPatIdx, status); 5398 } 5399 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 5400 } 5401 } 5402 break; 5403 5404 case URX_CTR_LOOP_NG: 5405 { 5406 // Non-greedy {min, max} loops 5407 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 5408 int32_t initOp = (int32_t)pat[opValue]; 5409 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 5410 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 5411 int32_t minCount = (int32_t)pat[opValue+2]; 5412 int32_t maxCount = (int32_t)pat[opValue+3]; 5413 // Increment the counter. Note: we DIDN'T worry about counter 5414 // overflow, since the data comes from UnicodeStrings, which 5415 // stores its length in an int32_t. Do we have to think about 5416 // this now that we're using UText? Probably not, since the length 5417 // in UChar32s is still an int32_t. 5418 (*pCounter)++; 5419 U_ASSERT(*pCounter > 0); 5420 5421 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { 5422 // The loop has matched the maximum permitted number of times. 5423 // Break out of here with no action. Matching will 5424 // continue with the following pattern. 5425 U_ASSERT(*pCounter == maxCount || maxCount == -1); 5426 break; 5427 } 5428 5429 if (*pCounter < minCount) { 5430 // We haven't met the minimum number of matches yet. 5431 // Loop back for another one. 5432 fp->fPatIdx = opValue + 4; // Loop back. 5433 } else { 5434 // We do have the minimum number of matches. 5435 // Fall into the following pattern, but first do 5436 // a state save to the top of the loop, so that a failure 5437 // in the following pattern will try another iteration of the loop. 5438 fp = StateSave(fp, opValue + 4, status); 5439 } 5440 } 5441 break; 5442 5443 case URX_STO_SP: 5444 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5445 fData[opValue] = fStack->size(); 5446 break; 5447 5448 case URX_LD_SP: 5449 { 5450 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5451 int32_t newStackSize = (int32_t)fData[opValue]; 5452 U_ASSERT(newStackSize <= fStack->size()); 5453 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5454 if (newFP == (int64_t *)fp) { 5455 break; 5456 } 5457 int32_t i; 5458 for (i=0; i<fFrameSize; i++) { 5459 newFP[i] = ((int64_t *)fp)[i]; 5460 } 5461 fp = (REStackFrame *)newFP; 5462 fStack->setSize(newStackSize); 5463 } 5464 break; 5465 5466 case URX_BACKREF: 5467 case URX_BACKREF_I: 5468 { 5469 U_ASSERT(opValue < fFrameSize); 5470 int64_t groupStartIdx = fp->fExtra[opValue]; 5471 int64_t groupEndIdx = fp->fExtra[opValue+1]; 5472 U_ASSERT(groupStartIdx <= groupEndIdx); 5473 int64_t len = groupEndIdx-groupStartIdx; 5474 if (groupStartIdx < 0) { 5475 // This capture group has not participated in the match thus far, 5476 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 5477 } 5478 5479 if (len == 0) { 5480 // The capture group match was of an empty string. 5481 // Verified by testing: Perl matches succeed in this case, so 5482 // we do too. 5483 break; 5484 } 5485 5486 UBool haveMatch = FALSE; 5487 if (fp->fInputIdx + len <= fActiveLimit) { 5488 if (opType == URX_BACKREF) { 5489 if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) { 5490 haveMatch = TRUE; 5491 } 5492 } else { 5493 if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, 5494 (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) { 5495 haveMatch = TRUE; 5496 } 5497 } 5498 } else { 5499 // TODO: probably need to do a partial string comparison, and only 5500 // set HitEnd if the available input matched. Ticket #6074 5501 fHitEnd = TRUE; 5502 } 5503 if (haveMatch) { 5504 fp->fInputIdx += len; // Match. Advance current input position. 5505 } else { 5506 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 5507 } 5508 } 5509 break; 5510 5511 case URX_STO_INP_LOC: 5512 { 5513 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 5514 fp->fExtra[opValue] = fp->fInputIdx; 5515 } 5516 break; 5517 5518 case URX_JMPX: 5519 { 5520 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5521 fp->fPatIdx += 1; 5522 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 5523 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 5524 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; 5525 U_ASSERT(savedInputIdx <= fp->fInputIdx); 5526 if (savedInputIdx < fp->fInputIdx) { 5527 fp->fPatIdx = opValue; // JMP 5528 } else { 5529 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 5530 } 5531 } 5532 break; 5533 5534 case URX_LA_START: 5535 { 5536 // Entering a lookahead block. 5537 // Save Stack Ptr, Input Pos. 5538 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5539 fData[opValue] = fStack->size(); 5540 fData[opValue+1] = fp->fInputIdx; 5541 fActiveStart = fLookStart; // Set the match region change for 5542 fActiveLimit = fLookLimit; // transparent bounds. 5543 } 5544 break; 5545 5546 case URX_LA_END: 5547 { 5548 // Leaving a look-ahead block. 5549 // restore Stack Ptr, Input Pos to positions they had on entry to block. 5550 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5551 int32_t stackSize = fStack->size(); 5552 int32_t newStackSize = (int32_t)fData[opValue]; 5553 U_ASSERT(stackSize >= newStackSize); 5554 if (stackSize > newStackSize) { 5555 // Copy the current top frame back to the new (cut back) top frame. 5556 // This makes the capture groups from within the look-ahead 5557 // expression available. 5558 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5559 int32_t i; 5560 for (i=0; i<fFrameSize; i++) { 5561 newFP[i] = ((int64_t *)fp)[i]; 5562 } 5563 fp = (REStackFrame *)newFP; 5564 fStack->setSize(newStackSize); 5565 } 5566 fp->fInputIdx = fData[opValue+1]; 5567 5568 // Restore the active region bounds in the input string; they may have 5569 // been changed because of transparent bounds on a Region. 5570 fActiveStart = fRegionStart; 5571 fActiveLimit = fRegionLimit; 5572 } 5573 break; 5574 5575 case URX_ONECHAR_I: 5576 if (fp->fInputIdx < fActiveLimit) { 5577 UChar32 c; 5578 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5579 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 5580 break; 5581 } 5582 } else { 5583 fHitEnd = TRUE; 5584 } 5585 5586 #ifdef REGEX_SMART_BACKTRACKING 5587 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 5588 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5589 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5590 UBool success = FALSE; 5591 int64_t reverseIndex = fp->fInputIdx; 5592 UChar32 c; 5593 while (reverseIndex > backSearchIndex) { 5594 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5595 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 5596 success = TRUE; 5597 break; 5598 } else if (c == U_SENTINEL) { 5599 break; 5600 } 5601 } 5602 if (success) { 5603 fHitEnd = FALSE; 5604 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5605 fp->fInputIdx = reverseIndex; 5606 if (fp->fInputIdx > backSearchIndex) { 5607 fp = StateSave(fp, fp->fPatIdx, status); 5608 } 5609 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5610 break; 5611 } 5612 } 5613 } 5614 #endif 5615 5616 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5617 break; 5618 5619 case URX_STRING_I: 5620 { 5621 // Test input against a literal string. 5622 // Strings require two slots in the compiled pattern, one for the 5623 // offset to the string text, and one for the length. 5624 const UCaseProps *csp = ucase_getSingleton(); 5625 { 5626 int32_t stringStartIdx, stringLen; 5627 stringStartIdx = opValue; 5628 5629 op = (int32_t)pat[fp->fPatIdx]; 5630 fp->fPatIdx++; 5631 opType = URX_TYPE(op); 5632 opValue = URX_VAL(op); 5633 U_ASSERT(opType == URX_STRING_LEN); 5634 stringLen = opValue; 5635 5636 const UChar *patternChars = litText+stringStartIdx; 5637 const UChar *patternEnd = patternChars+stringLen; 5638 5639 const UChar *foldChars = NULL; 5640 int32_t foldOffset, foldLength; 5641 UChar32 c; 5642 UBool c_is_valid = FALSE; 5643 5644 #ifdef REGEX_SMART_BACKTRACKING 5645 int32_t originalInputIdx = fp->fInputIdx; 5646 #endif 5647 UBool success = TRUE; 5648 5649 foldOffset = foldLength = 0; 5650 5651 while (patternChars < patternEnd && success) { 5652 if (fp->fInputIdx < fActiveLimit) { // don't read past end of string 5653 if(foldOffset < foldLength) { 5654 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 5655 c_is_valid = TRUE; 5656 } else { 5657 // test pre-condition of U16_NEXT: i < length 5658 U_ASSERT(fp->fInputIdx < fActiveLimit); 5659 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5660 c_is_valid = TRUE; 5661 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); 5662 if(foldLength >= 0) { 5663 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings 5664 foldOffset = 0; 5665 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 5666 } else { 5667 c = foldLength; 5668 foldLength = foldOffset; // to avoid reading chars from the folding buffer 5669 } 5670 } 5671 } 5672 } else { 5673 c_is_valid = FALSE; 5674 } 5675 5676 if (fp->fInputIdx <= fActiveLimit && c_is_valid) { 5677 if (U_IS_BMP(c)) { 5678 success = (*patternChars == c); 5679 patternChars += 1; 5680 } else if (patternChars+1 < patternEnd) { 5681 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); 5682 patternChars += 2; 5683 } 5684 } else { 5685 success = FALSE; 5686 fHitEnd = TRUE; // TODO: See ticket 6074 5687 } 5688 } 5689 5690 if (!success) { 5691 #ifdef REGEX_SMART_BACKTRACKING 5692 if (fp->fInputIdx > backSearchIndex && fStack->size()) { 5693 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5694 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5695 // Reset to last start point 5696 int64_t reverseIndex = originalInputIdx; 5697 patternChars = litText+stringStartIdx; 5698 5699 // Search backwards for a possible start 5700 do { 5701 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5702 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); 5703 if(foldLength >= 0) { 5704 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings 5705 foldOffset = 0; 5706 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 5707 } else { 5708 c = foldLength; 5709 foldLength = foldOffset; // to avoid reading chars from the folding buffer 5710 } 5711 } 5712 5713 if ((U_IS_BMP(c) && *patternChars == c) || 5714 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { 5715 success = TRUE; 5716 break; 5717 } 5718 } while (reverseIndex > backSearchIndex); 5719 5720 // And try again 5721 if (success) { 5722 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5723 fp->fInputIdx = reverseIndex; 5724 if (fp->fInputIdx > backSearchIndex) { 5725 fp = StateSave(fp, fp->fPatIdx, status); 5726 } 5727 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5728 break; 5729 } 5730 } 5731 } 5732 #endif 5733 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5734 } 5735 } 5736 } 5737 break; 5738 5739 case URX_LB_START: 5740 { 5741 // Entering a look-behind block. 5742 // Save Stack Ptr, Input Pos. 5743 // TODO: implement transparent bounds. Ticket #6067 5744 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5745 fData[opValue] = fStack->size(); 5746 fData[opValue+1] = fp->fInputIdx; 5747 // Init the variable containing the start index for attempted matches. 5748 fData[opValue+2] = -1; 5749 // Save input string length, then reset to pin any matches to end at 5750 // the current position. 5751 fData[opValue+3] = fActiveLimit; 5752 fActiveLimit = fp->fInputIdx; 5753 } 5754 break; 5755 5756 5757 case URX_LB_CONT: 5758 { 5759 // Positive Look-Behind, at top of loop checking for matches of LB expression 5760 // at all possible input starting positions. 5761 5762 // Fetch the min and max possible match lengths. They are the operands 5763 // of this op in the pattern. 5764 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5765 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5766 U_ASSERT(minML <= maxML); 5767 U_ASSERT(minML >= 0); 5768 5769 // Fetch (from data) the last input index where a match was attempted. 5770 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5771 int64_t *lbStartIdx = &fData[opValue+2]; 5772 if (*lbStartIdx < 0) { 5773 // First time through loop. 5774 *lbStartIdx = fp->fInputIdx - minML; 5775 } else { 5776 // 2nd through nth time through the loop. 5777 // Back up start position for match by one. 5778 if (*lbStartIdx == 0) { 5779 (*lbStartIdx)--; 5780 } else { 5781 U16_BACK_1(inputBuf, 0, *lbStartIdx); 5782 } 5783 } 5784 5785 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 5786 // We have tried all potential match starting points without 5787 // getting a match. Backtrack out, and out of the 5788 // Look Behind altogether. 5789 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5790 int64_t restoreInputLen = fData[opValue+3]; 5791 U_ASSERT(restoreInputLen >= fActiveLimit); 5792 U_ASSERT(restoreInputLen <= fInputLength); 5793 fActiveLimit = restoreInputLen; 5794 break; 5795 } 5796 5797 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5798 // (successful match will fall off the end of the loop.) 5799 fp = StateSave(fp, fp->fPatIdx-3, status); 5800 fp->fInputIdx = *lbStartIdx; 5801 } 5802 break; 5803 5804 case URX_LB_END: 5805 // End of a look-behind block, after a successful match. 5806 { 5807 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5808 if (fp->fInputIdx != fActiveLimit) { 5809 // The look-behind expression matched, but the match did not 5810 // extend all the way to the point that we are looking behind from. 5811 // FAIL out of here, which will take us back to the LB_CONT, which 5812 // will retry the match starting at another position or fail 5813 // the look-behind altogether, whichever is appropriate. 5814 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5815 break; 5816 } 5817 5818 // Look-behind match is good. Restore the orignal input string length, 5819 // which had been truncated to pin the end of the lookbehind match to the 5820 // position being looked-behind. 5821 int64_t originalInputLen = fData[opValue+3]; 5822 U_ASSERT(originalInputLen >= fActiveLimit); 5823 U_ASSERT(originalInputLen <= fInputLength); 5824 fActiveLimit = originalInputLen; 5825 } 5826 break; 5827 5828 5829 case URX_LBN_CONT: 5830 { 5831 // Negative Look-Behind, at top of loop checking for matches of LB expression 5832 // at all possible input starting positions. 5833 5834 // Fetch the extra parameters of this op. 5835 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5836 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5837 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 5838 continueLoc = URX_VAL(continueLoc); 5839 U_ASSERT(minML <= maxML); 5840 U_ASSERT(minML >= 0); 5841 U_ASSERT(continueLoc > fp->fPatIdx); 5842 5843 // Fetch (from data) the last input index where a match was attempted. 5844 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5845 int64_t *lbStartIdx = &fData[opValue+2]; 5846 if (*lbStartIdx < 0) { 5847 // First time through loop. 5848 *lbStartIdx = fp->fInputIdx - minML; 5849 } else { 5850 // 2nd through nth time through the loop. 5851 // Back up start position for match by one. 5852 if (*lbStartIdx == 0) { 5853 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. 5854 } else { 5855 U16_BACK_1(inputBuf, 0, *lbStartIdx); 5856 } 5857 } 5858 5859 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 5860 // We have tried all potential match starting points without 5861 // getting a match, which means that the negative lookbehind as 5862 // a whole has succeeded. Jump forward to the continue location 5863 int64_t restoreInputLen = fData[opValue+3]; 5864 U_ASSERT(restoreInputLen >= fActiveLimit); 5865 U_ASSERT(restoreInputLen <= fInputLength); 5866 fActiveLimit = restoreInputLen; 5867 fp->fPatIdx = continueLoc; 5868 break; 5869 } 5870 5871 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5872 // (successful match will cause a FAIL out of the loop altogether.) 5873 fp = StateSave(fp, fp->fPatIdx-4, status); 5874 fp->fInputIdx = *lbStartIdx; 5875 } 5876 break; 5877 5878 case URX_LBN_END: 5879 // End of a negative look-behind block, after a successful match. 5880 { 5881 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5882 if (fp->fInputIdx != fActiveLimit) { 5883 // The look-behind expression matched, but the match did not 5884 // extend all the way to the point that we are looking behind from. 5885 // FAIL out of here, which will take us back to the LB_CONT, which 5886 // will retry the match starting at another position or succeed 5887 // the look-behind altogether, whichever is appropriate. 5888 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5889 break; 5890 } 5891 5892 // Look-behind expression matched, which means look-behind test as 5893 // a whole Fails 5894 5895 // Restore the orignal input string length, which had been truncated 5896 // inorder to pin the end of the lookbehind match 5897 // to the position being looked-behind. 5898 int64_t originalInputLen = fData[opValue+3]; 5899 U_ASSERT(originalInputLen >= fActiveLimit); 5900 U_ASSERT(originalInputLen <= fInputLength); 5901 fActiveLimit = originalInputLen; 5902 5903 // Restore original stack position, discarding any state saved 5904 // by the successful pattern match. 5905 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5906 int32_t newStackSize = (int32_t)fData[opValue]; 5907 U_ASSERT(fStack->size() > newStackSize); 5908 fStack->setSize(newStackSize); 5909 5910 // FAIL, which will take control back to someplace 5911 // prior to entering the look-behind test. 5912 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5913 } 5914 break; 5915 5916 5917 case URX_LOOP_SR_I: 5918 // Loop Initialization for the optimized implementation of 5919 // [some character set]* 5920 // This op scans through all matching input. 5921 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5922 { 5923 U_ASSERT(opValue > 0 && opValue < sets->size()); 5924 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5925 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 5926 5927 // Loop through input, until either the input is exhausted or 5928 // we reach a character that is not a member of the set. 5929 int32_t ix = (int32_t)fp->fInputIdx; 5930 for (;;) { 5931 if (ix >= fActiveLimit) { 5932 fHitEnd = TRUE; 5933 break; 5934 } 5935 UChar32 c; 5936 U16_NEXT(inputBuf, ix, fActiveLimit, c); 5937 if (c<256) { 5938 if (s8->contains(c) == FALSE) { 5939 U16_BACK_1(inputBuf, 0, ix); 5940 break; 5941 } 5942 } else { 5943 if (s->contains(c) == FALSE) { 5944 U16_BACK_1(inputBuf, 0, ix); 5945 break; 5946 } 5947 } 5948 } 5949 5950 // If there were no matching characters, skip over the loop altogether. 5951 // The loop doesn't run at all, a * op always succeeds. 5952 if (ix == fp->fInputIdx) { 5953 fp->fPatIdx++; // skip the URX_LOOP_C op. 5954 break; 5955 } 5956 5957 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5958 // must follow. It's operand is the stack location 5959 // that holds the starting input index for the match of this [set]* 5960 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 5961 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5962 int32_t stackLoc = URX_VAL(loopcOp); 5963 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5964 fp->fExtra[stackLoc] = fp->fInputIdx; 5965 #ifdef REGEX_SMART_BACKTRACKING 5966 backSearchIndex = fp->fInputIdx; 5967 #endif 5968 fp->fInputIdx = ix; 5969 5970 // Save State to the URX_LOOP_C op that follows this one, 5971 // so that match failures in the following code will return to there. 5972 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 5973 fp = StateSave(fp, fp->fPatIdx, status); 5974 fp->fPatIdx++; 5975 } 5976 break; 5977 5978 5979 case URX_LOOP_DOT_I: 5980 // Loop Initialization for the optimized implementation of .* 5981 // This op scans through all remaining input. 5982 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5983 { 5984 // Loop through input until the input is exhausted (we reach an end-of-line) 5985 // In DOTALL mode, we can just go straight to the end of the input. 5986 int32_t ix; 5987 if ((opValue & 1) == 1) { 5988 // Dot-matches-All mode. Jump straight to the end of the string. 5989 ix = (int32_t)fActiveLimit; 5990 fHitEnd = TRUE; 5991 } else { 5992 // NOT DOT ALL mode. Line endings do not match '.' 5993 // Scan forward until a line ending or end of input. 5994 ix = (int32_t)fp->fInputIdx; 5995 for (;;) { 5996 if (ix >= fActiveLimit) { 5997 fHitEnd = TRUE; 5998 break; 5999 } 6000 UChar32 c; 6001 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] 6002 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 6003 if ((c == 0x0a) || // 0x0a is newline in both modes. 6004 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 6005 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) { 6006 // char is a line ending. Put the input pos back to the 6007 // line ending char, and exit the scanning loop. 6008 U16_BACK_1(inputBuf, 0, ix); 6009 break; 6010 } 6011 } 6012 } 6013 } 6014 6015 // If there were no matching characters, skip over the loop altogether. 6016 // The loop doesn't run at all, a * op always succeeds. 6017 if (ix == fp->fInputIdx) { 6018 fp->fPatIdx++; // skip the URX_LOOP_C op. 6019 break; 6020 } 6021 6022 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 6023 // must follow. It's operand is the stack location 6024 // that holds the starting input index for the match of this .* 6025 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 6026 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 6027 int32_t stackLoc = URX_VAL(loopcOp); 6028 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 6029 fp->fExtra[stackLoc] = fp->fInputIdx; 6030 #ifdef REGEX_SMART_BACKTRACKING 6031 backSearchIndex = fp->fInputIdx; 6032 #endif 6033 fp->fInputIdx = ix; 6034 6035 // Save State to the URX_LOOP_C op that follows this one, 6036 // so that match failures in the following code will return to there. 6037 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 6038 fp = StateSave(fp, fp->fPatIdx, status); 6039 fp->fPatIdx++; 6040 } 6041 break; 6042 6043 6044 case URX_LOOP_C: 6045 { 6046 U_ASSERT(opValue>=0 && opValue<fFrameSize); 6047 backSearchIndex = (int32_t)fp->fExtra[opValue]; 6048 U_ASSERT(backSearchIndex <= fp->fInputIdx); 6049 if (backSearchIndex == fp->fInputIdx) { 6050 // We've backed up the input idx to the point that the loop started. 6051 // The loop is done. Leave here without saving state. 6052 // Subsequent failures won't come back here. 6053 break; 6054 } 6055 // Set up for the next iteration of the loop, with input index 6056 // backed up by one from the last time through, 6057 // and a state save to this instruction in case the following code fails again. 6058 // (We're going backwards because this loop emulates stack unwinding, not 6059 // the initial scan forward.) 6060 U_ASSERT(fp->fInputIdx > 0); 6061 UChar32 prevC; 6062 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit? 6063 6064 if (prevC == 0x0a && 6065 fp->fInputIdx > backSearchIndex && 6066 inputBuf[fp->fInputIdx-1] == 0x0d) { 6067 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 6068 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 6069 // .*, stepping back over CRLF pair. 6070 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 6071 } 6072 } 6073 6074 6075 fp = StateSave(fp, fp->fPatIdx-1, status); 6076 } 6077 break; 6078 6079 6080 6081 default: 6082 // Trouble. The compiled pattern contains an entry with an 6083 // unrecognized type tag. 6084 U_ASSERT(FALSE); 6085 } 6086 6087 if (U_FAILURE(status)) { 6088 isMatch = FALSE; 6089 break; 6090 } 6091 } 6092 6093 breakFromLoop: 6094 fMatch = isMatch; 6095 if (isMatch) { 6096 fLastMatchEnd = fMatchEnd; 6097 fMatchStart = startIdx; 6098 fMatchEnd = fp->fInputIdx; 6099 if (fTraceDebug) { 6100 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); 6101 } 6102 } 6103 else 6104 { 6105 if (fTraceDebug) { 6106 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); 6107 } 6108 } 6109 6110 fFrame = fp; // The active stack frame when the engine stopped. 6111 // Contains the capture group results that we need to 6112 // access later. 6113 6114 return; 6115 } 6116 6117 6118 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) 6119 6120 U_NAMESPACE_END 6121 6122 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 6123