1 /* 2 ************************************************************************** 3 * Copyright (C) 2002-2010 International Business Machines Corporation * 4 * and others. All rights reserved. * 5 ************************************************************************** 6 */ 7 // 8 // file: rematch.cpp 9 // 10 // Contains the implementation of class RegexMatcher, 11 // which is one of the main API classes for the ICU regular expression package. 12 // 13 14 #include "unicode/utypes.h" 15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 16 17 #include "unicode/regex.h" 18 #include "unicode/uniset.h" 19 #include "unicode/uchar.h" 20 #include "unicode/ustring.h" 21 #include "unicode/rbbi.h" 22 #include "uassert.h" 23 #include "cmemory.h" 24 #include "uvector.h" 25 #include "uvectr32.h" 26 #include "uvectr64.h" 27 #include "regeximp.h" 28 #include "regexst.h" 29 #include "regextxt.h" 30 #include "ucase.h" 31 32 // #include <malloc.h> // Needed for heapcheck testing 33 34 35 // Find progress callback 36 // ---------------------- 37 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call. 38 // 39 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \ 40 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE) 41 42 43 // Smart Backtracking 44 // ------------------ 45 // When a failure would go back to a LOOP_C instruction, 46 // strings, characters, and setrefs scan backwards for a valid start 47 // character themselves, pop the stack, and save state, emulating the 48 // LOOP_C's effect but assured that the next character of input is a 49 // possible matching character. 50 // 51 // Good idea in theory; unfortunately it only helps out a few specific 52 // cases and slows the engine down a little in the rest. 53 54 //#define REGEX_SMART_BACKTRACKING 1 55 56 U_NAMESPACE_BEGIN 57 58 // Default limit for the size of the back track stack, to avoid system 59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. 60 // This value puts ICU's limits higher than most other regexp implementations, 61 // which use recursion rather than the heap, and take more storage per 62 // backtrack point. 63 // 64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; 65 66 // Time limit counter constant. 67 // Time limits for expression evaluation are in terms of quanta of work by 68 // the engine, each of which is 10,000 state saves. 69 // This constant determines that state saves per tick number. 70 static const int32_t TIMER_INITIAL_VALUE = 10000; 71 72 //----------------------------------------------------------------------------- 73 // 74 // Constructor and Destructor 75 // 76 //----------------------------------------------------------------------------- 77 RegexMatcher::RegexMatcher(const RegexPattern *pat) { 78 fDeferredStatus = U_ZERO_ERROR; 79 init(fDeferredStatus); 80 if (U_FAILURE(fDeferredStatus)) { 81 return; 82 } 83 if (pat==NULL) { 84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; 85 return; 86 } 87 fPattern = pat; 88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); 89 } 90 91 92 93 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 94 uint32_t flags, UErrorCode &status) { 95 init(status); 96 if (U_FAILURE(status)) { 97 return; 98 } 99 UParseError pe; 100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 101 fPattern = fPatternOwned; 102 103 UText inputText = UTEXT_INITIALIZER; 104 utext_openConstUnicodeString(&inputText, &input, &status); 105 init2(&inputText, status); 106 utext_close(&inputText); 107 108 fInputUniStrMaybeMutable = TRUE; 109 } 110 111 112 RegexMatcher::RegexMatcher(UText *regexp, UText *input, 113 uint32_t flags, UErrorCode &status) { 114 init(status); 115 if (U_FAILURE(status)) { 116 return; 117 } 118 UParseError pe; 119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 120 if (U_FAILURE(status)) { 121 return; 122 } 123 124 fPattern = fPatternOwned; 125 init2(input, status); 126 } 127 128 129 RegexMatcher::RegexMatcher(const UnicodeString ®exp, 130 uint32_t flags, UErrorCode &status) { 131 init(status); 132 if (U_FAILURE(status)) { 133 return; 134 } 135 UParseError pe; 136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 137 if (U_FAILURE(status)) { 138 return; 139 } 140 fPattern = fPatternOwned; 141 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 142 } 143 144 RegexMatcher::RegexMatcher(UText *regexp, 145 uint32_t flags, UErrorCode &status) { 146 init(status); 147 if (U_FAILURE(status)) { 148 return; 149 } 150 UParseError pe; 151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 152 if (U_FAILURE(status)) { 153 return; 154 } 155 156 fPattern = fPatternOwned; 157 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 158 } 159 160 161 162 163 RegexMatcher::~RegexMatcher() { 164 delete fStack; 165 if (fData != fSmallData) { 166 uprv_free(fData); 167 fData = NULL; 168 } 169 if (fPatternOwned) { 170 delete fPatternOwned; 171 fPatternOwned = NULL; 172 fPattern = NULL; 173 } 174 175 if (fInput) { 176 delete fInput; 177 } 178 if (fInputText) { 179 utext_close(fInputText); 180 } 181 if (fAltInputText) { 182 utext_close(fAltInputText); 183 } 184 185 #if UCONFIG_NO_BREAK_ITERATION==0 186 delete fWordBreakItr; 187 #endif 188 } 189 190 // 191 // init() common initialization for use by all constructors. 192 // Initialize all fields, get the object into a consistent state. 193 // This must be done even when the initial status shows an error, 194 // so that the object is initialized sufficiently well for the destructor 195 // to run safely. 196 // 197 void RegexMatcher::init(UErrorCode &status) { 198 fPattern = NULL; 199 fPatternOwned = NULL; 200 fFrameSize = 0; 201 fRegionStart = 0; 202 fRegionLimit = 0; 203 fAnchorStart = 0; 204 fAnchorLimit = 0; 205 fLookStart = 0; 206 fLookLimit = 0; 207 fActiveStart = 0; 208 fActiveLimit = 0; 209 fTransparentBounds = FALSE; 210 fAnchoringBounds = TRUE; 211 fMatch = FALSE; 212 fMatchStart = 0; 213 fMatchEnd = 0; 214 fLastMatchEnd = -1; 215 fAppendPosition = 0; 216 fHitEnd = FALSE; 217 fRequireEnd = FALSE; 218 fStack = NULL; 219 fFrame = NULL; 220 fTimeLimit = 0; 221 fTime = 0; 222 fTickCounter = 0; 223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; 224 fCallbackFn = NULL; 225 fCallbackContext = NULL; 226 fFindProgressCallbackFn = NULL; 227 fFindProgressCallbackContext = NULL; 228 fTraceDebug = FALSE; 229 fDeferredStatus = status; 230 fData = fSmallData; 231 fWordBreakItr = NULL; 232 233 fStack = new UVector64(status); 234 fInputText = NULL; 235 fAltInputText = NULL; 236 fInput = NULL; 237 fInputLength = 0; 238 fInputUniStrMaybeMutable = FALSE; 239 240 if (U_FAILURE(status)) { 241 fDeferredStatus = status; 242 } 243 } 244 245 // 246 // init2() Common initialization for use by RegexMatcher constructors, part 2. 247 // This handles the common setup to be done after the Pattern is available. 248 // 249 void RegexMatcher::init2(UText *input, UErrorCode &status) { 250 if (U_FAILURE(status)) { 251 fDeferredStatus = status; 252 return; 253 } 254 255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) { 256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); 257 if (fData == NULL) { 258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 259 return; 260 } 261 } 262 263 reset(input); 264 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); 265 if (U_FAILURE(status)) { 266 fDeferredStatus = status; 267 return; 268 } 269 } 270 271 272 static const UChar BACKSLASH = 0x5c; 273 static const UChar DOLLARSIGN = 0x24; 274 //-------------------------------------------------------------------------------- 275 // 276 // appendReplacement 277 // 278 //-------------------------------------------------------------------------------- 279 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, 280 const UnicodeString &replacement, 281 UErrorCode &status) { 282 UText replacementText = UTEXT_INITIALIZER; 283 284 utext_openConstUnicodeString(&replacementText, &replacement, &status); 285 if (U_SUCCESS(status)) { 286 UText resultText = UTEXT_INITIALIZER; 287 utext_openUnicodeString(&resultText, &dest, &status); 288 289 if (U_SUCCESS(status)) { 290 appendReplacement(&resultText, &replacementText, status); 291 utext_close(&resultText); 292 } 293 utext_close(&replacementText); 294 } 295 296 return *this; 297 } 298 299 // 300 // appendReplacement, UText mode 301 // 302 RegexMatcher &RegexMatcher::appendReplacement(UText *dest, 303 UText *replacement, 304 UErrorCode &status) { 305 if (U_FAILURE(status)) { 306 return *this; 307 } 308 if (U_FAILURE(fDeferredStatus)) { 309 status = fDeferredStatus; 310 return *this; 311 } 312 if (fMatch == FALSE) { 313 status = U_REGEX_INVALID_STATE; 314 return *this; 315 } 316 317 // Copy input string from the end of previous match to start of current match 318 int64_t destLen = utext_nativeLength(dest); 319 if (fMatchStart > fAppendPosition) { 320 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 321 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 322 (int32_t)(fMatchStart-fAppendPosition), &status); 323 } else { 324 int32_t len16; 325 if (UTEXT_USES_U16(fInputText)) { 326 len16 = (int32_t)(fMatchStart-fAppendPosition); 327 } else { 328 UErrorCode lengthStatus = U_ZERO_ERROR; 329 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus); 330 } 331 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 332 if (inputChars == NULL) { 333 status = U_MEMORY_ALLOCATION_ERROR; 334 return *this; 335 } 336 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); 337 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); 338 uprv_free(inputChars); 339 } 340 } 341 fAppendPosition = fMatchEnd; 342 343 344 // scan the replacement text, looking for substitutions ($n) and \escapes. 345 // TODO: optimize this loop by efficiently scanning for '$' or '\', 346 // move entire ranges not containing substitutions. 347 UTEXT_SETNATIVEINDEX(replacement, 0); 348 UChar32 c = UTEXT_NEXT32(replacement); 349 while (c != U_SENTINEL) { 350 if (c == BACKSLASH) { 351 // Backslash Escape. Copy the following char out without further checks. 352 // Note: Surrogate pairs don't need any special handling 353 // The second half wont be a '$' or a '\', and 354 // will move to the dest normally on the next 355 // loop iteration. 356 c = UTEXT_CURRENT32(replacement); 357 if (c == U_SENTINEL) { 358 break; 359 } 360 361 if (c==0x55/*U*/ || c==0x75/*u*/) { 362 // We have a \udddd or \Udddddddd escape sequence. 363 int32_t offset = 0; 364 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement); 365 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 366 if (escapedChar != (UChar32)0xFFFFFFFF) { 367 if (U_IS_BMP(escapedChar)) { 368 UChar c16 = (UChar)escapedChar; 369 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 370 } else { 371 UChar surrogate[2]; 372 surrogate[0] = U16_LEAD(escapedChar); 373 surrogate[1] = U16_TRAIL(escapedChar); 374 if (U_SUCCESS(status)) { 375 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 376 } 377 } 378 // TODO: Report errors for mal-formed \u escapes? 379 // As this is, the original sequence is output, which may be OK. 380 if (context.lastOffset == offset) { 381 UTEXT_PREVIOUS32(replacement); 382 } else if (context.lastOffset != offset-1) { 383 utext_moveIndex32(replacement, offset - context.lastOffset - 1); 384 } 385 } 386 } else { 387 UTEXT_NEXT32(replacement); 388 // Plain backslash escape. Just put out the escaped character. 389 if (U_IS_BMP(c)) { 390 UChar c16 = (UChar)c; 391 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 392 } else { 393 UChar surrogate[2]; 394 surrogate[0] = U16_LEAD(c); 395 surrogate[1] = U16_TRAIL(c); 396 if (U_SUCCESS(status)) { 397 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 398 } 399 } 400 } 401 } else if (c != DOLLARSIGN) { 402 // Normal char, not a $. Copy it out without further checks. 403 if (U_IS_BMP(c)) { 404 UChar c16 = (UChar)c; 405 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 406 } else { 407 UChar surrogate[2]; 408 surrogate[0] = U16_LEAD(c); 409 surrogate[1] = U16_TRAIL(c); 410 if (U_SUCCESS(status)) { 411 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 412 } 413 } 414 } else { 415 // We've got a $. Pick up a capture group number if one follows. 416 // Consume at most the number of digits necessary for the largest capture 417 // number that is valid for this pattern. 418 419 int32_t numDigits = 0; 420 int32_t groupNum = 0; 421 UChar32 digitC; 422 for (;;) { 423 digitC = UTEXT_CURRENT32(replacement); 424 if (digitC == U_SENTINEL) { 425 break; 426 } 427 if (u_isdigit(digitC) == FALSE) { 428 break; 429 } 430 UTEXT_NEXT32(replacement); 431 groupNum=groupNum*10 + u_charDigitValue(digitC); 432 numDigits++; 433 if (numDigits >= fPattern->fMaxCaptureDigits) { 434 break; 435 } 436 } 437 438 439 if (numDigits == 0) { 440 // The $ didn't introduce a group number at all. 441 // Treat it as just part of the substitution text. 442 UChar c16 = DOLLARSIGN; 443 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 444 } else { 445 // Finally, append the capture group data to the destination. 446 destLen += appendGroup(groupNum, dest, status); 447 if (U_FAILURE(status)) { 448 // Can fail if group number is out of range. 449 break; 450 } 451 } 452 } 453 454 if (U_FAILURE(status)) { 455 break; 456 } else { 457 c = UTEXT_NEXT32(replacement); 458 } 459 } 460 461 return *this; 462 } 463 464 465 466 //-------------------------------------------------------------------------------- 467 // 468 // appendTail Intended to be used in conjunction with appendReplacement() 469 // To the destination string, append everything following 470 // the last match position from the input string. 471 // 472 // Note: Match ranges do not affect appendTail or appendReplacement 473 // 474 //-------------------------------------------------------------------------------- 475 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { 476 UErrorCode status = U_ZERO_ERROR; 477 UText resultText = UTEXT_INITIALIZER; 478 utext_openUnicodeString(&resultText, &dest, &status); 479 480 if (U_SUCCESS(status)) { 481 appendTail(&resultText, status); 482 utext_close(&resultText); 483 } 484 485 return dest; 486 } 487 488 // 489 // appendTail, UText mode 490 // 491 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { 492 UBool bailOut = FALSE; 493 if (U_FAILURE(status)) { 494 bailOut = TRUE; 495 } 496 if (U_FAILURE(fDeferredStatus)) { 497 status = fDeferredStatus; 498 bailOut = TRUE; 499 } 500 501 if (bailOut) { 502 // dest must not be NULL 503 if (dest) { 504 utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status); 505 return dest; 506 } 507 } 508 509 if (fInputLength > fAppendPosition) { 510 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 511 int64_t destLen = utext_nativeLength(dest); 512 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 513 (int32_t)(fInputLength-fAppendPosition), &status); 514 } else { 515 int32_t len16; 516 if (UTEXT_USES_U16(fInputText)) { 517 len16 = (int32_t)(fInputLength-fAppendPosition); 518 } else { 519 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status); 520 status = U_ZERO_ERROR; // buffer overflow 521 } 522 523 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); 524 if (inputChars == NULL) { 525 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 526 } else { 527 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated 528 int64_t destLen = utext_nativeLength(dest); 529 utext_replace(dest, destLen, destLen, inputChars, len16, &status); 530 uprv_free(inputChars); 531 } 532 } 533 } 534 return dest; 535 } 536 537 538 539 //-------------------------------------------------------------------------------- 540 // 541 // end 542 // 543 //-------------------------------------------------------------------------------- 544 int32_t RegexMatcher::end(UErrorCode &err) const { 545 return end(0, err); 546 } 547 548 int64_t RegexMatcher::end64(UErrorCode &err) const { 549 return end64(0, err); 550 } 551 552 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const { 553 if (U_FAILURE(err)) { 554 return -1; 555 } 556 if (fMatch == FALSE) { 557 err = U_REGEX_INVALID_STATE; 558 return -1; 559 } 560 if (group < 0 || group > fPattern->fGroupMap->size()) { 561 err = U_INDEX_OUTOFBOUNDS_ERROR; 562 return -1; 563 } 564 int64_t e = -1; 565 if (group == 0) { 566 e = fMatchEnd; 567 } else { 568 // Get the position within the stack frame of the variables for 569 // this capture group. 570 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 571 U_ASSERT(groupOffset < fPattern->fFrameSize); 572 U_ASSERT(groupOffset >= 0); 573 e = fFrame->fExtra[groupOffset + 1]; 574 } 575 576 return e; 577 } 578 579 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { 580 return (int32_t)end64(group, err); 581 } 582 583 584 //-------------------------------------------------------------------------------- 585 // 586 // find() 587 // 588 //-------------------------------------------------------------------------------- 589 UBool RegexMatcher::find() { 590 // Start at the position of the last match end. (Will be zero if the 591 // matcher has been reset.) 592 // 593 if (U_FAILURE(fDeferredStatus)) { 594 return FALSE; 595 } 596 597 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 598 return findUsingChunk(); 599 } 600 601 int64_t startPos = fMatchEnd; 602 if (startPos==0) { 603 startPos = fActiveStart; 604 } 605 606 if (fMatch) { 607 // Save the position of any previous successful match. 608 fLastMatchEnd = fMatchEnd; 609 610 if (fMatchStart == fMatchEnd) { 611 // Previous match had zero length. Move start position up one position 612 // to avoid sending find() into a loop on zero-length matches. 613 if (startPos >= fActiveLimit) { 614 fMatch = FALSE; 615 fHitEnd = TRUE; 616 return FALSE; 617 } 618 UTEXT_SETNATIVEINDEX(fInputText, startPos); 619 UTEXT_NEXT32(fInputText); 620 startPos = UTEXT_GETNATIVEINDEX(fInputText); 621 } 622 } else { 623 if (fLastMatchEnd >= 0) { 624 // A previous find() failed to match. Don't try again. 625 // (without this test, a pattern with a zero-length match 626 // could match again at the end of an input string.) 627 fHitEnd = TRUE; 628 return FALSE; 629 } 630 } 631 632 633 // Compute the position in the input string beyond which a match can not begin, because 634 // the minimum length match would extend past the end of the input. 635 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 636 // Be aware of possible overflows if making changes here. 637 int64_t testStartLimit; 638 if (UTEXT_USES_U16(fInputText)) { 639 testStartLimit = fActiveLimit - fPattern->fMinMatchLen; 640 if (startPos > testStartLimit) { 641 fMatch = FALSE; 642 fHitEnd = TRUE; 643 return FALSE; 644 } 645 } else { 646 // For now, let the matcher discover that it can't match on its own 647 // We don't know how long the match len is in native characters 648 testStartLimit = fActiveLimit; 649 } 650 651 UChar32 c; 652 U_ASSERT(startPos >= 0); 653 654 switch (fPattern->fStartType) { 655 case START_NO_INFO: 656 // No optimization was found. 657 // Try a match at each input position. 658 for (;;) { 659 MatchAt(startPos, FALSE, fDeferredStatus); 660 if (U_FAILURE(fDeferredStatus)) { 661 return FALSE; 662 } 663 if (fMatch) { 664 return TRUE; 665 } 666 if (startPos >= testStartLimit) { 667 fHitEnd = TRUE; 668 return FALSE; 669 } 670 UTEXT_SETNATIVEINDEX(fInputText, startPos); 671 UTEXT_NEXT32(fInputText); 672 startPos = UTEXT_GETNATIVEINDEX(fInputText); 673 // Note that it's perfectly OK for a pattern to have a zero-length 674 // match at the end of a string, so we must make sure that the loop 675 // runs with startPos == testStartLimit the last time through. 676 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 677 return FALSE; 678 } 679 U_ASSERT(FALSE); 680 681 case START_START: 682 // Matches are only possible at the start of the input string 683 // (pattern begins with ^ or \A) 684 if (startPos > fActiveStart) { 685 fMatch = FALSE; 686 return FALSE; 687 } 688 MatchAt(startPos, FALSE, fDeferredStatus); 689 if (U_FAILURE(fDeferredStatus)) { 690 return FALSE; 691 } 692 return fMatch; 693 694 695 case START_SET: 696 { 697 // Match may start on any char from a pre-computed set. 698 U_ASSERT(fPattern->fMinMatchLen > 0); 699 int64_t pos; 700 UTEXT_SETNATIVEINDEX(fInputText, startPos); 701 for (;;) { 702 c = UTEXT_NEXT32(fInputText); 703 pos = UTEXT_GETNATIVEINDEX(fInputText); 704 // c will be -1 (U_SENTINEL) at end of text, in which case we 705 // skip this next block (so we don't have a negative array index) 706 // and handle end of text in the following block. 707 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) || 708 (c>=256 && fPattern->fInitialChars->contains(c)))) { 709 MatchAt(startPos, FALSE, fDeferredStatus); 710 if (U_FAILURE(fDeferredStatus)) { 711 return FALSE; 712 } 713 if (fMatch) { 714 return TRUE; 715 } 716 UTEXT_SETNATIVEINDEX(fInputText, pos); 717 } 718 if (startPos >= testStartLimit) { 719 fMatch = FALSE; 720 fHitEnd = TRUE; 721 return FALSE; 722 } 723 startPos = pos; 724 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 725 return FALSE; 726 } 727 } 728 U_ASSERT(FALSE); 729 730 case START_STRING: 731 case START_CHAR: 732 { 733 // Match starts on exactly one char. 734 U_ASSERT(fPattern->fMinMatchLen > 0); 735 UChar32 theChar = fPattern->fInitialChar; 736 int64_t pos; 737 UTEXT_SETNATIVEINDEX(fInputText, startPos); 738 for (;;) { 739 c = UTEXT_NEXT32(fInputText); 740 pos = UTEXT_GETNATIVEINDEX(fInputText); 741 if (c == theChar) { 742 MatchAt(startPos, FALSE, fDeferredStatus); 743 if (U_FAILURE(fDeferredStatus)) { 744 return FALSE; 745 } 746 if (fMatch) { 747 return TRUE; 748 } 749 UTEXT_SETNATIVEINDEX(fInputText, pos); 750 } 751 if (startPos >= testStartLimit) { 752 fMatch = FALSE; 753 fHitEnd = TRUE; 754 return FALSE; 755 } 756 startPos = pos; 757 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 758 return FALSE; 759 } 760 } 761 U_ASSERT(FALSE); 762 763 case START_LINE: 764 { 765 UChar32 c; 766 if (startPos == fAnchorStart) { 767 MatchAt(startPos, FALSE, fDeferredStatus); 768 if (U_FAILURE(fDeferredStatus)) { 769 return FALSE; 770 } 771 if (fMatch) { 772 return TRUE; 773 } 774 UTEXT_SETNATIVEINDEX(fInputText, startPos); 775 c = UTEXT_NEXT32(fInputText); 776 startPos = UTEXT_GETNATIVEINDEX(fInputText); 777 } else { 778 UTEXT_SETNATIVEINDEX(fInputText, startPos); 779 c = UTEXT_PREVIOUS32(fInputText); 780 UTEXT_SETNATIVEINDEX(fInputText, startPos); 781 } 782 783 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 784 for (;;) { 785 if (c == 0x0a) { 786 MatchAt(startPos, FALSE, fDeferredStatus); 787 if (U_FAILURE(fDeferredStatus)) { 788 return FALSE; 789 } 790 if (fMatch) { 791 return TRUE; 792 } 793 UTEXT_SETNATIVEINDEX(fInputText, startPos); 794 } 795 if (startPos >= testStartLimit) { 796 fMatch = FALSE; 797 fHitEnd = TRUE; 798 return FALSE; 799 } 800 c = UTEXT_NEXT32(fInputText); 801 startPos = UTEXT_GETNATIVEINDEX(fInputText); 802 // Note that it's perfectly OK for a pattern to have a zero-length 803 // match at the end of a string, so we must make sure that the loop 804 // runs with startPos == testStartLimit the last time through. 805 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 806 return FALSE; 807 } 808 } else { 809 for (;;) { 810 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 811 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { 812 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 813 UTEXT_NEXT32(fInputText); 814 startPos = UTEXT_GETNATIVEINDEX(fInputText); 815 } 816 MatchAt(startPos, FALSE, fDeferredStatus); 817 if (U_FAILURE(fDeferredStatus)) { 818 return FALSE; 819 } 820 if (fMatch) { 821 return TRUE; 822 } 823 UTEXT_SETNATIVEINDEX(fInputText, startPos); 824 } 825 if (startPos >= testStartLimit) { 826 fMatch = FALSE; 827 fHitEnd = TRUE; 828 return FALSE; 829 } 830 c = UTEXT_NEXT32(fInputText); 831 startPos = UTEXT_GETNATIVEINDEX(fInputText); 832 // Note that it's perfectly OK for a pattern to have a zero-length 833 // match at the end of a string, so we must make sure that the loop 834 // runs with startPos == testStartLimit the last time through. 835 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 836 return FALSE; 837 } 838 } 839 } 840 841 default: 842 U_ASSERT(FALSE); 843 } 844 845 U_ASSERT(FALSE); 846 return FALSE; 847 } 848 849 850 851 UBool RegexMatcher::find(int64_t start, UErrorCode &status) { 852 if (U_FAILURE(status)) { 853 return FALSE; 854 } 855 if (U_FAILURE(fDeferredStatus)) { 856 status = fDeferredStatus; 857 return FALSE; 858 } 859 this->reset(); // Note: Reset() is specified by Java Matcher documentation. 860 // This will reset the region to be the full input length. 861 if (start < 0) { 862 status = U_INDEX_OUTOFBOUNDS_ERROR; 863 return FALSE; 864 } 865 866 int64_t nativeStart = start; 867 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 868 status = U_INDEX_OUTOFBOUNDS_ERROR; 869 return FALSE; 870 } 871 fMatchEnd = nativeStart; 872 return find(); 873 } 874 875 876 //-------------------------------------------------------------------------------- 877 // 878 // findUsingChunk() -- like find(), but with the advance knowledge that the 879 // entire string is available in the UText's chunk buffer. 880 // 881 //-------------------------------------------------------------------------------- 882 UBool RegexMatcher::findUsingChunk() { 883 // Start at the position of the last match end. (Will be zero if the 884 // matcher has been reset. 885 // 886 887 int32_t startPos = (int32_t)fMatchEnd; 888 if (startPos==0) { 889 startPos = (int32_t)fActiveStart; 890 } 891 892 const UChar *inputBuf = fInputText->chunkContents; 893 894 if (fMatch) { 895 // Save the position of any previous successful match. 896 fLastMatchEnd = fMatchEnd; 897 898 if (fMatchStart == fMatchEnd) { 899 // Previous match had zero length. Move start position up one position 900 // to avoid sending find() into a loop on zero-length matches. 901 if (startPos >= fActiveLimit) { 902 fMatch = FALSE; 903 fHitEnd = TRUE; 904 return FALSE; 905 } 906 U16_FWD_1(inputBuf, startPos, fInputLength); 907 } 908 } else { 909 if (fLastMatchEnd >= 0) { 910 // A previous find() failed to match. Don't try again. 911 // (without this test, a pattern with a zero-length match 912 // could match again at the end of an input string.) 913 fHitEnd = TRUE; 914 return FALSE; 915 } 916 } 917 918 919 // Compute the position in the input string beyond which a match can not begin, because 920 // the minimum length match would extend past the end of the input. 921 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 922 // Be aware of possible overflows if making changes here. 923 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); 924 if (startPos > testLen) { 925 fMatch = FALSE; 926 fHitEnd = TRUE; 927 return FALSE; 928 } 929 930 UChar32 c; 931 U_ASSERT(startPos >= 0); 932 933 switch (fPattern->fStartType) { 934 case START_NO_INFO: 935 // No optimization was found. 936 // Try a match at each input position. 937 for (;;) { 938 MatchChunkAt(startPos, FALSE, fDeferredStatus); 939 if (U_FAILURE(fDeferredStatus)) { 940 return FALSE; 941 } 942 if (fMatch) { 943 return TRUE; 944 } 945 if (startPos >= testLen) { 946 fHitEnd = TRUE; 947 return FALSE; 948 } 949 U16_FWD_1(inputBuf, startPos, fActiveLimit); 950 // Note that it's perfectly OK for a pattern to have a zero-length 951 // match at the end of a string, so we must make sure that the loop 952 // runs with startPos == testLen the last time through. 953 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 954 return FALSE; 955 } 956 U_ASSERT(FALSE); 957 958 case START_START: 959 // Matches are only possible at the start of the input string 960 // (pattern begins with ^ or \A) 961 if (startPos > fActiveStart) { 962 fMatch = FALSE; 963 return FALSE; 964 } 965 MatchChunkAt(startPos, FALSE, fDeferredStatus); 966 if (U_FAILURE(fDeferredStatus)) { 967 return FALSE; 968 } 969 return fMatch; 970 971 972 case START_SET: 973 { 974 // Match may start on any char from a pre-computed set. 975 U_ASSERT(fPattern->fMinMatchLen > 0); 976 for (;;) { 977 int32_t pos = startPos; 978 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 979 if ((c<256 && fPattern->fInitialChars8->contains(c)) || 980 (c>=256 && fPattern->fInitialChars->contains(c))) { 981 MatchChunkAt(pos, FALSE, fDeferredStatus); 982 if (U_FAILURE(fDeferredStatus)) { 983 return FALSE; 984 } 985 if (fMatch) { 986 return TRUE; 987 } 988 } 989 if (pos >= testLen) { 990 fMatch = FALSE; 991 fHitEnd = TRUE; 992 return FALSE; 993 } 994 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 995 return FALSE; 996 } 997 } 998 U_ASSERT(FALSE); 999 1000 case START_STRING: 1001 case START_CHAR: 1002 { 1003 // Match starts on exactly one char. 1004 U_ASSERT(fPattern->fMinMatchLen > 0); 1005 UChar32 theChar = fPattern->fInitialChar; 1006 for (;;) { 1007 int32_t pos = startPos; 1008 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 1009 if (c == theChar) { 1010 MatchChunkAt(pos, FALSE, fDeferredStatus); 1011 if (U_FAILURE(fDeferredStatus)) { 1012 return FALSE; 1013 } 1014 if (fMatch) { 1015 return TRUE; 1016 } 1017 } 1018 if (pos >= testLen) { 1019 fMatch = FALSE; 1020 fHitEnd = TRUE; 1021 return FALSE; 1022 } 1023 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1024 return FALSE; 1025 } 1026 } 1027 U_ASSERT(FALSE); 1028 1029 case START_LINE: 1030 { 1031 UChar32 c; 1032 if (startPos == fAnchorStart) { 1033 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1034 if (U_FAILURE(fDeferredStatus)) { 1035 return FALSE; 1036 } 1037 if (fMatch) { 1038 return TRUE; 1039 } 1040 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1041 } 1042 1043 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 1044 for (;;) { 1045 c = inputBuf[startPos-1]; 1046 if (c == 0x0a) { 1047 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1048 if (U_FAILURE(fDeferredStatus)) { 1049 return FALSE; 1050 } 1051 if (fMatch) { 1052 return TRUE; 1053 } 1054 } 1055 if (startPos >= testLen) { 1056 fMatch = FALSE; 1057 fHitEnd = TRUE; 1058 return FALSE; 1059 } 1060 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1061 // Note that it's perfectly OK for a pattern to have a zero-length 1062 // match at the end of a string, so we must make sure that the loop 1063 // runs with startPos == testLen the last time through. 1064 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1065 return FALSE; 1066 } 1067 } else { 1068 for (;;) { 1069 c = inputBuf[startPos-1]; 1070 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 1071 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { 1072 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { 1073 startPos++; 1074 } 1075 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1076 if (U_FAILURE(fDeferredStatus)) { 1077 return FALSE; 1078 } 1079 if (fMatch) { 1080 return TRUE; 1081 } 1082 } 1083 if (startPos >= testLen) { 1084 fMatch = FALSE; 1085 fHitEnd = TRUE; 1086 return FALSE; 1087 } 1088 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1089 // Note that it's perfectly OK for a pattern to have a zero-length 1090 // match at the end of a string, so we must make sure that the loop 1091 // runs with startPos == testLen the last time through. 1092 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1093 return FALSE; 1094 } 1095 } 1096 } 1097 1098 default: 1099 U_ASSERT(FALSE); 1100 } 1101 1102 U_ASSERT(FALSE); 1103 return FALSE; 1104 } 1105 1106 1107 1108 //-------------------------------------------------------------------------------- 1109 // 1110 // group() 1111 // 1112 //-------------------------------------------------------------------------------- 1113 UnicodeString RegexMatcher::group(UErrorCode &status) const { 1114 return group(0, status); 1115 } 1116 1117 // Return immutable shallow clone 1118 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const { 1119 return group(0, dest, group_len, status); 1120 } 1121 1122 // Return immutable shallow clone 1123 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const { 1124 group_len = 0; 1125 UBool bailOut = FALSE; 1126 if (U_FAILURE(status)) { 1127 return dest; 1128 } 1129 if (U_FAILURE(fDeferredStatus)) { 1130 status = fDeferredStatus; 1131 bailOut = TRUE; 1132 } 1133 if (fMatch == FALSE) { 1134 status = U_REGEX_INVALID_STATE; 1135 bailOut = TRUE; 1136 } 1137 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1138 status = U_INDEX_OUTOFBOUNDS_ERROR; 1139 bailOut = TRUE; 1140 } 1141 1142 if (bailOut) { 1143 return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status); 1144 } 1145 1146 int64_t s, e; 1147 if (groupNum == 0) { 1148 s = fMatchStart; 1149 e = fMatchEnd; 1150 } else { 1151 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1152 U_ASSERT(groupOffset < fPattern->fFrameSize); 1153 U_ASSERT(groupOffset >= 0); 1154 s = fFrame->fExtra[groupOffset]; 1155 e = fFrame->fExtra[groupOffset+1]; 1156 } 1157 1158 if (s < 0) { 1159 // A capture group wasn't part of the match 1160 return utext_clone(dest, fInputText, FALSE, TRUE, &status); 1161 } 1162 U_ASSERT(s <= e); 1163 group_len = e - s; 1164 1165 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); 1166 if (dest) 1167 UTEXT_SETNATIVEINDEX(dest, s); 1168 return dest; 1169 } 1170 1171 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { 1172 UnicodeString result; 1173 if (U_FAILURE(status)) { 1174 return result; 1175 } 1176 UText resultText = UTEXT_INITIALIZER; 1177 utext_openUnicodeString(&resultText, &result, &status); 1178 group(groupNum, &resultText, status); 1179 utext_close(&resultText); 1180 return result; 1181 } 1182 1183 1184 // Return deep (mutable) clone 1185 // Technology Preview (as an API), but note that the UnicodeString API is implemented 1186 // using this function. 1187 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const { 1188 UBool bailOut = FALSE; 1189 if (U_FAILURE(status)) { 1190 return dest; 1191 } 1192 if (U_FAILURE(fDeferredStatus)) { 1193 status = fDeferredStatus; 1194 bailOut = TRUE; 1195 } 1196 1197 if (fMatch == FALSE) { 1198 status = U_REGEX_INVALID_STATE; 1199 bailOut = TRUE; 1200 } 1201 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1202 status = U_INDEX_OUTOFBOUNDS_ERROR; 1203 bailOut = TRUE; 1204 } 1205 1206 if (bailOut) { 1207 if (dest) { 1208 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1209 return dest; 1210 } else { 1211 return utext_openUChars(NULL, NULL, 0, &status); 1212 } 1213 } 1214 1215 int64_t s, e; 1216 if (groupNum == 0) { 1217 s = fMatchStart; 1218 e = fMatchEnd; 1219 } else { 1220 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1221 U_ASSERT(groupOffset < fPattern->fFrameSize); 1222 U_ASSERT(groupOffset >= 0); 1223 s = fFrame->fExtra[groupOffset]; 1224 e = fFrame->fExtra[groupOffset+1]; 1225 } 1226 1227 if (s < 0) { 1228 // A capture group wasn't part of the match 1229 if (dest) { 1230 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1231 return dest; 1232 } else { 1233 return utext_openUChars(NULL, NULL, 0, &status); 1234 } 1235 } 1236 U_ASSERT(s <= e); 1237 1238 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1239 U_ASSERT(e <= fInputLength); 1240 if (dest) { 1241 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status); 1242 } else { 1243 UText groupText = UTEXT_INITIALIZER; 1244 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status); 1245 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); 1246 utext_close(&groupText); 1247 } 1248 } else { 1249 int32_t len16; 1250 if (UTEXT_USES_U16(fInputText)) { 1251 len16 = (int32_t)(e-s); 1252 } else { 1253 UErrorCode lengthStatus = U_ZERO_ERROR; 1254 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1255 } 1256 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1257 if (groupChars == NULL) { 1258 status = U_MEMORY_ALLOCATION_ERROR; 1259 return dest; 1260 } 1261 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1262 1263 if (dest) { 1264 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status); 1265 } else { 1266 UText groupText = UTEXT_INITIALIZER; 1267 utext_openUChars(&groupText, groupChars, len16, &status); 1268 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); 1269 utext_close(&groupText); 1270 } 1271 1272 uprv_free(groupChars); 1273 } 1274 return dest; 1275 } 1276 1277 //-------------------------------------------------------------------------------- 1278 // 1279 // appendGroup() -- currently internal only, appends a group to a UText rather 1280 // than replacing its contents 1281 // 1282 //-------------------------------------------------------------------------------- 1283 1284 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const { 1285 if (U_FAILURE(status)) { 1286 return 0; 1287 } 1288 if (U_FAILURE(fDeferredStatus)) { 1289 status = fDeferredStatus; 1290 return 0; 1291 } 1292 int64_t destLen = utext_nativeLength(dest); 1293 1294 if (fMatch == FALSE) { 1295 status = U_REGEX_INVALID_STATE; 1296 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1297 } 1298 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1299 status = U_INDEX_OUTOFBOUNDS_ERROR; 1300 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1301 } 1302 1303 int64_t s, e; 1304 if (groupNum == 0) { 1305 s = fMatchStart; 1306 e = fMatchEnd; 1307 } else { 1308 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1309 U_ASSERT(groupOffset < fPattern->fFrameSize); 1310 U_ASSERT(groupOffset >= 0); 1311 s = fFrame->fExtra[groupOffset]; 1312 e = fFrame->fExtra[groupOffset+1]; 1313 } 1314 1315 if (s < 0) { 1316 // A capture group wasn't part of the match 1317 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1318 } 1319 U_ASSERT(s <= e); 1320 1321 int64_t deltaLen; 1322 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1323 U_ASSERT(e <= fInputLength); 1324 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status); 1325 } else { 1326 int32_t len16; 1327 if (UTEXT_USES_U16(fInputText)) { 1328 len16 = (int32_t)(e-s); 1329 } else { 1330 UErrorCode lengthStatus = U_ZERO_ERROR; 1331 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1332 } 1333 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1334 if (groupChars == NULL) { 1335 status = U_MEMORY_ALLOCATION_ERROR; 1336 return 0; 1337 } 1338 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1339 1340 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status); 1341 uprv_free(groupChars); 1342 } 1343 return deltaLen; 1344 } 1345 1346 1347 1348 //-------------------------------------------------------------------------------- 1349 // 1350 // groupCount() 1351 // 1352 //-------------------------------------------------------------------------------- 1353 int32_t RegexMatcher::groupCount() const { 1354 return fPattern->fGroupMap->size(); 1355 } 1356 1357 1358 1359 //-------------------------------------------------------------------------------- 1360 // 1361 // hasAnchoringBounds() 1362 // 1363 //-------------------------------------------------------------------------------- 1364 UBool RegexMatcher::hasAnchoringBounds() const { 1365 return fAnchoringBounds; 1366 } 1367 1368 1369 //-------------------------------------------------------------------------------- 1370 // 1371 // hasTransparentBounds() 1372 // 1373 //-------------------------------------------------------------------------------- 1374 UBool RegexMatcher::hasTransparentBounds() const { 1375 return fTransparentBounds; 1376 } 1377 1378 1379 1380 //-------------------------------------------------------------------------------- 1381 // 1382 // hitEnd() 1383 // 1384 //-------------------------------------------------------------------------------- 1385 UBool RegexMatcher::hitEnd() const { 1386 return fHitEnd; 1387 } 1388 1389 1390 //-------------------------------------------------------------------------------- 1391 // 1392 // input() 1393 // 1394 //-------------------------------------------------------------------------------- 1395 const UnicodeString &RegexMatcher::input() const { 1396 if (!fInput) { 1397 UErrorCode status = U_ZERO_ERROR; 1398 int32_t len16; 1399 if (UTEXT_USES_U16(fInputText)) { 1400 len16 = (int32_t)fInputLength; 1401 } else { 1402 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status); 1403 status = U_ZERO_ERROR; // overflow, length status 1404 } 1405 UnicodeString *result = new UnicodeString(len16, 0, 0); 1406 1407 UChar *inputChars = result->getBuffer(len16); 1408 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning 1409 result->releaseBuffer(len16); 1410 1411 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator= 1412 } 1413 1414 return *fInput; 1415 } 1416 1417 //-------------------------------------------------------------------------------- 1418 // 1419 // inputText() 1420 // 1421 //-------------------------------------------------------------------------------- 1422 UText *RegexMatcher::inputText() const { 1423 return fInputText; 1424 } 1425 1426 1427 //-------------------------------------------------------------------------------- 1428 // 1429 // getInput() -- like inputText(), but makes a clone or copies into another UText 1430 // 1431 //-------------------------------------------------------------------------------- 1432 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { 1433 UBool bailOut = FALSE; 1434 if (U_FAILURE(status)) { 1435 return dest; 1436 } 1437 if (U_FAILURE(fDeferredStatus)) { 1438 status = fDeferredStatus; 1439 bailOut = TRUE; 1440 } 1441 1442 if (bailOut) { 1443 if (dest) { 1444 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1445 return dest; 1446 } else { 1447 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); 1448 } 1449 } 1450 1451 if (dest) { 1452 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1453 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status); 1454 } else { 1455 int32_t input16Len; 1456 if (UTEXT_USES_U16(fInputText)) { 1457 input16Len = (int32_t)fInputLength; 1458 } else { 1459 UErrorCode lengthStatus = U_ZERO_ERROR; 1460 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error 1461 } 1462 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len)); 1463 if (inputChars == NULL) { 1464 return dest; 1465 } 1466 1467 status = U_ZERO_ERROR; 1468 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning 1469 status = U_ZERO_ERROR; 1470 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status); 1471 1472 uprv_free(inputChars); 1473 } 1474 return dest; 1475 } else { 1476 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); 1477 } 1478 } 1479 1480 1481 static UBool compat_SyncMutableUTextContents(UText *ut); 1482 static UBool compat_SyncMutableUTextContents(UText *ut) { 1483 UBool retVal = FALSE; 1484 1485 // In the following test, we're really only interested in whether the UText should switch 1486 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents 1487 // will still point to the correct data. 1488 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { 1489 UnicodeString *us=(UnicodeString *)ut->context; 1490 1491 // Update to the latest length. 1492 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). 1493 int32_t newLength = us->length(); 1494 1495 // Update the chunk description. 1496 // The buffer may have switched between stack- and heap-based. 1497 ut->chunkContents = us->getBuffer(); 1498 ut->chunkLength = newLength; 1499 ut->chunkNativeLimit = newLength; 1500 ut->nativeIndexingLimit = newLength; 1501 retVal = TRUE; 1502 } 1503 1504 return retVal; 1505 } 1506 1507 //-------------------------------------------------------------------------------- 1508 // 1509 // lookingAt() 1510 // 1511 //-------------------------------------------------------------------------------- 1512 UBool RegexMatcher::lookingAt(UErrorCode &status) { 1513 if (U_FAILURE(status)) { 1514 return FALSE; 1515 } 1516 if (U_FAILURE(fDeferredStatus)) { 1517 status = fDeferredStatus; 1518 return FALSE; 1519 } 1520 1521 if (fInputUniStrMaybeMutable) { 1522 if (compat_SyncMutableUTextContents(fInputText)) { 1523 fInputLength = utext_nativeLength(fInputText); 1524 reset(); 1525 } 1526 } 1527 else { 1528 resetPreserveRegion(); 1529 } 1530 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1531 MatchChunkAt((int32_t)fActiveStart, FALSE, status); 1532 } else { 1533 MatchAt(fActiveStart, FALSE, status); 1534 } 1535 return fMatch; 1536 } 1537 1538 1539 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { 1540 if (U_FAILURE(status)) { 1541 return FALSE; 1542 } 1543 if (U_FAILURE(fDeferredStatus)) { 1544 status = fDeferredStatus; 1545 return FALSE; 1546 } 1547 reset(); 1548 1549 if (start < 0) { 1550 status = U_INDEX_OUTOFBOUNDS_ERROR; 1551 return FALSE; 1552 } 1553 1554 if (fInputUniStrMaybeMutable) { 1555 if (compat_SyncMutableUTextContents(fInputText)) { 1556 fInputLength = utext_nativeLength(fInputText); 1557 reset(); 1558 } 1559 } 1560 1561 int64_t nativeStart; 1562 nativeStart = start; 1563 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1564 status = U_INDEX_OUTOFBOUNDS_ERROR; 1565 return FALSE; 1566 } 1567 1568 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1569 MatchChunkAt((int32_t)nativeStart, FALSE, status); 1570 } else { 1571 MatchAt(nativeStart, FALSE, status); 1572 } 1573 return fMatch; 1574 } 1575 1576 1577 1578 //-------------------------------------------------------------------------------- 1579 // 1580 // matches() 1581 // 1582 //-------------------------------------------------------------------------------- 1583 UBool RegexMatcher::matches(UErrorCode &status) { 1584 if (U_FAILURE(status)) { 1585 return FALSE; 1586 } 1587 if (U_FAILURE(fDeferredStatus)) { 1588 status = fDeferredStatus; 1589 return FALSE; 1590 } 1591 1592 if (fInputUniStrMaybeMutable) { 1593 if (compat_SyncMutableUTextContents(fInputText)) { 1594 fInputLength = utext_nativeLength(fInputText); 1595 reset(); 1596 } 1597 } 1598 else { 1599 resetPreserveRegion(); 1600 } 1601 1602 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1603 MatchChunkAt((int32_t)fActiveStart, TRUE, status); 1604 } else { 1605 MatchAt(fActiveStart, TRUE, status); 1606 } 1607 return fMatch; 1608 } 1609 1610 1611 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { 1612 if (U_FAILURE(status)) { 1613 return FALSE; 1614 } 1615 if (U_FAILURE(fDeferredStatus)) { 1616 status = fDeferredStatus; 1617 return FALSE; 1618 } 1619 reset(); 1620 1621 if (start < 0) { 1622 status = U_INDEX_OUTOFBOUNDS_ERROR; 1623 return FALSE; 1624 } 1625 1626 if (fInputUniStrMaybeMutable) { 1627 if (compat_SyncMutableUTextContents(fInputText)) { 1628 fInputLength = utext_nativeLength(fInputText); 1629 reset(); 1630 } 1631 } 1632 1633 int64_t nativeStart; 1634 nativeStart = start; 1635 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1636 status = U_INDEX_OUTOFBOUNDS_ERROR; 1637 return FALSE; 1638 } 1639 1640 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1641 MatchChunkAt((int32_t)nativeStart, TRUE, status); 1642 } else { 1643 MatchAt(nativeStart, TRUE, status); 1644 } 1645 return fMatch; 1646 } 1647 1648 1649 1650 //-------------------------------------------------------------------------------- 1651 // 1652 // pattern 1653 // 1654 //-------------------------------------------------------------------------------- 1655 const RegexPattern &RegexMatcher::pattern() const { 1656 return *fPattern; 1657 } 1658 1659 1660 1661 //-------------------------------------------------------------------------------- 1662 // 1663 // region 1664 // 1665 //-------------------------------------------------------------------------------- 1666 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) { 1667 if (U_FAILURE(status)) { 1668 return *this; 1669 } 1670 1671 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { 1672 status = U_ILLEGAL_ARGUMENT_ERROR; 1673 } 1674 1675 int64_t nativeStart = regionStart; 1676 int64_t nativeLimit = regionLimit; 1677 if (nativeStart > fInputLength || nativeLimit > fInputLength) { 1678 status = U_ILLEGAL_ARGUMENT_ERROR; 1679 } 1680 1681 if (startIndex == -1) 1682 this->reset(); 1683 else 1684 resetPreserveRegion(); 1685 1686 fRegionStart = nativeStart; 1687 fRegionLimit = nativeLimit; 1688 fActiveStart = nativeStart; 1689 fActiveLimit = nativeLimit; 1690 1691 if (startIndex != -1) { 1692 if (startIndex < fActiveStart || startIndex > fActiveLimit) { 1693 status = U_INDEX_OUTOFBOUNDS_ERROR; 1694 } 1695 fMatchEnd = startIndex; 1696 } 1697 1698 if (!fTransparentBounds) { 1699 fLookStart = nativeStart; 1700 fLookLimit = nativeLimit; 1701 } 1702 if (fAnchoringBounds) { 1703 fAnchorStart = nativeStart; 1704 fAnchorLimit = nativeLimit; 1705 } 1706 return *this; 1707 } 1708 1709 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) { 1710 return region(start, limit, -1, status); 1711 } 1712 1713 //-------------------------------------------------------------------------------- 1714 // 1715 // regionEnd 1716 // 1717 //-------------------------------------------------------------------------------- 1718 int32_t RegexMatcher::regionEnd() const { 1719 return (int32_t)fRegionLimit; 1720 } 1721 1722 int64_t RegexMatcher::regionEnd64() const { 1723 return fRegionLimit; 1724 } 1725 1726 //-------------------------------------------------------------------------------- 1727 // 1728 // regionStart 1729 // 1730 //-------------------------------------------------------------------------------- 1731 int32_t RegexMatcher::regionStart() const { 1732 return (int32_t)fRegionStart; 1733 } 1734 1735 int64_t RegexMatcher::regionStart64() const { 1736 return fRegionStart; 1737 } 1738 1739 1740 //-------------------------------------------------------------------------------- 1741 // 1742 // replaceAll 1743 // 1744 //-------------------------------------------------------------------------------- 1745 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { 1746 UText replacementText = UTEXT_INITIALIZER; 1747 UText resultText = UTEXT_INITIALIZER; 1748 UnicodeString resultString; 1749 if (U_FAILURE(status)) { 1750 return resultString; 1751 } 1752 1753 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1754 utext_openUnicodeString(&resultText, &resultString, &status); 1755 1756 replaceAll(&replacementText, &resultText, status); 1757 1758 utext_close(&resultText); 1759 utext_close(&replacementText); 1760 1761 return resultString; 1762 } 1763 1764 1765 // 1766 // replaceAll, UText mode 1767 // 1768 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) { 1769 if (U_FAILURE(status)) { 1770 return dest; 1771 } 1772 if (U_FAILURE(fDeferredStatus)) { 1773 status = fDeferredStatus; 1774 return dest; 1775 } 1776 1777 if (dest == NULL) { 1778 UnicodeString emptyString; 1779 UText empty = UTEXT_INITIALIZER; 1780 1781 utext_openUnicodeString(&empty, &emptyString, &status); 1782 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1783 utext_close(&empty); 1784 } 1785 1786 if (U_SUCCESS(status)) { 1787 reset(); 1788 while (find()) { 1789 appendReplacement(dest, replacement, status); 1790 if (U_FAILURE(status)) { 1791 break; 1792 } 1793 } 1794 appendTail(dest, status); 1795 } 1796 1797 return dest; 1798 } 1799 1800 1801 //-------------------------------------------------------------------------------- 1802 // 1803 // replaceFirst 1804 // 1805 //-------------------------------------------------------------------------------- 1806 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { 1807 UText replacementText = UTEXT_INITIALIZER; 1808 UText resultText = UTEXT_INITIALIZER; 1809 UnicodeString resultString; 1810 1811 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1812 utext_openUnicodeString(&resultText, &resultString, &status); 1813 1814 replaceFirst(&replacementText, &resultText, status); 1815 1816 utext_close(&resultText); 1817 utext_close(&replacementText); 1818 1819 return resultString; 1820 } 1821 1822 // 1823 // replaceFirst, UText mode 1824 // 1825 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) { 1826 if (U_FAILURE(status)) { 1827 return dest; 1828 } 1829 if (U_FAILURE(fDeferredStatus)) { 1830 status = fDeferredStatus; 1831 return dest; 1832 } 1833 1834 reset(); 1835 if (!find()) { 1836 return getInput(dest, status); 1837 } 1838 1839 if (dest == NULL) { 1840 UnicodeString emptyString; 1841 UText empty = UTEXT_INITIALIZER; 1842 1843 utext_openUnicodeString(&empty, &emptyString, &status); 1844 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1845 utext_close(&empty); 1846 } 1847 1848 appendReplacement(dest, replacement, status); 1849 appendTail(dest, status); 1850 1851 return dest; 1852 } 1853 1854 1855 //-------------------------------------------------------------------------------- 1856 // 1857 // requireEnd 1858 // 1859 //-------------------------------------------------------------------------------- 1860 UBool RegexMatcher::requireEnd() const { 1861 return fRequireEnd; 1862 } 1863 1864 1865 //-------------------------------------------------------------------------------- 1866 // 1867 // reset 1868 // 1869 //-------------------------------------------------------------------------------- 1870 RegexMatcher &RegexMatcher::reset() { 1871 fRegionStart = 0; 1872 fRegionLimit = fInputLength; 1873 fActiveStart = 0; 1874 fActiveLimit = fInputLength; 1875 fAnchorStart = 0; 1876 fAnchorLimit = fInputLength; 1877 fLookStart = 0; 1878 fLookLimit = fInputLength; 1879 resetPreserveRegion(); 1880 return *this; 1881 } 1882 1883 1884 1885 void RegexMatcher::resetPreserveRegion() { 1886 fMatchStart = 0; 1887 fMatchEnd = 0; 1888 fLastMatchEnd = -1; 1889 fAppendPosition = 0; 1890 fMatch = FALSE; 1891 fHitEnd = FALSE; 1892 fRequireEnd = FALSE; 1893 fTime = 0; 1894 fTickCounter = TIMER_INITIAL_VALUE; 1895 //resetStack(); // more expensive than it looks... 1896 } 1897 1898 1899 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { 1900 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus); 1901 if (fPattern->fNeedsAltInput) { 1902 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1903 } 1904 fInputLength = utext_nativeLength(fInputText); 1905 1906 reset(); 1907 delete fInput; 1908 fInput = NULL; 1909 1910 // Do the following for any UnicodeString. 1911 // This is for compatibility for those clients who modify the input string "live" during regex operations. 1912 fInputUniStrMaybeMutable = TRUE; 1913 1914 if (fWordBreakItr != NULL) { 1915 #if UCONFIG_NO_BREAK_ITERATION==0 1916 UErrorCode status = U_ZERO_ERROR; 1917 fWordBreakItr->setText(fInputText, status); 1918 #endif 1919 } 1920 return *this; 1921 } 1922 1923 1924 RegexMatcher &RegexMatcher::reset(UText *input) { 1925 if (fInputText != input) { 1926 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus); 1927 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1928 fInputLength = utext_nativeLength(fInputText); 1929 1930 delete fInput; 1931 fInput = NULL; 1932 1933 if (fWordBreakItr != NULL) { 1934 #if UCONFIG_NO_BREAK_ITERATION==0 1935 UErrorCode status = U_ZERO_ERROR; 1936 fWordBreakItr->setText(input, status); 1937 #endif 1938 } 1939 } 1940 reset(); 1941 fInputUniStrMaybeMutable = FALSE; 1942 1943 return *this; 1944 } 1945 1946 /*RegexMatcher &RegexMatcher::reset(const UChar *) { 1947 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; 1948 return *this; 1949 }*/ 1950 1951 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { 1952 if (U_FAILURE(status)) { 1953 return *this; 1954 } 1955 reset(); // Reset also resets the region to be the entire string. 1956 1957 if (position < 0 || position > fActiveLimit) { 1958 status = U_INDEX_OUTOFBOUNDS_ERROR; 1959 return *this; 1960 } 1961 fMatchEnd = position; 1962 return *this; 1963 } 1964 1965 1966 1967 1968 1969 //-------------------------------------------------------------------------------- 1970 // 1971 // setTrace 1972 // 1973 //-------------------------------------------------------------------------------- 1974 void RegexMatcher::setTrace(UBool state) { 1975 fTraceDebug = state; 1976 } 1977 1978 1979 1980 //--------------------------------------------------------------------- 1981 // 1982 // split 1983 // 1984 //--------------------------------------------------------------------- 1985 int32_t RegexMatcher::split(const UnicodeString &input, 1986 UnicodeString dest[], 1987 int32_t destCapacity, 1988 UErrorCode &status) 1989 { 1990 UText inputText = UTEXT_INITIALIZER; 1991 utext_openConstUnicodeString(&inputText, &input, &status); 1992 if (U_FAILURE(status)) { 1993 return 0; 1994 } 1995 1996 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); 1997 if (destText == NULL) { 1998 status = U_MEMORY_ALLOCATION_ERROR; 1999 return 0; 2000 } 2001 int32_t i; 2002 for (i = 0; i < destCapacity; i++) { 2003 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); 2004 } 2005 2006 int32_t fieldCount = split(&inputText, destText, destCapacity, status); 2007 2008 for (i = 0; i < destCapacity; i++) { 2009 utext_close(destText[i]); 2010 } 2011 2012 uprv_free(destText); 2013 utext_close(&inputText); 2014 return fieldCount; 2015 } 2016 2017 // 2018 // split, UText mode 2019 // 2020 int32_t RegexMatcher::split(UText *input, 2021 UText *dest[], 2022 int32_t destCapacity, 2023 UErrorCode &status) 2024 { 2025 // 2026 // Check arguements for validity 2027 // 2028 if (U_FAILURE(status)) { 2029 return 0; 2030 }; 2031 2032 if (destCapacity < 1) { 2033 status = U_ILLEGAL_ARGUMENT_ERROR; 2034 return 0; 2035 } 2036 2037 // 2038 // Reset for the input text 2039 // 2040 reset(input); 2041 int64_t nextOutputStringStart = 0; 2042 if (fActiveLimit == 0) { 2043 return 0; 2044 } 2045 2046 // 2047 // Loop through the input text, searching for the delimiter pattern 2048 // 2049 int32_t i; 2050 int32_t numCaptureGroups = fPattern->fGroupMap->size(); 2051 for (i=0; ; i++) { 2052 if (i>=destCapacity-1) { 2053 // There is one or zero output string left. 2054 // Fill the last output string with whatever is left from the input, then exit the loop. 2055 // ( i will be == destCapacity if we filled the output array while processing 2056 // capture groups of the delimiter expression, in which case we will discard the 2057 // last capture group saved in favor of the unprocessed remainder of the 2058 // input string.) 2059 i = destCapacity-1; 2060 if (fActiveLimit > nextOutputStringStart) { 2061 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2062 if (dest[i]) { 2063 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2064 input->chunkContents+nextOutputStringStart, 2065 (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2066 } else { 2067 UText remainingText = UTEXT_INITIALIZER; 2068 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2069 fActiveLimit-nextOutputStringStart, &status); 2070 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2071 utext_close(&remainingText); 2072 } 2073 } else { 2074 UErrorCode lengthStatus = U_ZERO_ERROR; 2075 int32_t remaining16Length = 2076 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 2077 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2078 if (remainingChars == NULL) { 2079 status = U_MEMORY_ALLOCATION_ERROR; 2080 break; 2081 } 2082 2083 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2084 if (dest[i]) { 2085 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2086 } else { 2087 UText remainingText = UTEXT_INITIALIZER; 2088 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2089 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2090 utext_close(&remainingText); 2091 } 2092 2093 uprv_free(remainingChars); 2094 } 2095 } 2096 break; 2097 } 2098 if (find()) { 2099 // We found another delimiter. Move everything from where we started looking 2100 // up until the start of the delimiter into the next output string. 2101 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2102 if (dest[i]) { 2103 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2104 input->chunkContents+nextOutputStringStart, 2105 (int32_t)(fMatchStart-nextOutputStringStart), &status); 2106 } else { 2107 UText remainingText = UTEXT_INITIALIZER; 2108 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2109 fMatchStart-nextOutputStringStart, &status); 2110 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2111 utext_close(&remainingText); 2112 } 2113 } else { 2114 UErrorCode lengthStatus = U_ZERO_ERROR; 2115 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus); 2116 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2117 if (remainingChars == NULL) { 2118 status = U_MEMORY_ALLOCATION_ERROR; 2119 break; 2120 } 2121 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status); 2122 if (dest[i]) { 2123 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2124 } else { 2125 UText remainingText = UTEXT_INITIALIZER; 2126 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2127 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2128 utext_close(&remainingText); 2129 } 2130 2131 uprv_free(remainingChars); 2132 } 2133 nextOutputStringStart = fMatchEnd; 2134 2135 // If the delimiter pattern has capturing parentheses, the captured 2136 // text goes out into the next n destination strings. 2137 int32_t groupNum; 2138 UBool lastGroupWasNullUText = FALSE; 2139 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 2140 if (i==destCapacity-1) { 2141 break; 2142 } 2143 i++; 2144 lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE); 2145 dest[i] = group(groupNum, dest[i], status); 2146 } 2147 2148 if (nextOutputStringStart == fActiveLimit) { 2149 // The delimiter was at the end of the string. We're done. 2150 break; 2151 } else if (i == destCapacity-1) { 2152 // We're out of capture groups, and the rest of the string is more important 2153 if (lastGroupWasNullUText) { 2154 utext_close(dest[i]); 2155 dest[i] = NULL; 2156 } 2157 } 2158 2159 } 2160 else 2161 { 2162 // We ran off the end of the input while looking for the next delimiter. 2163 // All the remaining text goes into the current output string. 2164 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2165 if (dest[i]) { 2166 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2167 input->chunkContents+nextOutputStringStart, 2168 (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2169 } else { 2170 UText remainingText = UTEXT_INITIALIZER; 2171 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2172 fActiveLimit-nextOutputStringStart, &status); 2173 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2174 utext_close(&remainingText); 2175 } 2176 } else { 2177 UErrorCode lengthStatus = U_ZERO_ERROR; 2178 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 2179 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2180 if (remainingChars == NULL) { 2181 status = U_MEMORY_ALLOCATION_ERROR; 2182 break; 2183 } 2184 2185 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2186 if (dest[i]) { 2187 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2188 } else { 2189 UText remainingText = UTEXT_INITIALIZER; 2190 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2191 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2192 utext_close(&remainingText); 2193 } 2194 2195 uprv_free(remainingChars); 2196 } 2197 break; 2198 } 2199 if (U_FAILURE(status)) { 2200 break; 2201 } 2202 } // end of for loop 2203 return i+1; 2204 } 2205 2206 2207 //-------------------------------------------------------------------------------- 2208 // 2209 // start 2210 // 2211 //-------------------------------------------------------------------------------- 2212 int32_t RegexMatcher::start(UErrorCode &status) const { 2213 return start(0, status); 2214 } 2215 2216 int64_t RegexMatcher::start64(UErrorCode &status) const { 2217 return start64(0, status); 2218 } 2219 2220 //-------------------------------------------------------------------------------- 2221 // 2222 // start(int32_t group, UErrorCode &status) 2223 // 2224 //-------------------------------------------------------------------------------- 2225 2226 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const { 2227 if (U_FAILURE(status)) { 2228 return -1; 2229 } 2230 if (U_FAILURE(fDeferredStatus)) { 2231 status = fDeferredStatus; 2232 return -1; 2233 } 2234 if (fMatch == FALSE) { 2235 status = U_REGEX_INVALID_STATE; 2236 return -1; 2237 } 2238 if (group < 0 || group > fPattern->fGroupMap->size()) { 2239 status = U_INDEX_OUTOFBOUNDS_ERROR; 2240 return -1; 2241 } 2242 int64_t s; 2243 if (group == 0) { 2244 s = fMatchStart; 2245 } else { 2246 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 2247 U_ASSERT(groupOffset < fPattern->fFrameSize); 2248 U_ASSERT(groupOffset >= 0); 2249 s = fFrame->fExtra[groupOffset]; 2250 } 2251 2252 return s; 2253 } 2254 2255 2256 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { 2257 return (int32_t)start64(group, status); 2258 } 2259 2260 //-------------------------------------------------------------------------------- 2261 // 2262 // useAnchoringBounds 2263 // 2264 //-------------------------------------------------------------------------------- 2265 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { 2266 fAnchoringBounds = b; 2267 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); 2268 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); 2269 return *this; 2270 } 2271 2272 2273 //-------------------------------------------------------------------------------- 2274 // 2275 // useTransparentBounds 2276 // 2277 //-------------------------------------------------------------------------------- 2278 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { 2279 fTransparentBounds = b; 2280 fLookStart = (fTransparentBounds ? 0 : fRegionStart); 2281 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); 2282 return *this; 2283 } 2284 2285 //-------------------------------------------------------------------------------- 2286 // 2287 // setTimeLimit 2288 // 2289 //-------------------------------------------------------------------------------- 2290 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { 2291 if (U_FAILURE(status)) { 2292 return; 2293 } 2294 if (U_FAILURE(fDeferredStatus)) { 2295 status = fDeferredStatus; 2296 return; 2297 } 2298 if (limit < 0) { 2299 status = U_ILLEGAL_ARGUMENT_ERROR; 2300 return; 2301 } 2302 fTimeLimit = limit; 2303 } 2304 2305 2306 //-------------------------------------------------------------------------------- 2307 // 2308 // getTimeLimit 2309 // 2310 //-------------------------------------------------------------------------------- 2311 int32_t RegexMatcher::getTimeLimit() const { 2312 return fTimeLimit; 2313 } 2314 2315 2316 //-------------------------------------------------------------------------------- 2317 // 2318 // setStackLimit 2319 // 2320 //-------------------------------------------------------------------------------- 2321 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { 2322 if (U_FAILURE(status)) { 2323 return; 2324 } 2325 if (U_FAILURE(fDeferredStatus)) { 2326 status = fDeferredStatus; 2327 return; 2328 } 2329 if (limit < 0) { 2330 status = U_ILLEGAL_ARGUMENT_ERROR; 2331 return; 2332 } 2333 2334 // Reset the matcher. This is needed here in case there is a current match 2335 // whose final stack frame (containing the match results, pointed to by fFrame) 2336 // would be lost by resizing to a smaller stack size. 2337 reset(); 2338 2339 if (limit == 0) { 2340 // Unlimited stack expansion 2341 fStack->setMaxCapacity(0); 2342 } else { 2343 // Change the units of the limit from bytes to ints, and bump the size up 2344 // to be big enough to hold at least one stack frame for the pattern, 2345 // if it isn't there already. 2346 int32_t adjustedLimit = limit / sizeof(int32_t); 2347 if (adjustedLimit < fPattern->fFrameSize) { 2348 adjustedLimit = fPattern->fFrameSize; 2349 } 2350 fStack->setMaxCapacity(adjustedLimit); 2351 } 2352 fStackLimit = limit; 2353 } 2354 2355 2356 //-------------------------------------------------------------------------------- 2357 // 2358 // getStackLimit 2359 // 2360 //-------------------------------------------------------------------------------- 2361 int32_t RegexMatcher::getStackLimit() const { 2362 return fStackLimit; 2363 } 2364 2365 2366 //-------------------------------------------------------------------------------- 2367 // 2368 // setMatchCallback 2369 // 2370 //-------------------------------------------------------------------------------- 2371 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, 2372 const void *context, 2373 UErrorCode &status) { 2374 if (U_FAILURE(status)) { 2375 return; 2376 } 2377 fCallbackFn = callback; 2378 fCallbackContext = context; 2379 } 2380 2381 2382 //-------------------------------------------------------------------------------- 2383 // 2384 // getMatchCallback 2385 // 2386 //-------------------------------------------------------------------------------- 2387 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, 2388 const void *&context, 2389 UErrorCode &status) { 2390 if (U_FAILURE(status)) { 2391 return; 2392 } 2393 callback = fCallbackFn; 2394 context = fCallbackContext; 2395 } 2396 2397 2398 //-------------------------------------------------------------------------------- 2399 // 2400 // setMatchCallback 2401 // 2402 //-------------------------------------------------------------------------------- 2403 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback, 2404 const void *context, 2405 UErrorCode &status) { 2406 if (U_FAILURE(status)) { 2407 return; 2408 } 2409 fFindProgressCallbackFn = callback; 2410 fFindProgressCallbackContext = context; 2411 } 2412 2413 2414 //-------------------------------------------------------------------------------- 2415 // 2416 // getMatchCallback 2417 // 2418 //-------------------------------------------------------------------------------- 2419 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback, 2420 const void *&context, 2421 UErrorCode &status) { 2422 if (U_FAILURE(status)) { 2423 return; 2424 } 2425 callback = fFindProgressCallbackFn; 2426 context = fFindProgressCallbackContext; 2427 } 2428 2429 2430 //================================================================================ 2431 // 2432 // Code following this point in this file is the internal 2433 // Match Engine Implementation. 2434 // 2435 //================================================================================ 2436 2437 2438 //-------------------------------------------------------------------------------- 2439 // 2440 // resetStack 2441 // Discard any previous contents of the state save stack, and initialize a 2442 // new stack frame to all -1. The -1s are needed for capture group limits, 2443 // where they indicate that a group has not yet matched anything. 2444 //-------------------------------------------------------------------------------- 2445 REStackFrame *RegexMatcher::resetStack() { 2446 // Discard any previous contents of the state save stack, and initialize a 2447 // new stack frame with all -1 data. The -1s are needed for capture group limits, 2448 // where they indicate that a group has not yet matched anything. 2449 fStack->removeAllElements(); 2450 2451 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); 2452 int32_t i; 2453 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { 2454 iFrame->fExtra[i] = -1; 2455 } 2456 return iFrame; 2457 } 2458 2459 2460 2461 //-------------------------------------------------------------------------------- 2462 // 2463 // isWordBoundary 2464 // in perl, "xab..cd..", \b is true at positions 0,3,5,7 2465 // For us, 2466 // If the current char is a combining mark, 2467 // \b is FALSE. 2468 // Else Scan backwards to the first non-combining char. 2469 // We are at a boundary if the this char and the original chars are 2470 // opposite in membership in \w set 2471 // 2472 // parameters: pos - the current position in the input buffer 2473 // 2474 // TODO: double-check edge cases at region boundaries. 2475 // 2476 //-------------------------------------------------------------------------------- 2477 UBool RegexMatcher::isWordBoundary(int64_t pos) { 2478 UBool isBoundary = FALSE; 2479 UBool cIsWord = FALSE; 2480 2481 if (pos >= fLookLimit) { 2482 fHitEnd = TRUE; 2483 } else { 2484 // Determine whether char c at current position is a member of the word set of chars. 2485 // If we're off the end of the string, behave as though we're not at a word char. 2486 UTEXT_SETNATIVEINDEX(fInputText, pos); 2487 UChar32 c = UTEXT_CURRENT32(fInputText); 2488 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2489 // Current char is a combining one. Not a boundary. 2490 return FALSE; 2491 } 2492 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2493 } 2494 2495 // Back up until we come to a non-combining char, determine whether 2496 // that char is a word char. 2497 UBool prevCIsWord = FALSE; 2498 for (;;) { 2499 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { 2500 break; 2501 } 2502 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); 2503 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2504 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2505 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 2506 break; 2507 } 2508 } 2509 isBoundary = cIsWord ^ prevCIsWord; 2510 return isBoundary; 2511 } 2512 2513 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { 2514 UBool isBoundary = FALSE; 2515 UBool cIsWord = FALSE; 2516 2517 const UChar *inputBuf = fInputText->chunkContents; 2518 2519 if (pos >= fLookLimit) { 2520 fHitEnd = TRUE; 2521 } else { 2522 // Determine whether char c at current position is a member of the word set of chars. 2523 // If we're off the end of the string, behave as though we're not at a word char. 2524 UChar32 c; 2525 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); 2526 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2527 // Current char is a combining one. Not a boundary. 2528 return FALSE; 2529 } 2530 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2531 } 2532 2533 // Back up until we come to a non-combining char, determine whether 2534 // that char is a word char. 2535 UBool prevCIsWord = FALSE; 2536 for (;;) { 2537 if (pos <= fLookStart) { 2538 break; 2539 } 2540 UChar32 prevChar; 2541 U16_PREV(inputBuf, fLookStart, pos, prevChar); 2542 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2543 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2544 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 2545 break; 2546 } 2547 } 2548 isBoundary = cIsWord ^ prevCIsWord; 2549 return isBoundary; 2550 } 2551 2552 //-------------------------------------------------------------------------------- 2553 // 2554 // isUWordBoundary 2555 // 2556 // Test for a word boundary using RBBI word break. 2557 // 2558 // parameters: pos - the current position in the input buffer 2559 // 2560 //-------------------------------------------------------------------------------- 2561 UBool RegexMatcher::isUWordBoundary(int64_t pos) { 2562 UBool returnVal = FALSE; 2563 #if UCONFIG_NO_BREAK_ITERATION==0 2564 2565 // If we haven't yet created a break iterator for this matcher, do it now. 2566 if (fWordBreakItr == NULL) { 2567 fWordBreakItr = 2568 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus); 2569 if (U_FAILURE(fDeferredStatus)) { 2570 return FALSE; 2571 } 2572 fWordBreakItr->setText(fInputText, fDeferredStatus); 2573 } 2574 2575 if (pos >= fLookLimit) { 2576 fHitEnd = TRUE; 2577 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" 2578 // words are not boundaries. All non-word chars stand by themselves, 2579 // with word boundaries on both sides. 2580 } else { 2581 if (!UTEXT_USES_U16(fInputText)) { 2582 // !!!: Would like a better way to do this! 2583 UErrorCode status = U_ZERO_ERROR; 2584 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status); 2585 } 2586 returnVal = fWordBreakItr->isBoundary((int32_t)pos); 2587 } 2588 #endif 2589 return returnVal; 2590 } 2591 2592 //-------------------------------------------------------------------------------- 2593 // 2594 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state 2595 // saves. Increment the "time" counter, and call the 2596 // user callback function if there is one installed. 2597 // 2598 // If the match operation needs to be aborted, either for a time-out 2599 // or because the user callback asked for it, just set an error status. 2600 // The engine will pick that up and stop in its outer loop. 2601 // 2602 //-------------------------------------------------------------------------------- 2603 void RegexMatcher::IncrementTime(UErrorCode &status) { 2604 fTickCounter = TIMER_INITIAL_VALUE; 2605 fTime++; 2606 if (fCallbackFn != NULL) { 2607 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { 2608 status = U_REGEX_STOPPED_BY_CALLER; 2609 return; 2610 } 2611 } 2612 if (fTimeLimit > 0 && fTime >= fTimeLimit) { 2613 status = U_REGEX_TIME_OUT; 2614 } 2615 } 2616 2617 //-------------------------------------------------------------------------------- 2618 // 2619 // ReportFindProgress This function is called once for each advance in the target 2620 // string from the find() function, and calls the user progress callback 2621 // function if there is one installed. 2622 // 2623 // NOTE: 2624 // 2625 // If the match operation needs to be aborted because the user 2626 // callback asked for it, just set an error status. 2627 // The engine will pick that up and stop in its outer loop. 2628 // 2629 //-------------------------------------------------------------------------------- 2630 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) { 2631 if (fFindProgressCallbackFn != NULL) { 2632 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) { 2633 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/; 2634 return FALSE; 2635 } 2636 } 2637 return TRUE; 2638 } 2639 2640 //-------------------------------------------------------------------------------- 2641 // 2642 // StateSave 2643 // Make a new stack frame, initialized as a copy of the current stack frame. 2644 // Set the pattern index in the original stack frame from the operand value 2645 // in the opcode. Execution of the engine continues with the state in 2646 // the newly created stack frame 2647 // 2648 // Note that reserveBlock() may grow the stack, resulting in the 2649 // whole thing being relocated in memory. 2650 // 2651 // Parameters: 2652 // fp The top frame pointer when called. At return, a new 2653 // fame will be present 2654 // savePatIdx An index into the compiled pattern. Goes into the original 2655 // (not new) frame. If execution ever back-tracks out of the 2656 // new frame, this will be where we continue from in the pattern. 2657 // Return 2658 // The new frame pointer. 2659 // 2660 //-------------------------------------------------------------------------------- 2661 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) { 2662 // push storage for a new frame. 2663 int64_t *newFP = fStack->reserveBlock(fFrameSize, status); 2664 if (newFP == NULL) { 2665 // Failure on attempted stack expansion. 2666 // Stack function set some other error code, change it to a more 2667 // specific one for regular expressions. 2668 status = U_REGEX_STACK_OVERFLOW; 2669 // We need to return a writable stack frame, so just return the 2670 // previous frame. The match operation will stop quickly 2671 // because of the error status, after which the frame will never 2672 // be looked at again. 2673 return fp; 2674 } 2675 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. 2676 2677 // New stack frame = copy of old top frame. 2678 int64_t *source = (int64_t *)fp; 2679 int64_t *dest = newFP; 2680 for (;;) { 2681 *dest++ = *source++; 2682 if (source == newFP) { 2683 break; 2684 } 2685 } 2686 2687 fTickCounter--; 2688 if (fTickCounter <= 0) { 2689 IncrementTime(status); // Re-initializes fTickCounter 2690 } 2691 fp->fPatIdx = savePatIdx; 2692 return (REStackFrame *)newFP; 2693 } 2694 2695 2696 //-------------------------------------------------------------------------------- 2697 // 2698 // MatchAt This is the actual matching engine. 2699 // 2700 // startIdx: begin matching a this index. 2701 // toEnd: if true, match must extend to end of the input region 2702 // 2703 //-------------------------------------------------------------------------------- 2704 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { 2705 UBool isMatch = FALSE; // True if the we have a match. 2706 2707 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards 2708 2709 int32_t op; // Operation from the compiled pattern, split into 2710 int32_t opType; // the opcode 2711 int32_t opValue; // and the operand value. 2712 2713 #ifdef REGEX_RUN_DEBUG 2714 if (fTraceDebug) 2715 { 2716 printf("MatchAt(startIdx=%ld)\n", startIdx); 2717 printf("Original Pattern: "); 2718 UChar32 c = utext_next32From(fPattern->fPattern, 0); 2719 while (c != U_SENTINEL) { 2720 if (c<32 || c>256) { 2721 c = '.'; 2722 } 2723 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); 2724 2725 c = UTEXT_NEXT32(fPattern->fPattern); 2726 } 2727 printf("\n"); 2728 printf("Input String: "); 2729 c = utext_next32From(fInputText, 0); 2730 while (c != U_SENTINEL) { 2731 if (c<32 || c>256) { 2732 c = '.'; 2733 } 2734 printf("%c", c); 2735 2736 c = UTEXT_NEXT32(fInputText); 2737 } 2738 printf("\n"); 2739 printf("\n"); 2740 } 2741 #endif 2742 2743 if (U_FAILURE(status)) { 2744 return; 2745 } 2746 2747 // Cache frequently referenced items from the compiled pattern 2748 // 2749 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 2750 2751 const UChar *litText = fPattern->fLiteralText.getBuffer(); 2752 UVector *sets = fPattern->fSets; 2753 2754 fFrameSize = fPattern->fFrameSize; 2755 REStackFrame *fp = resetStack(); 2756 2757 fp->fPatIdx = 0; 2758 fp->fInputIdx = startIdx; 2759 2760 // Zero out the pattern's static data 2761 int32_t i; 2762 for (i = 0; i<fPattern->fDataSize; i++) { 2763 fData[i] = 0; 2764 } 2765 2766 // 2767 // Main loop for interpreting the compiled pattern. 2768 // One iteration of the loop per pattern operation performed. 2769 // 2770 for (;;) { 2771 #if 0 2772 if (_heapchk() != _HEAPOK) { 2773 fprintf(stderr, "Heap Trouble\n"); 2774 } 2775 #endif 2776 2777 op = (int32_t)pat[fp->fPatIdx]; 2778 opType = URX_TYPE(op); 2779 opValue = URX_VAL(op); 2780 #ifdef REGEX_RUN_DEBUG 2781 if (fTraceDebug) { 2782 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2783 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, 2784 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 2785 fPattern->dumpOp(fp->fPatIdx); 2786 } 2787 #endif 2788 fp->fPatIdx++; 2789 2790 switch (opType) { 2791 2792 2793 case URX_NOP: 2794 break; 2795 2796 2797 case URX_BACKTRACK: 2798 // Force a backtrack. In some circumstances, the pattern compiler 2799 // will notice that the pattern can't possibly match anything, and will 2800 // emit one of these at that point. 2801 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2802 break; 2803 2804 2805 case URX_ONECHAR: 2806 if (fp->fInputIdx < fActiveLimit) { 2807 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2808 UChar32 c = UTEXT_NEXT32(fInputText); 2809 if (c == opValue) { 2810 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2811 break; 2812 } 2813 } else { 2814 fHitEnd = TRUE; 2815 } 2816 2817 #ifdef REGEX_SMART_BACKTRACKING 2818 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 2819 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 2820 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 2821 UBool success = FALSE; 2822 UChar32 c = UTEXT_PREVIOUS32(fInputText); 2823 while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) { 2824 if (c == opValue) { 2825 success = TRUE; 2826 break; 2827 } else if (c == U_SENTINEL) { 2828 break; 2829 } 2830 c = UTEXT_PREVIOUS32(fInputText); 2831 } 2832 if (success) { 2833 fHitEnd = FALSE; 2834 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2835 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2836 if (fp->fInputIdx > backSearchIndex) { 2837 fp = StateSave(fp, fp->fPatIdx, status); 2838 } 2839 fp->fPatIdx++; // Skip the LOOP_C, we just did that 2840 break; 2841 } 2842 } 2843 } 2844 #endif 2845 2846 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2847 break; 2848 2849 2850 case URX_STRING: 2851 { 2852 // Test input against a literal string. 2853 // Strings require two slots in the compiled pattern, one for the 2854 // offset to the string text, and one for the length. 2855 int32_t stringStartIdx = opValue; 2856 int32_t stringLen; 2857 2858 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 2859 fp->fPatIdx++; 2860 opType = URX_TYPE(op); 2861 stringLen = URX_VAL(op); 2862 U_ASSERT(opType == URX_STRING_LEN); 2863 U_ASSERT(stringLen >= 2); 2864 2865 const UChar *patternChars = litText+stringStartIdx; 2866 const UChar *patternEnd = patternChars+stringLen; 2867 2868 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2869 UChar32 c; 2870 UBool success = TRUE; 2871 2872 while (patternChars < patternEnd && success) { 2873 c = UTEXT_NEXT32(fInputText); 2874 2875 if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) { 2876 if (U_IS_BMP(c)) { 2877 success = (*patternChars == c); 2878 patternChars += 1; 2879 } else if (patternChars+1 < patternEnd) { 2880 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); 2881 patternChars += 2; 2882 } 2883 } else { 2884 success = FALSE; 2885 fHitEnd = TRUE; // TODO: See ticket 6074 2886 } 2887 } 2888 2889 if (success) { 2890 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2891 } else { 2892 #ifdef REGEX_SMART_BACKTRACKING 2893 if (fp->fInputIdx > backSearchIndex && fStack->size()) { 2894 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 2895 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 2896 // Reset to last start point 2897 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2898 patternChars = litText+stringStartIdx; 2899 2900 // Search backwards for a possible start 2901 do { 2902 c = UTEXT_PREVIOUS32(fInputText); 2903 if (c == U_SENTINEL) { 2904 break; 2905 } else if ((U_IS_BMP(c) && *patternChars == c) || 2906 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { 2907 success = TRUE; 2908 break; 2909 } 2910 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); 2911 2912 // And try again 2913 if (success) { 2914 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2915 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2916 if (fp->fInputIdx > backSearchIndex) { 2917 fp = StateSave(fp, fp->fPatIdx, status); 2918 } 2919 fp->fPatIdx++; // Skip the LOOP_C, we just did that 2920 break; 2921 } 2922 } 2923 } 2924 #endif 2925 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2926 } 2927 } 2928 break; 2929 2930 2931 case URX_STATE_SAVE: 2932 fp = StateSave(fp, opValue, status); 2933 break; 2934 2935 2936 case URX_END: 2937 // The match loop will exit via this path on a successful match, 2938 // when we reach the end of the pattern. 2939 if (toEnd && fp->fInputIdx != fActiveLimit) { 2940 // The pattern matched, but not to the end of input. Try some more. 2941 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2942 break; 2943 } 2944 isMatch = TRUE; 2945 goto breakFromLoop; 2946 2947 // Start and End Capture stack frame variables are laid out out like this: 2948 // fp->fExtra[opValue] - The start of a completed capture group 2949 // opValue+1 - The end of a completed capture group 2950 // opValue+2 - the start of a capture group whose end 2951 // has not yet been reached (and might not ever be). 2952 case URX_START_CAPTURE: 2953 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2954 fp->fExtra[opValue+2] = fp->fInputIdx; 2955 break; 2956 2957 2958 case URX_END_CAPTURE: 2959 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2960 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 2961 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 2962 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 2963 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 2964 break; 2965 2966 2967 case URX_DOLLAR: // $, test for End of line 2968 // or for position before new line at end of input 2969 { 2970 if (fp->fInputIdx >= fAnchorLimit) { 2971 // We really are at the end of input. Success. 2972 fHitEnd = TRUE; 2973 fRequireEnd = TRUE; 2974 break; 2975 } 2976 2977 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2978 2979 // If we are positioned just before a new-line that is located at the 2980 // end of input, succeed. 2981 UChar32 c = UTEXT_NEXT32(fInputText); 2982 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 2983 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 2984 // If not in the middle of a CR/LF sequence 2985 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { 2986 // At new-line at end of input. Success 2987 fHitEnd = TRUE; 2988 fRequireEnd = TRUE; 2989 2990 break; 2991 } 2992 } 2993 } else { 2994 UChar32 nextC = UTEXT_NEXT32(fInputText); 2995 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 2996 fHitEnd = TRUE; 2997 fRequireEnd = TRUE; 2998 break; // At CR/LF at end of input. Success 2999 } 3000 } 3001 3002 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3003 } 3004 break; 3005 3006 3007 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 3008 if (fp->fInputIdx >= fAnchorLimit) { 3009 // Off the end of input. Success. 3010 fHitEnd = TRUE; 3011 fRequireEnd = TRUE; 3012 break; 3013 } else { 3014 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3015 UChar32 c = UTEXT_NEXT32(fInputText); 3016 // Either at the last character of input, or off the end. 3017 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) { 3018 fHitEnd = TRUE; 3019 fRequireEnd = TRUE; 3020 break; 3021 } 3022 } 3023 3024 // Not at end of input. Back-track out. 3025 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3026 break; 3027 3028 3029 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 3030 { 3031 if (fp->fInputIdx >= fAnchorLimit) { 3032 // We really are at the end of input. Success. 3033 fHitEnd = TRUE; 3034 fRequireEnd = TRUE; 3035 break; 3036 } 3037 // If we are positioned just before a new-line, succeed. 3038 // It makes no difference where the new-line is within the input. 3039 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3040 UChar32 c = UTEXT_CURRENT32(fInputText); 3041 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 3042 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 3043 // In multi-line mode, hitting a new-line just before the end of input does not 3044 // set the hitEnd or requireEnd flags 3045 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) { 3046 break; 3047 } 3048 } 3049 // not at a new line. Fail. 3050 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3051 } 3052 break; 3053 3054 3055 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 3056 { 3057 if (fp->fInputIdx >= fAnchorLimit) { 3058 // We really are at the end of input. Success. 3059 fHitEnd = TRUE; 3060 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 3061 break; // adding a new-line would not lose the match. 3062 } 3063 // If we are not positioned just before a new-line, the test fails; backtrack out. 3064 // It makes no difference where the new-line is within the input. 3065 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3066 if (UTEXT_CURRENT32(fInputText) != 0x0a) { 3067 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3068 } 3069 } 3070 break; 3071 3072 3073 case URX_CARET: // ^, test for start of line 3074 if (fp->fInputIdx != fAnchorStart) { 3075 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3076 } 3077 break; 3078 3079 3080 case URX_CARET_M: // ^, test for start of line in mulit-line mode 3081 { 3082 if (fp->fInputIdx == fAnchorStart) { 3083 // We are at the start input. Success. 3084 break; 3085 } 3086 // Check whether character just before the current pos is a new-line 3087 // unless we are at the end of input 3088 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3089 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3090 if ((fp->fInputIdx < fAnchorLimit) && 3091 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 3092 // It's a new-line. ^ is true. Success. 3093 // TODO: what should be done with positions between a CR and LF? 3094 break; 3095 } 3096 // Not at the start of a line. Fail. 3097 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3098 } 3099 break; 3100 3101 3102 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 3103 { 3104 U_ASSERT(fp->fInputIdx >= fAnchorStart); 3105 if (fp->fInputIdx <= fAnchorStart) { 3106 // We are at the start input. Success. 3107 break; 3108 } 3109 // Check whether character just before the current pos is a new-line 3110 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 3111 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3112 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3113 if (c != 0x0a) { 3114 // Not at the start of a line. Back-track out. 3115 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3116 } 3117 } 3118 break; 3119 3120 case URX_BACKSLASH_B: // Test for word boundaries 3121 { 3122 UBool success = isWordBoundary(fp->fInputIdx); 3123 success ^= (opValue != 0); // flip sense for \B 3124 if (!success) { 3125 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3126 } 3127 } 3128 break; 3129 3130 3131 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 3132 { 3133 UBool success = isUWordBoundary(fp->fInputIdx); 3134 success ^= (opValue != 0); // flip sense for \B 3135 if (!success) { 3136 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3137 } 3138 } 3139 break; 3140 3141 3142 case URX_BACKSLASH_D: // Test for decimal digit 3143 { 3144 if (fp->fInputIdx >= fActiveLimit) { 3145 fHitEnd = TRUE; 3146 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3147 break; 3148 } 3149 3150 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3151 3152 UChar32 c = UTEXT_NEXT32(fInputText); 3153 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 3154 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 3155 success ^= (opValue != 0); // flip sense for \D 3156 if (success) { 3157 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3158 } else { 3159 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3160 } 3161 } 3162 break; 3163 3164 3165 case URX_BACKSLASH_G: // Test for position at end of previous match 3166 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { 3167 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3168 } 3169 break; 3170 3171 3172 case URX_BACKSLASH_X: 3173 // Match a Grapheme, as defined by Unicode TR 29. 3174 // Differs slightly from Perl, which consumes combining marks independently 3175 // of context. 3176 { 3177 3178 // Fail if at end of input 3179 if (fp->fInputIdx >= fActiveLimit) { 3180 fHitEnd = TRUE; 3181 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3182 break; 3183 } 3184 3185 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3186 3187 // Examine (and consume) the current char. 3188 // Dispatch into a little state machine, based on the char. 3189 UChar32 c; 3190 c = UTEXT_NEXT32(fInputText); 3191 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3192 UnicodeSet **sets = fPattern->fStaticSets; 3193 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 3194 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 3195 if (sets[URX_GC_L]->contains(c)) goto GC_L; 3196 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 3197 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 3198 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3199 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3200 goto GC_Extend; 3201 3202 3203 3204 GC_L: 3205 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3206 c = UTEXT_NEXT32(fInputText); 3207 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3208 if (sets[URX_GC_L]->contains(c)) goto GC_L; 3209 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 3210 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 3211 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3212 UTEXT_PREVIOUS32(fInputText); 3213 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3214 goto GC_Extend; 3215 3216 GC_V: 3217 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3218 c = UTEXT_NEXT32(fInputText); 3219 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3220 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3221 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3222 UTEXT_PREVIOUS32(fInputText); 3223 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3224 goto GC_Extend; 3225 3226 GC_T: 3227 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3228 c = UTEXT_NEXT32(fInputText); 3229 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3230 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3231 UTEXT_PREVIOUS32(fInputText); 3232 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3233 goto GC_Extend; 3234 3235 GC_Extend: 3236 // Combining characters are consumed here 3237 for (;;) { 3238 if (fp->fInputIdx >= fActiveLimit) { 3239 break; 3240 } 3241 c = UTEXT_CURRENT32(fInputText); 3242 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 3243 break; 3244 } 3245 UTEXT_NEXT32(fInputText); 3246 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3247 } 3248 goto GC_Done; 3249 3250 GC_Control: 3251 // Most control chars stand alone (don't combine with combining chars), 3252 // except for that CR/LF sequence is a single grapheme cluster. 3253 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 3254 c = UTEXT_NEXT32(fInputText); 3255 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3256 } 3257 3258 GC_Done: 3259 if (fp->fInputIdx >= fActiveLimit) { 3260 fHitEnd = TRUE; 3261 } 3262 break; 3263 } 3264 3265 3266 3267 3268 case URX_BACKSLASH_Z: // Test for end of Input 3269 if (fp->fInputIdx < fAnchorLimit) { 3270 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3271 } else { 3272 fHitEnd = TRUE; 3273 fRequireEnd = TRUE; 3274 } 3275 break; 3276 3277 3278 3279 case URX_STATIC_SETREF: 3280 { 3281 // Test input character against one of the predefined sets 3282 // (Word Characters, for example) 3283 // The high bit of the op value is a flag for the match polarity. 3284 // 0: success if input char is in set. 3285 // 1: success if input char is not in set. 3286 if (fp->fInputIdx >= fActiveLimit) { 3287 fHitEnd = TRUE; 3288 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3289 break; 3290 } 3291 3292 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 3293 opValue &= ~URX_NEG_SET; 3294 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3295 3296 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3297 UChar32 c = UTEXT_NEXT32(fInputText); 3298 if (c < 256) { 3299 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3300 if (s8->contains(c)) { 3301 success = !success; 3302 } 3303 } else { 3304 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3305 if (s->contains(c)) { 3306 success = !success; 3307 } 3308 } 3309 if (success) { 3310 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3311 } else { 3312 // the character wasn't in the set. 3313 #ifdef REGEX_SMART_BACKTRACKING 3314 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 3315 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3316 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3317 // Try to find it, backwards 3318 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried 3319 success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset 3320 do { 3321 c = UTEXT_PREVIOUS32(fInputText); 3322 if (c == U_SENTINEL) { 3323 break; 3324 } else if (c < 256) { 3325 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3326 if (s8->contains(c)) { 3327 success = !success; 3328 } 3329 } else { 3330 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3331 if (s->contains(c)) { 3332 success = !success; 3333 } 3334 } 3335 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success); 3336 3337 if (success && c != U_SENTINEL) { 3338 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3339 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3340 if (fp->fInputIdx > backSearchIndex) { 3341 fp = StateSave(fp, fp->fPatIdx, status); 3342 } 3343 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3344 break; 3345 } 3346 } 3347 } 3348 #endif 3349 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3350 } 3351 } 3352 break; 3353 3354 3355 case URX_STAT_SETREF_N: 3356 { 3357 // Test input character for NOT being a member of one of 3358 // the predefined sets (Word Characters, for example) 3359 if (fp->fInputIdx >= fActiveLimit) { 3360 fHitEnd = TRUE; 3361 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3362 break; 3363 } 3364 3365 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3366 3367 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3368 3369 UChar32 c = UTEXT_NEXT32(fInputText); 3370 if (c < 256) { 3371 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3372 if (s8->contains(c) == FALSE) { 3373 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3374 break; 3375 } 3376 } else { 3377 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3378 if (s->contains(c) == FALSE) { 3379 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3380 break; 3381 } 3382 } 3383 // the character wasn't in the set. 3384 #ifdef REGEX_SMART_BACKTRACKING 3385 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 3386 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3387 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3388 // Try to find it, backwards 3389 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried 3390 UBool success = FALSE; 3391 do { 3392 c = UTEXT_PREVIOUS32(fInputText); 3393 if (c == U_SENTINEL) { 3394 break; 3395 } else if (c < 256) { 3396 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3397 if (s8->contains(c) == FALSE) { 3398 success = TRUE; 3399 break; 3400 } 3401 } else { 3402 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3403 if (s->contains(c) == FALSE) { 3404 success = TRUE; 3405 break; 3406 } 3407 } 3408 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); 3409 3410 if (success) { 3411 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3412 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3413 if (fp->fInputIdx > backSearchIndex) { 3414 fp = StateSave(fp, fp->fPatIdx, status); 3415 } 3416 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3417 break; 3418 } 3419 } 3420 } 3421 #endif 3422 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3423 } 3424 break; 3425 3426 3427 case URX_SETREF: 3428 if (fp->fInputIdx >= fActiveLimit) { 3429 fHitEnd = TRUE; 3430 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3431 break; 3432 } else { 3433 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3434 3435 // There is input left. Pick up one char and test it for set membership. 3436 UChar32 c = UTEXT_NEXT32(fInputText); 3437 U_ASSERT(opValue > 0 && opValue < sets->size()); 3438 if (c<256) { 3439 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 3440 if (s8->contains(c)) { 3441 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3442 break; 3443 } 3444 } else { 3445 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 3446 if (s->contains(c)) { 3447 // The character is in the set. A Match. 3448 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3449 break; 3450 } 3451 } 3452 3453 // the character wasn't in the set. 3454 #ifdef REGEX_SMART_BACKTRACKING 3455 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 3456 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3457 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3458 // Try to find it, backwards 3459 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried 3460 UBool success = FALSE; 3461 do { 3462 c = UTEXT_PREVIOUS32(fInputText); 3463 if (c == U_SENTINEL) { 3464 break; 3465 } else if (c < 256) { 3466 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 3467 if (s8->contains(c)) { 3468 success = TRUE; 3469 break; 3470 } 3471 } else { 3472 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 3473 if (s->contains(c)) { 3474 success = TRUE; 3475 break; 3476 } 3477 } 3478 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); 3479 3480 if (success) { 3481 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3482 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3483 if (fp->fInputIdx > backSearchIndex) { 3484 fp = StateSave(fp, fp->fPatIdx, status); 3485 } 3486 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3487 break; 3488 } 3489 } 3490 } 3491 #endif 3492 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3493 } 3494 break; 3495 3496 3497 case URX_DOTANY: 3498 { 3499 // . matches anything, but stops at end-of-line. 3500 if (fp->fInputIdx >= fActiveLimit) { 3501 // At end of input. Match failed. Backtrack out. 3502 fHitEnd = TRUE; 3503 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3504 break; 3505 } 3506 3507 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3508 3509 // There is input left. Advance over one char, unless we've hit end-of-line 3510 UChar32 c = UTEXT_NEXT32(fInputText); 3511 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 3512 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 3513 // End of line in normal mode. . does not match. 3514 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3515 break; 3516 } 3517 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3518 } 3519 break; 3520 3521 3522 case URX_DOTANY_ALL: 3523 { 3524 // ., in dot-matches-all (including new lines) mode 3525 if (fp->fInputIdx >= fActiveLimit) { 3526 // At end of input. Match failed. Backtrack out. 3527 fHitEnd = TRUE; 3528 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3529 break; 3530 } 3531 3532 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3533 3534 // There is input left. Advance over one char, except if we are 3535 // at a cr/lf, advance over both of them. 3536 UChar32 c; 3537 c = UTEXT_NEXT32(fInputText); 3538 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3539 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 3540 // In the case of a CR/LF, we need to advance over both. 3541 UChar32 nextc = UTEXT_CURRENT32(fInputText); 3542 if (nextc == 0x0a) { 3543 UTEXT_NEXT32(fInputText); 3544 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3545 } 3546 } 3547 } 3548 break; 3549 3550 3551 case URX_DOTANY_UNIX: 3552 { 3553 // '.' operator, matches all, but stops at end-of-line. 3554 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 3555 if (fp->fInputIdx >= fActiveLimit) { 3556 // At end of input. Match failed. Backtrack out. 3557 fHitEnd = TRUE; 3558 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3559 break; 3560 } 3561 3562 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3563 3564 // There is input left. Advance over one char, unless we've hit end-of-line 3565 UChar32 c = UTEXT_NEXT32(fInputText); 3566 if (c == 0x0a) { 3567 // End of line in normal mode. '.' does not match the \n 3568 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3569 } else { 3570 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3571 } 3572 } 3573 break; 3574 3575 3576 case URX_JMP: 3577 fp->fPatIdx = opValue; 3578 break; 3579 3580 case URX_FAIL: 3581 isMatch = FALSE; 3582 goto breakFromLoop; 3583 3584 case URX_JMP_SAV: 3585 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 3586 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3587 fp->fPatIdx = opValue; // Then JMP. 3588 break; 3589 3590 case URX_JMP_SAV_X: 3591 // This opcode is used with (x)+, when x can match a zero length string. 3592 // Same as JMP_SAV, except conditional on the match having made forward progress. 3593 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 3594 // data address of the input position at the start of the loop. 3595 { 3596 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 3597 int32_t stoOp = (int32_t)pat[opValue-1]; 3598 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 3599 int32_t frameLoc = URX_VAL(stoOp); 3600 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 3601 int64_t prevInputIdx = fp->fExtra[frameLoc]; 3602 U_ASSERT(prevInputIdx <= fp->fInputIdx); 3603 if (prevInputIdx < fp->fInputIdx) { 3604 // The match did make progress. Repeat the loop. 3605 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3606 fp->fPatIdx = opValue; 3607 fp->fExtra[frameLoc] = fp->fInputIdx; 3608 } 3609 // If the input position did not advance, we do nothing here, 3610 // execution will fall out of the loop. 3611 } 3612 break; 3613 3614 case URX_CTR_INIT: 3615 { 3616 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3617 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3618 3619 // Pick up the three extra operands that CTR_INIT has, and 3620 // skip the pattern location counter past 3621 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3622 fp->fPatIdx += 3; 3623 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3624 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3625 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3626 U_ASSERT(minCount>=0); 3627 U_ASSERT(maxCount>=minCount || maxCount==-1); 3628 U_ASSERT(loopLoc>fp->fPatIdx); 3629 3630 if (minCount == 0) { 3631 fp = StateSave(fp, loopLoc+1, status); 3632 } 3633 if (maxCount == 0) { 3634 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3635 } 3636 } 3637 break; 3638 3639 case URX_CTR_LOOP: 3640 { 3641 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3642 int32_t initOp = (int32_t)pat[opValue]; 3643 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 3644 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3645 int32_t minCount = (int32_t)pat[opValue+2]; 3646 int32_t maxCount = (int32_t)pat[opValue+3]; 3647 // Increment the counter. Note: we DIDN'T worry about counter 3648 // overflow, since the data comes from UnicodeStrings, which 3649 // stores its length in an int32_t. Do we have to think about 3650 // this now that we're using UText? Probably not, since the length 3651 // in UChar32s is still an int32_t. 3652 (*pCounter)++; 3653 U_ASSERT(*pCounter > 0); 3654 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { 3655 U_ASSERT(*pCounter == maxCount || maxCount == -1); 3656 break; 3657 } 3658 if (*pCounter >= minCount) { 3659 fp = StateSave(fp, fp->fPatIdx, status); 3660 } 3661 fp->fPatIdx = opValue + 4; // Loop back. 3662 } 3663 break; 3664 3665 case URX_CTR_INIT_NG: 3666 { 3667 // Initialize a non-greedy loop 3668 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3669 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3670 3671 // Pick up the three extra operands that CTR_INIT has, and 3672 // skip the pattern location counter past 3673 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3674 fp->fPatIdx += 3; 3675 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3676 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3677 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3678 U_ASSERT(minCount>=0); 3679 U_ASSERT(maxCount>=minCount || maxCount==-1); 3680 U_ASSERT(loopLoc>fp->fPatIdx); 3681 3682 if (minCount == 0) { 3683 if (maxCount != 0) { 3684 fp = StateSave(fp, fp->fPatIdx, status); 3685 } 3686 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 3687 } 3688 } 3689 break; 3690 3691 case URX_CTR_LOOP_NG: 3692 { 3693 // Non-greedy {min, max} loops 3694 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3695 int32_t initOp = (int32_t)pat[opValue]; 3696 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 3697 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3698 int32_t minCount = (int32_t)pat[opValue+2]; 3699 int32_t maxCount = (int32_t)pat[opValue+3]; 3700 // Increment the counter. Note: we DIDN'T worry about counter 3701 // overflow, since the data comes from UnicodeStrings, which 3702 // stores its length in an int32_t. Do we have to think about 3703 // this now that we're using UText? Probably not, since the length 3704 // in UChar32s is still an int32_t. 3705 (*pCounter)++; 3706 U_ASSERT(*pCounter > 0); 3707 3708 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { 3709 // The loop has matched the maximum permitted number of times. 3710 // Break out of here with no action. Matching will 3711 // continue with the following pattern. 3712 U_ASSERT(*pCounter == maxCount || maxCount == -1); 3713 break; 3714 } 3715 3716 if (*pCounter < minCount) { 3717 // We haven't met the minimum number of matches yet. 3718 // Loop back for another one. 3719 fp->fPatIdx = opValue + 4; // Loop back. 3720 } else { 3721 // We do have the minimum number of matches. 3722 // Fall into the following pattern, but first do 3723 // a state save to the top of the loop, so that a failure 3724 // in the following pattern will try another iteration of the loop. 3725 fp = StateSave(fp, opValue + 4, status); 3726 } 3727 } 3728 break; 3729 3730 case URX_STO_SP: 3731 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3732 fData[opValue] = fStack->size(); 3733 break; 3734 3735 case URX_LD_SP: 3736 { 3737 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3738 int32_t newStackSize = (int32_t)fData[opValue]; 3739 U_ASSERT(newStackSize <= fStack->size()); 3740 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3741 if (newFP == (int64_t *)fp) { 3742 break; 3743 } 3744 int32_t i; 3745 for (i=0; i<fFrameSize; i++) { 3746 newFP[i] = ((int64_t *)fp)[i]; 3747 } 3748 fp = (REStackFrame *)newFP; 3749 fStack->setSize(newStackSize); 3750 } 3751 break; 3752 3753 case URX_BACKREF: 3754 case URX_BACKREF_I: 3755 { 3756 U_ASSERT(opValue < fFrameSize); 3757 int64_t groupStartIdx = fp->fExtra[opValue]; 3758 int64_t groupEndIdx = fp->fExtra[opValue+1]; 3759 U_ASSERT(groupStartIdx <= groupEndIdx); 3760 if (groupStartIdx < 0) { 3761 // This capture group has not participated in the match thus far, 3762 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3763 } 3764 3765 if (groupEndIdx == groupStartIdx) { 3766 // The capture group match was of an empty string. 3767 // Verified by testing: Perl matches succeed in this case, so 3768 // we do too. 3769 break; 3770 } 3771 3772 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); 3773 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3774 3775 UBool haveMatch = (opType == URX_BACKREF ? 3776 (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) : 3777 (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status))); 3778 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3779 3780 if (fp->fInputIdx > fActiveLimit) { 3781 fHitEnd = TRUE; 3782 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3783 } else if (!haveMatch) { 3784 if (fp->fInputIdx == fActiveLimit) { 3785 fHitEnd = TRUE; 3786 } 3787 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3788 } 3789 } 3790 break; 3791 3792 case URX_STO_INP_LOC: 3793 { 3794 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 3795 fp->fExtra[opValue] = fp->fInputIdx; 3796 } 3797 break; 3798 3799 case URX_JMPX: 3800 { 3801 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3802 fp->fPatIdx += 1; 3803 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 3804 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 3805 int64_t savedInputIdx = fp->fExtra[dataLoc]; 3806 U_ASSERT(savedInputIdx <= fp->fInputIdx); 3807 if (savedInputIdx < fp->fInputIdx) { 3808 fp->fPatIdx = opValue; // JMP 3809 } else { 3810 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 3811 } 3812 } 3813 break; 3814 3815 case URX_LA_START: 3816 { 3817 // Entering a lookahead block. 3818 // Save Stack Ptr, Input Pos. 3819 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3820 fData[opValue] = fStack->size(); 3821 fData[opValue+1] = fp->fInputIdx; 3822 fActiveStart = fLookStart; // Set the match region change for 3823 fActiveLimit = fLookLimit; // transparent bounds. 3824 } 3825 break; 3826 3827 case URX_LA_END: 3828 { 3829 // Leaving a look-ahead block. 3830 // restore Stack Ptr, Input Pos to positions they had on entry to block. 3831 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3832 int32_t stackSize = fStack->size(); 3833 int32_t newStackSize =(int32_t)fData[opValue]; 3834 U_ASSERT(stackSize >= newStackSize); 3835 if (stackSize > newStackSize) { 3836 // Copy the current top frame back to the new (cut back) top frame. 3837 // This makes the capture groups from within the look-ahead 3838 // expression available. 3839 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3840 int32_t i; 3841 for (i=0; i<fFrameSize; i++) { 3842 newFP[i] = ((int64_t *)fp)[i]; 3843 } 3844 fp = (REStackFrame *)newFP; 3845 fStack->setSize(newStackSize); 3846 } 3847 fp->fInputIdx = fData[opValue+1]; 3848 3849 // Restore the active region bounds in the input string; they may have 3850 // been changed because of transparent bounds on a Region. 3851 fActiveStart = fRegionStart; 3852 fActiveLimit = fRegionLimit; 3853 } 3854 break; 3855 3856 case URX_ONECHAR_I: 3857 if (fp->fInputIdx < fActiveLimit) { 3858 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3859 3860 UChar32 c = UTEXT_NEXT32(fInputText); 3861 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 3862 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3863 break; 3864 } 3865 } else { 3866 fHitEnd = TRUE; 3867 } 3868 3869 #ifdef REGEX_SMART_BACKTRACKING 3870 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 3871 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3872 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3873 UBool success = FALSE; 3874 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3875 while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) { 3876 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 3877 success = TRUE; 3878 break; 3879 } else if (c == U_SENTINEL) { 3880 break; 3881 } 3882 c = UTEXT_PREVIOUS32(fInputText); 3883 } 3884 if (success) { 3885 fHitEnd = FALSE; 3886 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3887 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3888 if (fp->fInputIdx > backSearchIndex) { 3889 fp = StateSave(fp, fp->fPatIdx, status); 3890 } 3891 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3892 break; 3893 } 3894 } 3895 } 3896 #endif 3897 3898 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3899 break; 3900 3901 case URX_STRING_I: 3902 { 3903 // Test input against a literal string. 3904 // Strings require two slots in the compiled pattern, one for the 3905 // offset to the string text, and one for the length. 3906 const UCaseProps *csp = ucase_getSingleton(); 3907 { 3908 int32_t stringStartIdx, stringLen; 3909 stringStartIdx = opValue; 3910 3911 op = (int32_t)pat[fp->fPatIdx]; 3912 fp->fPatIdx++; 3913 opType = URX_TYPE(op); 3914 opValue = URX_VAL(op); 3915 U_ASSERT(opType == URX_STRING_LEN); 3916 stringLen = opValue; 3917 3918 const UChar *patternChars = litText+stringStartIdx; 3919 const UChar *patternEnd = patternChars+stringLen; 3920 3921 const UChar *foldChars = NULL; 3922 int32_t foldOffset, foldLength; 3923 UChar32 c; 3924 3925 foldOffset = foldLength = 0; 3926 UBool success = TRUE; 3927 3928 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3929 while (patternChars < patternEnd && success) { 3930 if(foldOffset < foldLength) { 3931 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 3932 } else { 3933 c = UTEXT_NEXT32(fInputText); 3934 if (c != U_SENTINEL) { 3935 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); 3936 if(foldLength >= 0) { 3937 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings 3938 foldOffset = 0; 3939 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 3940 } else { 3941 c = foldLength; 3942 foldLength = foldOffset; // to avoid reading chars from the folding buffer 3943 } 3944 } 3945 } 3946 3947 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3948 } 3949 3950 success = FALSE; 3951 if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) { 3952 if (U_IS_BMP(c)) { 3953 success = (*patternChars == c); 3954 patternChars += 1; 3955 } else if (patternChars+1 < patternEnd) { 3956 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); 3957 patternChars += 2; 3958 } 3959 } else { 3960 fHitEnd = TRUE; // TODO: See ticket 6074 3961 } 3962 } 3963 3964 if (!success) { 3965 #ifdef REGEX_SMART_BACKTRACKING 3966 if (fp->fInputIdx > backSearchIndex && fStack->size()) { 3967 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3968 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3969 // Reset to last start point 3970 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3971 patternChars = litText+stringStartIdx; 3972 3973 // Search backwards for a possible start 3974 do { 3975 c = UTEXT_PREVIOUS32(fInputText); 3976 if (c == U_SENTINEL) { 3977 break; 3978 } else { 3979 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); 3980 if(foldLength >= 0) { 3981 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings 3982 foldOffset = 0; 3983 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 3984 } else { 3985 c = foldLength; 3986 foldLength = foldOffset; // to avoid reading chars from the folding buffer 3987 } 3988 } 3989 3990 if ((U_IS_BMP(c) && *patternChars == c) || 3991 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { 3992 success = TRUE; 3993 break; 3994 } 3995 } 3996 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); 3997 3998 // And try again 3999 if (success) { 4000 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4001 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4002 if (fp->fInputIdx > backSearchIndex) { 4003 fp = StateSave(fp, fp->fPatIdx, status); 4004 } 4005 fp->fPatIdx++; // Skip the LOOP_C, we just did that 4006 break; 4007 } 4008 } 4009 } 4010 #endif 4011 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4012 } 4013 } 4014 } 4015 break; 4016 4017 case URX_LB_START: 4018 { 4019 // Entering a look-behind block. 4020 // Save Stack Ptr, Input Pos. 4021 // TODO: implement transparent bounds. Ticket #6067 4022 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4023 fData[opValue] = fStack->size(); 4024 fData[opValue+1] = fp->fInputIdx; 4025 // Init the variable containing the start index for attempted matches. 4026 fData[opValue+2] = -1; 4027 // Save input string length, then reset to pin any matches to end at 4028 // the current position. 4029 fData[opValue+3] = fActiveLimit; 4030 fActiveLimit = fp->fInputIdx; 4031 } 4032 break; 4033 4034 4035 case URX_LB_CONT: 4036 { 4037 // Positive Look-Behind, at top of loop checking for matches of LB expression 4038 // at all possible input starting positions. 4039 4040 // Fetch the min and max possible match lengths. They are the operands 4041 // of this op in the pattern. 4042 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 4043 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 4044 U_ASSERT(minML <= maxML); 4045 U_ASSERT(minML >= 0); 4046 4047 // Fetch (from data) the last input index where a match was attempted. 4048 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4049 int64_t *lbStartIdx = &fData[opValue+2]; 4050 if (*lbStartIdx < 0) { 4051 // First time through loop. 4052 *lbStartIdx = fp->fInputIdx - minML; 4053 } else { 4054 // 2nd through nth time through the loop. 4055 // Back up start position for match by one. 4056 if (*lbStartIdx == 0) { 4057 (*lbStartIdx)--; 4058 } else { 4059 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); 4060 UTEXT_PREVIOUS32(fInputText); 4061 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 4062 } 4063 } 4064 4065 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 4066 // We have tried all potential match starting points without 4067 // getting a match. Backtrack out, and out of the 4068 // Look Behind altogether. 4069 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4070 int64_t restoreInputLen = fData[opValue+3]; 4071 U_ASSERT(restoreInputLen >= fActiveLimit); 4072 U_ASSERT(restoreInputLen <= fInputLength); 4073 fActiveLimit = restoreInputLen; 4074 break; 4075 } 4076 4077 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 4078 // (successful match will fall off the end of the loop.) 4079 fp = StateSave(fp, fp->fPatIdx-3, status); 4080 fp->fInputIdx = *lbStartIdx; 4081 } 4082 break; 4083 4084 case URX_LB_END: 4085 // End of a look-behind block, after a successful match. 4086 { 4087 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4088 if (fp->fInputIdx != fActiveLimit) { 4089 // The look-behind expression matched, but the match did not 4090 // extend all the way to the point that we are looking behind from. 4091 // FAIL out of here, which will take us back to the LB_CONT, which 4092 // will retry the match starting at another position or fail 4093 // the look-behind altogether, whichever is appropriate. 4094 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4095 break; 4096 } 4097 4098 // Look-behind match is good. Restore the orignal input string length, 4099 // which had been truncated to pin the end of the lookbehind match to the 4100 // position being looked-behind. 4101 int64_t originalInputLen = fData[opValue+3]; 4102 U_ASSERT(originalInputLen >= fActiveLimit); 4103 U_ASSERT(originalInputLen <= fInputLength); 4104 fActiveLimit = originalInputLen; 4105 } 4106 break; 4107 4108 4109 case URX_LBN_CONT: 4110 { 4111 // Negative Look-Behind, at top of loop checking for matches of LB expression 4112 // at all possible input starting positions. 4113 4114 // Fetch the extra parameters of this op. 4115 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 4116 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 4117 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 4118 continueLoc = URX_VAL(continueLoc); 4119 U_ASSERT(minML <= maxML); 4120 U_ASSERT(minML >= 0); 4121 U_ASSERT(continueLoc > fp->fPatIdx); 4122 4123 // Fetch (from data) the last input index where a match was attempted. 4124 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4125 int64_t *lbStartIdx = &fData[opValue+2]; 4126 if (*lbStartIdx < 0) { 4127 // First time through loop. 4128 *lbStartIdx = fp->fInputIdx - minML; 4129 } else { 4130 // 2nd through nth time through the loop. 4131 // Back up start position for match by one. 4132 if (*lbStartIdx == 0) { 4133 (*lbStartIdx)--; 4134 } else { 4135 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); 4136 UTEXT_PREVIOUS32(fInputText); 4137 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 4138 } 4139 } 4140 4141 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 4142 // We have tried all potential match starting points without 4143 // getting a match, which means that the negative lookbehind as 4144 // a whole has succeeded. Jump forward to the continue location 4145 int64_t restoreInputLen = fData[opValue+3]; 4146 U_ASSERT(restoreInputLen >= fActiveLimit); 4147 U_ASSERT(restoreInputLen <= fInputLength); 4148 fActiveLimit = restoreInputLen; 4149 fp->fPatIdx = continueLoc; 4150 break; 4151 } 4152 4153 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 4154 // (successful match will cause a FAIL out of the loop altogether.) 4155 fp = StateSave(fp, fp->fPatIdx-4, status); 4156 fp->fInputIdx = *lbStartIdx; 4157 } 4158 break; 4159 4160 case URX_LBN_END: 4161 // End of a negative look-behind block, after a successful match. 4162 { 4163 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4164 if (fp->fInputIdx != fActiveLimit) { 4165 // The look-behind expression matched, but the match did not 4166 // extend all the way to the point that we are looking behind from. 4167 // FAIL out of here, which will take us back to the LB_CONT, which 4168 // will retry the match starting at another position or succeed 4169 // the look-behind altogether, whichever is appropriate. 4170 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4171 break; 4172 } 4173 4174 // Look-behind expression matched, which means look-behind test as 4175 // a whole Fails 4176 4177 // Restore the orignal input string length, which had been truncated 4178 // inorder to pin the end of the lookbehind match 4179 // to the position being looked-behind. 4180 int64_t originalInputLen = fData[opValue+3]; 4181 U_ASSERT(originalInputLen >= fActiveLimit); 4182 U_ASSERT(originalInputLen <= fInputLength); 4183 fActiveLimit = originalInputLen; 4184 4185 // Restore original stack position, discarding any state saved 4186 // by the successful pattern match. 4187 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4188 int32_t newStackSize = (int32_t)fData[opValue]; 4189 U_ASSERT(fStack->size() > newStackSize); 4190 fStack->setSize(newStackSize); 4191 4192 // FAIL, which will take control back to someplace 4193 // prior to entering the look-behind test. 4194 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4195 } 4196 break; 4197 4198 4199 case URX_LOOP_SR_I: 4200 // Loop Initialization for the optimized implementation of 4201 // [some character set]* 4202 // This op scans through all matching input. 4203 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4204 { 4205 U_ASSERT(opValue > 0 && opValue < sets->size()); 4206 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 4207 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 4208 4209 // Loop through input, until either the input is exhausted or 4210 // we reach a character that is not a member of the set. 4211 int64_t ix = fp->fInputIdx; 4212 UTEXT_SETNATIVEINDEX(fInputText, ix); 4213 for (;;) { 4214 if (ix >= fActiveLimit) { 4215 fHitEnd = TRUE; 4216 break; 4217 } 4218 UChar32 c = UTEXT_NEXT32(fInputText); 4219 if (c<256) { 4220 if (s8->contains(c) == FALSE) { 4221 break; 4222 } 4223 } else { 4224 if (s->contains(c) == FALSE) { 4225 break; 4226 } 4227 } 4228 ix = UTEXT_GETNATIVEINDEX(fInputText); 4229 } 4230 4231 // If there were no matching characters, skip over the loop altogether. 4232 // The loop doesn't run at all, a * op always succeeds. 4233 if (ix == fp->fInputIdx) { 4234 fp->fPatIdx++; // skip the URX_LOOP_C op. 4235 break; 4236 } 4237 4238 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4239 // must follow. It's operand is the stack location 4240 // that holds the starting input index for the match of this [set]* 4241 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 4242 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4243 int32_t stackLoc = URX_VAL(loopcOp); 4244 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4245 fp->fExtra[stackLoc] = fp->fInputIdx; 4246 #ifdef REGEX_SMART_BACKTRACKING 4247 backSearchIndex = fp->fInputIdx; 4248 #endif 4249 fp->fInputIdx = ix; 4250 4251 // Save State to the URX_LOOP_C op that follows this one, 4252 // so that match failures in the following code will return to there. 4253 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4254 fp = StateSave(fp, fp->fPatIdx, status); 4255 fp->fPatIdx++; 4256 } 4257 break; 4258 4259 4260 case URX_LOOP_DOT_I: 4261 // Loop Initialization for the optimized implementation of .* 4262 // This op scans through all remaining input. 4263 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4264 { 4265 // Loop through input until the input is exhausted (we reach an end-of-line) 4266 // In DOTALL mode, we can just go straight to the end of the input. 4267 int64_t ix; 4268 if ((opValue & 1) == 1) { 4269 // Dot-matches-All mode. Jump straight to the end of the string. 4270 ix = fActiveLimit; 4271 fHitEnd = TRUE; 4272 } else { 4273 // NOT DOT ALL mode. Line endings do not match '.' 4274 // Scan forward until a line ending or end of input. 4275 ix = fp->fInputIdx; 4276 UTEXT_SETNATIVEINDEX(fInputText, ix); 4277 for (;;) { 4278 if (ix >= fActiveLimit) { 4279 fHitEnd = TRUE; 4280 break; 4281 } 4282 UChar32 c = UTEXT_NEXT32(fInputText); 4283 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 4284 if ((c == 0x0a) || // 0x0a is newline in both modes. 4285 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 4286 (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) { 4287 // char is a line ending. Exit the scanning loop. 4288 break; 4289 } 4290 } 4291 ix = UTEXT_GETNATIVEINDEX(fInputText); 4292 } 4293 } 4294 4295 // If there were no matching characters, skip over the loop altogether. 4296 // The loop doesn't run at all, a * op always succeeds. 4297 if (ix == fp->fInputIdx) { 4298 fp->fPatIdx++; // skip the URX_LOOP_C op. 4299 break; 4300 } 4301 4302 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4303 // must follow. It's operand is the stack location 4304 // that holds the starting input index for the match of this .* 4305 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 4306 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4307 int32_t stackLoc = URX_VAL(loopcOp); 4308 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4309 fp->fExtra[stackLoc] = fp->fInputIdx; 4310 #ifdef REGEX_SMART_BACKTRACKING 4311 backSearchIndex = fp->fInputIdx; 4312 #endif 4313 fp->fInputIdx = ix; 4314 4315 // Save State to the URX_LOOP_C op that follows this one, 4316 // so that match failures in the following code will return to there. 4317 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4318 fp = StateSave(fp, fp->fPatIdx, status); 4319 fp->fPatIdx++; 4320 } 4321 break; 4322 4323 4324 case URX_LOOP_C: 4325 { 4326 U_ASSERT(opValue>=0 && opValue<fFrameSize); 4327 backSearchIndex = fp->fExtra[opValue]; 4328 U_ASSERT(backSearchIndex <= fp->fInputIdx); 4329 if (backSearchIndex == fp->fInputIdx) { 4330 // We've backed up the input idx to the point that the loop started. 4331 // The loop is done. Leave here without saving state. 4332 // Subsequent failures won't come back here. 4333 break; 4334 } 4335 // Set up for the next iteration of the loop, with input index 4336 // backed up by one from the last time through, 4337 // and a state save to this instruction in case the following code fails again. 4338 // (We're going backwards because this loop emulates stack unwinding, not 4339 // the initial scan forward.) 4340 U_ASSERT(fp->fInputIdx > 0); 4341 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4342 UChar32 prevC = UTEXT_PREVIOUS32(fInputText); 4343 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4344 4345 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); 4346 if (prevC == 0x0a && 4347 fp->fInputIdx > backSearchIndex && 4348 twoPrevC == 0x0d) { 4349 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 4350 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 4351 // .*, stepping back over CRLF pair. 4352 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4353 } 4354 } 4355 4356 4357 fp = StateSave(fp, fp->fPatIdx-1, status); 4358 } 4359 break; 4360 4361 4362 4363 default: 4364 // Trouble. The compiled pattern contains an entry with an 4365 // unrecognized type tag. 4366 U_ASSERT(FALSE); 4367 } 4368 4369 if (U_FAILURE(status)) { 4370 isMatch = FALSE; 4371 break; 4372 } 4373 } 4374 4375 breakFromLoop: 4376 fMatch = isMatch; 4377 if (isMatch) { 4378 fLastMatchEnd = fMatchEnd; 4379 fMatchStart = startIdx; 4380 fMatchEnd = fp->fInputIdx; 4381 if (fTraceDebug) { 4382 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); 4383 } 4384 } 4385 else 4386 { 4387 if (fTraceDebug) { 4388 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); 4389 } 4390 } 4391 4392 fFrame = fp; // The active stack frame when the engine stopped. 4393 // Contains the capture group results that we need to 4394 // access later. 4395 return; 4396 } 4397 4398 4399 //-------------------------------------------------------------------------------- 4400 // 4401 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the 4402 // assumption that the entire string is available in the UText's 4403 // chunk buffer. For now, that means we can use int32_t indexes, 4404 // except for anything that needs to be saved (like group starts 4405 // and ends). 4406 // 4407 // startIdx: begin matching a this index. 4408 // toEnd: if true, match must extend to end of the input region 4409 // 4410 //-------------------------------------------------------------------------------- 4411 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { 4412 UBool isMatch = FALSE; // True if the we have a match. 4413 4414 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards 4415 4416 int32_t op; // Operation from the compiled pattern, split into 4417 int32_t opType; // the opcode 4418 int32_t opValue; // and the operand value. 4419 4420 #ifdef REGEX_RUN_DEBUG 4421 if (fTraceDebug) 4422 { 4423 printf("MatchAt(startIdx=%ld)\n", startIdx); 4424 printf("Original Pattern: "); 4425 UChar32 c = utext_next32From(fPattern->fPattern, 0); 4426 while (c != U_SENTINEL) { 4427 if (c<32 || c>256) { 4428 c = '.'; 4429 } 4430 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); 4431 4432 c = UTEXT_NEXT32(fPattern->fPattern); 4433 } 4434 printf("\n"); 4435 printf("Input String: "); 4436 c = utext_next32From(fInputText, 0); 4437 while (c != U_SENTINEL) { 4438 if (c<32 || c>256) { 4439 c = '.'; 4440 } 4441 printf("%c", c); 4442 4443 c = UTEXT_NEXT32(fInputText); 4444 } 4445 printf("\n"); 4446 printf("\n"); 4447 } 4448 #endif 4449 4450 if (U_FAILURE(status)) { 4451 return; 4452 } 4453 4454 // Cache frequently referenced items from the compiled pattern 4455 // 4456 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 4457 4458 const UChar *litText = fPattern->fLiteralText.getBuffer(); 4459 UVector *sets = fPattern->fSets; 4460 4461 const UChar *inputBuf = fInputText->chunkContents; 4462 4463 fFrameSize = fPattern->fFrameSize; 4464 REStackFrame *fp = resetStack(); 4465 4466 fp->fPatIdx = 0; 4467 fp->fInputIdx = startIdx; 4468 4469 // Zero out the pattern's static data 4470 int32_t i; 4471 for (i = 0; i<fPattern->fDataSize; i++) { 4472 fData[i] = 0; 4473 } 4474 4475 // 4476 // Main loop for interpreting the compiled pattern. 4477 // One iteration of the loop per pattern operation performed. 4478 // 4479 for (;;) { 4480 #if 0 4481 if (_heapchk() != _HEAPOK) { 4482 fprintf(stderr, "Heap Trouble\n"); 4483 } 4484 #endif 4485 4486 op = (int32_t)pat[fp->fPatIdx]; 4487 opType = URX_TYPE(op); 4488 opValue = URX_VAL(op); 4489 #ifdef REGEX_RUN_DEBUG 4490 if (fTraceDebug) { 4491 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4492 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, 4493 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 4494 fPattern->dumpOp(fp->fPatIdx); 4495 } 4496 #endif 4497 fp->fPatIdx++; 4498 4499 switch (opType) { 4500 4501 4502 case URX_NOP: 4503 break; 4504 4505 4506 case URX_BACKTRACK: 4507 // Force a backtrack. In some circumstances, the pattern compiler 4508 // will notice that the pattern can't possibly match anything, and will 4509 // emit one of these at that point. 4510 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4511 break; 4512 4513 4514 case URX_ONECHAR: 4515 if (fp->fInputIdx < fActiveLimit) { 4516 UChar32 c; 4517 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4518 if (c == opValue) { 4519 break; 4520 } 4521 } else { 4522 fHitEnd = TRUE; 4523 } 4524 4525 #ifdef REGEX_SMART_BACKTRACKING 4526 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 4527 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 4528 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 4529 int64_t reverseIndex = fp->fInputIdx; 4530 UChar32 c; 4531 do { 4532 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 4533 if (c == opValue) { 4534 break; 4535 } 4536 } while (reverseIndex > backSearchIndex); 4537 if (c == opValue) { 4538 fHitEnd = FALSE; 4539 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4540 fp->fInputIdx = reverseIndex; 4541 if (fp->fInputIdx > backSearchIndex) { 4542 fp = StateSave(fp, fp->fPatIdx, status); 4543 } 4544 fp->fPatIdx++; // Skip the LOOP_C, we just did that 4545 break; 4546 } 4547 } 4548 } 4549 #endif 4550 4551 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4552 break; 4553 4554 4555 case URX_STRING: 4556 { 4557 // Test input against a literal string. 4558 // Strings require two slots in the compiled pattern, one for the 4559 // offset to the string text, and one for the length. 4560 int32_t stringStartIdx = opValue; 4561 int32_t stringLen; 4562 4563 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 4564 fp->fPatIdx++; 4565 opType = URX_TYPE(op); 4566 stringLen = URX_VAL(op); 4567 U_ASSERT(opType == URX_STRING_LEN); 4568 U_ASSERT(stringLen >= 2); 4569 4570 if (fp->fInputIdx + stringLen > fActiveLimit) { 4571 // No match. String is longer than the remaining input text. 4572 fHitEnd = TRUE; // TODO: See ticket 6074 4573 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4574 break; 4575 } 4576 4577 const UChar * pInp = inputBuf + fp->fInputIdx; 4578 const UChar * pPat = litText+stringStartIdx; 4579 const UChar * pEnd = pInp + stringLen; 4580 UBool success = FALSE; 4581 for(;;) { 4582 if (*pInp == *pPat) { 4583 pInp++; 4584 pPat++; 4585 if (pInp == pEnd) { 4586 // Successful Match. 4587 success = TRUE; 4588 break; 4589 } 4590 } else { 4591 // Match failed. 4592 break; 4593 } 4594 } 4595 4596 if (success) { 4597 fp->fInputIdx += stringLen; 4598 } else { 4599 #ifdef REGEX_SMART_BACKTRACKING 4600 if (fp->fInputIdx > backSearchIndex && fStack->size()) { 4601 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 4602 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 4603 // Reset to last start point 4604 int64_t reverseIndex = fp->fInputIdx; 4605 UChar32 c; 4606 pPat = litText+stringStartIdx; 4607 4608 // Search backwards for a possible start 4609 do { 4610 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 4611 if ((U_IS_BMP(c) && *pPat == c) || 4612 (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) { 4613 success = TRUE; 4614 break; 4615 } 4616 } while (reverseIndex > backSearchIndex); 4617 4618 // And try again 4619 if (success) { 4620 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4621 fp->fInputIdx = reverseIndex; 4622 if (fp->fInputIdx > backSearchIndex) { 4623 fp = StateSave(fp, fp->fPatIdx, status); 4624 } 4625 fp->fPatIdx++; // Skip the LOOP_C, we just did that 4626 break; 4627 } 4628 } 4629 } 4630 #endif 4631 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4632 } 4633 } 4634 break; 4635 4636 4637 case URX_STATE_SAVE: 4638 fp = StateSave(fp, opValue, status); 4639 break; 4640 4641 4642 case URX_END: 4643 // The match loop will exit via this path on a successful match, 4644 // when we reach the end of the pattern. 4645 if (toEnd && fp->fInputIdx != fActiveLimit) { 4646 // The pattern matched, but not to the end of input. Try some more. 4647 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4648 break; 4649 } 4650 isMatch = TRUE; 4651 goto breakFromLoop; 4652 4653 // Start and End Capture stack frame variables are laid out out like this: 4654 // fp->fExtra[opValue] - The start of a completed capture group 4655 // opValue+1 - The end of a completed capture group 4656 // opValue+2 - the start of a capture group whose end 4657 // has not yet been reached (and might not ever be). 4658 case URX_START_CAPTURE: 4659 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4660 fp->fExtra[opValue+2] = fp->fInputIdx; 4661 break; 4662 4663 4664 case URX_END_CAPTURE: 4665 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4666 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 4667 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 4668 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 4669 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 4670 break; 4671 4672 4673 case URX_DOLLAR: // $, test for End of line 4674 // or for position before new line at end of input 4675 if (fp->fInputIdx < fAnchorLimit-2) { 4676 // We are no where near the end of input. Fail. 4677 // This is the common case. Keep it first. 4678 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4679 break; 4680 } 4681 if (fp->fInputIdx >= fAnchorLimit) { 4682 // We really are at the end of input. Success. 4683 fHitEnd = TRUE; 4684 fRequireEnd = TRUE; 4685 break; 4686 } 4687 4688 // If we are positioned just before a new-line that is located at the 4689 // end of input, succeed. 4690 if (fp->fInputIdx == fAnchorLimit-1) { 4691 UChar32 c; 4692 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); 4693 4694 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 4695 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4696 // At new-line at end of input. Success 4697 fHitEnd = TRUE; 4698 fRequireEnd = TRUE; 4699 break; 4700 } 4701 } 4702 } else if (fp->fInputIdx == fAnchorLimit-2 && 4703 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) { 4704 fHitEnd = TRUE; 4705 fRequireEnd = TRUE; 4706 break; // At CR/LF at end of input. Success 4707 } 4708 4709 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4710 4711 break; 4712 4713 4714 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 4715 if (fp->fInputIdx >= fAnchorLimit-1) { 4716 // Either at the last character of input, or off the end. 4717 if (fp->fInputIdx == fAnchorLimit-1) { 4718 // At last char of input. Success if it's a new line. 4719 if (inputBuf[fp->fInputIdx] == 0x0a) { 4720 fHitEnd = TRUE; 4721 fRequireEnd = TRUE; 4722 break; 4723 } 4724 } else { 4725 // Off the end of input. Success. 4726 fHitEnd = TRUE; 4727 fRequireEnd = TRUE; 4728 break; 4729 } 4730 } 4731 4732 // Not at end of input. Back-track out. 4733 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4734 break; 4735 4736 4737 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 4738 { 4739 if (fp->fInputIdx >= fAnchorLimit) { 4740 // We really are at the end of input. Success. 4741 fHitEnd = TRUE; 4742 fRequireEnd = TRUE; 4743 break; 4744 } 4745 // If we are positioned just before a new-line, succeed. 4746 // It makes no difference where the new-line is within the input. 4747 UChar32 c = inputBuf[fp->fInputIdx]; 4748 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 4749 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 4750 // In multi-line mode, hitting a new-line just before the end of input does not 4751 // set the hitEnd or requireEnd flags 4752 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4753 break; 4754 } 4755 } 4756 // not at a new line. Fail. 4757 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4758 } 4759 break; 4760 4761 4762 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 4763 { 4764 if (fp->fInputIdx >= fAnchorLimit) { 4765 // We really are at the end of input. Success. 4766 fHitEnd = TRUE; 4767 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 4768 break; // adding a new-line would not lose the match. 4769 } 4770 // If we are not positioned just before a new-line, the test fails; backtrack out. 4771 // It makes no difference where the new-line is within the input. 4772 if (inputBuf[fp->fInputIdx] != 0x0a) { 4773 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4774 } 4775 } 4776 break; 4777 4778 4779 case URX_CARET: // ^, test for start of line 4780 if (fp->fInputIdx != fAnchorStart) { 4781 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4782 } 4783 break; 4784 4785 4786 case URX_CARET_M: // ^, test for start of line in mulit-line mode 4787 { 4788 if (fp->fInputIdx == fAnchorStart) { 4789 // We are at the start input. Success. 4790 break; 4791 } 4792 // Check whether character just before the current pos is a new-line 4793 // unless we are at the end of input 4794 UChar c = inputBuf[fp->fInputIdx - 1]; 4795 if ((fp->fInputIdx < fAnchorLimit) && 4796 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 4797 // It's a new-line. ^ is true. Success. 4798 // TODO: what should be done with positions between a CR and LF? 4799 break; 4800 } 4801 // Not at the start of a line. Fail. 4802 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4803 } 4804 break; 4805 4806 4807 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 4808 { 4809 U_ASSERT(fp->fInputIdx >= fAnchorStart); 4810 if (fp->fInputIdx <= fAnchorStart) { 4811 // We are at the start input. Success. 4812 break; 4813 } 4814 // Check whether character just before the current pos is a new-line 4815 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 4816 UChar c = inputBuf[fp->fInputIdx - 1]; 4817 if (c != 0x0a) { 4818 // Not at the start of a line. Back-track out. 4819 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4820 } 4821 } 4822 break; 4823 4824 case URX_BACKSLASH_B: // Test for word boundaries 4825 { 4826 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); 4827 success ^= (opValue != 0); // flip sense for \B 4828 if (!success) { 4829 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4830 } 4831 } 4832 break; 4833 4834 4835 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 4836 { 4837 UBool success = isUWordBoundary(fp->fInputIdx); 4838 success ^= (opValue != 0); // flip sense for \B 4839 if (!success) { 4840 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4841 } 4842 } 4843 break; 4844 4845 4846 case URX_BACKSLASH_D: // Test for decimal digit 4847 { 4848 if (fp->fInputIdx >= fActiveLimit) { 4849 fHitEnd = TRUE; 4850 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4851 break; 4852 } 4853 4854 UChar32 c; 4855 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4856 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 4857 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 4858 success ^= (opValue != 0); // flip sense for \D 4859 if (!success) { 4860 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4861 } 4862 } 4863 break; 4864 4865 4866 case URX_BACKSLASH_G: // Test for position at end of previous match 4867 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { 4868 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4869 } 4870 break; 4871 4872 4873 case URX_BACKSLASH_X: 4874 // Match a Grapheme, as defined by Unicode TR 29. 4875 // Differs slightly from Perl, which consumes combining marks independently 4876 // of context. 4877 { 4878 4879 // Fail if at end of input 4880 if (fp->fInputIdx >= fActiveLimit) { 4881 fHitEnd = TRUE; 4882 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4883 break; 4884 } 4885 4886 // Examine (and consume) the current char. 4887 // Dispatch into a little state machine, based on the char. 4888 UChar32 c; 4889 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4890 UnicodeSet **sets = fPattern->fStaticSets; 4891 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 4892 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 4893 if (sets[URX_GC_L]->contains(c)) goto GC_L; 4894 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 4895 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 4896 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4897 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4898 goto GC_Extend; 4899 4900 4901 4902 GC_L: 4903 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4904 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4905 if (sets[URX_GC_L]->contains(c)) goto GC_L; 4906 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 4907 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 4908 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4909 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4910 goto GC_Extend; 4911 4912 GC_V: 4913 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4914 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4915 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4916 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4917 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4918 goto GC_Extend; 4919 4920 GC_T: 4921 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4922 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4923 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4924 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4925 goto GC_Extend; 4926 4927 GC_Extend: 4928 // Combining characters are consumed here 4929 for (;;) { 4930 if (fp->fInputIdx >= fActiveLimit) { 4931 break; 4932 } 4933 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4934 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 4935 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 4936 break; 4937 } 4938 } 4939 goto GC_Done; 4940 4941 GC_Control: 4942 // Most control chars stand alone (don't combine with combining chars), 4943 // except for that CR/LF sequence is a single grapheme cluster. 4944 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) { 4945 fp->fInputIdx++; 4946 } 4947 4948 GC_Done: 4949 if (fp->fInputIdx >= fActiveLimit) { 4950 fHitEnd = TRUE; 4951 } 4952 break; 4953 } 4954 4955 4956 4957 4958 case URX_BACKSLASH_Z: // Test for end of Input 4959 if (fp->fInputIdx < fAnchorLimit) { 4960 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4961 } else { 4962 fHitEnd = TRUE; 4963 fRequireEnd = TRUE; 4964 } 4965 break; 4966 4967 4968 4969 case URX_STATIC_SETREF: 4970 { 4971 // Test input character against one of the predefined sets 4972 // (Word Characters, for example) 4973 // The high bit of the op value is a flag for the match polarity. 4974 // 0: success if input char is in set. 4975 // 1: success if input char is not in set. 4976 if (fp->fInputIdx >= fActiveLimit) { 4977 fHitEnd = TRUE; 4978 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4979 break; 4980 } 4981 4982 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 4983 opValue &= ~URX_NEG_SET; 4984 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 4985 4986 UChar32 c; 4987 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4988 if (c < 256) { 4989 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 4990 if (s8->contains(c)) { 4991 success = !success; 4992 } 4993 } else { 4994 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 4995 if (s->contains(c)) { 4996 success = !success; 4997 } 4998 } 4999 if (!success) { 5000 #ifdef REGEX_SMART_BACKTRACKING 5001 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 5002 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5003 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5004 // Try to find it, backwards 5005 int64_t reverseIndex = fp->fInputIdx; 5006 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried 5007 success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset 5008 do { 5009 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5010 if (c < 256) { 5011 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 5012 if (s8->contains(c)) { 5013 success = !success; 5014 } 5015 } else { 5016 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 5017 if (s->contains(c)) { 5018 success = !success; 5019 } 5020 } 5021 } while (reverseIndex > backSearchIndex && !success); 5022 5023 if (success) { 5024 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5025 fp->fInputIdx = reverseIndex; 5026 if (fp->fInputIdx > backSearchIndex) { 5027 fp = StateSave(fp, fp->fPatIdx, status); 5028 } 5029 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5030 break; 5031 } 5032 } 5033 } 5034 #endif 5035 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5036 } 5037 } 5038 break; 5039 5040 5041 case URX_STAT_SETREF_N: 5042 { 5043 // Test input character for NOT being a member of one of 5044 // the predefined sets (Word Characters, for example) 5045 if (fp->fInputIdx >= fActiveLimit) { 5046 fHitEnd = TRUE; 5047 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5048 break; 5049 } 5050 5051 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 5052 5053 UChar32 c; 5054 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5055 if (c < 256) { 5056 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 5057 if (s8->contains(c) == FALSE) { 5058 break; 5059 } 5060 } else { 5061 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 5062 if (s->contains(c) == FALSE) { 5063 break; 5064 } 5065 } 5066 5067 #ifdef REGEX_SMART_BACKTRACKING 5068 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 5069 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5070 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5071 // Try to find it, backwards 5072 int64_t reverseIndex = fp->fInputIdx; 5073 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried 5074 UBool success = FALSE; 5075 do { 5076 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5077 if (c < 256) { 5078 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 5079 if (s8->contains(c) == FALSE) { 5080 success = TRUE; 5081 break; 5082 } 5083 } else { 5084 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 5085 if (s->contains(c) == FALSE) { 5086 success = TRUE; 5087 break; 5088 } 5089 } 5090 } while (reverseIndex > backSearchIndex); 5091 5092 if (success) { 5093 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5094 fp->fInputIdx = reverseIndex; 5095 if (fp->fInputIdx > backSearchIndex) { 5096 fp = StateSave(fp, fp->fPatIdx, status); 5097 } 5098 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5099 break; 5100 } 5101 } 5102 } 5103 #endif 5104 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5105 } 5106 break; 5107 5108 5109 case URX_SETREF: 5110 { 5111 if (fp->fInputIdx >= fActiveLimit) { 5112 fHitEnd = TRUE; 5113 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5114 break; 5115 } 5116 5117 U_ASSERT(opValue > 0 && opValue < sets->size()); 5118 5119 // There is input left. Pick up one char and test it for set membership. 5120 UChar32 c; 5121 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5122 if (c<256) { 5123 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5124 if (s8->contains(c)) { 5125 // The character is in the set. A Match. 5126 break; 5127 } 5128 } else { 5129 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 5130 if (s->contains(c)) { 5131 // The character is in the set. A Match. 5132 break; 5133 } 5134 } 5135 5136 // the character wasn't in the set. 5137 #ifdef REGEX_SMART_BACKTRACKING 5138 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 5139 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5140 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5141 // Try to find it, backwards 5142 int64_t reverseIndex = fp->fInputIdx; 5143 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried 5144 UBool success = FALSE; 5145 do { 5146 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5147 if (c < 256) { 5148 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5149 if (s8->contains(c)) { 5150 success = TRUE; 5151 break; 5152 } 5153 } else { 5154 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 5155 if (s->contains(c)) { 5156 success = TRUE; 5157 break; 5158 } 5159 } 5160 } while (reverseIndex > backSearchIndex); 5161 5162 if (success) { 5163 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5164 fp->fInputIdx = reverseIndex; 5165 if (fp->fInputIdx > reverseIndex) { 5166 fp = StateSave(fp, fp->fPatIdx, status); 5167 } 5168 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5169 break; 5170 } 5171 } 5172 } 5173 #endif 5174 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5175 } 5176 break; 5177 5178 5179 case URX_DOTANY: 5180 { 5181 // . matches anything, but stops at end-of-line. 5182 if (fp->fInputIdx >= fActiveLimit) { 5183 // At end of input. Match failed. Backtrack out. 5184 fHitEnd = TRUE; 5185 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5186 break; 5187 } 5188 5189 // There is input left. Advance over one char, unless we've hit end-of-line 5190 UChar32 c; 5191 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5192 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 5193 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 5194 // End of line in normal mode. . does not match. 5195 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5196 break; 5197 } 5198 } 5199 break; 5200 5201 5202 case URX_DOTANY_ALL: 5203 { 5204 // . in dot-matches-all (including new lines) mode 5205 if (fp->fInputIdx >= fActiveLimit) { 5206 // At end of input. Match failed. Backtrack out. 5207 fHitEnd = TRUE; 5208 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5209 break; 5210 } 5211 5212 // There is input left. Advance over one char, except if we are 5213 // at a cr/lf, advance over both of them. 5214 UChar32 c; 5215 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5216 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 5217 // In the case of a CR/LF, we need to advance over both. 5218 if (inputBuf[fp->fInputIdx] == 0x0a) { 5219 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); 5220 } 5221 } 5222 } 5223 break; 5224 5225 5226 case URX_DOTANY_UNIX: 5227 { 5228 // '.' operator, matches all, but stops at end-of-line. 5229 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 5230 if (fp->fInputIdx >= fActiveLimit) { 5231 // At end of input. Match failed. Backtrack out. 5232 fHitEnd = TRUE; 5233 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5234 break; 5235 } 5236 5237 // There is input left. Advance over one char, unless we've hit end-of-line 5238 UChar32 c; 5239 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5240 if (c == 0x0a) { 5241 // End of line in normal mode. '.' does not match the \n 5242 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5243 } 5244 } 5245 break; 5246 5247 5248 case URX_JMP: 5249 fp->fPatIdx = opValue; 5250 break; 5251 5252 case URX_FAIL: 5253 isMatch = FALSE; 5254 goto breakFromLoop; 5255 5256 case URX_JMP_SAV: 5257 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 5258 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 5259 fp->fPatIdx = opValue; // Then JMP. 5260 break; 5261 5262 case URX_JMP_SAV_X: 5263 // This opcode is used with (x)+, when x can match a zero length string. 5264 // Same as JMP_SAV, except conditional on the match having made forward progress. 5265 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 5266 // data address of the input position at the start of the loop. 5267 { 5268 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 5269 int32_t stoOp = (int32_t)pat[opValue-1]; 5270 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 5271 int32_t frameLoc = URX_VAL(stoOp); 5272 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 5273 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; 5274 U_ASSERT(prevInputIdx <= fp->fInputIdx); 5275 if (prevInputIdx < fp->fInputIdx) { 5276 // The match did make progress. Repeat the loop. 5277 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 5278 fp->fPatIdx = opValue; 5279 fp->fExtra[frameLoc] = fp->fInputIdx; 5280 } 5281 // If the input position did not advance, we do nothing here, 5282 // execution will fall out of the loop. 5283 } 5284 break; 5285 5286 case URX_CTR_INIT: 5287 { 5288 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 5289 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 5290 5291 // Pick up the three extra operands that CTR_INIT has, and 5292 // skip the pattern location counter past 5293 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5294 fp->fPatIdx += 3; 5295 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 5296 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 5297 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 5298 U_ASSERT(minCount>=0); 5299 U_ASSERT(maxCount>=minCount || maxCount==-1); 5300 U_ASSERT(loopLoc>fp->fPatIdx); 5301 5302 if (minCount == 0) { 5303 fp = StateSave(fp, loopLoc+1, status); 5304 } 5305 if (maxCount == 0) { 5306 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5307 } 5308 } 5309 break; 5310 5311 case URX_CTR_LOOP: 5312 { 5313 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 5314 int32_t initOp = (int32_t)pat[opValue]; 5315 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 5316 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 5317 int32_t minCount = (int32_t)pat[opValue+2]; 5318 int32_t maxCount = (int32_t)pat[opValue+3]; 5319 // Increment the counter. Note: we DIDN'T worry about counter 5320 // overflow, since the data comes from UnicodeStrings, which 5321 // stores its length in an int32_t. Do we have to think about 5322 // this now that we're using UText? Probably not, since the length 5323 // in UChar32s is still an int32_t. 5324 (*pCounter)++; 5325 U_ASSERT(*pCounter > 0); 5326 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { 5327 U_ASSERT(*pCounter == maxCount || maxCount == -1); 5328 break; 5329 } 5330 if (*pCounter >= minCount) { 5331 fp = StateSave(fp, fp->fPatIdx, status); 5332 } 5333 fp->fPatIdx = opValue + 4; // Loop back. 5334 } 5335 break; 5336 5337 case URX_CTR_INIT_NG: 5338 { 5339 // Initialize a non-greedy loop 5340 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 5341 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 5342 5343 // Pick up the three extra operands that CTR_INIT has, and 5344 // skip the pattern location counter past 5345 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5346 fp->fPatIdx += 3; 5347 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 5348 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 5349 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 5350 U_ASSERT(minCount>=0); 5351 U_ASSERT(maxCount>=minCount || maxCount==-1); 5352 U_ASSERT(loopLoc>fp->fPatIdx); 5353 5354 if (minCount == 0) { 5355 if (maxCount != 0) { 5356 fp = StateSave(fp, fp->fPatIdx, status); 5357 } 5358 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 5359 } 5360 } 5361 break; 5362 5363 case URX_CTR_LOOP_NG: 5364 { 5365 // Non-greedy {min, max} loops 5366 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 5367 int32_t initOp = (int32_t)pat[opValue]; 5368 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 5369 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 5370 int32_t minCount = (int32_t)pat[opValue+2]; 5371 int32_t maxCount = (int32_t)pat[opValue+3]; 5372 // Increment the counter. Note: we DIDN'T worry about counter 5373 // overflow, since the data comes from UnicodeStrings, which 5374 // stores its length in an int32_t. Do we have to think about 5375 // this now that we're using UText? Probably not, since the length 5376 // in UChar32s is still an int32_t. 5377 (*pCounter)++; 5378 U_ASSERT(*pCounter > 0); 5379 5380 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { 5381 // The loop has matched the maximum permitted number of times. 5382 // Break out of here with no action. Matching will 5383 // continue with the following pattern. 5384 U_ASSERT(*pCounter == maxCount || maxCount == -1); 5385 break; 5386 } 5387 5388 if (*pCounter < minCount) { 5389 // We haven't met the minimum number of matches yet. 5390 // Loop back for another one. 5391 fp->fPatIdx = opValue + 4; // Loop back. 5392 } else { 5393 // We do have the minimum number of matches. 5394 // Fall into the following pattern, but first do 5395 // a state save to the top of the loop, so that a failure 5396 // in the following pattern will try another iteration of the loop. 5397 fp = StateSave(fp, opValue + 4, status); 5398 } 5399 } 5400 break; 5401 5402 case URX_STO_SP: 5403 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5404 fData[opValue] = fStack->size(); 5405 break; 5406 5407 case URX_LD_SP: 5408 { 5409 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5410 int32_t newStackSize = (int32_t)fData[opValue]; 5411 U_ASSERT(newStackSize <= fStack->size()); 5412 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5413 if (newFP == (int64_t *)fp) { 5414 break; 5415 } 5416 int32_t i; 5417 for (i=0; i<fFrameSize; i++) { 5418 newFP[i] = ((int64_t *)fp)[i]; 5419 } 5420 fp = (REStackFrame *)newFP; 5421 fStack->setSize(newStackSize); 5422 } 5423 break; 5424 5425 case URX_BACKREF: 5426 case URX_BACKREF_I: 5427 { 5428 U_ASSERT(opValue < fFrameSize); 5429 int64_t groupStartIdx = fp->fExtra[opValue]; 5430 int64_t groupEndIdx = fp->fExtra[opValue+1]; 5431 U_ASSERT(groupStartIdx <= groupEndIdx); 5432 int64_t len = groupEndIdx-groupStartIdx; 5433 if (groupStartIdx < 0) { 5434 // This capture group has not participated in the match thus far, 5435 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 5436 } 5437 5438 if (len == 0) { 5439 // The capture group match was of an empty string. 5440 // Verified by testing: Perl matches succeed in this case, so 5441 // we do too. 5442 break; 5443 } 5444 5445 UBool haveMatch = FALSE; 5446 if (fp->fInputIdx + len <= fActiveLimit) { 5447 if (opType == URX_BACKREF) { 5448 if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) { 5449 haveMatch = TRUE; 5450 } 5451 } else { 5452 if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, 5453 (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) { 5454 haveMatch = TRUE; 5455 } 5456 } 5457 } else { 5458 // TODO: probably need to do a partial string comparison, and only 5459 // set HitEnd if the available input matched. Ticket #6074 5460 fHitEnd = TRUE; 5461 } 5462 if (haveMatch) { 5463 fp->fInputIdx += len; // Match. Advance current input position. 5464 } else { 5465 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 5466 } 5467 } 5468 break; 5469 5470 case URX_STO_INP_LOC: 5471 { 5472 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 5473 fp->fExtra[opValue] = fp->fInputIdx; 5474 } 5475 break; 5476 5477 case URX_JMPX: 5478 { 5479 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5480 fp->fPatIdx += 1; 5481 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 5482 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 5483 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; 5484 U_ASSERT(savedInputIdx <= fp->fInputIdx); 5485 if (savedInputIdx < fp->fInputIdx) { 5486 fp->fPatIdx = opValue; // JMP 5487 } else { 5488 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 5489 } 5490 } 5491 break; 5492 5493 case URX_LA_START: 5494 { 5495 // Entering a lookahead block. 5496 // Save Stack Ptr, Input Pos. 5497 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5498 fData[opValue] = fStack->size(); 5499 fData[opValue+1] = fp->fInputIdx; 5500 fActiveStart = fLookStart; // Set the match region change for 5501 fActiveLimit = fLookLimit; // transparent bounds. 5502 } 5503 break; 5504 5505 case URX_LA_END: 5506 { 5507 // Leaving a look-ahead block. 5508 // restore Stack Ptr, Input Pos to positions they had on entry to block. 5509 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5510 int32_t stackSize = fStack->size(); 5511 int32_t newStackSize = (int32_t)fData[opValue]; 5512 U_ASSERT(stackSize >= newStackSize); 5513 if (stackSize > newStackSize) { 5514 // Copy the current top frame back to the new (cut back) top frame. 5515 // This makes the capture groups from within the look-ahead 5516 // expression available. 5517 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5518 int32_t i; 5519 for (i=0; i<fFrameSize; i++) { 5520 newFP[i] = ((int64_t *)fp)[i]; 5521 } 5522 fp = (REStackFrame *)newFP; 5523 fStack->setSize(newStackSize); 5524 } 5525 fp->fInputIdx = fData[opValue+1]; 5526 5527 // Restore the active region bounds in the input string; they may have 5528 // been changed because of transparent bounds on a Region. 5529 fActiveStart = fRegionStart; 5530 fActiveLimit = fRegionLimit; 5531 } 5532 break; 5533 5534 case URX_ONECHAR_I: 5535 if (fp->fInputIdx < fActiveLimit) { 5536 UChar32 c; 5537 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5538 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 5539 break; 5540 } 5541 } else { 5542 fHitEnd = TRUE; 5543 } 5544 5545 #ifdef REGEX_SMART_BACKTRACKING 5546 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 5547 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5548 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5549 UBool success = FALSE; 5550 int64_t reverseIndex = fp->fInputIdx; 5551 UChar32 c; 5552 while (reverseIndex > backSearchIndex) { 5553 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5554 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 5555 success = TRUE; 5556 break; 5557 } else if (c == U_SENTINEL) { 5558 break; 5559 } 5560 } 5561 if (success) { 5562 fHitEnd = FALSE; 5563 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5564 fp->fInputIdx = reverseIndex; 5565 if (fp->fInputIdx > backSearchIndex) { 5566 fp = StateSave(fp, fp->fPatIdx, status); 5567 } 5568 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5569 break; 5570 } 5571 } 5572 } 5573 #endif 5574 5575 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5576 break; 5577 5578 case URX_STRING_I: 5579 { 5580 // Test input against a literal string. 5581 // Strings require two slots in the compiled pattern, one for the 5582 // offset to the string text, and one for the length. 5583 const UCaseProps *csp = ucase_getSingleton(); 5584 { 5585 int32_t stringStartIdx, stringLen; 5586 stringStartIdx = opValue; 5587 5588 op = (int32_t)pat[fp->fPatIdx]; 5589 fp->fPatIdx++; 5590 opType = URX_TYPE(op); 5591 opValue = URX_VAL(op); 5592 U_ASSERT(opType == URX_STRING_LEN); 5593 stringLen = opValue; 5594 5595 const UChar *patternChars = litText+stringStartIdx; 5596 const UChar *patternEnd = patternChars+stringLen; 5597 5598 const UChar *foldChars = NULL; 5599 int32_t foldOffset, foldLength; 5600 UChar32 c; 5601 UBool c_is_valid = FALSE; 5602 5603 #ifdef REGEX_SMART_BACKTRACKING 5604 int32_t originalInputIdx = fp->fInputIdx; 5605 #endif 5606 UBool success = TRUE; 5607 5608 foldOffset = foldLength = 0; 5609 5610 while (patternChars < patternEnd && success) { 5611 if (fp->fInputIdx < fActiveLimit) { // don't read past end of string 5612 if(foldOffset < foldLength) { 5613 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 5614 c_is_valid = TRUE; 5615 } else { 5616 // test pre-condition of U16_NEXT: i < length 5617 U_ASSERT(fp->fInputIdx < fActiveLimit); 5618 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5619 c_is_valid = TRUE; 5620 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); 5621 if(foldLength >= 0) { 5622 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings 5623 foldOffset = 0; 5624 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 5625 } else { 5626 c = foldLength; 5627 foldLength = foldOffset; // to avoid reading chars from the folding buffer 5628 } 5629 } 5630 } 5631 } else { 5632 c_is_valid = FALSE; 5633 } 5634 5635 if (fp->fInputIdx <= fActiveLimit && c_is_valid) { 5636 if (U_IS_BMP(c)) { 5637 success = (*patternChars == c); 5638 patternChars += 1; 5639 } else if (patternChars+1 < patternEnd) { 5640 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); 5641 patternChars += 2; 5642 } 5643 } else { 5644 success = FALSE; 5645 fHitEnd = TRUE; // TODO: See ticket 6074 5646 } 5647 } 5648 5649 if (!success) { 5650 #ifdef REGEX_SMART_BACKTRACKING 5651 if (fp->fInputIdx > backSearchIndex && fStack->size()) { 5652 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5653 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5654 // Reset to last start point 5655 int64_t reverseIndex = originalInputIdx; 5656 patternChars = litText+stringStartIdx; 5657 5658 // Search backwards for a possible start 5659 do { 5660 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5661 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); 5662 if(foldLength >= 0) { 5663 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings 5664 foldOffset = 0; 5665 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 5666 } else { 5667 c = foldLength; 5668 foldLength = foldOffset; // to avoid reading chars from the folding buffer 5669 } 5670 } 5671 5672 if ((U_IS_BMP(c) && *patternChars == c) || 5673 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { 5674 success = TRUE; 5675 break; 5676 } 5677 } while (reverseIndex > backSearchIndex); 5678 5679 // And try again 5680 if (success) { 5681 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5682 fp->fInputIdx = reverseIndex; 5683 if (fp->fInputIdx > backSearchIndex) { 5684 fp = StateSave(fp, fp->fPatIdx, status); 5685 } 5686 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5687 break; 5688 } 5689 } 5690 } 5691 #endif 5692 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5693 } 5694 } 5695 } 5696 break; 5697 5698 case URX_LB_START: 5699 { 5700 // Entering a look-behind block. 5701 // Save Stack Ptr, Input Pos. 5702 // TODO: implement transparent bounds. Ticket #6067 5703 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5704 fData[opValue] = fStack->size(); 5705 fData[opValue+1] = fp->fInputIdx; 5706 // Init the variable containing the start index for attempted matches. 5707 fData[opValue+2] = -1; 5708 // Save input string length, then reset to pin any matches to end at 5709 // the current position. 5710 fData[opValue+3] = fActiveLimit; 5711 fActiveLimit = fp->fInputIdx; 5712 } 5713 break; 5714 5715 5716 case URX_LB_CONT: 5717 { 5718 // Positive Look-Behind, at top of loop checking for matches of LB expression 5719 // at all possible input starting positions. 5720 5721 // Fetch the min and max possible match lengths. They are the operands 5722 // of this op in the pattern. 5723 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5724 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5725 U_ASSERT(minML <= maxML); 5726 U_ASSERT(minML >= 0); 5727 5728 // Fetch (from data) the last input index where a match was attempted. 5729 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5730 int64_t *lbStartIdx = &fData[opValue+2]; 5731 if (*lbStartIdx < 0) { 5732 // First time through loop. 5733 *lbStartIdx = fp->fInputIdx - minML; 5734 } else { 5735 // 2nd through nth time through the loop. 5736 // Back up start position for match by one. 5737 if (*lbStartIdx == 0) { 5738 (*lbStartIdx)--; 5739 } else { 5740 U16_BACK_1(inputBuf, 0, *lbStartIdx); 5741 } 5742 } 5743 5744 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 5745 // We have tried all potential match starting points without 5746 // getting a match. Backtrack out, and out of the 5747 // Look Behind altogether. 5748 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5749 int64_t restoreInputLen = fData[opValue+3]; 5750 U_ASSERT(restoreInputLen >= fActiveLimit); 5751 U_ASSERT(restoreInputLen <= fInputLength); 5752 fActiveLimit = restoreInputLen; 5753 break; 5754 } 5755 5756 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5757 // (successful match will fall off the end of the loop.) 5758 fp = StateSave(fp, fp->fPatIdx-3, status); 5759 fp->fInputIdx = *lbStartIdx; 5760 } 5761 break; 5762 5763 case URX_LB_END: 5764 // End of a look-behind block, after a successful match. 5765 { 5766 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5767 if (fp->fInputIdx != fActiveLimit) { 5768 // The look-behind expression matched, but the match did not 5769 // extend all the way to the point that we are looking behind from. 5770 // FAIL out of here, which will take us back to the LB_CONT, which 5771 // will retry the match starting at another position or fail 5772 // the look-behind altogether, whichever is appropriate. 5773 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5774 break; 5775 } 5776 5777 // Look-behind match is good. Restore the orignal input string length, 5778 // which had been truncated to pin the end of the lookbehind match to the 5779 // position being looked-behind. 5780 int64_t originalInputLen = fData[opValue+3]; 5781 U_ASSERT(originalInputLen >= fActiveLimit); 5782 U_ASSERT(originalInputLen <= fInputLength); 5783 fActiveLimit = originalInputLen; 5784 } 5785 break; 5786 5787 5788 case URX_LBN_CONT: 5789 { 5790 // Negative Look-Behind, at top of loop checking for matches of LB expression 5791 // at all possible input starting positions. 5792 5793 // Fetch the extra parameters of this op. 5794 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5795 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5796 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 5797 continueLoc = URX_VAL(continueLoc); 5798 U_ASSERT(minML <= maxML); 5799 U_ASSERT(minML >= 0); 5800 U_ASSERT(continueLoc > fp->fPatIdx); 5801 5802 // Fetch (from data) the last input index where a match was attempted. 5803 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5804 int64_t *lbStartIdx = &fData[opValue+2]; 5805 if (*lbStartIdx < 0) { 5806 // First time through loop. 5807 *lbStartIdx = fp->fInputIdx - minML; 5808 } else { 5809 // 2nd through nth time through the loop. 5810 // Back up start position for match by one. 5811 if (*lbStartIdx == 0) { 5812 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. 5813 } else { 5814 U16_BACK_1(inputBuf, 0, *lbStartIdx); 5815 } 5816 } 5817 5818 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 5819 // We have tried all potential match starting points without 5820 // getting a match, which means that the negative lookbehind as 5821 // a whole has succeeded. Jump forward to the continue location 5822 int64_t restoreInputLen = fData[opValue+3]; 5823 U_ASSERT(restoreInputLen >= fActiveLimit); 5824 U_ASSERT(restoreInputLen <= fInputLength); 5825 fActiveLimit = restoreInputLen; 5826 fp->fPatIdx = continueLoc; 5827 break; 5828 } 5829 5830 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5831 // (successful match will cause a FAIL out of the loop altogether.) 5832 fp = StateSave(fp, fp->fPatIdx-4, status); 5833 fp->fInputIdx = *lbStartIdx; 5834 } 5835 break; 5836 5837 case URX_LBN_END: 5838 // End of a negative look-behind block, after a successful match. 5839 { 5840 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5841 if (fp->fInputIdx != fActiveLimit) { 5842 // The look-behind expression matched, but the match did not 5843 // extend all the way to the point that we are looking behind from. 5844 // FAIL out of here, which will take us back to the LB_CONT, which 5845 // will retry the match starting at another position or succeed 5846 // the look-behind altogether, whichever is appropriate. 5847 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5848 break; 5849 } 5850 5851 // Look-behind expression matched, which means look-behind test as 5852 // a whole Fails 5853 5854 // Restore the orignal input string length, which had been truncated 5855 // inorder to pin the end of the lookbehind match 5856 // to the position being looked-behind. 5857 int64_t originalInputLen = fData[opValue+3]; 5858 U_ASSERT(originalInputLen >= fActiveLimit); 5859 U_ASSERT(originalInputLen <= fInputLength); 5860 fActiveLimit = originalInputLen; 5861 5862 // Restore original stack position, discarding any state saved 5863 // by the successful pattern match. 5864 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5865 int32_t newStackSize = (int32_t)fData[opValue]; 5866 U_ASSERT(fStack->size() > newStackSize); 5867 fStack->setSize(newStackSize); 5868 5869 // FAIL, which will take control back to someplace 5870 // prior to entering the look-behind test. 5871 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5872 } 5873 break; 5874 5875 5876 case URX_LOOP_SR_I: 5877 // Loop Initialization for the optimized implementation of 5878 // [some character set]* 5879 // This op scans through all matching input. 5880 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5881 { 5882 U_ASSERT(opValue > 0 && opValue < sets->size()); 5883 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5884 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 5885 5886 // Loop through input, until either the input is exhausted or 5887 // we reach a character that is not a member of the set. 5888 int32_t ix = (int32_t)fp->fInputIdx; 5889 for (;;) { 5890 if (ix >= fActiveLimit) { 5891 fHitEnd = TRUE; 5892 break; 5893 } 5894 UChar32 c; 5895 U16_NEXT(inputBuf, ix, fActiveLimit, c); 5896 if (c<256) { 5897 if (s8->contains(c) == FALSE) { 5898 U16_BACK_1(inputBuf, 0, ix); 5899 break; 5900 } 5901 } else { 5902 if (s->contains(c) == FALSE) { 5903 U16_BACK_1(inputBuf, 0, ix); 5904 break; 5905 } 5906 } 5907 } 5908 5909 // If there were no matching characters, skip over the loop altogether. 5910 // The loop doesn't run at all, a * op always succeeds. 5911 if (ix == fp->fInputIdx) { 5912 fp->fPatIdx++; // skip the URX_LOOP_C op. 5913 break; 5914 } 5915 5916 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5917 // must follow. It's operand is the stack location 5918 // that holds the starting input index for the match of this [set]* 5919 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 5920 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5921 int32_t stackLoc = URX_VAL(loopcOp); 5922 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5923 fp->fExtra[stackLoc] = fp->fInputIdx; 5924 #ifdef REGEX_SMART_BACKTRACKING 5925 backSearchIndex = fp->fInputIdx; 5926 #endif 5927 fp->fInputIdx = ix; 5928 5929 // Save State to the URX_LOOP_C op that follows this one, 5930 // so that match failures in the following code will return to there. 5931 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 5932 fp = StateSave(fp, fp->fPatIdx, status); 5933 fp->fPatIdx++; 5934 } 5935 break; 5936 5937 5938 case URX_LOOP_DOT_I: 5939 // Loop Initialization for the optimized implementation of .* 5940 // This op scans through all remaining input. 5941 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5942 { 5943 // Loop through input until the input is exhausted (we reach an end-of-line) 5944 // In DOTALL mode, we can just go straight to the end of the input. 5945 int32_t ix; 5946 if ((opValue & 1) == 1) { 5947 // Dot-matches-All mode. Jump straight to the end of the string. 5948 ix = (int32_t)fActiveLimit; 5949 fHitEnd = TRUE; 5950 } else { 5951 // NOT DOT ALL mode. Line endings do not match '.' 5952 // Scan forward until a line ending or end of input. 5953 ix = (int32_t)fp->fInputIdx; 5954 for (;;) { 5955 if (ix >= fActiveLimit) { 5956 fHitEnd = TRUE; 5957 break; 5958 } 5959 UChar32 c; 5960 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] 5961 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 5962 if ((c == 0x0a) || // 0x0a is newline in both modes. 5963 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 5964 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) { 5965 // char is a line ending. Put the input pos back to the 5966 // line ending char, and exit the scanning loop. 5967 U16_BACK_1(inputBuf, 0, ix); 5968 break; 5969 } 5970 } 5971 } 5972 } 5973 5974 // If there were no matching characters, skip over the loop altogether. 5975 // The loop doesn't run at all, a * op always succeeds. 5976 if (ix == fp->fInputIdx) { 5977 fp->fPatIdx++; // skip the URX_LOOP_C op. 5978 break; 5979 } 5980 5981 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5982 // must follow. It's operand is the stack location 5983 // that holds the starting input index for the match of this .* 5984 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 5985 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5986 int32_t stackLoc = URX_VAL(loopcOp); 5987 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5988 fp->fExtra[stackLoc] = fp->fInputIdx; 5989 #ifdef REGEX_SMART_BACKTRACKING 5990 backSearchIndex = fp->fInputIdx; 5991 #endif 5992 fp->fInputIdx = ix; 5993 5994 // Save State to the URX_LOOP_C op that follows this one, 5995 // so that match failures in the following code will return to there. 5996 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 5997 fp = StateSave(fp, fp->fPatIdx, status); 5998 fp->fPatIdx++; 5999 } 6000 break; 6001 6002 6003 case URX_LOOP_C: 6004 { 6005 U_ASSERT(opValue>=0 && opValue<fFrameSize); 6006 backSearchIndex = (int32_t)fp->fExtra[opValue]; 6007 U_ASSERT(backSearchIndex <= fp->fInputIdx); 6008 if (backSearchIndex == fp->fInputIdx) { 6009 // We've backed up the input idx to the point that the loop started. 6010 // The loop is done. Leave here without saving state. 6011 // Subsequent failures won't come back here. 6012 break; 6013 } 6014 // Set up for the next iteration of the loop, with input index 6015 // backed up by one from the last time through, 6016 // and a state save to this instruction in case the following code fails again. 6017 // (We're going backwards because this loop emulates stack unwinding, not 6018 // the initial scan forward.) 6019 U_ASSERT(fp->fInputIdx > 0); 6020 UChar32 prevC; 6021 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit? 6022 6023 if (prevC == 0x0a && 6024 fp->fInputIdx > backSearchIndex && 6025 inputBuf[fp->fInputIdx-1] == 0x0d) { 6026 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 6027 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 6028 // .*, stepping back over CRLF pair. 6029 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 6030 } 6031 } 6032 6033 6034 fp = StateSave(fp, fp->fPatIdx-1, status); 6035 } 6036 break; 6037 6038 6039 6040 default: 6041 // Trouble. The compiled pattern contains an entry with an 6042 // unrecognized type tag. 6043 U_ASSERT(FALSE); 6044 } 6045 6046 if (U_FAILURE(status)) { 6047 isMatch = FALSE; 6048 break; 6049 } 6050 } 6051 6052 breakFromLoop: 6053 fMatch = isMatch; 6054 if (isMatch) { 6055 fLastMatchEnd = fMatchEnd; 6056 fMatchStart = startIdx; 6057 fMatchEnd = fp->fInputIdx; 6058 if (fTraceDebug) { 6059 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); 6060 } 6061 } 6062 else 6063 { 6064 if (fTraceDebug) { 6065 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); 6066 } 6067 } 6068 6069 fFrame = fp; // The active stack frame when the engine stopped. 6070 // Contains the capture group results that we need to 6071 // access later. 6072 6073 return; 6074 } 6075 6076 6077 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) 6078 6079 U_NAMESPACE_END 6080 6081 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 6082