1 /* 2 ************************************************************************** 3 * Copyright (C) 2002-2010 International Business Machines Corporation * 4 * and others. All rights reserved. * 5 ************************************************************************** 6 */ 7 // 8 // file: rematch.cpp 9 // 10 // Contains the implementation of class RegexMatcher, 11 // which is one of the main API classes for the ICU regular expression package. 12 // 13 14 #include "unicode/utypes.h" 15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 16 17 #include "unicode/regex.h" 18 #include "unicode/uniset.h" 19 #include "unicode/uchar.h" 20 #include "unicode/ustring.h" 21 #include "unicode/rbbi.h" 22 #include "uassert.h" 23 #include "cmemory.h" 24 #include "uvector.h" 25 #include "uvectr32.h" 26 #include "uvectr64.h" 27 #include "regeximp.h" 28 #include "regexst.h" 29 #include "regextxt.h" 30 #include "ucase.h" 31 32 // #include <malloc.h> // Needed for heapcheck testing 33 34 35 36 // Smart Backtracking 37 // ------------------ 38 // When a failure would go back to a LOOP_C instruction, 39 // strings, characters, and setrefs scan backwards for a valid start 40 // character themselves, pop the stack, and save state, emulating the 41 // LOOP_C's effect but assured that the next character of input is a 42 // possible matching character. 43 // 44 // Good idea in theory; unfortunately it only helps out a few specific 45 // cases and slows the engine down a little in the rest. 46 47 //#define REGEX_SMART_BACKTRACKING 1 48 49 U_NAMESPACE_BEGIN 50 51 // Default limit for the size of the back track stack, to avoid system 52 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. 53 // This value puts ICU's limits higher than most other regexp implementations, 54 // which use recursion rather than the heap, and take more storage per 55 // backtrack point. 56 // 57 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; 58 59 // Time limit counter constant. 60 // Time limits for expression evaluation are in terms of quanta of work by 61 // the engine, each of which is 10,000 state saves. 62 // This constant determines that state saves per tick number. 63 static const int32_t TIMER_INITIAL_VALUE = 10000; 64 65 //----------------------------------------------------------------------------- 66 // 67 // Constructor and Destructor 68 // 69 //----------------------------------------------------------------------------- 70 RegexMatcher::RegexMatcher(const RegexPattern *pat) { 71 fDeferredStatus = U_ZERO_ERROR; 72 init(fDeferredStatus); 73 if (U_FAILURE(fDeferredStatus)) { 74 return; 75 } 76 if (pat==NULL) { 77 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; 78 return; 79 } 80 fPattern = pat; 81 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); 82 } 83 84 85 86 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 87 uint32_t flags, UErrorCode &status) { 88 init(status); 89 if (U_FAILURE(status)) { 90 return; 91 } 92 UParseError pe; 93 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 94 fPattern = fPatternOwned; 95 96 UText inputText = UTEXT_INITIALIZER; 97 utext_openConstUnicodeString(&inputText, &input, &status); 98 init2(&inputText, status); 99 utext_close(&inputText); 100 101 fInputUniStrMaybeMutable = TRUE; 102 } 103 104 105 RegexMatcher::RegexMatcher(UText *regexp, UText *input, 106 uint32_t flags, UErrorCode &status) { 107 init(status); 108 if (U_FAILURE(status)) { 109 return; 110 } 111 UParseError pe; 112 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 113 if (U_FAILURE(status)) { 114 return; 115 } 116 117 fPattern = fPatternOwned; 118 init2(input, status); 119 } 120 121 122 RegexMatcher::RegexMatcher(const UnicodeString ®exp, 123 uint32_t flags, UErrorCode &status) { 124 init(status); 125 if (U_FAILURE(status)) { 126 return; 127 } 128 UParseError pe; 129 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 130 if (U_FAILURE(status)) { 131 return; 132 } 133 fPattern = fPatternOwned; 134 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 135 } 136 137 RegexMatcher::RegexMatcher(UText *regexp, 138 uint32_t flags, UErrorCode &status) { 139 init(status); 140 if (U_FAILURE(status)) { 141 return; 142 } 143 UParseError pe; 144 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 145 if (U_FAILURE(status)) { 146 return; 147 } 148 149 fPattern = fPatternOwned; 150 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 151 } 152 153 154 155 156 RegexMatcher::~RegexMatcher() { 157 delete fStack; 158 if (fData != fSmallData) { 159 uprv_free(fData); 160 fData = NULL; 161 } 162 if (fPatternOwned) { 163 delete fPatternOwned; 164 fPatternOwned = NULL; 165 fPattern = NULL; 166 } 167 168 if (fInput) { 169 delete fInput; 170 } 171 if (fInputText) { 172 utext_close(fInputText); 173 } 174 if (fAltInputText) { 175 utext_close(fAltInputText); 176 } 177 178 #if UCONFIG_NO_BREAK_ITERATION==0 179 delete fWordBreakItr; 180 #endif 181 } 182 183 // 184 // init() common initialization for use by all constructors. 185 // Initialize all fields, get the object into a consistent state. 186 // This must be done even when the initial status shows an error, 187 // so that the object is initialized sufficiently well for the destructor 188 // to run safely. 189 // 190 void RegexMatcher::init(UErrorCode &status) { 191 fPattern = NULL; 192 fPatternOwned = NULL; 193 fFrameSize = 0; 194 fRegionStart = 0; 195 fRegionLimit = 0; 196 fAnchorStart = 0; 197 fAnchorLimit = 0; 198 fLookStart = 0; 199 fLookLimit = 0; 200 fActiveStart = 0; 201 fActiveLimit = 0; 202 fTransparentBounds = FALSE; 203 fAnchoringBounds = TRUE; 204 fMatch = FALSE; 205 fMatchStart = 0; 206 fMatchEnd = 0; 207 fLastMatchEnd = -1; 208 fAppendPosition = 0; 209 fHitEnd = FALSE; 210 fRequireEnd = FALSE; 211 fStack = NULL; 212 fFrame = NULL; 213 fTimeLimit = 0; 214 fTime = 0; 215 fTickCounter = 0; 216 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; 217 fCallbackFn = NULL; 218 fCallbackContext = NULL; 219 fTraceDebug = FALSE; 220 fDeferredStatus = status; 221 fData = fSmallData; 222 fWordBreakItr = NULL; 223 224 fStack = new UVector64(status); 225 fInputText = NULL; 226 fAltInputText = NULL; 227 fInput = NULL; 228 fInputLength = 0; 229 fInputUniStrMaybeMutable = FALSE; 230 231 if (U_FAILURE(status)) { 232 fDeferredStatus = status; 233 } 234 } 235 236 // 237 // init2() Common initialization for use by RegexMatcher constructors, part 2. 238 // This handles the common setup to be done after the Pattern is available. 239 // 240 void RegexMatcher::init2(UText *input, UErrorCode &status) { 241 if (U_FAILURE(status)) { 242 fDeferredStatus = status; 243 return; 244 } 245 246 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) { 247 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); 248 if (fData == NULL) { 249 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 250 return; 251 } 252 } 253 254 reset(input); 255 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); 256 if (U_FAILURE(status)) { 257 fDeferredStatus = status; 258 return; 259 } 260 } 261 262 263 static const UChar BACKSLASH = 0x5c; 264 static const UChar DOLLARSIGN = 0x24; 265 //-------------------------------------------------------------------------------- 266 // 267 // appendReplacement 268 // 269 //-------------------------------------------------------------------------------- 270 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, 271 const UnicodeString &replacement, 272 UErrorCode &status) { 273 UText replacementText = UTEXT_INITIALIZER; 274 275 utext_openConstUnicodeString(&replacementText, &replacement, &status); 276 if (U_SUCCESS(status)) { 277 UText resultText = UTEXT_INITIALIZER; 278 utext_openUnicodeString(&resultText, &dest, &status); 279 280 if (U_SUCCESS(status)) { 281 appendReplacement(&resultText, &replacementText, status); 282 utext_close(&resultText); 283 } 284 utext_close(&replacementText); 285 } 286 287 return *this; 288 } 289 290 // 291 // appendReplacement, UText mode 292 // 293 RegexMatcher &RegexMatcher::appendReplacement(UText *dest, 294 UText *replacement, 295 UErrorCode &status) { 296 if (U_FAILURE(status)) { 297 return *this; 298 } 299 if (U_FAILURE(fDeferredStatus)) { 300 status = fDeferredStatus; 301 return *this; 302 } 303 if (fMatch == FALSE) { 304 status = U_REGEX_INVALID_STATE; 305 return *this; 306 } 307 308 // Copy input string from the end of previous match to start of current match 309 int64_t destLen = utext_nativeLength(dest); 310 if (fMatchStart > fAppendPosition) { 311 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 312 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, (int32_t)(fMatchStart-fAppendPosition), &status); 313 } else { 314 int32_t len16; 315 if (UTEXT_USES_U16(fInputText)) { 316 len16 = (int32_t)(fMatchStart-fAppendPosition); 317 } else { 318 UErrorCode lengthStatus = U_ZERO_ERROR; 319 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus); 320 } 321 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 322 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); 323 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); 324 uprv_free(inputChars); 325 } 326 } 327 fAppendPosition = fMatchEnd; 328 329 330 // scan the replacement text, looking for substitutions ($n) and \escapes. 331 // TODO: optimize this loop by efficiently scanning for '$' or '\', 332 // move entire ranges not containing substitutions. 333 UTEXT_SETNATIVEINDEX(replacement, 0); 334 UChar32 c = UTEXT_NEXT32(replacement); 335 while (c != U_SENTINEL) { 336 if (c == BACKSLASH) { 337 // Backslash Escape. Copy the following char out without further checks. 338 // Note: Surrogate pairs don't need any special handling 339 // The second half wont be a '$' or a '\', and 340 // will move to the dest normally on the next 341 // loop iteration. 342 c = UTEXT_CURRENT32(replacement); 343 if (c == U_SENTINEL) { 344 break; 345 } 346 347 if (c==0x55/*U*/ || c==0x75/*u*/) { 348 // We have a \udddd or \Udddddddd escape sequence. 349 int32_t offset = 0; 350 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement); 351 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 352 if (escapedChar != (UChar32)0xFFFFFFFF) { 353 if (U_IS_BMP(escapedChar)) { 354 UChar c16 = (UChar)escapedChar; 355 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 356 } else { 357 UChar surrogate[2]; 358 surrogate[0] = U16_LEAD(escapedChar); 359 surrogate[1] = U16_TRAIL(escapedChar); 360 if (U_SUCCESS(status)) { 361 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 362 } 363 } 364 // TODO: Report errors for mal-formed \u escapes? 365 // As this is, the original sequence is output, which may be OK. 366 if (context.lastOffset == offset) { 367 UTEXT_PREVIOUS32(replacement); 368 } else if (context.lastOffset != offset-1) { 369 utext_moveIndex32(replacement, offset - context.lastOffset - 1); 370 } 371 } 372 } else { 373 UTEXT_NEXT32(replacement); 374 // Plain backslash escape. Just put out the escaped character. 375 if (U_IS_BMP(c)) { 376 UChar c16 = (UChar)c; 377 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 378 } else { 379 UChar surrogate[2]; 380 surrogate[0] = U16_LEAD(c); 381 surrogate[1] = U16_TRAIL(c); 382 if (U_SUCCESS(status)) { 383 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 384 } 385 } 386 } 387 } else if (c != DOLLARSIGN) { 388 // Normal char, not a $. Copy it out without further checks. 389 if (U_IS_BMP(c)) { 390 UChar c16 = (UChar)c; 391 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 392 } else { 393 UChar surrogate[2]; 394 surrogate[0] = U16_LEAD(c); 395 surrogate[1] = U16_TRAIL(c); 396 if (U_SUCCESS(status)) { 397 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 398 } 399 } 400 } else { 401 // We've got a $. Pick up a capture group number if one follows. 402 // Consume at most the number of digits necessary for the largest capture 403 // number that is valid for this pattern. 404 405 int32_t numDigits = 0; 406 int32_t groupNum = 0; 407 UChar32 digitC; 408 for (;;) { 409 digitC = UTEXT_CURRENT32(replacement); 410 if (digitC == U_SENTINEL) { 411 break; 412 } 413 if (u_isdigit(digitC) == FALSE) { 414 break; 415 } 416 UTEXT_NEXT32(replacement); 417 groupNum=groupNum*10 + u_charDigitValue(digitC); 418 numDigits++; 419 if (numDigits >= fPattern->fMaxCaptureDigits) { 420 break; 421 } 422 } 423 424 425 if (numDigits == 0) { 426 // The $ didn't introduce a group number at all. 427 // Treat it as just part of the substitution text. 428 UChar c16 = DOLLARSIGN; 429 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 430 } else { 431 // Finally, append the capture group data to the destination. 432 destLen += appendGroup(groupNum, dest, status); 433 if (U_FAILURE(status)) { 434 // Can fail if group number is out of range. 435 break; 436 } 437 } 438 } 439 440 if (U_FAILURE(status)) { 441 break; 442 } else { 443 c = UTEXT_NEXT32(replacement); 444 } 445 } 446 447 return *this; 448 } 449 450 451 452 //-------------------------------------------------------------------------------- 453 // 454 // appendTail Intended to be used in conjunction with appendReplacement() 455 // To the destination string, append everything following 456 // the last match position from the input string. 457 // 458 // Note: Match ranges do not affect appendTail or appendReplacement 459 // 460 //-------------------------------------------------------------------------------- 461 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { 462 UErrorCode status = U_ZERO_ERROR; 463 UText resultText = UTEXT_INITIALIZER; 464 utext_openUnicodeString(&resultText, &dest, &status); 465 466 if (U_SUCCESS(status)) { 467 appendTail(&resultText); 468 utext_close(&resultText); 469 } 470 471 return dest; 472 } 473 474 // 475 // appendTail, UText mode 476 // 477 UText *RegexMatcher::appendTail(UText *dest) { 478 if (fInputLength > fAppendPosition) { 479 UErrorCode status = U_ZERO_ERROR; 480 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 481 int64_t destLen = utext_nativeLength(dest); 482 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, (int32_t)(fInputLength-fAppendPosition), &status); 483 } else { 484 int32_t len16; 485 if (UTEXT_USES_U16(fInputText)) { 486 len16 = (int32_t)(fInputLength-fAppendPosition); 487 } else { 488 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status); 489 status = U_ZERO_ERROR; // buffer overflow 490 } 491 492 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); 493 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated 494 495 int64_t destLen = utext_nativeLength(dest); 496 utext_replace(dest, destLen, destLen, inputChars, len16, &status); 497 498 uprv_free(inputChars); 499 } 500 } 501 return dest; 502 } 503 504 505 506 //-------------------------------------------------------------------------------- 507 // 508 // end 509 // 510 //-------------------------------------------------------------------------------- 511 int32_t RegexMatcher::end(UErrorCode &err) const { 512 return end(0, err); 513 } 514 515 516 517 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { 518 if (U_FAILURE(err)) { 519 return -1; 520 } 521 if (fMatch == FALSE) { 522 err = U_REGEX_INVALID_STATE; 523 return -1; 524 } 525 if (group < 0 || group > fPattern->fGroupMap->size()) { 526 err = U_INDEX_OUTOFBOUNDS_ERROR; 527 return -1; 528 } 529 int64_t e = -1; 530 if (group == 0) { 531 e = fMatchEnd; 532 } else { 533 // Get the position within the stack frame of the variables for 534 // this capture group. 535 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 536 U_ASSERT(groupOffset < fPattern->fFrameSize); 537 U_ASSERT(groupOffset >= 0); 538 e = fFrame->fExtra[groupOffset + 1]; 539 } 540 541 if (e == -1 || UTEXT_USES_U16(fInputText)) { 542 return (int32_t)e; 543 } else { 544 // !!!: Would like a better way to do this! 545 UErrorCode status = U_ZERO_ERROR; 546 return utext_extract(fInputText, 0, e, NULL, 0, &status); 547 } 548 } 549 550 551 552 //-------------------------------------------------------------------------------- 553 // 554 // find() 555 // 556 //-------------------------------------------------------------------------------- 557 UBool RegexMatcher::find() { 558 // Start at the position of the last match end. (Will be zero if the 559 // matcher has been reset.) 560 // 561 if (U_FAILURE(fDeferredStatus)) { 562 return FALSE; 563 } 564 565 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 566 return findUsingChunk(); 567 } 568 569 int64_t startPos = fMatchEnd; 570 if (startPos==0) { 571 startPos = fActiveStart; 572 } 573 574 if (fMatch) { 575 // Save the position of any previous successful match. 576 fLastMatchEnd = fMatchEnd; 577 578 if (fMatchStart == fMatchEnd) { 579 // Previous match had zero length. Move start position up one position 580 // to avoid sending find() into a loop on zero-length matches. 581 if (startPos >= fActiveLimit) { 582 fMatch = FALSE; 583 fHitEnd = TRUE; 584 return FALSE; 585 } 586 UTEXT_SETNATIVEINDEX(fInputText, startPos); 587 UTEXT_NEXT32(fInputText); 588 startPos = UTEXT_GETNATIVEINDEX(fInputText); 589 } 590 } else { 591 if (fLastMatchEnd >= 0) { 592 // A previous find() failed to match. Don't try again. 593 // (without this test, a pattern with a zero-length match 594 // could match again at the end of an input string.) 595 fHitEnd = TRUE; 596 return FALSE; 597 } 598 } 599 600 601 // Compute the position in the input string beyond which a match can not begin, because 602 // the minimum length match would extend past the end of the input. 603 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 604 // Be aware of possible overflows if making changes here. 605 int64_t testStartLimit; 606 if (UTEXT_USES_U16(fInputText)) { 607 testStartLimit = fActiveLimit - fPattern->fMinMatchLen; 608 if (startPos > testStartLimit) { 609 fMatch = FALSE; 610 fHitEnd = TRUE; 611 return FALSE; 612 } 613 } else { 614 // For now, let the matcher discover that it can't match on its own 615 // We don't know how long the match len is in native characters 616 testStartLimit = fActiveLimit; 617 } 618 619 UChar32 c; 620 U_ASSERT(startPos >= 0); 621 622 switch (fPattern->fStartType) { 623 case START_NO_INFO: 624 // No optimization was found. 625 // Try a match at each input position. 626 for (;;) { 627 MatchAt(startPos, FALSE, fDeferredStatus); 628 if (U_FAILURE(fDeferredStatus)) { 629 return FALSE; 630 } 631 if (fMatch) { 632 return TRUE; 633 } 634 if (startPos >= testStartLimit) { 635 fHitEnd = TRUE; 636 return FALSE; 637 } 638 UTEXT_SETNATIVEINDEX(fInputText, startPos); 639 UTEXT_NEXT32(fInputText); 640 startPos = UTEXT_GETNATIVEINDEX(fInputText); 641 // Note that it's perfectly OK for a pattern to have a zero-length 642 // match at the end of a string, so we must make sure that the loop 643 // runs with startPos == testStartLimit the last time through. 644 } 645 U_ASSERT(FALSE); 646 647 case START_START: 648 // Matches are only possible at the start of the input string 649 // (pattern begins with ^ or \A) 650 if (startPos > fActiveStart) { 651 fMatch = FALSE; 652 return FALSE; 653 } 654 MatchAt(startPos, FALSE, fDeferredStatus); 655 if (U_FAILURE(fDeferredStatus)) { 656 return FALSE; 657 } 658 return fMatch; 659 660 661 case START_SET: 662 { 663 // Match may start on any char from a pre-computed set. 664 U_ASSERT(fPattern->fMinMatchLen > 0); 665 int64_t pos; 666 UTEXT_SETNATIVEINDEX(fInputText, startPos); 667 for (;;) { 668 c = UTEXT_NEXT32(fInputText); 669 pos = UTEXT_GETNATIVEINDEX(fInputText); 670 // c will be -1 (U_SENTINEL) at end of text, in which case we 671 // skip this next block (so we don't have a negative array index) 672 // and handle end of text in the following block. 673 if (c >= 0 && (c<256 && fPattern->fInitialChars8->contains(c) || 674 c>=256 && fPattern->fInitialChars->contains(c))) { 675 MatchAt(startPos, FALSE, fDeferredStatus); 676 if (U_FAILURE(fDeferredStatus)) { 677 return FALSE; 678 } 679 if (fMatch) { 680 return TRUE; 681 } 682 UTEXT_SETNATIVEINDEX(fInputText, pos); 683 } 684 if (startPos >= testStartLimit) { 685 fMatch = FALSE; 686 fHitEnd = TRUE; 687 return FALSE; 688 } 689 startPos = pos; 690 } 691 } 692 U_ASSERT(FALSE); 693 694 case START_STRING: 695 case START_CHAR: 696 { 697 // Match starts on exactly one char. 698 U_ASSERT(fPattern->fMinMatchLen > 0); 699 UChar32 theChar = fPattern->fInitialChar; 700 int64_t pos; 701 UTEXT_SETNATIVEINDEX(fInputText, startPos); 702 for (;;) { 703 c = UTEXT_NEXT32(fInputText); 704 pos = UTEXT_GETNATIVEINDEX(fInputText); 705 if (c == theChar) { 706 MatchAt(startPos, FALSE, fDeferredStatus); 707 if (U_FAILURE(fDeferredStatus)) { 708 return FALSE; 709 } 710 if (fMatch) { 711 return TRUE; 712 } 713 UTEXT_SETNATIVEINDEX(fInputText, pos); 714 } 715 if (startPos >= testStartLimit) { 716 fMatch = FALSE; 717 fHitEnd = TRUE; 718 return FALSE; 719 } 720 startPos = pos; 721 } 722 } 723 U_ASSERT(FALSE); 724 725 case START_LINE: 726 { 727 UChar32 c; 728 if (startPos == fAnchorStart) { 729 MatchAt(startPos, FALSE, fDeferredStatus); 730 if (U_FAILURE(fDeferredStatus)) { 731 return FALSE; 732 } 733 if (fMatch) { 734 return TRUE; 735 } 736 UTEXT_SETNATIVEINDEX(fInputText, startPos); 737 c = UTEXT_NEXT32(fInputText); 738 startPos = UTEXT_GETNATIVEINDEX(fInputText); 739 } else { 740 UTEXT_SETNATIVEINDEX(fInputText, startPos); 741 c = UTEXT_PREVIOUS32(fInputText); 742 UTEXT_SETNATIVEINDEX(fInputText, startPos); 743 } 744 745 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 746 for (;;) { 747 if (c == 0x0a) { 748 MatchAt(startPos, FALSE, fDeferredStatus); 749 if (U_FAILURE(fDeferredStatus)) { 750 return FALSE; 751 } 752 if (fMatch) { 753 return TRUE; 754 } 755 UTEXT_SETNATIVEINDEX(fInputText, startPos); 756 } 757 if (startPos >= testStartLimit) { 758 fMatch = FALSE; 759 fHitEnd = TRUE; 760 return FALSE; 761 } 762 c = UTEXT_NEXT32(fInputText); 763 startPos = UTEXT_GETNATIVEINDEX(fInputText); 764 // Note that it's perfectly OK for a pattern to have a zero-length 765 // match at the end of a string, so we must make sure that the loop 766 // runs with startPos == testStartLimit the last time through. 767 } 768 } else { 769 for (;;) { 770 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 771 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { 772 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 773 UTEXT_NEXT32(fInputText); 774 startPos = UTEXT_GETNATIVEINDEX(fInputText); 775 } 776 MatchAt(startPos, FALSE, fDeferredStatus); 777 if (U_FAILURE(fDeferredStatus)) { 778 return FALSE; 779 } 780 if (fMatch) { 781 return TRUE; 782 } 783 UTEXT_SETNATIVEINDEX(fInputText, startPos); 784 } 785 if (startPos >= testStartLimit) { 786 fMatch = FALSE; 787 fHitEnd = TRUE; 788 return FALSE; 789 } 790 c = UTEXT_NEXT32(fInputText); 791 startPos = UTEXT_GETNATIVEINDEX(fInputText); 792 // Note that it's perfectly OK for a pattern to have a zero-length 793 // match at the end of a string, so we must make sure that the loop 794 // runs with startPos == testStartLimit the last time through. 795 } 796 } 797 } 798 799 default: 800 U_ASSERT(FALSE); 801 } 802 803 U_ASSERT(FALSE); 804 return FALSE; 805 } 806 807 808 809 UBool RegexMatcher::find(int32_t start, UErrorCode &status) { 810 if (U_FAILURE(status)) { 811 return FALSE; 812 } 813 if (U_FAILURE(fDeferredStatus)) { 814 status = fDeferredStatus; 815 return FALSE; 816 } 817 this->reset(); // Note: Reset() is specified by Java Matcher documentation. 818 // This will reset the region to be the full input length. 819 if (start < 0) { 820 status = U_INDEX_OUTOFBOUNDS_ERROR; 821 return FALSE; 822 } 823 824 UBool couldFindStart = TRUE; 825 int64_t nativeStart; 826 if (UTEXT_USES_U16(fInputText)) { 827 nativeStart = start; 828 } else { 829 UTEXT_SETNATIVEINDEX(fInputText, 0); 830 int32_t i = 0; 831 while (i < start) { 832 UChar32 c = UTEXT_NEXT32(fInputText); 833 if (c != U_SENTINEL) { 834 i += U16_LENGTH(c); 835 } else { 836 couldFindStart = FALSE; 837 break; 838 } 839 } 840 nativeStart = UTEXT_GETNATIVEINDEX(fInputText); 841 } 842 if (!couldFindStart || nativeStart < fActiveStart || nativeStart > fActiveLimit) { 843 status = U_INDEX_OUTOFBOUNDS_ERROR; 844 return FALSE; 845 } 846 fMatchEnd = nativeStart; 847 return find(); 848 } 849 850 851 //-------------------------------------------------------------------------------- 852 // 853 // findUsingChunk() -- like find(), but with the advance knowledge that the 854 // entire string is available in the UText's chunk buffer. 855 // 856 //-------------------------------------------------------------------------------- 857 UBool RegexMatcher::findUsingChunk() { 858 // Start at the position of the last match end. (Will be zero if the 859 // matcher has been reset. 860 // 861 862 int32_t startPos = (int32_t)fMatchEnd; 863 if (startPos==0) { 864 startPos = (int32_t)fActiveStart; 865 } 866 867 const UChar *inputBuf = fInputText->chunkContents; 868 869 if (fMatch) { 870 // Save the position of any previous successful match. 871 fLastMatchEnd = fMatchEnd; 872 873 if (fMatchStart == fMatchEnd) { 874 // Previous match had zero length. Move start position up one position 875 // to avoid sending find() into a loop on zero-length matches. 876 if (startPos >= fActiveLimit) { 877 fMatch = FALSE; 878 fHitEnd = TRUE; 879 return FALSE; 880 } 881 U16_FWD_1(inputBuf, startPos, fInputLength); 882 } 883 } else { 884 if (fLastMatchEnd >= 0) { 885 // A previous find() failed to match. Don't try again. 886 // (without this test, a pattern with a zero-length match 887 // could match again at the end of an input string.) 888 fHitEnd = TRUE; 889 return FALSE; 890 } 891 } 892 893 894 // Compute the position in the input string beyond which a match can not begin, because 895 // the minimum length match would extend past the end of the input. 896 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 897 // Be aware of possible overflows if making changes here. 898 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); 899 if (startPos > testLen) { 900 fMatch = FALSE; 901 fHitEnd = TRUE; 902 return FALSE; 903 } 904 905 UChar32 c; 906 U_ASSERT(startPos >= 0); 907 908 switch (fPattern->fStartType) { 909 case START_NO_INFO: 910 // No optimization was found. 911 // Try a match at each input position. 912 for (;;) { 913 MatchChunkAt(startPos, FALSE, fDeferredStatus); 914 if (U_FAILURE(fDeferredStatus)) { 915 return FALSE; 916 } 917 if (fMatch) { 918 return TRUE; 919 } 920 if (startPos >= testLen) { 921 fHitEnd = TRUE; 922 return FALSE; 923 } 924 U16_FWD_1(inputBuf, startPos, fActiveLimit); 925 // Note that it's perfectly OK for a pattern to have a zero-length 926 // match at the end of a string, so we must make sure that the loop 927 // runs with startPos == testLen the last time through. 928 } 929 U_ASSERT(FALSE); 930 931 case START_START: 932 // Matches are only possible at the start of the input string 933 // (pattern begins with ^ or \A) 934 if (startPos > fActiveStart) { 935 fMatch = FALSE; 936 return FALSE; 937 } 938 MatchChunkAt(startPos, FALSE, fDeferredStatus); 939 if (U_FAILURE(fDeferredStatus)) { 940 return FALSE; 941 } 942 return fMatch; 943 944 945 case START_SET: 946 { 947 // Match may start on any char from a pre-computed set. 948 U_ASSERT(fPattern->fMinMatchLen > 0); 949 for (;;) { 950 int32_t pos = startPos; 951 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 952 if (c<256 && fPattern->fInitialChars8->contains(c) || 953 c>=256 && fPattern->fInitialChars->contains(c)) { 954 MatchChunkAt(pos, FALSE, fDeferredStatus); 955 if (U_FAILURE(fDeferredStatus)) { 956 return FALSE; 957 } 958 if (fMatch) { 959 return TRUE; 960 } 961 } 962 if (pos >= testLen) { 963 fMatch = FALSE; 964 fHitEnd = TRUE; 965 return FALSE; 966 } 967 } 968 } 969 U_ASSERT(FALSE); 970 971 case START_STRING: 972 case START_CHAR: 973 { 974 // Match starts on exactly one char. 975 U_ASSERT(fPattern->fMinMatchLen > 0); 976 UChar32 theChar = fPattern->fInitialChar; 977 for (;;) { 978 int32_t pos = startPos; 979 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 980 if (c == theChar) { 981 MatchChunkAt(pos, FALSE, fDeferredStatus); 982 if (U_FAILURE(fDeferredStatus)) { 983 return FALSE; 984 } 985 if (fMatch) { 986 return TRUE; 987 } 988 } 989 if (pos >= testLen) { 990 fMatch = FALSE; 991 fHitEnd = TRUE; 992 return FALSE; 993 } 994 } 995 } 996 U_ASSERT(FALSE); 997 998 case START_LINE: 999 { 1000 UChar32 c; 1001 if (startPos == fAnchorStart) { 1002 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1003 if (U_FAILURE(fDeferredStatus)) { 1004 return FALSE; 1005 } 1006 if (fMatch) { 1007 return TRUE; 1008 } 1009 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1010 } 1011 1012 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 1013 for (;;) { 1014 c = inputBuf[startPos-1]; 1015 if (c == 0x0a) { 1016 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1017 if (U_FAILURE(fDeferredStatus)) { 1018 return FALSE; 1019 } 1020 if (fMatch) { 1021 return TRUE; 1022 } 1023 } 1024 if (startPos >= testLen) { 1025 fMatch = FALSE; 1026 fHitEnd = TRUE; 1027 return FALSE; 1028 } 1029 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1030 // Note that it's perfectly OK for a pattern to have a zero-length 1031 // match at the end of a string, so we must make sure that the loop 1032 // runs with startPos == testLen the last time through. 1033 } 1034 } else { 1035 for (;;) { 1036 c = inputBuf[startPos-1]; 1037 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 1038 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { 1039 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { 1040 startPos++; 1041 } 1042 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1043 if (U_FAILURE(fDeferredStatus)) { 1044 return FALSE; 1045 } 1046 if (fMatch) { 1047 return TRUE; 1048 } 1049 } 1050 if (startPos >= testLen) { 1051 fMatch = FALSE; 1052 fHitEnd = TRUE; 1053 return FALSE; 1054 } 1055 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1056 // Note that it's perfectly OK for a pattern to have a zero-length 1057 // match at the end of a string, so we must make sure that the loop 1058 // runs with startPos == testLen the last time through. 1059 } 1060 } 1061 } 1062 1063 default: 1064 U_ASSERT(FALSE); 1065 } 1066 1067 U_ASSERT(FALSE); 1068 return FALSE; 1069 } 1070 1071 1072 1073 //-------------------------------------------------------------------------------- 1074 // 1075 // group() 1076 // 1077 //-------------------------------------------------------------------------------- 1078 UnicodeString RegexMatcher::group(UErrorCode &status) const { 1079 return group(0, status); 1080 } 1081 1082 UText *RegexMatcher::group(UText *dest, MatcherDestIsUTextFlag /*flag*/, UErrorCode &status) const { 1083 return group(0, dest, status); 1084 } 1085 1086 1087 1088 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { 1089 UnicodeString result; 1090 UText resultText = UTEXT_INITIALIZER; 1091 utext_openUnicodeString(&resultText, &result, &status); 1092 group(groupNum, &resultText, status); 1093 utext_close(&resultText); 1094 return result; 1095 } 1096 1097 1098 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const { 1099 UBool bailOut = FALSE; 1100 if (U_FAILURE(status)) { 1101 bailOut = TRUE; 1102 } 1103 if (U_FAILURE(fDeferredStatus)) { 1104 status = fDeferredStatus; 1105 bailOut = TRUE; 1106 } 1107 1108 if (fMatch == FALSE) { 1109 status = U_REGEX_INVALID_STATE; 1110 bailOut = TRUE; 1111 } 1112 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1113 status = U_INDEX_OUTOFBOUNDS_ERROR; 1114 bailOut = TRUE; 1115 } 1116 1117 if (bailOut) { 1118 if (dest) { 1119 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1120 return dest; 1121 } else { 1122 return utext_openUChars(NULL, NULL, 0, &status); 1123 } 1124 } 1125 1126 int64_t s, e; 1127 if (groupNum == 0) { 1128 s = fMatchStart; 1129 e = fMatchEnd; 1130 } else { 1131 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1132 U_ASSERT(groupOffset < fPattern->fFrameSize); 1133 U_ASSERT(groupOffset >= 0); 1134 s = fFrame->fExtra[groupOffset]; 1135 e = fFrame->fExtra[groupOffset+1]; 1136 } 1137 1138 if (s < 0) { 1139 // A capture group wasn't part of the match 1140 if (dest) { 1141 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1142 return dest; 1143 } else { 1144 return utext_openUChars(NULL, NULL, 0, &status); 1145 } 1146 } 1147 U_ASSERT(s <= e); 1148 1149 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1150 U_ASSERT(e <= fInputLength); 1151 if (dest) { 1152 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status); 1153 } else { 1154 UText groupText = UTEXT_INITIALIZER; 1155 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status); 1156 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); 1157 utext_close(&groupText); 1158 } 1159 } else { 1160 int32_t len16; 1161 if (UTEXT_USES_U16(fInputText)) { 1162 len16 = (int32_t)(e-s); 1163 } else { 1164 UErrorCode lengthStatus = U_ZERO_ERROR; 1165 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1166 } 1167 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1168 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1169 1170 if (dest) { 1171 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status); 1172 } else { 1173 UText groupText = UTEXT_INITIALIZER; 1174 utext_openUChars(&groupText, groupChars, len16, &status); 1175 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); 1176 utext_close(&groupText); 1177 } 1178 1179 uprv_free(groupChars); 1180 } 1181 return dest; 1182 } 1183 1184 //-------------------------------------------------------------------------------- 1185 // 1186 // appendGroup() -- currently internal only, appends a group to a UText rather 1187 // than replacing its contents 1188 // 1189 //-------------------------------------------------------------------------------- 1190 1191 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const { 1192 int64_t destLen = utext_nativeLength(dest); 1193 1194 if (U_FAILURE(status)) { 1195 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1196 } 1197 if (U_FAILURE(fDeferredStatus)) { 1198 status = fDeferredStatus; 1199 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1200 } 1201 1202 if (fMatch == FALSE) { 1203 status = U_REGEX_INVALID_STATE; 1204 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1205 } 1206 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1207 status = U_INDEX_OUTOFBOUNDS_ERROR; 1208 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1209 } 1210 1211 int64_t s, e; 1212 if (groupNum == 0) { 1213 s = fMatchStart; 1214 e = fMatchEnd; 1215 } else { 1216 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1217 U_ASSERT(groupOffset < fPattern->fFrameSize); 1218 U_ASSERT(groupOffset >= 0); 1219 s = fFrame->fExtra[groupOffset]; 1220 e = fFrame->fExtra[groupOffset+1]; 1221 } 1222 1223 if (s < 0) { 1224 // A capture group wasn't part of the match 1225 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1226 } 1227 U_ASSERT(s <= e); 1228 1229 int64_t deltaLen; 1230 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1231 U_ASSERT(e <= fInputLength); 1232 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status); 1233 } else { 1234 int32_t len16; 1235 if (UTEXT_USES_U16(fInputText)) { 1236 len16 = (int32_t)(e-s); 1237 } else { 1238 UErrorCode lengthStatus = U_ZERO_ERROR; 1239 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1240 } 1241 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1242 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1243 1244 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status); 1245 uprv_free(groupChars); 1246 } 1247 return deltaLen; 1248 } 1249 1250 1251 1252 //-------------------------------------------------------------------------------- 1253 // 1254 // groupCount() 1255 // 1256 //-------------------------------------------------------------------------------- 1257 int32_t RegexMatcher::groupCount() const { 1258 return fPattern->fGroupMap->size(); 1259 } 1260 1261 1262 1263 //-------------------------------------------------------------------------------- 1264 // 1265 // hasAnchoringBounds() 1266 // 1267 //-------------------------------------------------------------------------------- 1268 UBool RegexMatcher::hasAnchoringBounds() const { 1269 return fAnchoringBounds; 1270 } 1271 1272 1273 //-------------------------------------------------------------------------------- 1274 // 1275 // hasTransparentBounds() 1276 // 1277 //-------------------------------------------------------------------------------- 1278 UBool RegexMatcher::hasTransparentBounds() const { 1279 return fTransparentBounds; 1280 } 1281 1282 1283 1284 //-------------------------------------------------------------------------------- 1285 // 1286 // hitEnd() 1287 // 1288 //-------------------------------------------------------------------------------- 1289 UBool RegexMatcher::hitEnd() const { 1290 return fHitEnd; 1291 } 1292 1293 1294 //-------------------------------------------------------------------------------- 1295 // 1296 // input() 1297 // 1298 //-------------------------------------------------------------------------------- 1299 const UnicodeString &RegexMatcher::input() const { 1300 if (!fInput) { 1301 UErrorCode status = U_ZERO_ERROR; 1302 int32_t len16; 1303 if (UTEXT_USES_U16(fInputText)) { 1304 len16 = (int32_t)fInputLength; 1305 } else { 1306 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status); 1307 status = U_ZERO_ERROR; // overflow, length status 1308 } 1309 UnicodeString *result = new UnicodeString(len16, 0, 0); 1310 1311 UChar *inputChars = result->getBuffer(len16); 1312 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning 1313 result->releaseBuffer(len16); 1314 1315 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator= 1316 } 1317 1318 return *fInput; 1319 } 1320 1321 //-------------------------------------------------------------------------------- 1322 // 1323 // inputText() 1324 // 1325 //-------------------------------------------------------------------------------- 1326 UText *RegexMatcher::inputText() const { 1327 return fInputText; 1328 } 1329 1330 1331 //-------------------------------------------------------------------------------- 1332 // 1333 // getInput() -- like inputText(), but makes a clone or copies into another UText 1334 // 1335 //-------------------------------------------------------------------------------- 1336 UText *RegexMatcher::getInput (UText *dest) const { 1337 UErrorCode status = U_ZERO_ERROR; // ignored 1338 if (dest) { 1339 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1340 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status); 1341 } else { 1342 int32_t input16Len; 1343 if (UTEXT_USES_U16(fInputText)) { 1344 input16Len = (int32_t)fInputLength; 1345 } else { 1346 UErrorCode lengthStatus = U_ZERO_ERROR; 1347 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error 1348 } 1349 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len)); 1350 1351 status = U_ZERO_ERROR; 1352 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning 1353 status = U_ZERO_ERROR; 1354 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status); 1355 1356 uprv_free(inputChars); 1357 } 1358 return dest; 1359 } else { 1360 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); 1361 } 1362 } 1363 1364 1365 static UBool compat_SyncMutableUTextContents(UText *ut); 1366 static UBool compat_SyncMutableUTextContents(UText *ut) { 1367 UBool retVal = FALSE; 1368 1369 // In the following test, we're really only interested in whether the UText should switch 1370 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents 1371 // will still point to the correct data. 1372 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { 1373 UnicodeString *us=(UnicodeString *)ut->context; 1374 1375 // Update to the latest length. 1376 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). 1377 int32_t newLength = us->length(); 1378 1379 // Update the chunk description. 1380 // The buffer may have switched between stack- and heap-based. 1381 ut->chunkContents = us->getBuffer(); 1382 ut->chunkLength = newLength; 1383 ut->chunkNativeLimit = newLength; 1384 ut->nativeIndexingLimit = newLength; 1385 retVal = TRUE; 1386 } 1387 1388 return retVal; 1389 } 1390 1391 //-------------------------------------------------------------------------------- 1392 // 1393 // lookingAt() 1394 // 1395 //-------------------------------------------------------------------------------- 1396 UBool RegexMatcher::lookingAt(UErrorCode &status) { 1397 if (U_FAILURE(status)) { 1398 return FALSE; 1399 } 1400 if (U_FAILURE(fDeferredStatus)) { 1401 status = fDeferredStatus; 1402 return FALSE; 1403 } 1404 1405 if (fInputUniStrMaybeMutable) { 1406 if (compat_SyncMutableUTextContents(fInputText)) { 1407 fInputLength = utext_nativeLength(fInputText); 1408 reset(); 1409 } 1410 } 1411 else { 1412 resetPreserveRegion(); 1413 } 1414 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1415 MatchChunkAt((int32_t)fActiveStart, FALSE, status); 1416 } else { 1417 MatchAt(fActiveStart, FALSE, status); 1418 } 1419 return fMatch; 1420 } 1421 1422 1423 UBool RegexMatcher::lookingAt(int32_t start, UErrorCode &status) { 1424 if (U_FAILURE(status)) { 1425 return FALSE; 1426 } 1427 if (U_FAILURE(fDeferredStatus)) { 1428 status = fDeferredStatus; 1429 return FALSE; 1430 } 1431 reset(); 1432 1433 if (start < 0) { 1434 status = U_INDEX_OUTOFBOUNDS_ERROR; 1435 return FALSE; 1436 } 1437 1438 if (fInputUniStrMaybeMutable) { 1439 if (compat_SyncMutableUTextContents(fInputText)) { 1440 fInputLength = utext_nativeLength(fInputText); 1441 reset(); 1442 } 1443 } 1444 1445 int64_t nativeStart; 1446 UBool couldFindStart = TRUE; 1447 if (UTEXT_USES_U16(fInputText)) { 1448 nativeStart = start; 1449 } else { 1450 UTEXT_SETNATIVEINDEX(fInputText, 0); 1451 int32_t i = 0; 1452 while (i < start) { 1453 UChar32 c = UTEXT_NEXT32(fInputText); 1454 if (c != U_SENTINEL) { 1455 i += U16_LENGTH(c); 1456 } else { 1457 couldFindStart = FALSE; 1458 break; 1459 } 1460 } 1461 nativeStart = UTEXT_GETNATIVEINDEX(fInputText); 1462 } 1463 if (!couldFindStart || nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1464 status = U_INDEX_OUTOFBOUNDS_ERROR; 1465 return FALSE; 1466 } 1467 1468 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1469 MatchChunkAt((int32_t)nativeStart, FALSE, status); 1470 } else { 1471 MatchAt(nativeStart, FALSE, status); 1472 } 1473 return fMatch; 1474 } 1475 1476 1477 1478 //-------------------------------------------------------------------------------- 1479 // 1480 // matches() 1481 // 1482 //-------------------------------------------------------------------------------- 1483 UBool RegexMatcher::matches(UErrorCode &status) { 1484 if (U_FAILURE(status)) { 1485 return FALSE; 1486 } 1487 if (U_FAILURE(fDeferredStatus)) { 1488 status = fDeferredStatus; 1489 return FALSE; 1490 } 1491 1492 if (fInputUniStrMaybeMutable) { 1493 if (compat_SyncMutableUTextContents(fInputText)) { 1494 fInputLength = utext_nativeLength(fInputText); 1495 reset(); 1496 } 1497 } 1498 else { 1499 resetPreserveRegion(); 1500 } 1501 1502 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1503 MatchChunkAt((int32_t)fActiveStart, TRUE, status); 1504 } else { 1505 MatchAt(fActiveStart, TRUE, status); 1506 } 1507 return fMatch; 1508 } 1509 1510 1511 UBool RegexMatcher::matches(int32_t start, UErrorCode &status) { 1512 if (U_FAILURE(status)) { 1513 return FALSE; 1514 } 1515 if (U_FAILURE(fDeferredStatus)) { 1516 status = fDeferredStatus; 1517 return FALSE; 1518 } 1519 reset(); 1520 1521 if (start < 0) { 1522 status = U_INDEX_OUTOFBOUNDS_ERROR; 1523 return FALSE; 1524 } 1525 1526 if (fInputUniStrMaybeMutable) { 1527 if (compat_SyncMutableUTextContents(fInputText)) { 1528 fInputLength = utext_nativeLength(fInputText); 1529 reset(); 1530 } 1531 } 1532 1533 int64_t nativeStart; 1534 UBool couldFindStart = TRUE; 1535 if (UTEXT_USES_U16(fInputText)) { 1536 nativeStart = start; 1537 } else { 1538 UTEXT_SETNATIVEINDEX(fInputText, 0); 1539 int32_t i = 0; 1540 while (i < start) { 1541 UChar32 c = UTEXT_NEXT32(fInputText); 1542 if (c != U_SENTINEL) { 1543 i += U16_LENGTH(c); 1544 } else { 1545 couldFindStart = FALSE; 1546 break; 1547 } 1548 } 1549 nativeStart = UTEXT_GETNATIVEINDEX(fInputText); 1550 } 1551 if (!couldFindStart || nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1552 status = U_INDEX_OUTOFBOUNDS_ERROR; 1553 return FALSE; 1554 } 1555 1556 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1557 MatchChunkAt((int32_t)nativeStart, TRUE, status); 1558 } else { 1559 MatchAt(nativeStart, TRUE, status); 1560 } 1561 return fMatch; 1562 } 1563 1564 1565 1566 //-------------------------------------------------------------------------------- 1567 // 1568 // pattern 1569 // 1570 //-------------------------------------------------------------------------------- 1571 const RegexPattern &RegexMatcher::pattern() const { 1572 return *fPattern; 1573 } 1574 1575 1576 1577 //-------------------------------------------------------------------------------- 1578 // 1579 // region 1580 // 1581 //-------------------------------------------------------------------------------- 1582 RegexMatcher &RegexMatcher::region(int32_t start, int32_t limit, UErrorCode &status) { 1583 if (U_FAILURE(status)) { 1584 return *this; 1585 } 1586 1587 if (start>limit || start<0 || limit<0) { 1588 status = U_ILLEGAL_ARGUMENT_ERROR; 1589 } 1590 1591 int64_t nativeStart; 1592 int32_t i = 0; 1593 UBool couldFindStart = TRUE; 1594 if (UTEXT_USES_U16(fInputText)) { 1595 nativeStart = start; 1596 couldFindStart = (nativeStart <= fInputLength); 1597 } else { 1598 UTEXT_SETNATIVEINDEX(fInputText, 0); 1599 while (i < start) { 1600 UChar32 c = UTEXT_NEXT32(fInputText); 1601 if (c != U_SENTINEL) { 1602 i += U16_LENGTH(c); 1603 } else { 1604 couldFindStart = FALSE; 1605 break; 1606 } 1607 } 1608 nativeStart = UTEXT_GETNATIVEINDEX(fInputText); 1609 } 1610 int64_t nativeLimit = nativeStart; 1611 1612 if (!couldFindStart) { 1613 status = U_ILLEGAL_ARGUMENT_ERROR; 1614 } else { 1615 UBool couldFindLimit = TRUE; 1616 if (UTEXT_USES_U16(fInputText)) { 1617 nativeLimit = limit; 1618 couldFindLimit = (nativeLimit <= fInputLength); 1619 } else { 1620 while (i < limit) { 1621 UChar32 c = UTEXT_NEXT32(fInputText); 1622 if (c != U_SENTINEL) { 1623 i += U16_LENGTH(c); 1624 } else { 1625 couldFindLimit = FALSE; 1626 break; 1627 } 1628 } 1629 nativeLimit = UTEXT_GETNATIVEINDEX(fInputText); 1630 } 1631 if (!couldFindLimit) { 1632 status = U_ILLEGAL_ARGUMENT_ERROR; 1633 } 1634 } 1635 1636 this->reset(); 1637 fRegionStart = nativeStart; 1638 fRegionLimit = nativeLimit; 1639 fActiveStart = nativeStart; 1640 fActiveLimit = nativeLimit; 1641 if (!fTransparentBounds) { 1642 fLookStart = nativeStart; 1643 fLookLimit = nativeLimit; 1644 } 1645 if (fAnchoringBounds) { 1646 fAnchorStart = nativeStart; 1647 fAnchorLimit = nativeLimit; 1648 } 1649 return *this; 1650 } 1651 1652 1653 1654 //-------------------------------------------------------------------------------- 1655 // 1656 // regionEnd 1657 // 1658 //-------------------------------------------------------------------------------- 1659 int32_t RegexMatcher::regionEnd() const { 1660 if (UTEXT_USES_U16(fInputText)) { 1661 return (int32_t)fRegionLimit; 1662 } else { 1663 // !!!: Would like a better way to do this! 1664 UErrorCode status = U_ZERO_ERROR; 1665 return utext_extract(fInputText, 0, fRegionLimit, NULL, 0, &status); 1666 } 1667 } 1668 1669 1670 //-------------------------------------------------------------------------------- 1671 // 1672 // regionStart 1673 // 1674 //-------------------------------------------------------------------------------- 1675 int32_t RegexMatcher::regionStart() const { 1676 if (UTEXT_USES_U16(fInputText)) { 1677 return (int32_t)fRegionStart; 1678 } else { 1679 // !!!: Would like a better way to do this! 1680 UErrorCode status = U_ZERO_ERROR; 1681 return utext_extract(fInputText, 0, fRegionStart, NULL, 0, &status); 1682 } 1683 } 1684 1685 1686 //-------------------------------------------------------------------------------- 1687 // 1688 // replaceAll 1689 // 1690 //-------------------------------------------------------------------------------- 1691 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { 1692 UText replacementText = UTEXT_INITIALIZER; 1693 UText resultText = UTEXT_INITIALIZER; 1694 UnicodeString resultString; 1695 1696 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1697 utext_openUnicodeString(&resultText, &resultString, &status); 1698 1699 replaceAll(&replacementText, &resultText, status); 1700 1701 utext_close(&resultText); 1702 utext_close(&replacementText); 1703 1704 return resultString; 1705 } 1706 1707 1708 // 1709 // replaceAll, UText mode 1710 // 1711 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) { 1712 if (U_FAILURE(status)) { 1713 return getInput(dest); 1714 } 1715 if (U_FAILURE(fDeferredStatus)) { 1716 status = fDeferredStatus; 1717 return getInput(dest); 1718 } 1719 1720 if (dest == NULL) { 1721 UnicodeString emptyString; 1722 UText empty = UTEXT_INITIALIZER; 1723 1724 utext_openUnicodeString(&empty, &emptyString, &status); 1725 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1726 utext_close(&empty); 1727 } 1728 1729 if (U_SUCCESS(status)) { 1730 reset(); 1731 while (find()) { 1732 appendReplacement(dest, replacement, status); 1733 if (U_FAILURE(status)) { 1734 break; 1735 } 1736 } 1737 appendTail(dest); 1738 } 1739 1740 return dest; 1741 } 1742 1743 1744 //-------------------------------------------------------------------------------- 1745 // 1746 // replaceFirst 1747 // 1748 //-------------------------------------------------------------------------------- 1749 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { 1750 UText replacementText = UTEXT_INITIALIZER; 1751 UText resultText = UTEXT_INITIALIZER; 1752 UnicodeString resultString; 1753 1754 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1755 utext_openUnicodeString(&resultText, &resultString, &status); 1756 1757 replaceFirst(&replacementText, &resultText, status); 1758 1759 utext_close(&resultText); 1760 utext_close(&replacementText); 1761 1762 return resultString; 1763 } 1764 1765 // 1766 // replaceFirst, UText mode 1767 // 1768 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) { 1769 if (U_FAILURE(status)) { 1770 return getInput(dest); 1771 } 1772 if (U_FAILURE(fDeferredStatus)) { 1773 status = fDeferredStatus; 1774 return getInput(dest); 1775 } 1776 1777 reset(); 1778 if (!find()) { 1779 return getInput(dest); 1780 } 1781 1782 if (dest == NULL) { 1783 UnicodeString emptyString; 1784 UText empty = UTEXT_INITIALIZER; 1785 1786 utext_openUnicodeString(&empty, &emptyString, &status); 1787 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1788 utext_close(&empty); 1789 } 1790 1791 appendReplacement(dest, replacement, status); 1792 appendTail(dest); 1793 1794 return dest; 1795 } 1796 1797 1798 //-------------------------------------------------------------------------------- 1799 // 1800 // requireEnd 1801 // 1802 //-------------------------------------------------------------------------------- 1803 UBool RegexMatcher::requireEnd() const { 1804 return fRequireEnd; 1805 } 1806 1807 1808 //-------------------------------------------------------------------------------- 1809 // 1810 // reset 1811 // 1812 //-------------------------------------------------------------------------------- 1813 RegexMatcher &RegexMatcher::reset() { 1814 fRegionStart = 0; 1815 fRegionLimit = fInputLength; 1816 fActiveStart = 0; 1817 fActiveLimit = fInputLength; 1818 fAnchorStart = 0; 1819 fAnchorLimit = fInputLength; 1820 fLookStart = 0; 1821 fLookLimit = fInputLength; 1822 resetPreserveRegion(); 1823 return *this; 1824 } 1825 1826 1827 1828 void RegexMatcher::resetPreserveRegion() { 1829 fMatchStart = 0; 1830 fMatchEnd = 0; 1831 fLastMatchEnd = -1; 1832 fAppendPosition = 0; 1833 fMatch = FALSE; 1834 fHitEnd = FALSE; 1835 fRequireEnd = FALSE; 1836 fTime = 0; 1837 fTickCounter = TIMER_INITIAL_VALUE; 1838 //resetStack(); // more expensive than it looks... 1839 } 1840 1841 1842 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { 1843 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus); 1844 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1845 fInputLength = utext_nativeLength(fInputText); 1846 1847 reset(); 1848 delete fInput; 1849 fInput = NULL; 1850 1851 // Do the following for any UnicodeString. 1852 // This is for compatibility for those clients who modify the input string "live" during regex operations. 1853 fInputUniStrMaybeMutable = TRUE; 1854 1855 if (fWordBreakItr != NULL) { 1856 #if UCONFIG_NO_BREAK_ITERATION==0 1857 UErrorCode status = U_ZERO_ERROR; 1858 fWordBreakItr->setText(fInputText, status); 1859 #endif 1860 } 1861 return *this; 1862 } 1863 1864 1865 RegexMatcher &RegexMatcher::reset(UText *input) { 1866 if (fInputText != input) { 1867 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus); 1868 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1869 fInputLength = utext_nativeLength(fInputText); 1870 1871 delete fInput; 1872 fInput = NULL; 1873 1874 if (fWordBreakItr != NULL) { 1875 #if UCONFIG_NO_BREAK_ITERATION==0 1876 UErrorCode status = U_ZERO_ERROR; 1877 fWordBreakItr->setText(input, status); 1878 #endif 1879 } 1880 } 1881 reset(); 1882 fInputUniStrMaybeMutable = FALSE; 1883 1884 return *this; 1885 } 1886 1887 /*RegexMatcher &RegexMatcher::reset(const UChar *) { 1888 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; 1889 return *this; 1890 }*/ 1891 1892 1893 RegexMatcher &RegexMatcher::reset(int32_t position, UErrorCode &status) { 1894 if (U_FAILURE(status)) { 1895 return *this; 1896 } 1897 reset(); // Reset also resets the region to be the entire string. 1898 1899 if (position < 0) { 1900 status = U_INDEX_OUTOFBOUNDS_ERROR; 1901 return *this; 1902 } 1903 1904 int64_t nativePos; 1905 UBool couldFindStart = TRUE; 1906 if (UTEXT_USES_U16(fInputText)) { 1907 nativePos = position; 1908 } else { 1909 UTEXT_SETNATIVEINDEX(fInputText, 0); 1910 int32_t i = 0; 1911 while (i < position) { 1912 UChar32 c = UTEXT_NEXT32(fInputText); 1913 if (c != U_SENTINEL) { 1914 i += U16_LENGTH(c); 1915 } else { 1916 couldFindStart = FALSE; 1917 break; 1918 } 1919 } 1920 nativePos = UTEXT_GETNATIVEINDEX(fInputText); 1921 } 1922 if (!couldFindStart || nativePos < fActiveStart || nativePos >= fActiveLimit) { 1923 status = U_INDEX_OUTOFBOUNDS_ERROR; 1924 return *this; 1925 } 1926 fMatchEnd = nativePos; 1927 return *this; 1928 } 1929 1930 // BEGIN android-added 1931 // Removed this function after Android upgrad to ICU4.6. 1932 //-------------------------------------------------------------------------------- 1933 // 1934 // refresh 1935 // 1936 //-------------------------------------------------------------------------------- 1937 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) { 1938 if (U_FAILURE(status)) { 1939 return *this; 1940 } 1941 if (input == NULL) { 1942 status = U_ILLEGAL_ARGUMENT_ERROR; 1943 return *this; 1944 } 1945 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) { 1946 status = U_ILLEGAL_ARGUMENT_ERROR; 1947 return *this; 1948 } 1949 int64_t pos = utext_getNativeIndex(fInputText); 1950 // Shallow read-only clone of the new UText into the existing input UText 1951 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status); 1952 if (U_FAILURE(status)) { 1953 return *this; 1954 } 1955 utext_setNativeIndex(fInputText, pos); 1956 1957 if (fAltInputText != NULL) { 1958 pos = utext_getNativeIndex(fAltInputText); 1959 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status); 1960 if (U_FAILURE(status)) { 1961 return *this; 1962 } 1963 utext_setNativeIndex(fAltInputText, pos); 1964 } 1965 return *this; 1966 } 1967 // END android-added 1968 1969 1970 1971 //-------------------------------------------------------------------------------- 1972 // 1973 // setTrace 1974 // 1975 //-------------------------------------------------------------------------------- 1976 void RegexMatcher::setTrace(UBool state) { 1977 fTraceDebug = state; 1978 } 1979 1980 1981 1982 //--------------------------------------------------------------------- 1983 // 1984 // split 1985 // 1986 //--------------------------------------------------------------------- 1987 int32_t RegexMatcher::split(const UnicodeString &input, 1988 UnicodeString dest[], 1989 int32_t destCapacity, 1990 UErrorCode &status) 1991 { 1992 UText inputText = UTEXT_INITIALIZER; 1993 utext_openConstUnicodeString(&inputText, &input, &status); 1994 1995 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); 1996 int32_t i; 1997 for (i = 0; i < destCapacity; i++) { 1998 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); 1999 } 2000 2001 int32_t fieldCount = split(&inputText, destText, destCapacity, status); 2002 2003 for (i = 0; i < destCapacity; i++) { 2004 utext_close(destText[i]); 2005 } 2006 2007 uprv_free(destText); 2008 utext_close(&inputText); 2009 return fieldCount; 2010 } 2011 2012 // 2013 // split, UText mode 2014 // 2015 int32_t RegexMatcher::split(UText *input, 2016 UText *dest[], 2017 int32_t destCapacity, 2018 UErrorCode &status) 2019 { 2020 // 2021 // Check arguements for validity 2022 // 2023 if (U_FAILURE(status)) { 2024 return 0; 2025 }; 2026 2027 if (destCapacity < 1) { 2028 status = U_ILLEGAL_ARGUMENT_ERROR; 2029 return 0; 2030 } 2031 2032 // 2033 // Reset for the input text 2034 // 2035 reset(input); 2036 int64_t nextOutputStringStart = 0; 2037 if (fActiveLimit == 0) { 2038 return 0; 2039 } 2040 2041 // 2042 // Loop through the input text, searching for the delimiter pattern 2043 // 2044 int32_t i; 2045 int32_t numCaptureGroups = fPattern->fGroupMap->size(); 2046 for (i=0; ; i++) { 2047 if (i>=destCapacity-1) { 2048 // There is one or zero output string left. 2049 // Fill the last output string with whatever is left from the input, then exit the loop. 2050 // ( i will be == destCapacity if we filled the output array while processing 2051 // capture groups of the delimiter expression, in which case we will discard the 2052 // last capture group saved in favor of the unprocessed remainder of the 2053 // input string.) 2054 i = destCapacity-1; 2055 if (fActiveLimit > nextOutputStringStart) { 2056 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2057 if (dest[i]) { 2058 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), input->chunkContents+nextOutputStringStart, (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2059 } else { 2060 UText remainingText = UTEXT_INITIALIZER; 2061 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, fActiveLimit-nextOutputStringStart, &status); 2062 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2063 utext_close(&remainingText); 2064 } 2065 } else { 2066 UErrorCode lengthStatus = U_ZERO_ERROR; 2067 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 2068 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2069 2070 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2071 if (dest[i]) { 2072 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2073 } else { 2074 UText remainingText = UTEXT_INITIALIZER; 2075 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2076 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2077 utext_close(&remainingText); 2078 } 2079 2080 uprv_free(remainingChars); 2081 } 2082 } 2083 break; 2084 } 2085 if (find()) { 2086 // We found another delimiter. Move everything from where we started looking 2087 // up until the start of the delimiter into the next output string. 2088 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2089 if (dest[i]) { 2090 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), input->chunkContents+nextOutputStringStart, (int32_t)(fMatchStart-nextOutputStringStart), &status); 2091 } else { 2092 UText remainingText = UTEXT_INITIALIZER; 2093 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, fMatchStart-nextOutputStringStart, &status); 2094 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2095 utext_close(&remainingText); 2096 } 2097 } else { 2098 UErrorCode lengthStatus = U_ZERO_ERROR; 2099 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus); 2100 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2101 2102 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status); 2103 if (dest[i]) { 2104 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2105 } else { 2106 UText remainingText = UTEXT_INITIALIZER; 2107 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2108 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2109 utext_close(&remainingText); 2110 } 2111 2112 uprv_free(remainingChars); 2113 } 2114 nextOutputStringStart = fMatchEnd; 2115 2116 // If the delimiter pattern has capturing parentheses, the captured 2117 // text goes out into the next n destination strings. 2118 int32_t groupNum; 2119 UBool lastGroupWasNullUText = FALSE; 2120 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 2121 if (i==destCapacity-1) { 2122 break; 2123 } 2124 i++; 2125 lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE); 2126 dest[i] = group(groupNum, dest[i], status); 2127 } 2128 2129 if (nextOutputStringStart == fActiveLimit) { 2130 // The delimiter was at the end of the string. We're done. 2131 break; 2132 } else if (i == destCapacity-1) { 2133 // We're out of capture groups, and the rest of the string is more important 2134 if (lastGroupWasNullUText) { 2135 utext_close(dest[i]); 2136 dest[i] = NULL; 2137 } 2138 } 2139 2140 } 2141 else 2142 { 2143 // We ran off the end of the input while looking for the next delimiter. 2144 // All the remaining text goes into the current output string. 2145 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2146 if (dest[i]) { 2147 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), input->chunkContents+nextOutputStringStart, (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2148 } else { 2149 UText remainingText = UTEXT_INITIALIZER; 2150 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, fActiveLimit-nextOutputStringStart, &status); 2151 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2152 utext_close(&remainingText); 2153 } 2154 } else { 2155 UErrorCode lengthStatus = U_ZERO_ERROR; 2156 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 2157 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2158 2159 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2160 if (dest[i]) { 2161 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2162 } else { 2163 UText remainingText = UTEXT_INITIALIZER; 2164 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2165 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2166 utext_close(&remainingText); 2167 } 2168 2169 uprv_free(remainingChars); 2170 } 2171 break; 2172 } 2173 } 2174 return i+1; 2175 } 2176 2177 2178 //-------------------------------------------------------------------------------- 2179 // 2180 // start 2181 // 2182 //-------------------------------------------------------------------------------- 2183 int32_t RegexMatcher::start(UErrorCode &status) const { 2184 return start(0, status); 2185 } 2186 2187 2188 2189 2190 //-------------------------------------------------------------------------------- 2191 // 2192 // start(int32_t group, UErrorCode &status) 2193 // 2194 //-------------------------------------------------------------------------------- 2195 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { 2196 if (U_FAILURE(status)) { 2197 return -1; 2198 } 2199 if (U_FAILURE(fDeferredStatus)) { 2200 status = fDeferredStatus; 2201 return -1; 2202 } 2203 if (fMatch == FALSE) { 2204 status = U_REGEX_INVALID_STATE; 2205 return -1; 2206 } 2207 if (group < 0 || group > fPattern->fGroupMap->size()) { 2208 status = U_INDEX_OUTOFBOUNDS_ERROR; 2209 return -1; 2210 } 2211 int64_t s; 2212 if (group == 0) { 2213 s = fMatchStart; 2214 } else { 2215 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 2216 U_ASSERT(groupOffset < fPattern->fFrameSize); 2217 U_ASSERT(groupOffset >= 0); 2218 s = fFrame->fExtra[groupOffset]; 2219 } 2220 2221 if (s == -1 || UTEXT_USES_U16(fInputText)) { 2222 return (int32_t)s; 2223 } else { 2224 // !!!: Would like a better way to do this! 2225 UErrorCode status = U_ZERO_ERROR; 2226 return utext_extract(fInputText, 0, s, NULL, 0, &status); 2227 } 2228 } 2229 2230 2231 2232 //-------------------------------------------------------------------------------- 2233 // 2234 // useAnchoringBounds 2235 // 2236 //-------------------------------------------------------------------------------- 2237 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { 2238 fAnchoringBounds = b; 2239 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); 2240 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); 2241 return *this; 2242 } 2243 2244 2245 //-------------------------------------------------------------------------------- 2246 // 2247 // useTransparentBounds 2248 // 2249 //-------------------------------------------------------------------------------- 2250 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { 2251 fTransparentBounds = b; 2252 fLookStart = (fTransparentBounds ? 0 : fRegionStart); 2253 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); 2254 return *this; 2255 } 2256 2257 //-------------------------------------------------------------------------------- 2258 // 2259 // setTimeLimit 2260 // 2261 //-------------------------------------------------------------------------------- 2262 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { 2263 if (U_FAILURE(status)) { 2264 return; 2265 } 2266 if (U_FAILURE(fDeferredStatus)) { 2267 status = fDeferredStatus; 2268 return; 2269 } 2270 if (limit < 0) { 2271 status = U_ILLEGAL_ARGUMENT_ERROR; 2272 return; 2273 } 2274 fTimeLimit = limit; 2275 } 2276 2277 2278 //-------------------------------------------------------------------------------- 2279 // 2280 // getTimeLimit 2281 // 2282 //-------------------------------------------------------------------------------- 2283 int32_t RegexMatcher::getTimeLimit() const { 2284 return fTimeLimit; 2285 } 2286 2287 2288 //-------------------------------------------------------------------------------- 2289 // 2290 // setStackLimit 2291 // 2292 //-------------------------------------------------------------------------------- 2293 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { 2294 if (U_FAILURE(status)) { 2295 return; 2296 } 2297 if (U_FAILURE(fDeferredStatus)) { 2298 status = fDeferredStatus; 2299 return; 2300 } 2301 if (limit < 0) { 2302 status = U_ILLEGAL_ARGUMENT_ERROR; 2303 return; 2304 } 2305 2306 // Reset the matcher. This is needed here in case there is a current match 2307 // whose final stack frame (containing the match results, pointed to by fFrame) 2308 // would be lost by resizing to a smaller stack size. 2309 reset(); 2310 2311 if (limit == 0) { 2312 // Unlimited stack expansion 2313 fStack->setMaxCapacity(0); 2314 } else { 2315 // Change the units of the limit from bytes to ints, and bump the size up 2316 // to be big enough to hold at least one stack frame for the pattern, 2317 // if it isn't there already. 2318 int32_t adjustedLimit = limit / sizeof(int32_t); 2319 if (adjustedLimit < fPattern->fFrameSize) { 2320 adjustedLimit = fPattern->fFrameSize; 2321 } 2322 fStack->setMaxCapacity(adjustedLimit); 2323 } 2324 fStackLimit = limit; 2325 } 2326 2327 2328 //-------------------------------------------------------------------------------- 2329 // 2330 // getStackLimit 2331 // 2332 //-------------------------------------------------------------------------------- 2333 int32_t RegexMatcher::getStackLimit() const { 2334 return fStackLimit; 2335 } 2336 2337 2338 //-------------------------------------------------------------------------------- 2339 // 2340 // setMatchCallback 2341 // 2342 //-------------------------------------------------------------------------------- 2343 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, 2344 const void *context, 2345 UErrorCode &status) { 2346 if (U_FAILURE(status)) { 2347 return; 2348 } 2349 fCallbackFn = callback; 2350 fCallbackContext = context; 2351 } 2352 2353 2354 //-------------------------------------------------------------------------------- 2355 // 2356 // getMatchCallback 2357 // 2358 //-------------------------------------------------------------------------------- 2359 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, 2360 const void *&context, 2361 UErrorCode &status) { 2362 if (U_FAILURE(status)) { 2363 return; 2364 } 2365 callback = fCallbackFn; 2366 context = fCallbackContext; 2367 } 2368 2369 2370 //================================================================================ 2371 // 2372 // Code following this point in this file is the internal 2373 // Match Engine Implementation. 2374 // 2375 //================================================================================ 2376 2377 2378 //-------------------------------------------------------------------------------- 2379 // 2380 // resetStack 2381 // Discard any previous contents of the state save stack, and initialize a 2382 // new stack frame to all -1. The -1s are needed for capture group limits, 2383 // where they indicate that a group has not yet matched anything. 2384 //-------------------------------------------------------------------------------- 2385 REStackFrame *RegexMatcher::resetStack() { 2386 // Discard any previous contents of the state save stack, and initialize a 2387 // new stack frame with all -1 data. The -1s are needed for capture group limits, 2388 // where they indicate that a group has not yet matched anything. 2389 fStack->removeAllElements(); 2390 2391 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); 2392 int32_t i; 2393 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { 2394 iFrame->fExtra[i] = -1; 2395 } 2396 return iFrame; 2397 } 2398 2399 2400 2401 //-------------------------------------------------------------------------------- 2402 // 2403 // isWordBoundary 2404 // in perl, "xab..cd..", \b is true at positions 0,3,5,7 2405 // For us, 2406 // If the current char is a combining mark, 2407 // \b is FALSE. 2408 // Else Scan backwards to the first non-combining char. 2409 // We are at a boundary if the this char and the original chars are 2410 // opposite in membership in \w set 2411 // 2412 // parameters: pos - the current position in the input buffer 2413 // 2414 // TODO: double-check edge cases at region boundaries. 2415 // 2416 //-------------------------------------------------------------------------------- 2417 UBool RegexMatcher::isWordBoundary(int64_t pos) { 2418 UBool isBoundary = FALSE; 2419 UBool cIsWord = FALSE; 2420 2421 if (pos >= fLookLimit) { 2422 fHitEnd = TRUE; 2423 } else { 2424 // Determine whether char c at current position is a member of the word set of chars. 2425 // If we're off the end of the string, behave as though we're not at a word char. 2426 UTEXT_SETNATIVEINDEX(fInputText, pos); 2427 UChar32 c = UTEXT_CURRENT32(fInputText); 2428 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2429 // Current char is a combining one. Not a boundary. 2430 return FALSE; 2431 } 2432 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2433 } 2434 2435 // Back up until we come to a non-combining char, determine whether 2436 // that char is a word char. 2437 UBool prevCIsWord = FALSE; 2438 for (;;) { 2439 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { 2440 break; 2441 } 2442 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); 2443 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2444 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2445 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 2446 break; 2447 } 2448 } 2449 isBoundary = cIsWord ^ prevCIsWord; 2450 return isBoundary; 2451 } 2452 2453 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { 2454 UBool isBoundary = FALSE; 2455 UBool cIsWord = FALSE; 2456 2457 const UChar *inputBuf = fInputText->chunkContents; 2458 2459 if (pos >= fLookLimit) { 2460 fHitEnd = TRUE; 2461 } else { 2462 // Determine whether char c at current position is a member of the word set of chars. 2463 // If we're off the end of the string, behave as though we're not at a word char. 2464 UChar32 c; 2465 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); 2466 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2467 // Current char is a combining one. Not a boundary. 2468 return FALSE; 2469 } 2470 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2471 } 2472 2473 // Back up until we come to a non-combining char, determine whether 2474 // that char is a word char. 2475 UBool prevCIsWord = FALSE; 2476 for (;;) { 2477 if (pos <= fLookStart) { 2478 break; 2479 } 2480 UChar32 prevChar; 2481 U16_PREV(inputBuf, fLookStart, pos, prevChar); 2482 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2483 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2484 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 2485 break; 2486 } 2487 } 2488 isBoundary = cIsWord ^ prevCIsWord; 2489 return isBoundary; 2490 } 2491 2492 //-------------------------------------------------------------------------------- 2493 // 2494 // isUWordBoundary 2495 // 2496 // Test for a word boundary using RBBI word break. 2497 // 2498 // parameters: pos - the current position in the input buffer 2499 // 2500 //-------------------------------------------------------------------------------- 2501 UBool RegexMatcher::isUWordBoundary(int64_t pos) { 2502 UBool returnVal = FALSE; 2503 #if UCONFIG_NO_BREAK_ITERATION==0 2504 2505 // If we haven't yet created a break iterator for this matcher, do it now. 2506 if (fWordBreakItr == NULL) { 2507 fWordBreakItr = 2508 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus); 2509 if (U_FAILURE(fDeferredStatus)) { 2510 return FALSE; 2511 } 2512 fWordBreakItr->setText(fInputText, fDeferredStatus); 2513 } 2514 2515 if (pos >= fLookLimit) { 2516 fHitEnd = TRUE; 2517 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" 2518 // words are not boundaries. All non-word chars stand by themselves, 2519 // with word boundaries on both sides. 2520 } else { 2521 if (!UTEXT_USES_U16(fInputText)) { 2522 // !!!: Would like a better way to do this! 2523 UErrorCode status = U_ZERO_ERROR; 2524 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status); 2525 } 2526 returnVal = fWordBreakItr->isBoundary((int32_t)pos); 2527 } 2528 #endif 2529 return returnVal; 2530 } 2531 2532 //-------------------------------------------------------------------------------- 2533 // 2534 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state 2535 // saves. Increment the "time" counter, and call the 2536 // user callback function if there is one installed. 2537 // 2538 // If the match operation needs to be aborted, either for a time-out 2539 // or because the user callback asked for it, just set an error status. 2540 // The engine will pick that up and stop in its outer loop. 2541 // 2542 //-------------------------------------------------------------------------------- 2543 void RegexMatcher::IncrementTime(UErrorCode &status) { 2544 fTickCounter = TIMER_INITIAL_VALUE; 2545 fTime++; 2546 if (fCallbackFn != NULL) { 2547 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { 2548 status = U_REGEX_STOPPED_BY_CALLER; 2549 return; 2550 } 2551 } 2552 if (fTimeLimit > 0 && fTime >= fTimeLimit) { 2553 status = U_REGEX_TIME_OUT; 2554 } 2555 } 2556 2557 //-------------------------------------------------------------------------------- 2558 // 2559 // StateSave 2560 // Make a new stack frame, initialized as a copy of the current stack frame. 2561 // Set the pattern index in the original stack frame from the operand value 2562 // in the opcode. Execution of the engine continues with the state in 2563 // the newly created stack frame 2564 // 2565 // Note that reserveBlock() may grow the stack, resulting in the 2566 // whole thing being relocated in memory. 2567 // 2568 // Parameters: 2569 // fp The top frame pointer when called. At return, a new 2570 // fame will be present 2571 // savePatIdx An index into the compiled pattern. Goes into the original 2572 // (not new) frame. If execution ever back-tracks out of the 2573 // new frame, this will be where we continue from in the pattern. 2574 // Return 2575 // The new frame pointer. 2576 // 2577 //-------------------------------------------------------------------------------- 2578 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) { 2579 // push storage for a new frame. 2580 int64_t *newFP = fStack->reserveBlock(fFrameSize, status); 2581 if (newFP == NULL) { 2582 // Failure on attempted stack expansion. 2583 // Stack function set some other error code, change it to a more 2584 // specific one for regular expressions. 2585 status = U_REGEX_STACK_OVERFLOW; 2586 // We need to return a writable stack frame, so just return the 2587 // previous frame. The match operation will stop quickly 2588 // because of the error status, after which the frame will never 2589 // be looked at again. 2590 return fp; 2591 } 2592 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. 2593 2594 // New stack frame = copy of old top frame. 2595 int64_t *source = (int64_t *)fp; 2596 int64_t *dest = newFP; 2597 for (;;) { 2598 *dest++ = *source++; 2599 if (source == newFP) { 2600 break; 2601 } 2602 } 2603 2604 fTickCounter--; 2605 if (fTickCounter <= 0) { 2606 IncrementTime(status); // Re-initializes fTickCounter 2607 } 2608 fp->fPatIdx = savePatIdx; 2609 return (REStackFrame *)newFP; 2610 } 2611 2612 2613 //-------------------------------------------------------------------------------- 2614 // 2615 // MatchAt This is the actual matching engine. 2616 // 2617 // startIdx: begin matching a this index. 2618 // toEnd: if true, match must extend to end of the input region 2619 // 2620 //-------------------------------------------------------------------------------- 2621 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { 2622 UBool isMatch = FALSE; // True if the we have a match. 2623 2624 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards 2625 2626 int32_t op; // Operation from the compiled pattern, split into 2627 int32_t opType; // the opcode 2628 int32_t opValue; // and the operand value. 2629 2630 #ifdef REGEX_RUN_DEBUG 2631 if (fTraceDebug) 2632 { 2633 printf("MatchAt(startIdx=%ld)\n", startIdx); 2634 printf("Original Pattern: "); 2635 UChar32 c = utext_next32From(fPattern->fPattern, 0); 2636 while (c != U_SENTINEL) { 2637 if (c<32 || c>256) { 2638 c = '.'; 2639 } 2640 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); 2641 2642 c = UTEXT_NEXT32(fPattern->fPattern); 2643 } 2644 printf("\n"); 2645 printf("Input String: "); 2646 c = utext_next32From(fInputText, 0); 2647 while (c != U_SENTINEL) { 2648 if (c<32 || c>256) { 2649 c = '.'; 2650 } 2651 printf("%c", c); 2652 2653 c = UTEXT_NEXT32(fInputText); 2654 } 2655 printf("\n"); 2656 printf("\n"); 2657 } 2658 #endif 2659 2660 if (U_FAILURE(status)) { 2661 return; 2662 } 2663 2664 // Cache frequently referenced items from the compiled pattern 2665 // 2666 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 2667 2668 const UChar *litText = fPattern->fLiteralText.getBuffer(); 2669 UVector *sets = fPattern->fSets; 2670 2671 fFrameSize = fPattern->fFrameSize; 2672 REStackFrame *fp = resetStack(); 2673 2674 fp->fPatIdx = 0; 2675 fp->fInputIdx = startIdx; 2676 2677 // Zero out the pattern's static data 2678 int32_t i; 2679 for (i = 0; i<fPattern->fDataSize; i++) { 2680 fData[i] = 0; 2681 } 2682 2683 // 2684 // Main loop for interpreting the compiled pattern. 2685 // One iteration of the loop per pattern operation performed. 2686 // 2687 for (;;) { 2688 #if 0 2689 if (_heapchk() != _HEAPOK) { 2690 fprintf(stderr, "Heap Trouble\n"); 2691 } 2692 #endif 2693 2694 op = (int32_t)pat[fp->fPatIdx]; 2695 opType = URX_TYPE(op); 2696 opValue = URX_VAL(op); 2697 #ifdef REGEX_RUN_DEBUG 2698 if (fTraceDebug) { 2699 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2700 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, 2701 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 2702 fPattern->dumpOp(fp->fPatIdx); 2703 } 2704 #endif 2705 fp->fPatIdx++; 2706 2707 switch (opType) { 2708 2709 2710 case URX_NOP: 2711 break; 2712 2713 2714 case URX_BACKTRACK: 2715 // Force a backtrack. In some circumstances, the pattern compiler 2716 // will notice that the pattern can't possibly match anything, and will 2717 // emit one of these at that point. 2718 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2719 break; 2720 2721 2722 case URX_ONECHAR: 2723 if (fp->fInputIdx < fActiveLimit) { 2724 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2725 UChar32 c = UTEXT_NEXT32(fInputText); 2726 if (c == opValue) { 2727 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2728 break; 2729 } 2730 } else { 2731 fHitEnd = TRUE; 2732 } 2733 2734 #ifdef REGEX_SMART_BACKTRACKING 2735 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 2736 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 2737 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 2738 UBool success = FALSE; 2739 UChar32 c = UTEXT_PREVIOUS32(fInputText); 2740 while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) { 2741 if (c == opValue) { 2742 success = TRUE; 2743 break; 2744 } else if (c == U_SENTINEL) { 2745 break; 2746 } 2747 c = UTEXT_PREVIOUS32(fInputText); 2748 } 2749 if (success) { 2750 fHitEnd = FALSE; 2751 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2752 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2753 if (fp->fInputIdx > backSearchIndex) { 2754 fp = StateSave(fp, fp->fPatIdx, status); 2755 } 2756 fp->fPatIdx++; // Skip the LOOP_C, we just did that 2757 break; 2758 } 2759 } 2760 } 2761 #endif 2762 2763 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2764 break; 2765 2766 2767 case URX_STRING: 2768 { 2769 // Test input against a literal string. 2770 // Strings require two slots in the compiled pattern, one for the 2771 // offset to the string text, and one for the length. 2772 int32_t stringStartIdx = opValue; 2773 int32_t stringLen; 2774 2775 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 2776 fp->fPatIdx++; 2777 opType = URX_TYPE(op); 2778 stringLen = URX_VAL(op); 2779 U_ASSERT(opType == URX_STRING_LEN); 2780 U_ASSERT(stringLen >= 2); 2781 2782 const UChar *patternChars = litText+stringStartIdx; 2783 const UChar *patternEnd = patternChars+stringLen; 2784 2785 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2786 UChar32 c; 2787 UBool success = TRUE; 2788 2789 while (patternChars < patternEnd && success) { 2790 c = UTEXT_NEXT32(fInputText); 2791 2792 if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) { 2793 if (U_IS_BMP(c)) { 2794 success = (*patternChars == c); 2795 patternChars += 1; 2796 } else if (patternChars+1 < patternEnd) { 2797 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); 2798 patternChars += 2; 2799 } 2800 } else { 2801 success = FALSE; 2802 fHitEnd = TRUE; // TODO: See ticket 6074 2803 } 2804 } 2805 2806 if (success) { 2807 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2808 } else { 2809 #ifdef REGEX_SMART_BACKTRACKING 2810 if (fp->fInputIdx > backSearchIndex && fStack->size()) { 2811 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 2812 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 2813 // Reset to last start point 2814 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2815 patternChars = litText+stringStartIdx; 2816 2817 // Search backwards for a possible start 2818 do { 2819 c = UTEXT_PREVIOUS32(fInputText); 2820 if (c == U_SENTINEL) { 2821 break; 2822 } else if ((U_IS_BMP(c) && *patternChars == c) || 2823 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { 2824 success = TRUE; 2825 break; 2826 } 2827 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); 2828 2829 // And try again 2830 if (success) { 2831 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2832 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2833 if (fp->fInputIdx > backSearchIndex) { 2834 fp = StateSave(fp, fp->fPatIdx, status); 2835 } 2836 fp->fPatIdx++; // Skip the LOOP_C, we just did that 2837 break; 2838 } 2839 } 2840 } 2841 #endif 2842 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2843 } 2844 } 2845 break; 2846 2847 2848 case URX_STATE_SAVE: 2849 fp = StateSave(fp, opValue, status); 2850 break; 2851 2852 2853 case URX_END: 2854 // The match loop will exit via this path on a successful match, 2855 // when we reach the end of the pattern. 2856 if (toEnd && fp->fInputIdx != fActiveLimit) { 2857 // The pattern matched, but not to the end of input. Try some more. 2858 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2859 break; 2860 } 2861 isMatch = TRUE; 2862 goto breakFromLoop; 2863 2864 // Start and End Capture stack frame variables are laid out out like this: 2865 // fp->fExtra[opValue] - The start of a completed capture group 2866 // opValue+1 - The end of a completed capture group 2867 // opValue+2 - the start of a capture group whose end 2868 // has not yet been reached (and might not ever be). 2869 case URX_START_CAPTURE: 2870 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2871 fp->fExtra[opValue+2] = fp->fInputIdx; 2872 break; 2873 2874 2875 case URX_END_CAPTURE: 2876 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2877 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 2878 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 2879 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 2880 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 2881 break; 2882 2883 2884 case URX_DOLLAR: // $, test for End of line 2885 // or for position before new line at end of input 2886 { 2887 if (fp->fInputIdx >= fAnchorLimit) { 2888 // We really are at the end of input. Success. 2889 fHitEnd = TRUE; 2890 fRequireEnd = TRUE; 2891 break; 2892 } 2893 2894 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2895 2896 // If we are positioned just before a new-line that is located at the 2897 // end of input, succeed. 2898 UChar32 c = UTEXT_NEXT32(fInputText); 2899 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 2900 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 2901 // If not in the middle of a CR/LF sequence 2902 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { 2903 // At new-line at end of input. Success 2904 fHitEnd = TRUE; 2905 fRequireEnd = TRUE; 2906 2907 break; 2908 } 2909 } 2910 } else { 2911 UChar32 nextC = UTEXT_NEXT32(fInputText); 2912 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 2913 fHitEnd = TRUE; 2914 fRequireEnd = TRUE; 2915 break; // At CR/LF at end of input. Success 2916 } 2917 } 2918 2919 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2920 } 2921 break; 2922 2923 2924 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 2925 if (fp->fInputIdx >= fAnchorLimit) { 2926 // Off the end of input. Success. 2927 fHitEnd = TRUE; 2928 fRequireEnd = TRUE; 2929 break; 2930 } else { 2931 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2932 UChar32 c = UTEXT_NEXT32(fInputText); 2933 // Either at the last character of input, or off the end. 2934 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) { 2935 fHitEnd = TRUE; 2936 fRequireEnd = TRUE; 2937 break; 2938 } 2939 } 2940 2941 // Not at end of input. Back-track out. 2942 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2943 break; 2944 2945 2946 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 2947 { 2948 if (fp->fInputIdx >= fAnchorLimit) { 2949 // We really are at the end of input. Success. 2950 fHitEnd = TRUE; 2951 fRequireEnd = TRUE; 2952 break; 2953 } 2954 // If we are positioned just before a new-line, succeed. 2955 // It makes no difference where the new-line is within the input. 2956 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2957 UChar32 c = UTEXT_CURRENT32(fInputText); 2958 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 2959 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 2960 // In multi-line mode, hitting a new-line just before the end of input does not 2961 // set the hitEnd or requireEnd flags 2962 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) { 2963 break; 2964 } 2965 } 2966 // not at a new line. Fail. 2967 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2968 } 2969 break; 2970 2971 2972 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 2973 { 2974 if (fp->fInputIdx >= fAnchorLimit) { 2975 // We really are at the end of input. Success. 2976 fHitEnd = TRUE; 2977 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 2978 break; // adding a new-line would not lose the match. 2979 } 2980 // If we are not positioned just before a new-line, the test fails; backtrack out. 2981 // It makes no difference where the new-line is within the input. 2982 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2983 if (UTEXT_CURRENT32(fInputText) != 0x0a) { 2984 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2985 } 2986 } 2987 break; 2988 2989 2990 case URX_CARET: // ^, test for start of line 2991 if (fp->fInputIdx != fAnchorStart) { 2992 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2993 } 2994 break; 2995 2996 2997 case URX_CARET_M: // ^, test for start of line in mulit-line mode 2998 { 2999 if (fp->fInputIdx == fAnchorStart) { 3000 // We are at the start input. Success. 3001 break; 3002 } 3003 // Check whether character just before the current pos is a new-line 3004 // unless we are at the end of input 3005 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3006 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3007 if ((fp->fInputIdx < fAnchorLimit) && 3008 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 3009 // It's a new-line. ^ is true. Success. 3010 // TODO: what should be done with positions between a CR and LF? 3011 break; 3012 } 3013 // Not at the start of a line. Fail. 3014 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3015 } 3016 break; 3017 3018 3019 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 3020 { 3021 U_ASSERT(fp->fInputIdx >= fAnchorStart); 3022 if (fp->fInputIdx <= fAnchorStart) { 3023 // We are at the start input. Success. 3024 break; 3025 } 3026 // Check whether character just before the current pos is a new-line 3027 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 3028 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3029 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3030 if (c != 0x0a) { 3031 // Not at the start of a line. Back-track out. 3032 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3033 } 3034 } 3035 break; 3036 3037 case URX_BACKSLASH_B: // Test for word boundaries 3038 { 3039 UBool success = isWordBoundary(fp->fInputIdx); 3040 success ^= (opValue != 0); // flip sense for \B 3041 if (!success) { 3042 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3043 } 3044 } 3045 break; 3046 3047 3048 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 3049 { 3050 UBool success = isUWordBoundary(fp->fInputIdx); 3051 success ^= (opValue != 0); // flip sense for \B 3052 if (!success) { 3053 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3054 } 3055 } 3056 break; 3057 3058 3059 case URX_BACKSLASH_D: // Test for decimal digit 3060 { 3061 if (fp->fInputIdx >= fActiveLimit) { 3062 fHitEnd = TRUE; 3063 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3064 break; 3065 } 3066 3067 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3068 3069 UChar32 c = UTEXT_NEXT32(fInputText); 3070 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 3071 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 3072 success ^= (opValue != 0); // flip sense for \D 3073 if (success) { 3074 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3075 } else { 3076 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3077 } 3078 } 3079 break; 3080 3081 3082 case URX_BACKSLASH_G: // Test for position at end of previous match 3083 if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==fActiveStart)) { 3084 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3085 } 3086 break; 3087 3088 3089 case URX_BACKSLASH_X: 3090 // Match a Grapheme, as defined by Unicode TR 29. 3091 // Differs slightly from Perl, which consumes combining marks independently 3092 // of context. 3093 { 3094 3095 // Fail if at end of input 3096 if (fp->fInputIdx >= fActiveLimit) { 3097 fHitEnd = TRUE; 3098 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3099 break; 3100 } 3101 3102 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3103 3104 // Examine (and consume) the current char. 3105 // Dispatch into a little state machine, based on the char. 3106 UChar32 c; 3107 c = UTEXT_NEXT32(fInputText); 3108 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3109 UnicodeSet **sets = fPattern->fStaticSets; 3110 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 3111 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 3112 if (sets[URX_GC_L]->contains(c)) goto GC_L; 3113 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 3114 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 3115 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3116 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3117 goto GC_Extend; 3118 3119 3120 3121 GC_L: 3122 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3123 c = UTEXT_NEXT32(fInputText); 3124 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3125 if (sets[URX_GC_L]->contains(c)) goto GC_L; 3126 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 3127 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 3128 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3129 UTEXT_PREVIOUS32(fInputText); 3130 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3131 goto GC_Extend; 3132 3133 GC_V: 3134 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3135 c = UTEXT_NEXT32(fInputText); 3136 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3137 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3138 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3139 UTEXT_PREVIOUS32(fInputText); 3140 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3141 goto GC_Extend; 3142 3143 GC_T: 3144 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3145 c = UTEXT_NEXT32(fInputText); 3146 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3147 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3148 UTEXT_PREVIOUS32(fInputText); 3149 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3150 goto GC_Extend; 3151 3152 GC_Extend: 3153 // Combining characters are consumed here 3154 for (;;) { 3155 if (fp->fInputIdx >= fActiveLimit) { 3156 break; 3157 } 3158 c = UTEXT_CURRENT32(fInputText); 3159 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 3160 break; 3161 } 3162 UTEXT_NEXT32(fInputText); 3163 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3164 } 3165 goto GC_Done; 3166 3167 GC_Control: 3168 // Most control chars stand alone (don't combine with combining chars), 3169 // except for that CR/LF sequence is a single grapheme cluster. 3170 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 3171 c = UTEXT_NEXT32(fInputText); 3172 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3173 } 3174 3175 GC_Done: 3176 if (fp->fInputIdx >= fActiveLimit) { 3177 fHitEnd = TRUE; 3178 } 3179 break; 3180 } 3181 3182 3183 3184 3185 case URX_BACKSLASH_Z: // Test for end of Input 3186 if (fp->fInputIdx < fAnchorLimit) { 3187 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3188 } else { 3189 fHitEnd = TRUE; 3190 fRequireEnd = TRUE; 3191 } 3192 break; 3193 3194 3195 3196 case URX_STATIC_SETREF: 3197 { 3198 // Test input character against one of the predefined sets 3199 // (Word Characters, for example) 3200 // The high bit of the op value is a flag for the match polarity. 3201 // 0: success if input char is in set. 3202 // 1: success if input char is not in set. 3203 if (fp->fInputIdx >= fActiveLimit) { 3204 fHitEnd = TRUE; 3205 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3206 break; 3207 } 3208 3209 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 3210 opValue &= ~URX_NEG_SET; 3211 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3212 3213 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3214 UChar32 c = UTEXT_NEXT32(fInputText); 3215 if (c < 256) { 3216 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3217 if (s8->contains(c)) { 3218 success = !success; 3219 } 3220 } else { 3221 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3222 if (s->contains(c)) { 3223 success = !success; 3224 } 3225 } 3226 if (success) { 3227 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3228 } else { 3229 // the character wasn't in the set. 3230 #ifdef REGEX_SMART_BACKTRACKING 3231 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 3232 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3233 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3234 // Try to find it, backwards 3235 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried 3236 success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset 3237 do { 3238 c = UTEXT_PREVIOUS32(fInputText); 3239 if (c == U_SENTINEL) { 3240 break; 3241 } else if (c < 256) { 3242 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3243 if (s8->contains(c)) { 3244 success = !success; 3245 } 3246 } else { 3247 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3248 if (s->contains(c)) { 3249 success = !success; 3250 } 3251 } 3252 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success); 3253 3254 if (success && c != U_SENTINEL) { 3255 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3256 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3257 if (fp->fInputIdx > backSearchIndex) { 3258 fp = StateSave(fp, fp->fPatIdx, status); 3259 } 3260 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3261 break; 3262 } 3263 } 3264 } 3265 #endif 3266 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3267 } 3268 } 3269 break; 3270 3271 3272 case URX_STAT_SETREF_N: 3273 { 3274 // Test input character for NOT being a member of one of 3275 // the predefined sets (Word Characters, for example) 3276 if (fp->fInputIdx >= fActiveLimit) { 3277 fHitEnd = TRUE; 3278 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3279 break; 3280 } 3281 3282 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3283 3284 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3285 3286 UChar32 c = UTEXT_NEXT32(fInputText); 3287 if (c < 256) { 3288 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3289 if (s8->contains(c) == FALSE) { 3290 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3291 break; 3292 } 3293 } else { 3294 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3295 if (s->contains(c) == FALSE) { 3296 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3297 break; 3298 } 3299 } 3300 // the character wasn't in the set. 3301 #ifdef REGEX_SMART_BACKTRACKING 3302 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 3303 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3304 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3305 // Try to find it, backwards 3306 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried 3307 UBool success = FALSE; 3308 do { 3309 c = UTEXT_PREVIOUS32(fInputText); 3310 if (c == U_SENTINEL) { 3311 break; 3312 } else if (c < 256) { 3313 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3314 if (s8->contains(c) == FALSE) { 3315 success = TRUE; 3316 break; 3317 } 3318 } else { 3319 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3320 if (s->contains(c) == FALSE) { 3321 success = TRUE; 3322 break; 3323 } 3324 } 3325 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); 3326 3327 if (success) { 3328 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3329 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3330 if (fp->fInputIdx > backSearchIndex) { 3331 fp = StateSave(fp, fp->fPatIdx, status); 3332 } 3333 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3334 break; 3335 } 3336 } 3337 } 3338 #endif 3339 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3340 } 3341 break; 3342 3343 3344 case URX_SETREF: 3345 if (fp->fInputIdx >= fActiveLimit) { 3346 fHitEnd = TRUE; 3347 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3348 break; 3349 } else { 3350 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3351 3352 // There is input left. Pick up one char and test it for set membership. 3353 UChar32 c = UTEXT_NEXT32(fInputText); 3354 U_ASSERT(opValue > 0 && opValue < sets->size()); 3355 if (c<256) { 3356 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 3357 if (s8->contains(c)) { 3358 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3359 break; 3360 } 3361 } else { 3362 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 3363 if (s->contains(c)) { 3364 // The character is in the set. A Match. 3365 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3366 break; 3367 } 3368 } 3369 3370 // the character wasn't in the set. 3371 #ifdef REGEX_SMART_BACKTRACKING 3372 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 3373 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3374 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3375 // Try to find it, backwards 3376 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried 3377 UBool success = FALSE; 3378 do { 3379 c = UTEXT_PREVIOUS32(fInputText); 3380 if (c == U_SENTINEL) { 3381 break; 3382 } else if (c < 256) { 3383 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 3384 if (s8->contains(c)) { 3385 success = TRUE; 3386 break; 3387 } 3388 } else { 3389 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 3390 if (s->contains(c)) { 3391 success = TRUE; 3392 break; 3393 } 3394 } 3395 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); 3396 3397 if (success) { 3398 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3399 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3400 if (fp->fInputIdx > backSearchIndex) { 3401 fp = StateSave(fp, fp->fPatIdx, status); 3402 } 3403 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3404 break; 3405 } 3406 } 3407 } 3408 #endif 3409 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3410 } 3411 break; 3412 3413 3414 case URX_DOTANY: 3415 { 3416 // . matches anything, but stops at end-of-line. 3417 if (fp->fInputIdx >= fActiveLimit) { 3418 // At end of input. Match failed. Backtrack out. 3419 fHitEnd = TRUE; 3420 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3421 break; 3422 } 3423 3424 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3425 3426 // There is input left. Advance over one char, unless we've hit end-of-line 3427 UChar32 c = UTEXT_NEXT32(fInputText); 3428 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 3429 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 3430 // End of line in normal mode. . does not match. 3431 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3432 break; 3433 } 3434 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3435 } 3436 break; 3437 3438 3439 case URX_DOTANY_ALL: 3440 { 3441 // ., in dot-matches-all (including new lines) mode 3442 if (fp->fInputIdx >= fActiveLimit) { 3443 // At end of input. Match failed. Backtrack out. 3444 fHitEnd = TRUE; 3445 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3446 break; 3447 } 3448 3449 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3450 3451 // There is input left. Advance over one char, except if we are 3452 // at a cr/lf, advance over both of them. 3453 UChar32 c; 3454 c = UTEXT_NEXT32(fInputText); 3455 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3456 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 3457 // In the case of a CR/LF, we need to advance over both. 3458 UChar32 nextc = UTEXT_CURRENT32(fInputText); 3459 if (nextc == 0x0a) { 3460 UTEXT_NEXT32(fInputText); 3461 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3462 } 3463 } 3464 } 3465 break; 3466 3467 3468 case URX_DOTANY_UNIX: 3469 { 3470 // '.' operator, matches all, but stops at end-of-line. 3471 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 3472 if (fp->fInputIdx >= fActiveLimit) { 3473 // At end of input. Match failed. Backtrack out. 3474 fHitEnd = TRUE; 3475 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3476 break; 3477 } 3478 3479 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3480 3481 // There is input left. Advance over one char, unless we've hit end-of-line 3482 UChar32 c = UTEXT_NEXT32(fInputText); 3483 if (c == 0x0a) { 3484 // End of line in normal mode. '.' does not match the \n 3485 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3486 } else { 3487 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3488 } 3489 } 3490 break; 3491 3492 3493 case URX_JMP: 3494 fp->fPatIdx = opValue; 3495 break; 3496 3497 case URX_FAIL: 3498 isMatch = FALSE; 3499 goto breakFromLoop; 3500 3501 case URX_JMP_SAV: 3502 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 3503 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3504 fp->fPatIdx = opValue; // Then JMP. 3505 break; 3506 3507 case URX_JMP_SAV_X: 3508 // This opcode is used with (x)+, when x can match a zero length string. 3509 // Same as JMP_SAV, except conditional on the match having made forward progress. 3510 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 3511 // data address of the input position at the start of the loop. 3512 { 3513 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 3514 int32_t stoOp = (int32_t)pat[opValue-1]; 3515 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 3516 int32_t frameLoc = URX_VAL(stoOp); 3517 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 3518 int64_t prevInputIdx = fp->fExtra[frameLoc]; 3519 U_ASSERT(prevInputIdx <= fp->fInputIdx); 3520 if (prevInputIdx < fp->fInputIdx) { 3521 // The match did make progress. Repeat the loop. 3522 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3523 fp->fPatIdx = opValue; 3524 fp->fExtra[frameLoc] = fp->fInputIdx; 3525 } 3526 // If the input position did not advance, we do nothing here, 3527 // execution will fall out of the loop. 3528 } 3529 break; 3530 3531 case URX_CTR_INIT: 3532 { 3533 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3534 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3535 3536 // Pick up the three extra operands that CTR_INIT has, and 3537 // skip the pattern location counter past 3538 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3539 fp->fPatIdx += 3; 3540 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3541 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3542 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3543 U_ASSERT(minCount>=0); 3544 U_ASSERT(maxCount>=minCount || maxCount==-1); 3545 U_ASSERT(loopLoc>fp->fPatIdx); 3546 3547 if (minCount == 0) { 3548 fp = StateSave(fp, loopLoc+1, status); 3549 } 3550 if (maxCount == 0) { 3551 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3552 } 3553 } 3554 break; 3555 3556 case URX_CTR_LOOP: 3557 { 3558 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3559 int32_t initOp = (int32_t)pat[opValue]; 3560 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 3561 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3562 int32_t minCount = (int32_t)pat[opValue+2]; 3563 int32_t maxCount = (int32_t)pat[opValue+3]; 3564 // Increment the counter. Note: we DIDN'T worry about counter 3565 // overflow, since the data comes from UnicodeStrings, which 3566 // stores its length in an int32_t. Do we have to think about 3567 // this now that we're using UText? Probably not, since the length 3568 // in UChar32s is still an int32_t. 3569 (*pCounter)++; 3570 U_ASSERT(*pCounter > 0); 3571 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { 3572 U_ASSERT(*pCounter == maxCount || maxCount == -1); 3573 break; 3574 } 3575 if (*pCounter >= minCount) { 3576 fp = StateSave(fp, fp->fPatIdx, status); 3577 } 3578 fp->fPatIdx = opValue + 4; // Loop back. 3579 } 3580 break; 3581 3582 case URX_CTR_INIT_NG: 3583 { 3584 // Initialize a non-greedy loop 3585 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3586 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3587 3588 // Pick up the three extra operands that CTR_INIT has, and 3589 // skip the pattern location counter past 3590 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3591 fp->fPatIdx += 3; 3592 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3593 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3594 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3595 U_ASSERT(minCount>=0); 3596 U_ASSERT(maxCount>=minCount || maxCount==-1); 3597 U_ASSERT(loopLoc>fp->fPatIdx); 3598 3599 if (minCount == 0) { 3600 if (maxCount != 0) { 3601 fp = StateSave(fp, fp->fPatIdx, status); 3602 } 3603 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 3604 } 3605 } 3606 break; 3607 3608 case URX_CTR_LOOP_NG: 3609 { 3610 // Non-greedy {min, max} loops 3611 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3612 int32_t initOp = (int32_t)pat[opValue]; 3613 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 3614 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3615 int32_t minCount = (int32_t)pat[opValue+2]; 3616 int32_t maxCount = (int32_t)pat[opValue+3]; 3617 // Increment the counter. Note: we DIDN'T worry about counter 3618 // overflow, since the data comes from UnicodeStrings, which 3619 // stores its length in an int32_t. Do we have to think about 3620 // this now that we're using UText? Probably not, since the length 3621 // in UChar32s is still an int32_t. 3622 (*pCounter)++; 3623 U_ASSERT(*pCounter > 0); 3624 3625 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { 3626 // The loop has matched the maximum permitted number of times. 3627 // Break out of here with no action. Matching will 3628 // continue with the following pattern. 3629 U_ASSERT(*pCounter == maxCount || maxCount == -1); 3630 break; 3631 } 3632 3633 if (*pCounter < minCount) { 3634 // We haven't met the minimum number of matches yet. 3635 // Loop back for another one. 3636 fp->fPatIdx = opValue + 4; // Loop back. 3637 } else { 3638 // We do have the minimum number of matches. 3639 // Fall into the following pattern, but first do 3640 // a state save to the top of the loop, so that a failure 3641 // in the following pattern will try another iteration of the loop. 3642 fp = StateSave(fp, opValue + 4, status); 3643 } 3644 } 3645 break; 3646 3647 case URX_STO_SP: 3648 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3649 fData[opValue] = fStack->size(); 3650 break; 3651 3652 case URX_LD_SP: 3653 { 3654 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3655 int32_t newStackSize = (int32_t)fData[opValue]; 3656 U_ASSERT(newStackSize <= fStack->size()); 3657 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3658 if (newFP == (int64_t *)fp) { 3659 break; 3660 } 3661 int32_t i; 3662 for (i=0; i<fFrameSize; i++) { 3663 newFP[i] = ((int64_t *)fp)[i]; 3664 } 3665 fp = (REStackFrame *)newFP; 3666 fStack->setSize(newStackSize); 3667 } 3668 break; 3669 3670 case URX_BACKREF: 3671 case URX_BACKREF_I: 3672 { 3673 U_ASSERT(opValue < fFrameSize); 3674 int64_t groupStartIdx = fp->fExtra[opValue]; 3675 int64_t groupEndIdx = fp->fExtra[opValue+1]; 3676 U_ASSERT(groupStartIdx <= groupEndIdx); 3677 if (groupStartIdx < 0) { 3678 // This capture group has not participated in the match thus far, 3679 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3680 } 3681 3682 if (groupEndIdx == groupStartIdx) { 3683 // The capture group match was of an empty string. 3684 // Verified by testing: Perl matches succeed in this case, so 3685 // we do too. 3686 break; 3687 } 3688 3689 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); 3690 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3691 3692 UBool haveMatch = (opType == URX_BACKREF ? 3693 (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) : 3694 (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status))); 3695 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3696 3697 if (fp->fInputIdx > fActiveLimit) { 3698 fHitEnd = TRUE; 3699 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3700 } else if (!haveMatch) { 3701 if (fp->fInputIdx == fActiveLimit) { 3702 fHitEnd = TRUE; 3703 } 3704 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3705 } 3706 } 3707 break; 3708 3709 case URX_STO_INP_LOC: 3710 { 3711 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 3712 fp->fExtra[opValue] = fp->fInputIdx; 3713 } 3714 break; 3715 3716 case URX_JMPX: 3717 { 3718 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3719 fp->fPatIdx += 1; 3720 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 3721 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 3722 int64_t savedInputIdx = fp->fExtra[dataLoc]; 3723 U_ASSERT(savedInputIdx <= fp->fInputIdx); 3724 if (savedInputIdx < fp->fInputIdx) { 3725 fp->fPatIdx = opValue; // JMP 3726 } else { 3727 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 3728 } 3729 } 3730 break; 3731 3732 case URX_LA_START: 3733 { 3734 // Entering a lookahead block. 3735 // Save Stack Ptr, Input Pos. 3736 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3737 fData[opValue] = fStack->size(); 3738 fData[opValue+1] = fp->fInputIdx; 3739 fActiveStart = fLookStart; // Set the match region change for 3740 fActiveLimit = fLookLimit; // transparent bounds. 3741 } 3742 break; 3743 3744 case URX_LA_END: 3745 { 3746 // Leaving a look-ahead block. 3747 // restore Stack Ptr, Input Pos to positions they had on entry to block. 3748 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3749 int32_t stackSize = fStack->size(); 3750 int32_t newStackSize =(int32_t)fData[opValue]; 3751 U_ASSERT(stackSize >= newStackSize); 3752 if (stackSize > newStackSize) { 3753 // Copy the current top frame back to the new (cut back) top frame. 3754 // This makes the capture groups from within the look-ahead 3755 // expression available. 3756 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3757 int32_t i; 3758 for (i=0; i<fFrameSize; i++) { 3759 newFP[i] = ((int64_t *)fp)[i]; 3760 } 3761 fp = (REStackFrame *)newFP; 3762 fStack->setSize(newStackSize); 3763 } 3764 fp->fInputIdx = fData[opValue+1]; 3765 3766 // Restore the active region bounds in the input string; they may have 3767 // been changed because of transparent bounds on a Region. 3768 fActiveStart = fRegionStart; 3769 fActiveLimit = fRegionLimit; 3770 } 3771 break; 3772 3773 case URX_ONECHAR_I: 3774 if (fp->fInputIdx < fActiveLimit) { 3775 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3776 3777 UChar32 c = UTEXT_NEXT32(fInputText); 3778 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 3779 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3780 break; 3781 } 3782 } else { 3783 fHitEnd = TRUE; 3784 } 3785 3786 #ifdef REGEX_SMART_BACKTRACKING 3787 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 3788 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3789 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3790 UBool success = FALSE; 3791 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3792 while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) { 3793 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 3794 success = TRUE; 3795 break; 3796 } else if (c == U_SENTINEL) { 3797 break; 3798 } 3799 c = UTEXT_PREVIOUS32(fInputText); 3800 } 3801 if (success) { 3802 fHitEnd = FALSE; 3803 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3804 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3805 if (fp->fInputIdx > backSearchIndex) { 3806 fp = StateSave(fp, fp->fPatIdx, status); 3807 } 3808 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3809 break; 3810 } 3811 } 3812 } 3813 #endif 3814 3815 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3816 break; 3817 3818 case URX_STRING_I: 3819 { 3820 // Test input against a literal string. 3821 // Strings require two slots in the compiled pattern, one for the 3822 // offset to the string text, and one for the length. 3823 const UCaseProps *csp = ucase_getSingleton(&status); 3824 if (U_SUCCESS(status)) { 3825 int32_t stringStartIdx, stringLen; 3826 stringStartIdx = opValue; 3827 3828 op = (int32_t)pat[fp->fPatIdx]; 3829 fp->fPatIdx++; 3830 opType = URX_TYPE(op); 3831 opValue = URX_VAL(op); 3832 U_ASSERT(opType == URX_STRING_LEN); 3833 stringLen = opValue; 3834 3835 const UChar *patternChars = litText+stringStartIdx; 3836 const UChar *patternEnd = patternChars+stringLen; 3837 3838 const UChar *foldChars = NULL; 3839 int32_t foldOffset, foldLength; 3840 UChar32 c; 3841 3842 foldOffset = foldLength = 0; 3843 UBool success = TRUE; 3844 3845 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3846 while (patternChars < patternEnd && success) { 3847 if(foldOffset < foldLength) { 3848 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 3849 } else { 3850 c = UTEXT_NEXT32(fInputText); 3851 if (c != U_SENTINEL) { 3852 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); 3853 if(foldLength >= 0) { 3854 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings 3855 foldOffset = 0; 3856 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 3857 } else { 3858 c = foldLength; 3859 foldLength = foldOffset; // to avoid reading chars from the folding buffer 3860 } 3861 } 3862 } 3863 3864 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3865 } 3866 3867 success = FALSE; 3868 if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) { 3869 if (U_IS_BMP(c)) { 3870 success = (*patternChars == c); 3871 patternChars += 1; 3872 } else if (patternChars+1 < patternEnd) { 3873 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); 3874 patternChars += 2; 3875 } 3876 } else { 3877 fHitEnd = TRUE; // TODO: See ticket 6074 3878 } 3879 } 3880 3881 if (!success) { 3882 #ifdef REGEX_SMART_BACKTRACKING 3883 if (fp->fInputIdx > backSearchIndex && fStack->size()) { 3884 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3885 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3886 // Reset to last start point 3887 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3888 patternChars = litText+stringStartIdx; 3889 3890 // Search backwards for a possible start 3891 do { 3892 c = UTEXT_PREVIOUS32(fInputText); 3893 if (c == U_SENTINEL) { 3894 break; 3895 } else { 3896 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); 3897 if(foldLength >= 0) { 3898 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings 3899 foldOffset = 0; 3900 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 3901 } else { 3902 c = foldLength; 3903 foldLength = foldOffset; // to avoid reading chars from the folding buffer 3904 } 3905 } 3906 3907 if ((U_IS_BMP(c) && *patternChars == c) || 3908 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { 3909 success = TRUE; 3910 break; 3911 } 3912 } 3913 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); 3914 3915 // And try again 3916 if (success) { 3917 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3918 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3919 if (fp->fInputIdx > backSearchIndex) { 3920 fp = StateSave(fp, fp->fPatIdx, status); 3921 } 3922 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3923 break; 3924 } 3925 } 3926 } 3927 #endif 3928 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3929 } 3930 } 3931 } 3932 break; 3933 3934 case URX_LB_START: 3935 { 3936 // Entering a look-behind block. 3937 // Save Stack Ptr, Input Pos. 3938 // TODO: implement transparent bounds. Ticket #6067 3939 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3940 fData[opValue] = fStack->size(); 3941 fData[opValue+1] = fp->fInputIdx; 3942 // Init the variable containing the start index for attempted matches. 3943 fData[opValue+2] = -1; 3944 // Save input string length, then reset to pin any matches to end at 3945 // the current position. 3946 fData[opValue+3] = fActiveLimit; 3947 fActiveLimit = fp->fInputIdx; 3948 } 3949 break; 3950 3951 3952 case URX_LB_CONT: 3953 { 3954 // Positive Look-Behind, at top of loop checking for matches of LB expression 3955 // at all possible input starting positions. 3956 3957 // Fetch the min and max possible match lengths. They are the operands 3958 // of this op in the pattern. 3959 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 3960 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 3961 U_ASSERT(minML <= maxML); 3962 U_ASSERT(minML >= 0); 3963 3964 // Fetch (from data) the last input index where a match was attempted. 3965 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3966 int64_t *lbStartIdx = &fData[opValue+2]; 3967 if (*lbStartIdx < 0) { 3968 // First time through loop. 3969 *lbStartIdx = fp->fInputIdx - minML; 3970 } else { 3971 // 2nd through nth time through the loop. 3972 // Back up start position for match by one. 3973 if (*lbStartIdx == 0) { 3974 (*lbStartIdx)--; 3975 } else { 3976 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); 3977 UTEXT_PREVIOUS32(fInputText); 3978 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 3979 } 3980 } 3981 3982 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 3983 // We have tried all potential match starting points without 3984 // getting a match. Backtrack out, and out of the 3985 // Look Behind altogether. 3986 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3987 int64_t restoreInputLen = fData[opValue+3]; 3988 U_ASSERT(restoreInputLen >= fActiveLimit); 3989 U_ASSERT(restoreInputLen <= fInputLength); 3990 fActiveLimit = restoreInputLen; 3991 break; 3992 } 3993 3994 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 3995 // (successful match will fall off the end of the loop.) 3996 fp = StateSave(fp, fp->fPatIdx-3, status); 3997 fp->fInputIdx = *lbStartIdx; 3998 } 3999 break; 4000 4001 case URX_LB_END: 4002 // End of a look-behind block, after a successful match. 4003 { 4004 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4005 if (fp->fInputIdx != fActiveLimit) { 4006 // The look-behind expression matched, but the match did not 4007 // extend all the way to the point that we are looking behind from. 4008 // FAIL out of here, which will take us back to the LB_CONT, which 4009 // will retry the match starting at another position or fail 4010 // the look-behind altogether, whichever is appropriate. 4011 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4012 break; 4013 } 4014 4015 // Look-behind match is good. Restore the orignal input string length, 4016 // which had been truncated to pin the end of the lookbehind match to the 4017 // position being looked-behind. 4018 int64_t originalInputLen = fData[opValue+3]; 4019 U_ASSERT(originalInputLen >= fActiveLimit); 4020 U_ASSERT(originalInputLen <= fInputLength); 4021 fActiveLimit = originalInputLen; 4022 } 4023 break; 4024 4025 4026 case URX_LBN_CONT: 4027 { 4028 // Negative Look-Behind, at top of loop checking for matches of LB expression 4029 // at all possible input starting positions. 4030 4031 // Fetch the extra parameters of this op. 4032 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 4033 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 4034 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 4035 continueLoc = URX_VAL(continueLoc); 4036 U_ASSERT(minML <= maxML); 4037 U_ASSERT(minML >= 0); 4038 U_ASSERT(continueLoc > fp->fPatIdx); 4039 4040 // Fetch (from data) the last input index where a match was attempted. 4041 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4042 int64_t *lbStartIdx = &fData[opValue+2]; 4043 if (*lbStartIdx < 0) { 4044 // First time through loop. 4045 *lbStartIdx = fp->fInputIdx - minML; 4046 } else { 4047 // 2nd through nth time through the loop. 4048 // Back up start position for match by one. 4049 if (*lbStartIdx == 0) { 4050 (*lbStartIdx)--; 4051 } else { 4052 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); 4053 UTEXT_PREVIOUS32(fInputText); 4054 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 4055 } 4056 } 4057 4058 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 4059 // We have tried all potential match starting points without 4060 // getting a match, which means that the negative lookbehind as 4061 // a whole has succeeded. Jump forward to the continue location 4062 int64_t restoreInputLen = fData[opValue+3]; 4063 U_ASSERT(restoreInputLen >= fActiveLimit); 4064 U_ASSERT(restoreInputLen <= fInputLength); 4065 fActiveLimit = restoreInputLen; 4066 fp->fPatIdx = continueLoc; 4067 break; 4068 } 4069 4070 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 4071 // (successful match will cause a FAIL out of the loop altogether.) 4072 fp = StateSave(fp, fp->fPatIdx-4, status); 4073 fp->fInputIdx = *lbStartIdx; 4074 } 4075 break; 4076 4077 case URX_LBN_END: 4078 // End of a negative look-behind block, after a successful match. 4079 { 4080 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4081 if (fp->fInputIdx != fActiveLimit) { 4082 // The look-behind expression matched, but the match did not 4083 // extend all the way to the point that we are looking behind from. 4084 // FAIL out of here, which will take us back to the LB_CONT, which 4085 // will retry the match starting at another position or succeed 4086 // the look-behind altogether, whichever is appropriate. 4087 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4088 break; 4089 } 4090 4091 // Look-behind expression matched, which means look-behind test as 4092 // a whole Fails 4093 4094 // Restore the orignal input string length, which had been truncated 4095 // inorder to pin the end of the lookbehind match 4096 // to the position being looked-behind. 4097 int64_t originalInputLen = fData[opValue+3]; 4098 U_ASSERT(originalInputLen >= fActiveLimit); 4099 U_ASSERT(originalInputLen <= fInputLength); 4100 fActiveLimit = originalInputLen; 4101 4102 // Restore original stack position, discarding any state saved 4103 // by the successful pattern match. 4104 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4105 int32_t newStackSize = (int32_t)fData[opValue]; 4106 U_ASSERT(fStack->size() > newStackSize); 4107 fStack->setSize(newStackSize); 4108 4109 // FAIL, which will take control back to someplace 4110 // prior to entering the look-behind test. 4111 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4112 } 4113 break; 4114 4115 4116 case URX_LOOP_SR_I: 4117 // Loop Initialization for the optimized implementation of 4118 // [some character set]* 4119 // This op scans through all matching input. 4120 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4121 { 4122 U_ASSERT(opValue > 0 && opValue < sets->size()); 4123 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 4124 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 4125 4126 // Loop through input, until either the input is exhausted or 4127 // we reach a character that is not a member of the set. 4128 int64_t ix = fp->fInputIdx; 4129 UTEXT_SETNATIVEINDEX(fInputText, ix); 4130 for (;;) { 4131 if (ix >= fActiveLimit) { 4132 fHitEnd = TRUE; 4133 break; 4134 } 4135 UChar32 c = UTEXT_NEXT32(fInputText); 4136 if (c<256) { 4137 if (s8->contains(c) == FALSE) { 4138 break; 4139 } 4140 } else { 4141 if (s->contains(c) == FALSE) { 4142 break; 4143 } 4144 } 4145 ix = UTEXT_GETNATIVEINDEX(fInputText); 4146 } 4147 4148 // If there were no matching characters, skip over the loop altogether. 4149 // The loop doesn't run at all, a * op always succeeds. 4150 if (ix == fp->fInputIdx) { 4151 fp->fPatIdx++; // skip the URX_LOOP_C op. 4152 break; 4153 } 4154 4155 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4156 // must follow. It's operand is the stack location 4157 // that holds the starting input index for the match of this [set]* 4158 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 4159 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4160 int32_t stackLoc = URX_VAL(loopcOp); 4161 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4162 fp->fExtra[stackLoc] = fp->fInputIdx; 4163 #ifdef REGEX_SMART_BACKTRACKING 4164 backSearchIndex = fp->fInputIdx; 4165 #endif 4166 fp->fInputIdx = ix; 4167 4168 // Save State to the URX_LOOP_C op that follows this one, 4169 // so that match failures in the following code will return to there. 4170 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4171 fp = StateSave(fp, fp->fPatIdx, status); 4172 fp->fPatIdx++; 4173 } 4174 break; 4175 4176 4177 case URX_LOOP_DOT_I: 4178 // Loop Initialization for the optimized implementation of .* 4179 // This op scans through all remaining input. 4180 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4181 { 4182 // Loop through input until the input is exhausted (we reach an end-of-line) 4183 // In DOTALL mode, we can just go straight to the end of the input. 4184 int64_t ix; 4185 if ((opValue & 1) == 1) { 4186 // Dot-matches-All mode. Jump straight to the end of the string. 4187 ix = fActiveLimit; 4188 fHitEnd = TRUE; 4189 } else { 4190 // NOT DOT ALL mode. Line endings do not match '.' 4191 // Scan forward until a line ending or end of input. 4192 ix = fp->fInputIdx; 4193 UTEXT_SETNATIVEINDEX(fInputText, ix); 4194 for (;;) { 4195 if (ix >= fActiveLimit) { 4196 fHitEnd = TRUE; 4197 break; 4198 } 4199 UChar32 c = UTEXT_NEXT32(fInputText); 4200 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 4201 if ((c == 0x0a) || // 0x0a is newline in both modes. 4202 ((opValue & 2) == 0) && // IF not UNIX_LINES mode 4203 (c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029) { 4204 // char is a line ending. Exit the scanning loop. 4205 break; 4206 } 4207 } 4208 ix = UTEXT_GETNATIVEINDEX(fInputText); 4209 } 4210 } 4211 4212 // If there were no matching characters, skip over the loop altogether. 4213 // The loop doesn't run at all, a * op always succeeds. 4214 if (ix == fp->fInputIdx) { 4215 fp->fPatIdx++; // skip the URX_LOOP_C op. 4216 break; 4217 } 4218 4219 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4220 // must follow. It's operand is the stack location 4221 // that holds the starting input index for the match of this .* 4222 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 4223 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4224 int32_t stackLoc = URX_VAL(loopcOp); 4225 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4226 fp->fExtra[stackLoc] = fp->fInputIdx; 4227 #ifdef REGEX_SMART_BACKTRACKING 4228 backSearchIndex = fp->fInputIdx; 4229 #endif 4230 fp->fInputIdx = ix; 4231 4232 // Save State to the URX_LOOP_C op that follows this one, 4233 // so that match failures in the following code will return to there. 4234 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4235 fp = StateSave(fp, fp->fPatIdx, status); 4236 fp->fPatIdx++; 4237 } 4238 break; 4239 4240 4241 case URX_LOOP_C: 4242 { 4243 U_ASSERT(opValue>=0 && opValue<fFrameSize); 4244 backSearchIndex = fp->fExtra[opValue]; 4245 U_ASSERT(backSearchIndex <= fp->fInputIdx); 4246 if (backSearchIndex == fp->fInputIdx) { 4247 // We've backed up the input idx to the point that the loop started. 4248 // The loop is done. Leave here without saving state. 4249 // Subsequent failures won't come back here. 4250 break; 4251 } 4252 // Set up for the next iteration of the loop, with input index 4253 // backed up by one from the last time through, 4254 // and a state save to this instruction in case the following code fails again. 4255 // (We're going backwards because this loop emulates stack unwinding, not 4256 // the initial scan forward.) 4257 U_ASSERT(fp->fInputIdx > 0); 4258 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4259 UChar32 prevC = UTEXT_PREVIOUS32(fInputText); 4260 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4261 4262 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); 4263 if (prevC == 0x0a && 4264 fp->fInputIdx > backSearchIndex && 4265 twoPrevC == 0x0d) { 4266 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 4267 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 4268 // .*, stepping back over CRLF pair. 4269 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4270 } 4271 } 4272 4273 4274 fp = StateSave(fp, fp->fPatIdx-1, status); 4275 } 4276 break; 4277 4278 4279 4280 default: 4281 // Trouble. The compiled pattern contains an entry with an 4282 // unrecognized type tag. 4283 U_ASSERT(FALSE); 4284 } 4285 4286 if (U_FAILURE(status)) { 4287 isMatch = FALSE; 4288 break; 4289 } 4290 } 4291 4292 breakFromLoop: 4293 fMatch = isMatch; 4294 if (isMatch) { 4295 fLastMatchEnd = fMatchEnd; 4296 fMatchStart = startIdx; 4297 fMatchEnd = fp->fInputIdx; 4298 if (fTraceDebug) { 4299 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); 4300 } 4301 } 4302 else 4303 { 4304 if (fTraceDebug) { 4305 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); 4306 } 4307 } 4308 4309 fFrame = fp; // The active stack frame when the engine stopped. 4310 // Contains the capture group results that we need to 4311 // access later. 4312 return; 4313 } 4314 4315 4316 //-------------------------------------------------------------------------------- 4317 // 4318 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the 4319 // assumption that the entire string is available in the UText's 4320 // chunk buffer. For now, that means we can use int32_t indexes, 4321 // except for anything that needs to be saved (like group starts 4322 // and ends). 4323 // 4324 // startIdx: begin matching a this index. 4325 // toEnd: if true, match must extend to end of the input region 4326 // 4327 //-------------------------------------------------------------------------------- 4328 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { 4329 UBool isMatch = FALSE; // True if the we have a match. 4330 4331 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards 4332 4333 int32_t op; // Operation from the compiled pattern, split into 4334 int32_t opType; // the opcode 4335 int32_t opValue; // and the operand value. 4336 4337 #ifdef REGEX_RUN_DEBUG 4338 if (fTraceDebug) 4339 { 4340 printf("MatchAt(startIdx=%ld)\n", startIdx); 4341 printf("Original Pattern: "); 4342 UChar32 c = utext_next32From(fPattern->fPattern, 0); 4343 while (c != U_SENTINEL) { 4344 if (c<32 || c>256) { 4345 c = '.'; 4346 } 4347 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); 4348 4349 c = UTEXT_NEXT32(fPattern->fPattern); 4350 } 4351 printf("\n"); 4352 printf("Input String: "); 4353 c = utext_next32From(fInputText, 0); 4354 while (c != U_SENTINEL) { 4355 if (c<32 || c>256) { 4356 c = '.'; 4357 } 4358 printf("%c", c); 4359 4360 c = UTEXT_NEXT32(fInputText); 4361 } 4362 printf("\n"); 4363 printf("\n"); 4364 } 4365 #endif 4366 4367 if (U_FAILURE(status)) { 4368 return; 4369 } 4370 4371 // Cache frequently referenced items from the compiled pattern 4372 // 4373 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 4374 4375 const UChar *litText = fPattern->fLiteralText.getBuffer(); 4376 UVector *sets = fPattern->fSets; 4377 4378 const UChar *inputBuf = fInputText->chunkContents; 4379 4380 fFrameSize = fPattern->fFrameSize; 4381 REStackFrame *fp = resetStack(); 4382 4383 fp->fPatIdx = 0; 4384 fp->fInputIdx = startIdx; 4385 4386 // Zero out the pattern's static data 4387 int32_t i; 4388 for (i = 0; i<fPattern->fDataSize; i++) { 4389 fData[i] = 0; 4390 } 4391 4392 // 4393 // Main loop for interpreting the compiled pattern. 4394 // One iteration of the loop per pattern operation performed. 4395 // 4396 for (;;) { 4397 #if 0 4398 if (_heapchk() != _HEAPOK) { 4399 fprintf(stderr, "Heap Trouble\n"); 4400 } 4401 #endif 4402 4403 op = (int32_t)pat[fp->fPatIdx]; 4404 opType = URX_TYPE(op); 4405 opValue = URX_VAL(op); 4406 #ifdef REGEX_RUN_DEBUG 4407 if (fTraceDebug) { 4408 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4409 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, 4410 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 4411 fPattern->dumpOp(fp->fPatIdx); 4412 } 4413 #endif 4414 fp->fPatIdx++; 4415 4416 switch (opType) { 4417 4418 4419 case URX_NOP: 4420 break; 4421 4422 4423 case URX_BACKTRACK: 4424 // Force a backtrack. In some circumstances, the pattern compiler 4425 // will notice that the pattern can't possibly match anything, and will 4426 // emit one of these at that point. 4427 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4428 break; 4429 4430 4431 case URX_ONECHAR: 4432 if (fp->fInputIdx < fActiveLimit) { 4433 UChar32 c; 4434 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4435 if (c == opValue) { 4436 break; 4437 } 4438 } else { 4439 fHitEnd = TRUE; 4440 } 4441 4442 #ifdef REGEX_SMART_BACKTRACKING 4443 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 4444 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 4445 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 4446 int64_t reverseIndex = fp->fInputIdx; 4447 UChar32 c; 4448 do { 4449 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 4450 if (c == opValue) { 4451 break; 4452 } 4453 } while (reverseIndex > backSearchIndex); 4454 if (c == opValue) { 4455 fHitEnd = FALSE; 4456 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4457 fp->fInputIdx = reverseIndex; 4458 if (fp->fInputIdx > backSearchIndex) { 4459 fp = StateSave(fp, fp->fPatIdx, status); 4460 } 4461 fp->fPatIdx++; // Skip the LOOP_C, we just did that 4462 break; 4463 } 4464 } 4465 } 4466 #endif 4467 4468 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4469 break; 4470 4471 4472 case URX_STRING: 4473 { 4474 // Test input against a literal string. 4475 // Strings require two slots in the compiled pattern, one for the 4476 // offset to the string text, and one for the length. 4477 int32_t stringStartIdx = opValue; 4478 int32_t stringLen; 4479 4480 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 4481 fp->fPatIdx++; 4482 opType = URX_TYPE(op); 4483 stringLen = URX_VAL(op); 4484 U_ASSERT(opType == URX_STRING_LEN); 4485 U_ASSERT(stringLen >= 2); 4486 4487 if (fp->fInputIdx + stringLen > fActiveLimit) { 4488 // No match. String is longer than the remaining input text. 4489 fHitEnd = TRUE; // TODO: See ticket 6074 4490 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4491 break; 4492 } 4493 4494 const UChar * pInp = inputBuf + fp->fInputIdx; 4495 const UChar * pPat = litText+stringStartIdx; 4496 const UChar * pEnd = pInp + stringLen; 4497 UBool success = FALSE; 4498 for(;;) { 4499 if (*pInp == *pPat) { 4500 pInp++; 4501 pPat++; 4502 if (pInp == pEnd) { 4503 // Successful Match. 4504 success = TRUE; 4505 break; 4506 } 4507 } else { 4508 // Match failed. 4509 break; 4510 } 4511 } 4512 4513 if (success) { 4514 fp->fInputIdx += stringLen; 4515 } else { 4516 #ifdef REGEX_SMART_BACKTRACKING 4517 if (fp->fInputIdx > backSearchIndex && fStack->size()) { 4518 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 4519 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 4520 // Reset to last start point 4521 int64_t reverseIndex = fp->fInputIdx; 4522 UChar32 c; 4523 pPat = litText+stringStartIdx; 4524 4525 // Search backwards for a possible start 4526 do { 4527 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 4528 if ((U_IS_BMP(c) && *pPat == c) || 4529 (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) { 4530 success = TRUE; 4531 break; 4532 } 4533 } while (reverseIndex > backSearchIndex); 4534 4535 // And try again 4536 if (success) { 4537 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4538 fp->fInputIdx = reverseIndex; 4539 if (fp->fInputIdx > backSearchIndex) { 4540 fp = StateSave(fp, fp->fPatIdx, status); 4541 } 4542 fp->fPatIdx++; // Skip the LOOP_C, we just did that 4543 break; 4544 } 4545 } 4546 } 4547 #endif 4548 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4549 } 4550 } 4551 break; 4552 4553 4554 case URX_STATE_SAVE: 4555 fp = StateSave(fp, opValue, status); 4556 break; 4557 4558 4559 case URX_END: 4560 // The match loop will exit via this path on a successful match, 4561 // when we reach the end of the pattern. 4562 if (toEnd && fp->fInputIdx != fActiveLimit) { 4563 // The pattern matched, but not to the end of input. Try some more. 4564 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4565 break; 4566 } 4567 isMatch = TRUE; 4568 goto breakFromLoop; 4569 4570 // Start and End Capture stack frame variables are laid out out like this: 4571 // fp->fExtra[opValue] - The start of a completed capture group 4572 // opValue+1 - The end of a completed capture group 4573 // opValue+2 - the start of a capture group whose end 4574 // has not yet been reached (and might not ever be). 4575 case URX_START_CAPTURE: 4576 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4577 fp->fExtra[opValue+2] = fp->fInputIdx; 4578 break; 4579 4580 4581 case URX_END_CAPTURE: 4582 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4583 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 4584 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 4585 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 4586 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 4587 break; 4588 4589 4590 case URX_DOLLAR: // $, test for End of line 4591 // or for position before new line at end of input 4592 if (fp->fInputIdx < fAnchorLimit-2) { 4593 // We are no where near the end of input. Fail. 4594 // This is the common case. Keep it first. 4595 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4596 break; 4597 } 4598 if (fp->fInputIdx >= fAnchorLimit) { 4599 // We really are at the end of input. Success. 4600 fHitEnd = TRUE; 4601 fRequireEnd = TRUE; 4602 break; 4603 } 4604 4605 // If we are positioned just before a new-line that is located at the 4606 // end of input, succeed. 4607 if (fp->fInputIdx == fAnchorLimit-1) { 4608 UChar32 c; 4609 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); 4610 4611 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 4612 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4613 // At new-line at end of input. Success 4614 fHitEnd = TRUE; 4615 fRequireEnd = TRUE; 4616 break; 4617 } 4618 } 4619 } else if (fp->fInputIdx == fAnchorLimit-2 && 4620 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) { 4621 fHitEnd = TRUE; 4622 fRequireEnd = TRUE; 4623 break; // At CR/LF at end of input. Success 4624 } 4625 4626 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4627 4628 break; 4629 4630 4631 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 4632 if (fp->fInputIdx >= fAnchorLimit-1) { 4633 // Either at the last character of input, or off the end. 4634 if (fp->fInputIdx == fAnchorLimit-1) { 4635 // At last char of input. Success if it's a new line. 4636 if (inputBuf[fp->fInputIdx] == 0x0a) { 4637 fHitEnd = TRUE; 4638 fRequireEnd = TRUE; 4639 break; 4640 } 4641 } else { 4642 // Off the end of input. Success. 4643 fHitEnd = TRUE; 4644 fRequireEnd = TRUE; 4645 break; 4646 } 4647 } 4648 4649 // Not at end of input. Back-track out. 4650 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4651 break; 4652 4653 4654 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 4655 { 4656 if (fp->fInputIdx >= fAnchorLimit) { 4657 // We really are at the end of input. Success. 4658 fHitEnd = TRUE; 4659 fRequireEnd = TRUE; 4660 break; 4661 } 4662 // If we are positioned just before a new-line, succeed. 4663 // It makes no difference where the new-line is within the input. 4664 UChar32 c = inputBuf[fp->fInputIdx]; 4665 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 4666 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 4667 // In multi-line mode, hitting a new-line just before the end of input does not 4668 // set the hitEnd or requireEnd flags 4669 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4670 break; 4671 } 4672 } 4673 // not at a new line. Fail. 4674 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4675 } 4676 break; 4677 4678 4679 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 4680 { 4681 if (fp->fInputIdx >= fAnchorLimit) { 4682 // We really are at the end of input. Success. 4683 fHitEnd = TRUE; 4684 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 4685 break; // adding a new-line would not lose the match. 4686 } 4687 // If we are not positioned just before a new-line, the test fails; backtrack out. 4688 // It makes no difference where the new-line is within the input. 4689 if (inputBuf[fp->fInputIdx] != 0x0a) { 4690 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4691 } 4692 } 4693 break; 4694 4695 4696 case URX_CARET: // ^, test for start of line 4697 if (fp->fInputIdx != fAnchorStart) { 4698 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4699 } 4700 break; 4701 4702 4703 case URX_CARET_M: // ^, test for start of line in mulit-line mode 4704 { 4705 if (fp->fInputIdx == fAnchorStart) { 4706 // We are at the start input. Success. 4707 break; 4708 } 4709 // Check whether character just before the current pos is a new-line 4710 // unless we are at the end of input 4711 UChar c = inputBuf[fp->fInputIdx - 1]; 4712 if ((fp->fInputIdx < fAnchorLimit) && 4713 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 4714 // It's a new-line. ^ is true. Success. 4715 // TODO: what should be done with positions between a CR and LF? 4716 break; 4717 } 4718 // Not at the start of a line. Fail. 4719 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4720 } 4721 break; 4722 4723 4724 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 4725 { 4726 U_ASSERT(fp->fInputIdx >= fAnchorStart); 4727 if (fp->fInputIdx <= fAnchorStart) { 4728 // We are at the start input. Success. 4729 break; 4730 } 4731 // Check whether character just before the current pos is a new-line 4732 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 4733 UChar c = inputBuf[fp->fInputIdx - 1]; 4734 if (c != 0x0a) { 4735 // Not at the start of a line. Back-track out. 4736 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4737 } 4738 } 4739 break; 4740 4741 case URX_BACKSLASH_B: // Test for word boundaries 4742 { 4743 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); 4744 success ^= (opValue != 0); // flip sense for \B 4745 if (!success) { 4746 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4747 } 4748 } 4749 break; 4750 4751 4752 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 4753 { 4754 UBool success = isUWordBoundary(fp->fInputIdx); 4755 success ^= (opValue != 0); // flip sense for \B 4756 if (!success) { 4757 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4758 } 4759 } 4760 break; 4761 4762 4763 case URX_BACKSLASH_D: // Test for decimal digit 4764 { 4765 if (fp->fInputIdx >= fActiveLimit) { 4766 fHitEnd = TRUE; 4767 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4768 break; 4769 } 4770 4771 UChar32 c; 4772 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4773 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 4774 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 4775 success ^= (opValue != 0); // flip sense for \D 4776 if (!success) { 4777 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4778 } 4779 } 4780 break; 4781 4782 4783 case URX_BACKSLASH_G: // Test for position at end of previous match 4784 if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==fActiveStart)) { 4785 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4786 } 4787 break; 4788 4789 4790 case URX_BACKSLASH_X: 4791 // Match a Grapheme, as defined by Unicode TR 29. 4792 // Differs slightly from Perl, which consumes combining marks independently 4793 // of context. 4794 { 4795 4796 // Fail if at end of input 4797 if (fp->fInputIdx >= fActiveLimit) { 4798 fHitEnd = TRUE; 4799 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4800 break; 4801 } 4802 4803 // Examine (and consume) the current char. 4804 // Dispatch into a little state machine, based on the char. 4805 UChar32 c; 4806 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4807 UnicodeSet **sets = fPattern->fStaticSets; 4808 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 4809 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 4810 if (sets[URX_GC_L]->contains(c)) goto GC_L; 4811 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 4812 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 4813 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4814 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4815 goto GC_Extend; 4816 4817 4818 4819 GC_L: 4820 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4821 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4822 if (sets[URX_GC_L]->contains(c)) goto GC_L; 4823 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 4824 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 4825 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4826 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4827 goto GC_Extend; 4828 4829 GC_V: 4830 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4831 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4832 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4833 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4834 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4835 goto GC_Extend; 4836 4837 GC_T: 4838 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4839 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4840 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4841 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4842 goto GC_Extend; 4843 4844 GC_Extend: 4845 // Combining characters are consumed here 4846 for (;;) { 4847 if (fp->fInputIdx >= fActiveLimit) { 4848 break; 4849 } 4850 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4851 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 4852 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 4853 break; 4854 } 4855 } 4856 goto GC_Done; 4857 4858 GC_Control: 4859 // Most control chars stand alone (don't combine with combining chars), 4860 // except for that CR/LF sequence is a single grapheme cluster. 4861 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) { 4862 fp->fInputIdx++; 4863 } 4864 4865 GC_Done: 4866 if (fp->fInputIdx >= fActiveLimit) { 4867 fHitEnd = TRUE; 4868 } 4869 break; 4870 } 4871 4872 4873 4874 4875 case URX_BACKSLASH_Z: // Test for end of Input 4876 if (fp->fInputIdx < fAnchorLimit) { 4877 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4878 } else { 4879 fHitEnd = TRUE; 4880 fRequireEnd = TRUE; 4881 } 4882 break; 4883 4884 4885 4886 case URX_STATIC_SETREF: 4887 { 4888 // Test input character against one of the predefined sets 4889 // (Word Characters, for example) 4890 // The high bit of the op value is a flag for the match polarity. 4891 // 0: success if input char is in set. 4892 // 1: success if input char is not in set. 4893 if (fp->fInputIdx >= fActiveLimit) { 4894 fHitEnd = TRUE; 4895 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4896 break; 4897 } 4898 4899 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 4900 opValue &= ~URX_NEG_SET; 4901 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 4902 4903 UChar32 c; 4904 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4905 if (c < 256) { 4906 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 4907 if (s8->contains(c)) { 4908 success = !success; 4909 } 4910 } else { 4911 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 4912 if (s->contains(c)) { 4913 success = !success; 4914 } 4915 } 4916 if (!success) { 4917 #ifdef REGEX_SMART_BACKTRACKING 4918 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 4919 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 4920 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 4921 // Try to find it, backwards 4922 int64_t reverseIndex = fp->fInputIdx; 4923 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried 4924 success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset 4925 do { 4926 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 4927 if (c < 256) { 4928 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 4929 if (s8->contains(c)) { 4930 success = !success; 4931 } 4932 } else { 4933 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 4934 if (s->contains(c)) { 4935 success = !success; 4936 } 4937 } 4938 } while (reverseIndex > backSearchIndex && !success); 4939 4940 if (success) { 4941 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4942 fp->fInputIdx = reverseIndex; 4943 if (fp->fInputIdx > backSearchIndex) { 4944 fp = StateSave(fp, fp->fPatIdx, status); 4945 } 4946 fp->fPatIdx++; // Skip the LOOP_C, we just did that 4947 break; 4948 } 4949 } 4950 } 4951 #endif 4952 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4953 } 4954 } 4955 break; 4956 4957 4958 case URX_STAT_SETREF_N: 4959 { 4960 // Test input character for NOT being a member of one of 4961 // the predefined sets (Word Characters, for example) 4962 if (fp->fInputIdx >= fActiveLimit) { 4963 fHitEnd = TRUE; 4964 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4965 break; 4966 } 4967 4968 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 4969 4970 UChar32 c; 4971 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4972 if (c < 256) { 4973 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 4974 if (s8->contains(c) == FALSE) { 4975 break; 4976 } 4977 } else { 4978 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 4979 if (s->contains(c) == FALSE) { 4980 break; 4981 } 4982 } 4983 4984 #ifdef REGEX_SMART_BACKTRACKING 4985 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 4986 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 4987 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 4988 // Try to find it, backwards 4989 int64_t reverseIndex = fp->fInputIdx; 4990 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried 4991 UBool success = FALSE; 4992 do { 4993 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 4994 if (c < 256) { 4995 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 4996 if (s8->contains(c) == FALSE) { 4997 success = TRUE; 4998 break; 4999 } 5000 } else { 5001 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 5002 if (s->contains(c) == FALSE) { 5003 success = TRUE; 5004 break; 5005 } 5006 } 5007 } while (reverseIndex > backSearchIndex); 5008 5009 if (success) { 5010 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5011 fp->fInputIdx = reverseIndex; 5012 if (fp->fInputIdx > backSearchIndex) { 5013 fp = StateSave(fp, fp->fPatIdx, status); 5014 } 5015 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5016 break; 5017 } 5018 } 5019 } 5020 #endif 5021 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5022 } 5023 break; 5024 5025 5026 case URX_SETREF: 5027 { 5028 if (fp->fInputIdx >= fActiveLimit) { 5029 fHitEnd = TRUE; 5030 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5031 break; 5032 } 5033 5034 U_ASSERT(opValue > 0 && opValue < sets->size()); 5035 5036 // There is input left. Pick up one char and test it for set membership. 5037 UChar32 c; 5038 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5039 if (c<256) { 5040 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5041 if (s8->contains(c)) { 5042 // The character is in the set. A Match. 5043 break; 5044 } 5045 } else { 5046 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 5047 if (s->contains(c)) { 5048 // The character is in the set. A Match. 5049 break; 5050 } 5051 } 5052 5053 // the character wasn't in the set. 5054 #ifdef REGEX_SMART_BACKTRACKING 5055 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 5056 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5057 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5058 // Try to find it, backwards 5059 int64_t reverseIndex = fp->fInputIdx; 5060 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried 5061 UBool success = FALSE; 5062 do { 5063 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5064 if (c < 256) { 5065 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5066 if (s8->contains(c)) { 5067 success = TRUE; 5068 break; 5069 } 5070 } else { 5071 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 5072 if (s->contains(c)) { 5073 success = TRUE; 5074 break; 5075 } 5076 } 5077 } while (reverseIndex > backSearchIndex); 5078 5079 if (success) { 5080 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5081 fp->fInputIdx = reverseIndex; 5082 if (fp->fInputIdx > reverseIndex) { 5083 fp = StateSave(fp, fp->fPatIdx, status); 5084 } 5085 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5086 break; 5087 } 5088 } 5089 } 5090 #endif 5091 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5092 } 5093 break; 5094 5095 5096 case URX_DOTANY: 5097 { 5098 // . matches anything, but stops at end-of-line. 5099 if (fp->fInputIdx >= fActiveLimit) { 5100 // At end of input. Match failed. Backtrack out. 5101 fHitEnd = TRUE; 5102 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5103 break; 5104 } 5105 5106 // There is input left. Advance over one char, unless we've hit end-of-line 5107 UChar32 c; 5108 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5109 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 5110 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 5111 // End of line in normal mode. . does not match. 5112 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5113 break; 5114 } 5115 } 5116 break; 5117 5118 5119 case URX_DOTANY_ALL: 5120 { 5121 // . in dot-matches-all (including new lines) mode 5122 if (fp->fInputIdx >= fActiveLimit) { 5123 // At end of input. Match failed. Backtrack out. 5124 fHitEnd = TRUE; 5125 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5126 break; 5127 } 5128 5129 // There is input left. Advance over one char, except if we are 5130 // at a cr/lf, advance over both of them. 5131 UChar32 c; 5132 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5133 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 5134 // In the case of a CR/LF, we need to advance over both. 5135 if (inputBuf[fp->fInputIdx] == 0x0a) { 5136 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); 5137 } 5138 } 5139 } 5140 break; 5141 5142 5143 case URX_DOTANY_UNIX: 5144 { 5145 // '.' operator, matches all, but stops at end-of-line. 5146 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 5147 if (fp->fInputIdx >= fActiveLimit) { 5148 // At end of input. Match failed. Backtrack out. 5149 fHitEnd = TRUE; 5150 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5151 break; 5152 } 5153 5154 // There is input left. Advance over one char, unless we've hit end-of-line 5155 UChar32 c; 5156 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5157 if (c == 0x0a) { 5158 // End of line in normal mode. '.' does not match the \n 5159 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5160 } 5161 } 5162 break; 5163 5164 5165 case URX_JMP: 5166 fp->fPatIdx = opValue; 5167 break; 5168 5169 case URX_FAIL: 5170 isMatch = FALSE; 5171 goto breakFromLoop; 5172 5173 case URX_JMP_SAV: 5174 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 5175 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 5176 fp->fPatIdx = opValue; // Then JMP. 5177 break; 5178 5179 case URX_JMP_SAV_X: 5180 // This opcode is used with (x)+, when x can match a zero length string. 5181 // Same as JMP_SAV, except conditional on the match having made forward progress. 5182 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 5183 // data address of the input position at the start of the loop. 5184 { 5185 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 5186 int32_t stoOp = (int32_t)pat[opValue-1]; 5187 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 5188 int32_t frameLoc = URX_VAL(stoOp); 5189 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 5190 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; 5191 U_ASSERT(prevInputIdx <= fp->fInputIdx); 5192 if (prevInputIdx < fp->fInputIdx) { 5193 // The match did make progress. Repeat the loop. 5194 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 5195 fp->fPatIdx = opValue; 5196 fp->fExtra[frameLoc] = fp->fInputIdx; 5197 } 5198 // If the input position did not advance, we do nothing here, 5199 // execution will fall out of the loop. 5200 } 5201 break; 5202 5203 case URX_CTR_INIT: 5204 { 5205 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 5206 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 5207 5208 // Pick up the three extra operands that CTR_INIT has, and 5209 // skip the pattern location counter past 5210 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5211 fp->fPatIdx += 3; 5212 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 5213 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 5214 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 5215 U_ASSERT(minCount>=0); 5216 U_ASSERT(maxCount>=minCount || maxCount==-1); 5217 U_ASSERT(loopLoc>fp->fPatIdx); 5218 5219 if (minCount == 0) { 5220 fp = StateSave(fp, loopLoc+1, status); 5221 } 5222 if (maxCount == 0) { 5223 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5224 } 5225 } 5226 break; 5227 5228 case URX_CTR_LOOP: 5229 { 5230 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 5231 int32_t initOp = (int32_t)pat[opValue]; 5232 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 5233 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 5234 int32_t minCount = (int32_t)pat[opValue+2]; 5235 int32_t maxCount = (int32_t)pat[opValue+3]; 5236 // Increment the counter. Note: we DIDN'T worry about counter 5237 // overflow, since the data comes from UnicodeStrings, which 5238 // stores its length in an int32_t. Do we have to think about 5239 // this now that we're using UText? Probably not, since the length 5240 // in UChar32s is still an int32_t. 5241 (*pCounter)++; 5242 U_ASSERT(*pCounter > 0); 5243 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { 5244 U_ASSERT(*pCounter == maxCount || maxCount == -1); 5245 break; 5246 } 5247 if (*pCounter >= minCount) { 5248 fp = StateSave(fp, fp->fPatIdx, status); 5249 } 5250 fp->fPatIdx = opValue + 4; // Loop back. 5251 } 5252 break; 5253 5254 case URX_CTR_INIT_NG: 5255 { 5256 // Initialize a non-greedy loop 5257 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 5258 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 5259 5260 // Pick up the three extra operands that CTR_INIT has, and 5261 // skip the pattern location counter past 5262 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5263 fp->fPatIdx += 3; 5264 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 5265 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 5266 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 5267 U_ASSERT(minCount>=0); 5268 U_ASSERT(maxCount>=minCount || maxCount==-1); 5269 U_ASSERT(loopLoc>fp->fPatIdx); 5270 5271 if (minCount == 0) { 5272 if (maxCount != 0) { 5273 fp = StateSave(fp, fp->fPatIdx, status); 5274 } 5275 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 5276 } 5277 } 5278 break; 5279 5280 case URX_CTR_LOOP_NG: 5281 { 5282 // Non-greedy {min, max} loops 5283 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 5284 int32_t initOp = (int32_t)pat[opValue]; 5285 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 5286 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 5287 int32_t minCount = (int32_t)pat[opValue+2]; 5288 int32_t maxCount = (int32_t)pat[opValue+3]; 5289 // Increment the counter. Note: we DIDN'T worry about counter 5290 // overflow, since the data comes from UnicodeStrings, which 5291 // stores its length in an int32_t. Do we have to think about 5292 // this now that we're using UText? Probably not, since the length 5293 // in UChar32s is still an int32_t. 5294 (*pCounter)++; 5295 U_ASSERT(*pCounter > 0); 5296 5297 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { 5298 // The loop has matched the maximum permitted number of times. 5299 // Break out of here with no action. Matching will 5300 // continue with the following pattern. 5301 U_ASSERT(*pCounter == maxCount || maxCount == -1); 5302 break; 5303 } 5304 5305 if (*pCounter < minCount) { 5306 // We haven't met the minimum number of matches yet. 5307 // Loop back for another one. 5308 fp->fPatIdx = opValue + 4; // Loop back. 5309 } else { 5310 // We do have the minimum number of matches. 5311 // Fall into the following pattern, but first do 5312 // a state save to the top of the loop, so that a failure 5313 // in the following pattern will try another iteration of the loop. 5314 fp = StateSave(fp, opValue + 4, status); 5315 } 5316 } 5317 break; 5318 5319 case URX_STO_SP: 5320 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5321 fData[opValue] = fStack->size(); 5322 break; 5323 5324 case URX_LD_SP: 5325 { 5326 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5327 int32_t newStackSize = (int32_t)fData[opValue]; 5328 U_ASSERT(newStackSize <= fStack->size()); 5329 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5330 if (newFP == (int64_t *)fp) { 5331 break; 5332 } 5333 int32_t i; 5334 for (i=0; i<fFrameSize; i++) { 5335 newFP[i] = ((int64_t *)fp)[i]; 5336 } 5337 fp = (REStackFrame *)newFP; 5338 fStack->setSize(newStackSize); 5339 } 5340 break; 5341 5342 case URX_BACKREF: 5343 case URX_BACKREF_I: 5344 { 5345 U_ASSERT(opValue < fFrameSize); 5346 int64_t groupStartIdx = fp->fExtra[opValue]; 5347 int64_t groupEndIdx = fp->fExtra[opValue+1]; 5348 U_ASSERT(groupStartIdx <= groupEndIdx); 5349 int64_t len = groupEndIdx-groupStartIdx; 5350 if (groupStartIdx < 0) { 5351 // This capture group has not participated in the match thus far, 5352 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 5353 } 5354 5355 if (len == 0) { 5356 // The capture group match was of an empty string. 5357 // Verified by testing: Perl matches succeed in this case, so 5358 // we do too. 5359 break; 5360 } 5361 5362 UBool haveMatch = FALSE; 5363 if (fp->fInputIdx + len <= fActiveLimit) { 5364 if (opType == URX_BACKREF) { 5365 if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) { 5366 haveMatch = TRUE; 5367 } 5368 } else { 5369 if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, 5370 (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) { 5371 haveMatch = TRUE; 5372 } 5373 } 5374 } else { 5375 // TODO: probably need to do a partial string comparison, and only 5376 // set HitEnd if the available input matched. Ticket #6074 5377 fHitEnd = TRUE; 5378 } 5379 if (haveMatch) { 5380 fp->fInputIdx += len; // Match. Advance current input position. 5381 } else { 5382 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 5383 } 5384 } 5385 break; 5386 5387 case URX_STO_INP_LOC: 5388 { 5389 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 5390 fp->fExtra[opValue] = fp->fInputIdx; 5391 } 5392 break; 5393 5394 case URX_JMPX: 5395 { 5396 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5397 fp->fPatIdx += 1; 5398 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 5399 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 5400 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; 5401 U_ASSERT(savedInputIdx <= fp->fInputIdx); 5402 if (savedInputIdx < fp->fInputIdx) { 5403 fp->fPatIdx = opValue; // JMP 5404 } else { 5405 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 5406 } 5407 } 5408 break; 5409 5410 case URX_LA_START: 5411 { 5412 // Entering a lookahead block. 5413 // Save Stack Ptr, Input Pos. 5414 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5415 fData[opValue] = fStack->size(); 5416 fData[opValue+1] = fp->fInputIdx; 5417 fActiveStart = fLookStart; // Set the match region change for 5418 fActiveLimit = fLookLimit; // transparent bounds. 5419 } 5420 break; 5421 5422 case URX_LA_END: 5423 { 5424 // Leaving a look-ahead block. 5425 // restore Stack Ptr, Input Pos to positions they had on entry to block. 5426 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5427 int32_t stackSize = fStack->size(); 5428 int32_t newStackSize = (int32_t)fData[opValue]; 5429 U_ASSERT(stackSize >= newStackSize); 5430 if (stackSize > newStackSize) { 5431 // Copy the current top frame back to the new (cut back) top frame. 5432 // This makes the capture groups from within the look-ahead 5433 // expression available. 5434 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5435 int32_t i; 5436 for (i=0; i<fFrameSize; i++) { 5437 newFP[i] = ((int64_t *)fp)[i]; 5438 } 5439 fp = (REStackFrame *)newFP; 5440 fStack->setSize(newStackSize); 5441 } 5442 fp->fInputIdx = fData[opValue+1]; 5443 5444 // Restore the active region bounds in the input string; they may have 5445 // been changed because of transparent bounds on a Region. 5446 fActiveStart = fRegionStart; 5447 fActiveLimit = fRegionLimit; 5448 } 5449 break; 5450 5451 case URX_ONECHAR_I: 5452 if (fp->fInputIdx < fActiveLimit) { 5453 UChar32 c; 5454 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5455 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 5456 break; 5457 } 5458 } else { 5459 fHitEnd = TRUE; 5460 } 5461 5462 #ifdef REGEX_SMART_BACKTRACKING 5463 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 5464 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5465 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5466 UBool success = FALSE; 5467 int64_t reverseIndex = fp->fInputIdx; 5468 UChar32 c; 5469 while (reverseIndex > backSearchIndex) { 5470 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5471 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 5472 success = TRUE; 5473 break; 5474 } else if (c == U_SENTINEL) { 5475 break; 5476 } 5477 } 5478 if (success) { 5479 fHitEnd = FALSE; 5480 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5481 fp->fInputIdx = reverseIndex; 5482 if (fp->fInputIdx > backSearchIndex) { 5483 fp = StateSave(fp, fp->fPatIdx, status); 5484 } 5485 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5486 break; 5487 } 5488 } 5489 } 5490 #endif 5491 5492 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5493 break; 5494 5495 case URX_STRING_I: 5496 { 5497 // Test input against a literal string. 5498 // Strings require two slots in the compiled pattern, one for the 5499 // offset to the string text, and one for the length. 5500 const UCaseProps *csp = ucase_getSingleton(&status); 5501 if (U_SUCCESS(status)) { 5502 int32_t stringStartIdx, stringLen; 5503 stringStartIdx = opValue; 5504 5505 op = (int32_t)pat[fp->fPatIdx]; 5506 fp->fPatIdx++; 5507 opType = URX_TYPE(op); 5508 opValue = URX_VAL(op); 5509 U_ASSERT(opType == URX_STRING_LEN); 5510 stringLen = opValue; 5511 5512 const UChar *patternChars = litText+stringStartIdx; 5513 const UChar *patternEnd = patternChars+stringLen; 5514 5515 const UChar *foldChars = NULL; 5516 int32_t foldOffset, foldLength; 5517 UChar32 c; 5518 5519 #ifdef REGEX_SMART_BACKTRACKING 5520 int32_t originalInputIdx = fp->fInputIdx; 5521 #endif 5522 UBool success = TRUE; 5523 5524 foldOffset = foldLength = 0; 5525 5526 while (patternChars < patternEnd && success) { 5527 if(foldOffset < foldLength) { 5528 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 5529 } else { 5530 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5531 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); 5532 if(foldLength >= 0) { 5533 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings 5534 foldOffset = 0; 5535 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 5536 } else { 5537 c = foldLength; 5538 foldLength = foldOffset; // to avoid reading chars from the folding buffer 5539 } 5540 } 5541 } 5542 5543 if (fp->fInputIdx <= fActiveLimit) { 5544 if (U_IS_BMP(c)) { 5545 success = (*patternChars == c); 5546 patternChars += 1; 5547 } else if (patternChars+1 < patternEnd) { 5548 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); 5549 patternChars += 2; 5550 } 5551 } else { 5552 success = FALSE; 5553 fHitEnd = TRUE; // TODO: See ticket 6074 5554 } 5555 } 5556 5557 if (!success) { 5558 #ifdef REGEX_SMART_BACKTRACKING 5559 if (fp->fInputIdx > backSearchIndex && fStack->size()) { 5560 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5561 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5562 // Reset to last start point 5563 int64_t reverseIndex = originalInputIdx; 5564 patternChars = litText+stringStartIdx; 5565 5566 // Search backwards for a possible start 5567 do { 5568 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5569 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); 5570 if(foldLength >= 0) { 5571 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings 5572 foldOffset = 0; 5573 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 5574 } else { 5575 c = foldLength; 5576 foldLength = foldOffset; // to avoid reading chars from the folding buffer 5577 } 5578 } 5579 5580 if ((U_IS_BMP(c) && *patternChars == c) || 5581 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { 5582 success = TRUE; 5583 break; 5584 } 5585 } while (reverseIndex > backSearchIndex); 5586 5587 // And try again 5588 if (success) { 5589 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5590 fp->fInputIdx = reverseIndex; 5591 if (fp->fInputIdx > backSearchIndex) { 5592 fp = StateSave(fp, fp->fPatIdx, status); 5593 } 5594 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5595 break; 5596 } 5597 } 5598 } 5599 #endif 5600 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5601 } 5602 } 5603 } 5604 break; 5605 5606 case URX_LB_START: 5607 { 5608 // Entering a look-behind block. 5609 // Save Stack Ptr, Input Pos. 5610 // TODO: implement transparent bounds. Ticket #6067 5611 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5612 fData[opValue] = fStack->size(); 5613 fData[opValue+1] = fp->fInputIdx; 5614 // Init the variable containing the start index for attempted matches. 5615 fData[opValue+2] = -1; 5616 // Save input string length, then reset to pin any matches to end at 5617 // the current position. 5618 fData[opValue+3] = fActiveLimit; 5619 fActiveLimit = fp->fInputIdx; 5620 } 5621 break; 5622 5623 5624 case URX_LB_CONT: 5625 { 5626 // Positive Look-Behind, at top of loop checking for matches of LB expression 5627 // at all possible input starting positions. 5628 5629 // Fetch the min and max possible match lengths. They are the operands 5630 // of this op in the pattern. 5631 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5632 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5633 U_ASSERT(minML <= maxML); 5634 U_ASSERT(minML >= 0); 5635 5636 // Fetch (from data) the last input index where a match was attempted. 5637 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5638 int64_t *lbStartIdx = &fData[opValue+2]; 5639 if (*lbStartIdx < 0) { 5640 // First time through loop. 5641 *lbStartIdx = fp->fInputIdx - minML; 5642 } else { 5643 // 2nd through nth time through the loop. 5644 // Back up start position for match by one. 5645 if (*lbStartIdx == 0) { 5646 (*lbStartIdx)--; 5647 } else { 5648 U16_BACK_1(inputBuf, 0, *lbStartIdx); 5649 } 5650 } 5651 5652 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 5653 // We have tried all potential match starting points without 5654 // getting a match. Backtrack out, and out of the 5655 // Look Behind altogether. 5656 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5657 int64_t restoreInputLen = fData[opValue+3]; 5658 U_ASSERT(restoreInputLen >= fActiveLimit); 5659 U_ASSERT(restoreInputLen <= fInputLength); 5660 fActiveLimit = restoreInputLen; 5661 break; 5662 } 5663 5664 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5665 // (successful match will fall off the end of the loop.) 5666 fp = StateSave(fp, fp->fPatIdx-3, status); 5667 fp->fInputIdx = *lbStartIdx; 5668 } 5669 break; 5670 5671 case URX_LB_END: 5672 // End of a look-behind block, after a successful match. 5673 { 5674 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5675 if (fp->fInputIdx != fActiveLimit) { 5676 // The look-behind expression matched, but the match did not 5677 // extend all the way to the point that we are looking behind from. 5678 // FAIL out of here, which will take us back to the LB_CONT, which 5679 // will retry the match starting at another position or fail 5680 // the look-behind altogether, whichever is appropriate. 5681 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5682 break; 5683 } 5684 5685 // Look-behind match is good. Restore the orignal input string length, 5686 // which had been truncated to pin the end of the lookbehind match to the 5687 // position being looked-behind. 5688 int64_t originalInputLen = fData[opValue+3]; 5689 U_ASSERT(originalInputLen >= fActiveLimit); 5690 U_ASSERT(originalInputLen <= fInputLength); 5691 fActiveLimit = originalInputLen; 5692 } 5693 break; 5694 5695 5696 case URX_LBN_CONT: 5697 { 5698 // Negative Look-Behind, at top of loop checking for matches of LB expression 5699 // at all possible input starting positions. 5700 5701 // Fetch the extra parameters of this op. 5702 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5703 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5704 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 5705 continueLoc = URX_VAL(continueLoc); 5706 U_ASSERT(minML <= maxML); 5707 U_ASSERT(minML >= 0); 5708 U_ASSERT(continueLoc > fp->fPatIdx); 5709 5710 // Fetch (from data) the last input index where a match was attempted. 5711 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5712 int64_t *lbStartIdx = &fData[opValue+2]; 5713 if (*lbStartIdx < 0) { 5714 // First time through loop. 5715 *lbStartIdx = fp->fInputIdx - minML; 5716 } else { 5717 // 2nd through nth time through the loop. 5718 // Back up start position for match by one. 5719 if (*lbStartIdx == 0) { 5720 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. 5721 } else { 5722 U16_BACK_1(inputBuf, 0, *lbStartIdx); 5723 } 5724 } 5725 5726 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 5727 // We have tried all potential match starting points without 5728 // getting a match, which means that the negative lookbehind as 5729 // a whole has succeeded. Jump forward to the continue location 5730 int64_t restoreInputLen = fData[opValue+3]; 5731 U_ASSERT(restoreInputLen >= fActiveLimit); 5732 U_ASSERT(restoreInputLen <= fInputLength); 5733 fActiveLimit = restoreInputLen; 5734 fp->fPatIdx = continueLoc; 5735 break; 5736 } 5737 5738 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5739 // (successful match will cause a FAIL out of the loop altogether.) 5740 fp = StateSave(fp, fp->fPatIdx-4, status); 5741 fp->fInputIdx = *lbStartIdx; 5742 } 5743 break; 5744 5745 case URX_LBN_END: 5746 // End of a negative look-behind block, after a successful match. 5747 { 5748 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5749 if (fp->fInputIdx != fActiveLimit) { 5750 // The look-behind expression matched, but the match did not 5751 // extend all the way to the point that we are looking behind from. 5752 // FAIL out of here, which will take us back to the LB_CONT, which 5753 // will retry the match starting at another position or succeed 5754 // the look-behind altogether, whichever is appropriate. 5755 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5756 break; 5757 } 5758 5759 // Look-behind expression matched, which means look-behind test as 5760 // a whole Fails 5761 5762 // Restore the orignal input string length, which had been truncated 5763 // inorder to pin the end of the lookbehind match 5764 // to the position being looked-behind. 5765 int64_t originalInputLen = fData[opValue+3]; 5766 U_ASSERT(originalInputLen >= fActiveLimit); 5767 U_ASSERT(originalInputLen <= fInputLength); 5768 fActiveLimit = originalInputLen; 5769 5770 // Restore original stack position, discarding any state saved 5771 // by the successful pattern match. 5772 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5773 int32_t newStackSize = (int32_t)fData[opValue]; 5774 U_ASSERT(fStack->size() > newStackSize); 5775 fStack->setSize(newStackSize); 5776 5777 // FAIL, which will take control back to someplace 5778 // prior to entering the look-behind test. 5779 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5780 } 5781 break; 5782 5783 5784 case URX_LOOP_SR_I: 5785 // Loop Initialization for the optimized implementation of 5786 // [some character set]* 5787 // This op scans through all matching input. 5788 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5789 { 5790 U_ASSERT(opValue > 0 && opValue < sets->size()); 5791 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5792 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 5793 5794 // Loop through input, until either the input is exhausted or 5795 // we reach a character that is not a member of the set. 5796 int32_t ix = (int32_t)fp->fInputIdx; 5797 for (;;) { 5798 if (ix >= fActiveLimit) { 5799 fHitEnd = TRUE; 5800 break; 5801 } 5802 UChar32 c; 5803 U16_NEXT(inputBuf, ix, fActiveLimit, c); 5804 if (c<256) { 5805 if (s8->contains(c) == FALSE) { 5806 U16_BACK_1(inputBuf, 0, ix); 5807 break; 5808 } 5809 } else { 5810 if (s->contains(c) == FALSE) { 5811 U16_BACK_1(inputBuf, 0, ix); 5812 break; 5813 } 5814 } 5815 } 5816 5817 // If there were no matching characters, skip over the loop altogether. 5818 // The loop doesn't run at all, a * op always succeeds. 5819 if (ix == fp->fInputIdx) { 5820 fp->fPatIdx++; // skip the URX_LOOP_C op. 5821 break; 5822 } 5823 5824 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5825 // must follow. It's operand is the stack location 5826 // that holds the starting input index for the match of this [set]* 5827 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 5828 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5829 int32_t stackLoc = URX_VAL(loopcOp); 5830 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5831 fp->fExtra[stackLoc] = fp->fInputIdx; 5832 #ifdef REGEX_SMART_BACKTRACKING 5833 backSearchIndex = fp->fInputIdx; 5834 #endif 5835 fp->fInputIdx = ix; 5836 5837 // Save State to the URX_LOOP_C op that follows this one, 5838 // so that match failures in the following code will return to there. 5839 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 5840 fp = StateSave(fp, fp->fPatIdx, status); 5841 fp->fPatIdx++; 5842 } 5843 break; 5844 5845 5846 case URX_LOOP_DOT_I: 5847 // Loop Initialization for the optimized implementation of .* 5848 // This op scans through all remaining input. 5849 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5850 { 5851 // Loop through input until the input is exhausted (we reach an end-of-line) 5852 // In DOTALL mode, we can just go straight to the end of the input. 5853 int32_t ix; 5854 if ((opValue & 1) == 1) { 5855 // Dot-matches-All mode. Jump straight to the end of the string. 5856 ix = (int32_t)fActiveLimit; 5857 fHitEnd = TRUE; 5858 } else { 5859 // NOT DOT ALL mode. Line endings do not match '.' 5860 // Scan forward until a line ending or end of input. 5861 ix = (int32_t)fp->fInputIdx; 5862 for (;;) { 5863 if (ix >= fActiveLimit) { 5864 fHitEnd = TRUE; 5865 break; 5866 } 5867 UChar32 c; 5868 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] 5869 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 5870 if ((c == 0x0a) || // 0x0a is newline in both modes. 5871 ((opValue & 2) == 0) && // IF not UNIX_LINES mode 5872 (c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029) { 5873 // char is a line ending. Put the input pos back to the 5874 // line ending char, and exit the scanning loop. 5875 U16_BACK_1(inputBuf, 0, ix); 5876 break; 5877 } 5878 } 5879 } 5880 } 5881 5882 // If there were no matching characters, skip over the loop altogether. 5883 // The loop doesn't run at all, a * op always succeeds. 5884 if (ix == fp->fInputIdx) { 5885 fp->fPatIdx++; // skip the URX_LOOP_C op. 5886 break; 5887 } 5888 5889 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5890 // must follow. It's operand is the stack location 5891 // that holds the starting input index for the match of this .* 5892 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 5893 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5894 int32_t stackLoc = URX_VAL(loopcOp); 5895 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5896 fp->fExtra[stackLoc] = fp->fInputIdx; 5897 #ifdef REGEX_SMART_BACKTRACKING 5898 backSearchIndex = fp->fInputIdx; 5899 #endif 5900 fp->fInputIdx = ix; 5901 5902 // Save State to the URX_LOOP_C op that follows this one, 5903 // so that match failures in the following code will return to there. 5904 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 5905 fp = StateSave(fp, fp->fPatIdx, status); 5906 fp->fPatIdx++; 5907 } 5908 break; 5909 5910 5911 case URX_LOOP_C: 5912 { 5913 U_ASSERT(opValue>=0 && opValue<fFrameSize); 5914 backSearchIndex = (int32_t)fp->fExtra[opValue]; 5915 U_ASSERT(backSearchIndex <= fp->fInputIdx); 5916 if (backSearchIndex == fp->fInputIdx) { 5917 // We've backed up the input idx to the point that the loop started. 5918 // The loop is done. Leave here without saving state. 5919 // Subsequent failures won't come back here. 5920 break; 5921 } 5922 // Set up for the next iteration of the loop, with input index 5923 // backed up by one from the last time through, 5924 // and a state save to this instruction in case the following code fails again. 5925 // (We're going backwards because this loop emulates stack unwinding, not 5926 // the initial scan forward.) 5927 U_ASSERT(fp->fInputIdx > 0); 5928 UChar32 prevC; 5929 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit? 5930 5931 if (prevC == 0x0a && 5932 fp->fInputIdx > backSearchIndex && 5933 inputBuf[fp->fInputIdx-1] == 0x0d) { 5934 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 5935 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 5936 // .*, stepping back over CRLF pair. 5937 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 5938 } 5939 } 5940 5941 5942 fp = StateSave(fp, fp->fPatIdx-1, status); 5943 } 5944 break; 5945 5946 5947 5948 default: 5949 // Trouble. The compiled pattern contains an entry with an 5950 // unrecognized type tag. 5951 U_ASSERT(FALSE); 5952 } 5953 5954 if (U_FAILURE(status)) { 5955 isMatch = FALSE; 5956 break; 5957 } 5958 } 5959 5960 breakFromLoop: 5961 fMatch = isMatch; 5962 if (isMatch) { 5963 fLastMatchEnd = fMatchEnd; 5964 fMatchStart = startIdx; 5965 fMatchEnd = fp->fInputIdx; 5966 if (fTraceDebug) { 5967 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); 5968 } 5969 } 5970 else 5971 { 5972 if (fTraceDebug) { 5973 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); 5974 } 5975 } 5976 5977 fFrame = fp; // The active stack frame when the engine stopped. 5978 // Contains the capture group results that we need to 5979 // access later. 5980 5981 return; 5982 } 5983 5984 5985 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) 5986 5987 U_NAMESPACE_END 5988 5989 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 5990 5991