1 /* 2 ************************************************************************** 3 * Copyright (C) 2002-2013 International Business Machines Corporation * 4 * and others. All rights reserved. * 5 ************************************************************************** 6 */ 7 // 8 // file: rematch.cpp 9 // 10 // Contains the implementation of class RegexMatcher, 11 // which is one of the main API classes for the ICU regular expression package. 12 // 13 14 #include "unicode/utypes.h" 15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 16 17 #include "unicode/regex.h" 18 #include "unicode/uniset.h" 19 #include "unicode/uchar.h" 20 #include "unicode/ustring.h" 21 #include "unicode/rbbi.h" 22 #include "unicode/utf.h" 23 #include "unicode/utf16.h" 24 #include "uassert.h" 25 #include "cmemory.h" 26 #include "uvector.h" 27 #include "uvectr32.h" 28 #include "uvectr64.h" 29 #include "regeximp.h" 30 #include "regexst.h" 31 #include "regextxt.h" 32 #include "ucase.h" 33 34 // #include <malloc.h> // Needed for heapcheck testing 35 36 37 // Find progress callback 38 // ---------------------- 39 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call. 40 // 41 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \ 42 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE) 43 44 45 // Smart Backtracking 46 // ------------------ 47 // When a failure would go back to a LOOP_C instruction, 48 // strings, characters, and setrefs scan backwards for a valid start 49 // character themselves, pop the stack, and save state, emulating the 50 // LOOP_C's effect but assured that the next character of input is a 51 // possible matching character. 52 // 53 // Good idea in theory; unfortunately it only helps out a few specific 54 // cases and slows the engine down a little in the rest. 55 56 U_NAMESPACE_BEGIN 57 58 // Default limit for the size of the back track stack, to avoid system 59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. 60 // This value puts ICU's limits higher than most other regexp implementations, 61 // which use recursion rather than the heap, and take more storage per 62 // backtrack point. 63 // 64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; 65 66 // Time limit counter constant. 67 // Time limits for expression evaluation are in terms of quanta of work by 68 // the engine, each of which is 10,000 state saves. 69 // This constant determines that state saves per tick number. 70 static const int32_t TIMER_INITIAL_VALUE = 10000; 71 72 //----------------------------------------------------------------------------- 73 // 74 // Constructor and Destructor 75 // 76 //----------------------------------------------------------------------------- 77 RegexMatcher::RegexMatcher(const RegexPattern *pat) { 78 fDeferredStatus = U_ZERO_ERROR; 79 init(fDeferredStatus); 80 if (U_FAILURE(fDeferredStatus)) { 81 return; 82 } 83 if (pat==NULL) { 84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; 85 return; 86 } 87 fPattern = pat; 88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); 89 } 90 91 92 93 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 94 uint32_t flags, UErrorCode &status) { 95 init(status); 96 if (U_FAILURE(status)) { 97 return; 98 } 99 UParseError pe; 100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 101 fPattern = fPatternOwned; 102 103 UText inputText = UTEXT_INITIALIZER; 104 utext_openConstUnicodeString(&inputText, &input, &status); 105 init2(&inputText, status); 106 utext_close(&inputText); 107 108 fInputUniStrMaybeMutable = TRUE; 109 } 110 111 112 RegexMatcher::RegexMatcher(UText *regexp, UText *input, 113 uint32_t flags, UErrorCode &status) { 114 init(status); 115 if (U_FAILURE(status)) { 116 return; 117 } 118 UParseError pe; 119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 120 if (U_FAILURE(status)) { 121 return; 122 } 123 124 fPattern = fPatternOwned; 125 init2(input, status); 126 } 127 128 129 RegexMatcher::RegexMatcher(const UnicodeString ®exp, 130 uint32_t flags, UErrorCode &status) { 131 init(status); 132 if (U_FAILURE(status)) { 133 return; 134 } 135 UParseError pe; 136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 137 if (U_FAILURE(status)) { 138 return; 139 } 140 fPattern = fPatternOwned; 141 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 142 } 143 144 RegexMatcher::RegexMatcher(UText *regexp, 145 uint32_t flags, UErrorCode &status) { 146 init(status); 147 if (U_FAILURE(status)) { 148 return; 149 } 150 UParseError pe; 151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 152 if (U_FAILURE(status)) { 153 return; 154 } 155 156 fPattern = fPatternOwned; 157 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 158 } 159 160 161 162 163 RegexMatcher::~RegexMatcher() { 164 delete fStack; 165 if (fData != fSmallData) { 166 uprv_free(fData); 167 fData = NULL; 168 } 169 if (fPatternOwned) { 170 delete fPatternOwned; 171 fPatternOwned = NULL; 172 fPattern = NULL; 173 } 174 175 if (fInput) { 176 delete fInput; 177 } 178 if (fInputText) { 179 utext_close(fInputText); 180 } 181 if (fAltInputText) { 182 utext_close(fAltInputText); 183 } 184 185 #if UCONFIG_NO_BREAK_ITERATION==0 186 delete fWordBreakItr; 187 #endif 188 } 189 190 // 191 // init() common initialization for use by all constructors. 192 // Initialize all fields, get the object into a consistent state. 193 // This must be done even when the initial status shows an error, 194 // so that the object is initialized sufficiently well for the destructor 195 // to run safely. 196 // 197 void RegexMatcher::init(UErrorCode &status) { 198 fPattern = NULL; 199 fPatternOwned = NULL; 200 fFrameSize = 0; 201 fRegionStart = 0; 202 fRegionLimit = 0; 203 fAnchorStart = 0; 204 fAnchorLimit = 0; 205 fLookStart = 0; 206 fLookLimit = 0; 207 fActiveStart = 0; 208 fActiveLimit = 0; 209 fTransparentBounds = FALSE; 210 fAnchoringBounds = TRUE; 211 fMatch = FALSE; 212 fMatchStart = 0; 213 fMatchEnd = 0; 214 fLastMatchEnd = -1; 215 fAppendPosition = 0; 216 fHitEnd = FALSE; 217 fRequireEnd = FALSE; 218 fStack = NULL; 219 fFrame = NULL; 220 fTimeLimit = 0; 221 fTime = 0; 222 fTickCounter = 0; 223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; 224 fCallbackFn = NULL; 225 fCallbackContext = NULL; 226 fFindProgressCallbackFn = NULL; 227 fFindProgressCallbackContext = NULL; 228 fTraceDebug = FALSE; 229 fDeferredStatus = status; 230 fData = fSmallData; 231 fWordBreakItr = NULL; 232 233 fStack = NULL; 234 fInputText = NULL; 235 fAltInputText = NULL; 236 fInput = NULL; 237 fInputLength = 0; 238 fInputUniStrMaybeMutable = FALSE; 239 240 if (U_FAILURE(status)) { 241 fDeferredStatus = status; 242 } 243 } 244 245 // 246 // init2() Common initialization for use by RegexMatcher constructors, part 2. 247 // This handles the common setup to be done after the Pattern is available. 248 // 249 void RegexMatcher::init2(UText *input, UErrorCode &status) { 250 if (U_FAILURE(status)) { 251 fDeferredStatus = status; 252 return; 253 } 254 255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) { 256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); 257 if (fData == NULL) { 258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 259 return; 260 } 261 } 262 263 fStack = new UVector64(status); 264 if (fStack == NULL) { 265 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 266 return; 267 } 268 269 reset(input); 270 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); 271 if (U_FAILURE(status)) { 272 fDeferredStatus = status; 273 return; 274 } 275 } 276 277 278 static const UChar BACKSLASH = 0x5c; 279 static const UChar DOLLARSIGN = 0x24; 280 //-------------------------------------------------------------------------------- 281 // 282 // appendReplacement 283 // 284 //-------------------------------------------------------------------------------- 285 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, 286 const UnicodeString &replacement, 287 UErrorCode &status) { 288 UText replacementText = UTEXT_INITIALIZER; 289 290 utext_openConstUnicodeString(&replacementText, &replacement, &status); 291 if (U_SUCCESS(status)) { 292 UText resultText = UTEXT_INITIALIZER; 293 utext_openUnicodeString(&resultText, &dest, &status); 294 295 if (U_SUCCESS(status)) { 296 appendReplacement(&resultText, &replacementText, status); 297 utext_close(&resultText); 298 } 299 utext_close(&replacementText); 300 } 301 302 return *this; 303 } 304 305 // 306 // appendReplacement, UText mode 307 // 308 RegexMatcher &RegexMatcher::appendReplacement(UText *dest, 309 UText *replacement, 310 UErrorCode &status) { 311 if (U_FAILURE(status)) { 312 return *this; 313 } 314 if (U_FAILURE(fDeferredStatus)) { 315 status = fDeferredStatus; 316 return *this; 317 } 318 if (fMatch == FALSE) { 319 status = U_REGEX_INVALID_STATE; 320 return *this; 321 } 322 323 // Copy input string from the end of previous match to start of current match 324 int64_t destLen = utext_nativeLength(dest); 325 if (fMatchStart > fAppendPosition) { 326 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 327 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 328 (int32_t)(fMatchStart-fAppendPosition), &status); 329 } else { 330 int32_t len16; 331 if (UTEXT_USES_U16(fInputText)) { 332 len16 = (int32_t)(fMatchStart-fAppendPosition); 333 } else { 334 UErrorCode lengthStatus = U_ZERO_ERROR; 335 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus); 336 } 337 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 338 if (inputChars == NULL) { 339 status = U_MEMORY_ALLOCATION_ERROR; 340 return *this; 341 } 342 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); 343 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); 344 uprv_free(inputChars); 345 } 346 } 347 fAppendPosition = fMatchEnd; 348 349 350 // scan the replacement text, looking for substitutions ($n) and \escapes. 351 // TODO: optimize this loop by efficiently scanning for '$' or '\', 352 // move entire ranges not containing substitutions. 353 UTEXT_SETNATIVEINDEX(replacement, 0); 354 UChar32 c = UTEXT_NEXT32(replacement); 355 while (c != U_SENTINEL) { 356 if (c == BACKSLASH) { 357 // Backslash Escape. Copy the following char out without further checks. 358 // Note: Surrogate pairs don't need any special handling 359 // The second half wont be a '$' or a '\', and 360 // will move to the dest normally on the next 361 // loop iteration. 362 c = UTEXT_CURRENT32(replacement); 363 if (c == U_SENTINEL) { 364 break; 365 } 366 367 if (c==0x55/*U*/ || c==0x75/*u*/) { 368 // We have a \udddd or \Udddddddd escape sequence. 369 int32_t offset = 0; 370 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement); 371 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 372 if (escapedChar != (UChar32)0xFFFFFFFF) { 373 if (U_IS_BMP(escapedChar)) { 374 UChar c16 = (UChar)escapedChar; 375 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 376 } else { 377 UChar surrogate[2]; 378 surrogate[0] = U16_LEAD(escapedChar); 379 surrogate[1] = U16_TRAIL(escapedChar); 380 if (U_SUCCESS(status)) { 381 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 382 } 383 } 384 // TODO: Report errors for mal-formed \u escapes? 385 // As this is, the original sequence is output, which may be OK. 386 if (context.lastOffset == offset) { 387 (void)UTEXT_PREVIOUS32(replacement); 388 } else if (context.lastOffset != offset-1) { 389 utext_moveIndex32(replacement, offset - context.lastOffset - 1); 390 } 391 } 392 } else { 393 (void)UTEXT_NEXT32(replacement); 394 // Plain backslash escape. Just put out the escaped character. 395 if (U_IS_BMP(c)) { 396 UChar c16 = (UChar)c; 397 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 398 } else { 399 UChar surrogate[2]; 400 surrogate[0] = U16_LEAD(c); 401 surrogate[1] = U16_TRAIL(c); 402 if (U_SUCCESS(status)) { 403 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 404 } 405 } 406 } 407 } else if (c != DOLLARSIGN) { 408 // Normal char, not a $. Copy it out without further checks. 409 if (U_IS_BMP(c)) { 410 UChar c16 = (UChar)c; 411 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 412 } else { 413 UChar surrogate[2]; 414 surrogate[0] = U16_LEAD(c); 415 surrogate[1] = U16_TRAIL(c); 416 if (U_SUCCESS(status)) { 417 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 418 } 419 } 420 } else { 421 // We've got a $. Pick up a capture group number if one follows. 422 // Consume at most the number of digits necessary for the largest capture 423 // number that is valid for this pattern. 424 425 int32_t numDigits = 0; 426 int32_t groupNum = 0; 427 UChar32 digitC; 428 for (;;) { 429 digitC = UTEXT_CURRENT32(replacement); 430 if (digitC == U_SENTINEL) { 431 break; 432 } 433 if (u_isdigit(digitC) == FALSE) { 434 break; 435 } 436 (void)UTEXT_NEXT32(replacement); 437 groupNum=groupNum*10 + u_charDigitValue(digitC); 438 numDigits++; 439 if (numDigits >= fPattern->fMaxCaptureDigits) { 440 break; 441 } 442 } 443 444 445 if (numDigits == 0) { 446 // The $ didn't introduce a group number at all. 447 // Treat it as just part of the substitution text. 448 UChar c16 = DOLLARSIGN; 449 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 450 } else { 451 // Finally, append the capture group data to the destination. 452 destLen += appendGroup(groupNum, dest, status); 453 if (U_FAILURE(status)) { 454 // Can fail if group number is out of range. 455 break; 456 } 457 } 458 } 459 460 if (U_FAILURE(status)) { 461 break; 462 } else { 463 c = UTEXT_NEXT32(replacement); 464 } 465 } 466 467 return *this; 468 } 469 470 471 472 //-------------------------------------------------------------------------------- 473 // 474 // appendTail Intended to be used in conjunction with appendReplacement() 475 // To the destination string, append everything following 476 // the last match position from the input string. 477 // 478 // Note: Match ranges do not affect appendTail or appendReplacement 479 // 480 //-------------------------------------------------------------------------------- 481 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { 482 UErrorCode status = U_ZERO_ERROR; 483 UText resultText = UTEXT_INITIALIZER; 484 utext_openUnicodeString(&resultText, &dest, &status); 485 486 if (U_SUCCESS(status)) { 487 appendTail(&resultText, status); 488 utext_close(&resultText); 489 } 490 491 return dest; 492 } 493 494 // 495 // appendTail, UText mode 496 // 497 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { 498 UBool bailOut = FALSE; 499 if (U_FAILURE(status)) { 500 bailOut = TRUE; 501 } 502 if (U_FAILURE(fDeferredStatus)) { 503 status = fDeferredStatus; 504 bailOut = TRUE; 505 } 506 507 if (bailOut) { 508 // dest must not be NULL 509 if (dest) { 510 utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status); 511 return dest; 512 } 513 } 514 515 if (fInputLength > fAppendPosition) { 516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 517 int64_t destLen = utext_nativeLength(dest); 518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 519 (int32_t)(fInputLength-fAppendPosition), &status); 520 } else { 521 int32_t len16; 522 if (UTEXT_USES_U16(fInputText)) { 523 len16 = (int32_t)(fInputLength-fAppendPosition); 524 } else { 525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status); 526 status = U_ZERO_ERROR; // buffer overflow 527 } 528 529 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); 530 if (inputChars == NULL) { 531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 532 } else { 533 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated 534 int64_t destLen = utext_nativeLength(dest); 535 utext_replace(dest, destLen, destLen, inputChars, len16, &status); 536 uprv_free(inputChars); 537 } 538 } 539 } 540 return dest; 541 } 542 543 544 545 //-------------------------------------------------------------------------------- 546 // 547 // end 548 // 549 //-------------------------------------------------------------------------------- 550 int32_t RegexMatcher::end(UErrorCode &err) const { 551 return end(0, err); 552 } 553 554 int64_t RegexMatcher::end64(UErrorCode &err) const { 555 return end64(0, err); 556 } 557 558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const { 559 if (U_FAILURE(err)) { 560 return -1; 561 } 562 if (fMatch == FALSE) { 563 err = U_REGEX_INVALID_STATE; 564 return -1; 565 } 566 if (group < 0 || group > fPattern->fGroupMap->size()) { 567 err = U_INDEX_OUTOFBOUNDS_ERROR; 568 return -1; 569 } 570 int64_t e = -1; 571 if (group == 0) { 572 e = fMatchEnd; 573 } else { 574 // Get the position within the stack frame of the variables for 575 // this capture group. 576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 577 U_ASSERT(groupOffset < fPattern->fFrameSize); 578 U_ASSERT(groupOffset >= 0); 579 e = fFrame->fExtra[groupOffset + 1]; 580 } 581 582 return e; 583 } 584 585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { 586 return (int32_t)end64(group, err); 587 } 588 589 590 //-------------------------------------------------------------------------------- 591 // 592 // find() 593 // 594 //-------------------------------------------------------------------------------- 595 UBool RegexMatcher::find() { 596 // Start at the position of the last match end. (Will be zero if the 597 // matcher has been reset.) 598 // 599 if (U_FAILURE(fDeferredStatus)) { 600 return FALSE; 601 } 602 603 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 604 return findUsingChunk(); 605 } 606 607 int64_t startPos = fMatchEnd; 608 if (startPos==0) { 609 startPos = fActiveStart; 610 } 611 612 if (fMatch) { 613 // Save the position of any previous successful match. 614 fLastMatchEnd = fMatchEnd; 615 616 if (fMatchStart == fMatchEnd) { 617 // Previous match had zero length. Move start position up one position 618 // to avoid sending find() into a loop on zero-length matches. 619 if (startPos >= fActiveLimit) { 620 fMatch = FALSE; 621 fHitEnd = TRUE; 622 return FALSE; 623 } 624 UTEXT_SETNATIVEINDEX(fInputText, startPos); 625 (void)UTEXT_NEXT32(fInputText); 626 startPos = UTEXT_GETNATIVEINDEX(fInputText); 627 } 628 } else { 629 if (fLastMatchEnd >= 0) { 630 // A previous find() failed to match. Don't try again. 631 // (without this test, a pattern with a zero-length match 632 // could match again at the end of an input string.) 633 fHitEnd = TRUE; 634 return FALSE; 635 } 636 } 637 638 639 // Compute the position in the input string beyond which a match can not begin, because 640 // the minimum length match would extend past the end of the input. 641 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 642 // Be aware of possible overflows if making changes here. 643 int64_t testStartLimit; 644 if (UTEXT_USES_U16(fInputText)) { 645 testStartLimit = fActiveLimit - fPattern->fMinMatchLen; 646 if (startPos > testStartLimit) { 647 fMatch = FALSE; 648 fHitEnd = TRUE; 649 return FALSE; 650 } 651 } else { 652 // For now, let the matcher discover that it can't match on its own 653 // We don't know how long the match len is in native characters 654 testStartLimit = fActiveLimit; 655 } 656 657 UChar32 c; 658 U_ASSERT(startPos >= 0); 659 660 switch (fPattern->fStartType) { 661 case START_NO_INFO: 662 // No optimization was found. 663 // Try a match at each input position. 664 for (;;) { 665 MatchAt(startPos, FALSE, fDeferredStatus); 666 if (U_FAILURE(fDeferredStatus)) { 667 return FALSE; 668 } 669 if (fMatch) { 670 return TRUE; 671 } 672 if (startPos >= testStartLimit) { 673 fHitEnd = TRUE; 674 return FALSE; 675 } 676 UTEXT_SETNATIVEINDEX(fInputText, startPos); 677 (void)UTEXT_NEXT32(fInputText); 678 startPos = UTEXT_GETNATIVEINDEX(fInputText); 679 // Note that it's perfectly OK for a pattern to have a zero-length 680 // match at the end of a string, so we must make sure that the loop 681 // runs with startPos == testStartLimit the last time through. 682 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 683 return FALSE; 684 } 685 U_ASSERT(FALSE); 686 687 case START_START: 688 // Matches are only possible at the start of the input string 689 // (pattern begins with ^ or \A) 690 if (startPos > fActiveStart) { 691 fMatch = FALSE; 692 return FALSE; 693 } 694 MatchAt(startPos, FALSE, fDeferredStatus); 695 if (U_FAILURE(fDeferredStatus)) { 696 return FALSE; 697 } 698 return fMatch; 699 700 701 case START_SET: 702 { 703 // Match may start on any char from a pre-computed set. 704 U_ASSERT(fPattern->fMinMatchLen > 0); 705 int64_t pos; 706 UTEXT_SETNATIVEINDEX(fInputText, startPos); 707 for (;;) { 708 c = UTEXT_NEXT32(fInputText); 709 pos = UTEXT_GETNATIVEINDEX(fInputText); 710 // c will be -1 (U_SENTINEL) at end of text, in which case we 711 // skip this next block (so we don't have a negative array index) 712 // and handle end of text in the following block. 713 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) || 714 (c>=256 && fPattern->fInitialChars->contains(c)))) { 715 MatchAt(startPos, FALSE, fDeferredStatus); 716 if (U_FAILURE(fDeferredStatus)) { 717 return FALSE; 718 } 719 if (fMatch) { 720 return TRUE; 721 } 722 UTEXT_SETNATIVEINDEX(fInputText, pos); 723 } 724 if (startPos >= testStartLimit) { 725 fMatch = FALSE; 726 fHitEnd = TRUE; 727 return FALSE; 728 } 729 startPos = pos; 730 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 731 return FALSE; 732 } 733 } 734 U_ASSERT(FALSE); 735 736 case START_STRING: 737 case START_CHAR: 738 { 739 // Match starts on exactly one char. 740 U_ASSERT(fPattern->fMinMatchLen > 0); 741 UChar32 theChar = fPattern->fInitialChar; 742 int64_t pos; 743 UTEXT_SETNATIVEINDEX(fInputText, startPos); 744 for (;;) { 745 c = UTEXT_NEXT32(fInputText); 746 pos = UTEXT_GETNATIVEINDEX(fInputText); 747 if (c == theChar) { 748 MatchAt(startPos, FALSE, fDeferredStatus); 749 if (U_FAILURE(fDeferredStatus)) { 750 return FALSE; 751 } 752 if (fMatch) { 753 return TRUE; 754 } 755 UTEXT_SETNATIVEINDEX(fInputText, pos); 756 } 757 if (startPos >= testStartLimit) { 758 fMatch = FALSE; 759 fHitEnd = TRUE; 760 return FALSE; 761 } 762 startPos = pos; 763 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 764 return FALSE; 765 } 766 } 767 U_ASSERT(FALSE); 768 769 case START_LINE: 770 { 771 UChar32 c; 772 if (startPos == fAnchorStart) { 773 MatchAt(startPos, FALSE, fDeferredStatus); 774 if (U_FAILURE(fDeferredStatus)) { 775 return FALSE; 776 } 777 if (fMatch) { 778 return TRUE; 779 } 780 UTEXT_SETNATIVEINDEX(fInputText, startPos); 781 c = UTEXT_NEXT32(fInputText); 782 startPos = UTEXT_GETNATIVEINDEX(fInputText); 783 } else { 784 UTEXT_SETNATIVEINDEX(fInputText, startPos); 785 c = UTEXT_PREVIOUS32(fInputText); 786 UTEXT_SETNATIVEINDEX(fInputText, startPos); 787 } 788 789 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 790 for (;;) { 791 if (c == 0x0a) { 792 MatchAt(startPos, FALSE, fDeferredStatus); 793 if (U_FAILURE(fDeferredStatus)) { 794 return FALSE; 795 } 796 if (fMatch) { 797 return TRUE; 798 } 799 UTEXT_SETNATIVEINDEX(fInputText, startPos); 800 } 801 if (startPos >= testStartLimit) { 802 fMatch = FALSE; 803 fHitEnd = TRUE; 804 return FALSE; 805 } 806 c = UTEXT_NEXT32(fInputText); 807 startPos = UTEXT_GETNATIVEINDEX(fInputText); 808 // Note that it's perfectly OK for a pattern to have a zero-length 809 // match at the end of a string, so we must make sure that the loop 810 // runs with startPos == testStartLimit the last time through. 811 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 812 return FALSE; 813 } 814 } else { 815 for (;;) { 816 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 817 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { 818 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 819 (void)UTEXT_NEXT32(fInputText); 820 startPos = UTEXT_GETNATIVEINDEX(fInputText); 821 } 822 MatchAt(startPos, FALSE, fDeferredStatus); 823 if (U_FAILURE(fDeferredStatus)) { 824 return FALSE; 825 } 826 if (fMatch) { 827 return TRUE; 828 } 829 UTEXT_SETNATIVEINDEX(fInputText, startPos); 830 } 831 if (startPos >= testStartLimit) { 832 fMatch = FALSE; 833 fHitEnd = TRUE; 834 return FALSE; 835 } 836 c = UTEXT_NEXT32(fInputText); 837 startPos = UTEXT_GETNATIVEINDEX(fInputText); 838 // Note that it's perfectly OK for a pattern to have a zero-length 839 // match at the end of a string, so we must make sure that the loop 840 // runs with startPos == testStartLimit the last time through. 841 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 842 return FALSE; 843 } 844 } 845 } 846 847 default: 848 U_ASSERT(FALSE); 849 } 850 851 U_ASSERT(FALSE); 852 return FALSE; 853 } 854 855 856 857 UBool RegexMatcher::find(int64_t start, UErrorCode &status) { 858 if (U_FAILURE(status)) { 859 return FALSE; 860 } 861 if (U_FAILURE(fDeferredStatus)) { 862 status = fDeferredStatus; 863 return FALSE; 864 } 865 this->reset(); // Note: Reset() is specified by Java Matcher documentation. 866 // This will reset the region to be the full input length. 867 if (start < 0) { 868 status = U_INDEX_OUTOFBOUNDS_ERROR; 869 return FALSE; 870 } 871 872 int64_t nativeStart = start; 873 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 874 status = U_INDEX_OUTOFBOUNDS_ERROR; 875 return FALSE; 876 } 877 fMatchEnd = nativeStart; 878 return find(); 879 } 880 881 882 //-------------------------------------------------------------------------------- 883 // 884 // findUsingChunk() -- like find(), but with the advance knowledge that the 885 // entire string is available in the UText's chunk buffer. 886 // 887 //-------------------------------------------------------------------------------- 888 UBool RegexMatcher::findUsingChunk() { 889 // Start at the position of the last match end. (Will be zero if the 890 // matcher has been reset. 891 // 892 893 int32_t startPos = (int32_t)fMatchEnd; 894 if (startPos==0) { 895 startPos = (int32_t)fActiveStart; 896 } 897 898 const UChar *inputBuf = fInputText->chunkContents; 899 900 if (fMatch) { 901 // Save the position of any previous successful match. 902 fLastMatchEnd = fMatchEnd; 903 904 if (fMatchStart == fMatchEnd) { 905 // Previous match had zero length. Move start position up one position 906 // to avoid sending find() into a loop on zero-length matches. 907 if (startPos >= fActiveLimit) { 908 fMatch = FALSE; 909 fHitEnd = TRUE; 910 return FALSE; 911 } 912 U16_FWD_1(inputBuf, startPos, fInputLength); 913 } 914 } else { 915 if (fLastMatchEnd >= 0) { 916 // A previous find() failed to match. Don't try again. 917 // (without this test, a pattern with a zero-length match 918 // could match again at the end of an input string.) 919 fHitEnd = TRUE; 920 return FALSE; 921 } 922 } 923 924 925 // Compute the position in the input string beyond which a match can not begin, because 926 // the minimum length match would extend past the end of the input. 927 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 928 // Be aware of possible overflows if making changes here. 929 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); 930 if (startPos > testLen) { 931 fMatch = FALSE; 932 fHitEnd = TRUE; 933 return FALSE; 934 } 935 936 UChar32 c; 937 U_ASSERT(startPos >= 0); 938 939 switch (fPattern->fStartType) { 940 case START_NO_INFO: 941 // No optimization was found. 942 // Try a match at each input position. 943 for (;;) { 944 MatchChunkAt(startPos, FALSE, fDeferredStatus); 945 if (U_FAILURE(fDeferredStatus)) { 946 return FALSE; 947 } 948 if (fMatch) { 949 return TRUE; 950 } 951 if (startPos >= testLen) { 952 fHitEnd = TRUE; 953 return FALSE; 954 } 955 U16_FWD_1(inputBuf, startPos, fActiveLimit); 956 // Note that it's perfectly OK for a pattern to have a zero-length 957 // match at the end of a string, so we must make sure that the loop 958 // runs with startPos == testLen the last time through. 959 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 960 return FALSE; 961 } 962 U_ASSERT(FALSE); 963 964 case START_START: 965 // Matches are only possible at the start of the input string 966 // (pattern begins with ^ or \A) 967 if (startPos > fActiveStart) { 968 fMatch = FALSE; 969 return FALSE; 970 } 971 MatchChunkAt(startPos, FALSE, fDeferredStatus); 972 if (U_FAILURE(fDeferredStatus)) { 973 return FALSE; 974 } 975 return fMatch; 976 977 978 case START_SET: 979 { 980 // Match may start on any char from a pre-computed set. 981 U_ASSERT(fPattern->fMinMatchLen > 0); 982 for (;;) { 983 int32_t pos = startPos; 984 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 985 if ((c<256 && fPattern->fInitialChars8->contains(c)) || 986 (c>=256 && fPattern->fInitialChars->contains(c))) { 987 MatchChunkAt(pos, FALSE, fDeferredStatus); 988 if (U_FAILURE(fDeferredStatus)) { 989 return FALSE; 990 } 991 if (fMatch) { 992 return TRUE; 993 } 994 } 995 if (pos >= testLen) { 996 fMatch = FALSE; 997 fHitEnd = TRUE; 998 return FALSE; 999 } 1000 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1001 return FALSE; 1002 } 1003 } 1004 U_ASSERT(FALSE); 1005 1006 case START_STRING: 1007 case START_CHAR: 1008 { 1009 // Match starts on exactly one char. 1010 U_ASSERT(fPattern->fMinMatchLen > 0); 1011 UChar32 theChar = fPattern->fInitialChar; 1012 for (;;) { 1013 int32_t pos = startPos; 1014 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 1015 if (c == theChar) { 1016 MatchChunkAt(pos, FALSE, fDeferredStatus); 1017 if (U_FAILURE(fDeferredStatus)) { 1018 return FALSE; 1019 } 1020 if (fMatch) { 1021 return TRUE; 1022 } 1023 } 1024 if (pos >= testLen) { 1025 fMatch = FALSE; 1026 fHitEnd = TRUE; 1027 return FALSE; 1028 } 1029 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1030 return FALSE; 1031 } 1032 } 1033 U_ASSERT(FALSE); 1034 1035 case START_LINE: 1036 { 1037 UChar32 c; 1038 if (startPos == fAnchorStart) { 1039 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1040 if (U_FAILURE(fDeferredStatus)) { 1041 return FALSE; 1042 } 1043 if (fMatch) { 1044 return TRUE; 1045 } 1046 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1047 } 1048 1049 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 1050 for (;;) { 1051 c = inputBuf[startPos-1]; 1052 if (c == 0x0a) { 1053 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1054 if (U_FAILURE(fDeferredStatus)) { 1055 return FALSE; 1056 } 1057 if (fMatch) { 1058 return TRUE; 1059 } 1060 } 1061 if (startPos >= testLen) { 1062 fMatch = FALSE; 1063 fHitEnd = TRUE; 1064 return FALSE; 1065 } 1066 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1067 // Note that it's perfectly OK for a pattern to have a zero-length 1068 // match at the end of a string, so we must make sure that the loop 1069 // runs with startPos == testLen the last time through. 1070 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1071 return FALSE; 1072 } 1073 } else { 1074 for (;;) { 1075 c = inputBuf[startPos-1]; 1076 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 1077 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { 1078 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { 1079 startPos++; 1080 } 1081 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1082 if (U_FAILURE(fDeferredStatus)) { 1083 return FALSE; 1084 } 1085 if (fMatch) { 1086 return TRUE; 1087 } 1088 } 1089 if (startPos >= testLen) { 1090 fMatch = FALSE; 1091 fHitEnd = TRUE; 1092 return FALSE; 1093 } 1094 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1095 // Note that it's perfectly OK for a pattern to have a zero-length 1096 // match at the end of a string, so we must make sure that the loop 1097 // runs with startPos == testLen the last time through. 1098 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1099 return FALSE; 1100 } 1101 } 1102 } 1103 1104 default: 1105 U_ASSERT(FALSE); 1106 } 1107 1108 U_ASSERT(FALSE); 1109 return FALSE; 1110 } 1111 1112 1113 1114 //-------------------------------------------------------------------------------- 1115 // 1116 // group() 1117 // 1118 //-------------------------------------------------------------------------------- 1119 UnicodeString RegexMatcher::group(UErrorCode &status) const { 1120 return group(0, status); 1121 } 1122 1123 // Return immutable shallow clone 1124 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const { 1125 return group(0, dest, group_len, status); 1126 } 1127 1128 // Return immutable shallow clone 1129 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const { 1130 group_len = 0; 1131 UBool bailOut = FALSE; 1132 if (U_FAILURE(status)) { 1133 return dest; 1134 } 1135 if (U_FAILURE(fDeferredStatus)) { 1136 status = fDeferredStatus; 1137 bailOut = TRUE; 1138 } 1139 if (fMatch == FALSE) { 1140 status = U_REGEX_INVALID_STATE; 1141 bailOut = TRUE; 1142 } 1143 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1144 status = U_INDEX_OUTOFBOUNDS_ERROR; 1145 bailOut = TRUE; 1146 } 1147 1148 if (bailOut) { 1149 return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status); 1150 } 1151 1152 int64_t s, e; 1153 if (groupNum == 0) { 1154 s = fMatchStart; 1155 e = fMatchEnd; 1156 } else { 1157 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1158 U_ASSERT(groupOffset < fPattern->fFrameSize); 1159 U_ASSERT(groupOffset >= 0); 1160 s = fFrame->fExtra[groupOffset]; 1161 e = fFrame->fExtra[groupOffset+1]; 1162 } 1163 1164 if (s < 0) { 1165 // A capture group wasn't part of the match 1166 return utext_clone(dest, fInputText, FALSE, TRUE, &status); 1167 } 1168 U_ASSERT(s <= e); 1169 group_len = e - s; 1170 1171 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); 1172 if (dest) 1173 UTEXT_SETNATIVEINDEX(dest, s); 1174 return dest; 1175 } 1176 1177 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { 1178 UnicodeString result; 1179 if (U_FAILURE(status)) { 1180 return result; 1181 } 1182 UText resultText = UTEXT_INITIALIZER; 1183 utext_openUnicodeString(&resultText, &result, &status); 1184 group(groupNum, &resultText, status); 1185 utext_close(&resultText); 1186 return result; 1187 } 1188 1189 1190 // Return deep (mutable) clone 1191 // Technology Preview (as an API), but note that the UnicodeString API is implemented 1192 // using this function. 1193 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const { 1194 UBool bailOut = FALSE; 1195 if (U_FAILURE(status)) { 1196 return dest; 1197 } 1198 if (U_FAILURE(fDeferredStatus)) { 1199 status = fDeferredStatus; 1200 bailOut = TRUE; 1201 } 1202 1203 if (fMatch == FALSE) { 1204 status = U_REGEX_INVALID_STATE; 1205 bailOut = TRUE; 1206 } 1207 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1208 status = U_INDEX_OUTOFBOUNDS_ERROR; 1209 bailOut = TRUE; 1210 } 1211 1212 if (bailOut) { 1213 if (dest) { 1214 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1215 return dest; 1216 } else { 1217 return utext_openUChars(NULL, NULL, 0, &status); 1218 } 1219 } 1220 1221 int64_t s, e; 1222 if (groupNum == 0) { 1223 s = fMatchStart; 1224 e = fMatchEnd; 1225 } else { 1226 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1227 U_ASSERT(groupOffset < fPattern->fFrameSize); 1228 U_ASSERT(groupOffset >= 0); 1229 s = fFrame->fExtra[groupOffset]; 1230 e = fFrame->fExtra[groupOffset+1]; 1231 } 1232 1233 if (s < 0) { 1234 // A capture group wasn't part of the match 1235 if (dest) { 1236 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1237 return dest; 1238 } else { 1239 return utext_openUChars(NULL, NULL, 0, &status); 1240 } 1241 } 1242 U_ASSERT(s <= e); 1243 1244 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1245 U_ASSERT(e <= fInputLength); 1246 if (dest) { 1247 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status); 1248 } else { 1249 UText groupText = UTEXT_INITIALIZER; 1250 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status); 1251 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); 1252 utext_close(&groupText); 1253 } 1254 } else { 1255 int32_t len16; 1256 if (UTEXT_USES_U16(fInputText)) { 1257 len16 = (int32_t)(e-s); 1258 } else { 1259 UErrorCode lengthStatus = U_ZERO_ERROR; 1260 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1261 } 1262 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1263 if (groupChars == NULL) { 1264 status = U_MEMORY_ALLOCATION_ERROR; 1265 return dest; 1266 } 1267 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1268 1269 if (dest) { 1270 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status); 1271 } else { 1272 UText groupText = UTEXT_INITIALIZER; 1273 utext_openUChars(&groupText, groupChars, len16, &status); 1274 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); 1275 utext_close(&groupText); 1276 } 1277 1278 uprv_free(groupChars); 1279 } 1280 return dest; 1281 } 1282 1283 //-------------------------------------------------------------------------------- 1284 // 1285 // appendGroup() -- currently internal only, appends a group to a UText rather 1286 // than replacing its contents 1287 // 1288 //-------------------------------------------------------------------------------- 1289 1290 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const { 1291 if (U_FAILURE(status)) { 1292 return 0; 1293 } 1294 if (U_FAILURE(fDeferredStatus)) { 1295 status = fDeferredStatus; 1296 return 0; 1297 } 1298 int64_t destLen = utext_nativeLength(dest); 1299 1300 if (fMatch == FALSE) { 1301 status = U_REGEX_INVALID_STATE; 1302 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1303 } 1304 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1305 status = U_INDEX_OUTOFBOUNDS_ERROR; 1306 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1307 } 1308 1309 int64_t s, e; 1310 if (groupNum == 0) { 1311 s = fMatchStart; 1312 e = fMatchEnd; 1313 } else { 1314 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1315 U_ASSERT(groupOffset < fPattern->fFrameSize); 1316 U_ASSERT(groupOffset >= 0); 1317 s = fFrame->fExtra[groupOffset]; 1318 e = fFrame->fExtra[groupOffset+1]; 1319 } 1320 1321 if (s < 0) { 1322 // A capture group wasn't part of the match 1323 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1324 } 1325 U_ASSERT(s <= e); 1326 1327 int64_t deltaLen; 1328 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1329 U_ASSERT(e <= fInputLength); 1330 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status); 1331 } else { 1332 int32_t len16; 1333 if (UTEXT_USES_U16(fInputText)) { 1334 len16 = (int32_t)(e-s); 1335 } else { 1336 UErrorCode lengthStatus = U_ZERO_ERROR; 1337 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1338 } 1339 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1340 if (groupChars == NULL) { 1341 status = U_MEMORY_ALLOCATION_ERROR; 1342 return 0; 1343 } 1344 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1345 1346 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status); 1347 uprv_free(groupChars); 1348 } 1349 return deltaLen; 1350 } 1351 1352 1353 1354 //-------------------------------------------------------------------------------- 1355 // 1356 // groupCount() 1357 // 1358 //-------------------------------------------------------------------------------- 1359 int32_t RegexMatcher::groupCount() const { 1360 return fPattern->fGroupMap->size(); 1361 } 1362 1363 1364 1365 //-------------------------------------------------------------------------------- 1366 // 1367 // hasAnchoringBounds() 1368 // 1369 //-------------------------------------------------------------------------------- 1370 UBool RegexMatcher::hasAnchoringBounds() const { 1371 return fAnchoringBounds; 1372 } 1373 1374 1375 //-------------------------------------------------------------------------------- 1376 // 1377 // hasTransparentBounds() 1378 // 1379 //-------------------------------------------------------------------------------- 1380 UBool RegexMatcher::hasTransparentBounds() const { 1381 return fTransparentBounds; 1382 } 1383 1384 1385 1386 //-------------------------------------------------------------------------------- 1387 // 1388 // hitEnd() 1389 // 1390 //-------------------------------------------------------------------------------- 1391 UBool RegexMatcher::hitEnd() const { 1392 return fHitEnd; 1393 } 1394 1395 1396 //-------------------------------------------------------------------------------- 1397 // 1398 // input() 1399 // 1400 //-------------------------------------------------------------------------------- 1401 const UnicodeString &RegexMatcher::input() const { 1402 if (!fInput) { 1403 UErrorCode status = U_ZERO_ERROR; 1404 int32_t len16; 1405 if (UTEXT_USES_U16(fInputText)) { 1406 len16 = (int32_t)fInputLength; 1407 } else { 1408 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status); 1409 status = U_ZERO_ERROR; // overflow, length status 1410 } 1411 UnicodeString *result = new UnicodeString(len16, 0, 0); 1412 1413 UChar *inputChars = result->getBuffer(len16); 1414 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning 1415 result->releaseBuffer(len16); 1416 1417 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator= 1418 } 1419 1420 return *fInput; 1421 } 1422 1423 //-------------------------------------------------------------------------------- 1424 // 1425 // inputText() 1426 // 1427 //-------------------------------------------------------------------------------- 1428 UText *RegexMatcher::inputText() const { 1429 return fInputText; 1430 } 1431 1432 1433 //-------------------------------------------------------------------------------- 1434 // 1435 // getInput() -- like inputText(), but makes a clone or copies into another UText 1436 // 1437 //-------------------------------------------------------------------------------- 1438 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { 1439 UBool bailOut = FALSE; 1440 if (U_FAILURE(status)) { 1441 return dest; 1442 } 1443 if (U_FAILURE(fDeferredStatus)) { 1444 status = fDeferredStatus; 1445 bailOut = TRUE; 1446 } 1447 1448 if (bailOut) { 1449 if (dest) { 1450 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1451 return dest; 1452 } else { 1453 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); 1454 } 1455 } 1456 1457 if (dest) { 1458 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1459 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status); 1460 } else { 1461 int32_t input16Len; 1462 if (UTEXT_USES_U16(fInputText)) { 1463 input16Len = (int32_t)fInputLength; 1464 } else { 1465 UErrorCode lengthStatus = U_ZERO_ERROR; 1466 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error 1467 } 1468 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len)); 1469 if (inputChars == NULL) { 1470 return dest; 1471 } 1472 1473 status = U_ZERO_ERROR; 1474 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning 1475 status = U_ZERO_ERROR; 1476 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status); 1477 1478 uprv_free(inputChars); 1479 } 1480 return dest; 1481 } else { 1482 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); 1483 } 1484 } 1485 1486 1487 static UBool compat_SyncMutableUTextContents(UText *ut); 1488 static UBool compat_SyncMutableUTextContents(UText *ut) { 1489 UBool retVal = FALSE; 1490 1491 // In the following test, we're really only interested in whether the UText should switch 1492 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents 1493 // will still point to the correct data. 1494 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { 1495 UnicodeString *us=(UnicodeString *)ut->context; 1496 1497 // Update to the latest length. 1498 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). 1499 int32_t newLength = us->length(); 1500 1501 // Update the chunk description. 1502 // The buffer may have switched between stack- and heap-based. 1503 ut->chunkContents = us->getBuffer(); 1504 ut->chunkLength = newLength; 1505 ut->chunkNativeLimit = newLength; 1506 ut->nativeIndexingLimit = newLength; 1507 retVal = TRUE; 1508 } 1509 1510 return retVal; 1511 } 1512 1513 //-------------------------------------------------------------------------------- 1514 // 1515 // lookingAt() 1516 // 1517 //-------------------------------------------------------------------------------- 1518 UBool RegexMatcher::lookingAt(UErrorCode &status) { 1519 if (U_FAILURE(status)) { 1520 return FALSE; 1521 } 1522 if (U_FAILURE(fDeferredStatus)) { 1523 status = fDeferredStatus; 1524 return FALSE; 1525 } 1526 1527 if (fInputUniStrMaybeMutable) { 1528 if (compat_SyncMutableUTextContents(fInputText)) { 1529 fInputLength = utext_nativeLength(fInputText); 1530 reset(); 1531 } 1532 } 1533 else { 1534 resetPreserveRegion(); 1535 } 1536 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1537 MatchChunkAt((int32_t)fActiveStart, FALSE, status); 1538 } else { 1539 MatchAt(fActiveStart, FALSE, status); 1540 } 1541 return fMatch; 1542 } 1543 1544 1545 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { 1546 if (U_FAILURE(status)) { 1547 return FALSE; 1548 } 1549 if (U_FAILURE(fDeferredStatus)) { 1550 status = fDeferredStatus; 1551 return FALSE; 1552 } 1553 reset(); 1554 1555 if (start < 0) { 1556 status = U_INDEX_OUTOFBOUNDS_ERROR; 1557 return FALSE; 1558 } 1559 1560 if (fInputUniStrMaybeMutable) { 1561 if (compat_SyncMutableUTextContents(fInputText)) { 1562 fInputLength = utext_nativeLength(fInputText); 1563 reset(); 1564 } 1565 } 1566 1567 int64_t nativeStart; 1568 nativeStart = start; 1569 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1570 status = U_INDEX_OUTOFBOUNDS_ERROR; 1571 return FALSE; 1572 } 1573 1574 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1575 MatchChunkAt((int32_t)nativeStart, FALSE, status); 1576 } else { 1577 MatchAt(nativeStart, FALSE, status); 1578 } 1579 return fMatch; 1580 } 1581 1582 1583 1584 //-------------------------------------------------------------------------------- 1585 // 1586 // matches() 1587 // 1588 //-------------------------------------------------------------------------------- 1589 UBool RegexMatcher::matches(UErrorCode &status) { 1590 if (U_FAILURE(status)) { 1591 return FALSE; 1592 } 1593 if (U_FAILURE(fDeferredStatus)) { 1594 status = fDeferredStatus; 1595 return FALSE; 1596 } 1597 1598 if (fInputUniStrMaybeMutable) { 1599 if (compat_SyncMutableUTextContents(fInputText)) { 1600 fInputLength = utext_nativeLength(fInputText); 1601 reset(); 1602 } 1603 } 1604 else { 1605 resetPreserveRegion(); 1606 } 1607 1608 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1609 MatchChunkAt((int32_t)fActiveStart, TRUE, status); 1610 } else { 1611 MatchAt(fActiveStart, TRUE, status); 1612 } 1613 return fMatch; 1614 } 1615 1616 1617 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { 1618 if (U_FAILURE(status)) { 1619 return FALSE; 1620 } 1621 if (U_FAILURE(fDeferredStatus)) { 1622 status = fDeferredStatus; 1623 return FALSE; 1624 } 1625 reset(); 1626 1627 if (start < 0) { 1628 status = U_INDEX_OUTOFBOUNDS_ERROR; 1629 return FALSE; 1630 } 1631 1632 if (fInputUniStrMaybeMutable) { 1633 if (compat_SyncMutableUTextContents(fInputText)) { 1634 fInputLength = utext_nativeLength(fInputText); 1635 reset(); 1636 } 1637 } 1638 1639 int64_t nativeStart; 1640 nativeStart = start; 1641 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1642 status = U_INDEX_OUTOFBOUNDS_ERROR; 1643 return FALSE; 1644 } 1645 1646 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1647 MatchChunkAt((int32_t)nativeStart, TRUE, status); 1648 } else { 1649 MatchAt(nativeStart, TRUE, status); 1650 } 1651 return fMatch; 1652 } 1653 1654 1655 1656 //-------------------------------------------------------------------------------- 1657 // 1658 // pattern 1659 // 1660 //-------------------------------------------------------------------------------- 1661 const RegexPattern &RegexMatcher::pattern() const { 1662 return *fPattern; 1663 } 1664 1665 1666 1667 //-------------------------------------------------------------------------------- 1668 // 1669 // region 1670 // 1671 //-------------------------------------------------------------------------------- 1672 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) { 1673 if (U_FAILURE(status)) { 1674 return *this; 1675 } 1676 1677 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { 1678 status = U_ILLEGAL_ARGUMENT_ERROR; 1679 } 1680 1681 int64_t nativeStart = regionStart; 1682 int64_t nativeLimit = regionLimit; 1683 if (nativeStart > fInputLength || nativeLimit > fInputLength) { 1684 status = U_ILLEGAL_ARGUMENT_ERROR; 1685 } 1686 1687 if (startIndex == -1) 1688 this->reset(); 1689 else 1690 resetPreserveRegion(); 1691 1692 fRegionStart = nativeStart; 1693 fRegionLimit = nativeLimit; 1694 fActiveStart = nativeStart; 1695 fActiveLimit = nativeLimit; 1696 1697 if (startIndex != -1) { 1698 if (startIndex < fActiveStart || startIndex > fActiveLimit) { 1699 status = U_INDEX_OUTOFBOUNDS_ERROR; 1700 } 1701 fMatchEnd = startIndex; 1702 } 1703 1704 if (!fTransparentBounds) { 1705 fLookStart = nativeStart; 1706 fLookLimit = nativeLimit; 1707 } 1708 if (fAnchoringBounds) { 1709 fAnchorStart = nativeStart; 1710 fAnchorLimit = nativeLimit; 1711 } 1712 return *this; 1713 } 1714 1715 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) { 1716 return region(start, limit, -1, status); 1717 } 1718 1719 //-------------------------------------------------------------------------------- 1720 // 1721 // regionEnd 1722 // 1723 //-------------------------------------------------------------------------------- 1724 int32_t RegexMatcher::regionEnd() const { 1725 return (int32_t)fRegionLimit; 1726 } 1727 1728 int64_t RegexMatcher::regionEnd64() const { 1729 return fRegionLimit; 1730 } 1731 1732 //-------------------------------------------------------------------------------- 1733 // 1734 // regionStart 1735 // 1736 //-------------------------------------------------------------------------------- 1737 int32_t RegexMatcher::regionStart() const { 1738 return (int32_t)fRegionStart; 1739 } 1740 1741 int64_t RegexMatcher::regionStart64() const { 1742 return fRegionStart; 1743 } 1744 1745 1746 //-------------------------------------------------------------------------------- 1747 // 1748 // replaceAll 1749 // 1750 //-------------------------------------------------------------------------------- 1751 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { 1752 UText replacementText = UTEXT_INITIALIZER; 1753 UText resultText = UTEXT_INITIALIZER; 1754 UnicodeString resultString; 1755 if (U_FAILURE(status)) { 1756 return resultString; 1757 } 1758 1759 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1760 utext_openUnicodeString(&resultText, &resultString, &status); 1761 1762 replaceAll(&replacementText, &resultText, status); 1763 1764 utext_close(&resultText); 1765 utext_close(&replacementText); 1766 1767 return resultString; 1768 } 1769 1770 1771 // 1772 // replaceAll, UText mode 1773 // 1774 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) { 1775 if (U_FAILURE(status)) { 1776 return dest; 1777 } 1778 if (U_FAILURE(fDeferredStatus)) { 1779 status = fDeferredStatus; 1780 return dest; 1781 } 1782 1783 if (dest == NULL) { 1784 UnicodeString emptyString; 1785 UText empty = UTEXT_INITIALIZER; 1786 1787 utext_openUnicodeString(&empty, &emptyString, &status); 1788 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1789 utext_close(&empty); 1790 } 1791 1792 if (U_SUCCESS(status)) { 1793 reset(); 1794 while (find()) { 1795 appendReplacement(dest, replacement, status); 1796 if (U_FAILURE(status)) { 1797 break; 1798 } 1799 } 1800 appendTail(dest, status); 1801 } 1802 1803 return dest; 1804 } 1805 1806 1807 //-------------------------------------------------------------------------------- 1808 // 1809 // replaceFirst 1810 // 1811 //-------------------------------------------------------------------------------- 1812 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { 1813 UText replacementText = UTEXT_INITIALIZER; 1814 UText resultText = UTEXT_INITIALIZER; 1815 UnicodeString resultString; 1816 1817 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1818 utext_openUnicodeString(&resultText, &resultString, &status); 1819 1820 replaceFirst(&replacementText, &resultText, status); 1821 1822 utext_close(&resultText); 1823 utext_close(&replacementText); 1824 1825 return resultString; 1826 } 1827 1828 // 1829 // replaceFirst, UText mode 1830 // 1831 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) { 1832 if (U_FAILURE(status)) { 1833 return dest; 1834 } 1835 if (U_FAILURE(fDeferredStatus)) { 1836 status = fDeferredStatus; 1837 return dest; 1838 } 1839 1840 reset(); 1841 if (!find()) { 1842 return getInput(dest, status); 1843 } 1844 1845 if (dest == NULL) { 1846 UnicodeString emptyString; 1847 UText empty = UTEXT_INITIALIZER; 1848 1849 utext_openUnicodeString(&empty, &emptyString, &status); 1850 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1851 utext_close(&empty); 1852 } 1853 1854 appendReplacement(dest, replacement, status); 1855 appendTail(dest, status); 1856 1857 return dest; 1858 } 1859 1860 1861 //-------------------------------------------------------------------------------- 1862 // 1863 // requireEnd 1864 // 1865 //-------------------------------------------------------------------------------- 1866 UBool RegexMatcher::requireEnd() const { 1867 return fRequireEnd; 1868 } 1869 1870 1871 //-------------------------------------------------------------------------------- 1872 // 1873 // reset 1874 // 1875 //-------------------------------------------------------------------------------- 1876 RegexMatcher &RegexMatcher::reset() { 1877 fRegionStart = 0; 1878 fRegionLimit = fInputLength; 1879 fActiveStart = 0; 1880 fActiveLimit = fInputLength; 1881 fAnchorStart = 0; 1882 fAnchorLimit = fInputLength; 1883 fLookStart = 0; 1884 fLookLimit = fInputLength; 1885 resetPreserveRegion(); 1886 return *this; 1887 } 1888 1889 1890 1891 void RegexMatcher::resetPreserveRegion() { 1892 fMatchStart = 0; 1893 fMatchEnd = 0; 1894 fLastMatchEnd = -1; 1895 fAppendPosition = 0; 1896 fMatch = FALSE; 1897 fHitEnd = FALSE; 1898 fRequireEnd = FALSE; 1899 fTime = 0; 1900 fTickCounter = TIMER_INITIAL_VALUE; 1901 //resetStack(); // more expensive than it looks... 1902 } 1903 1904 1905 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { 1906 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus); 1907 if (fPattern->fNeedsAltInput) { 1908 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1909 } 1910 fInputLength = utext_nativeLength(fInputText); 1911 1912 reset(); 1913 delete fInput; 1914 fInput = NULL; 1915 1916 // Do the following for any UnicodeString. 1917 // This is for compatibility for those clients who modify the input string "live" during regex operations. 1918 fInputUniStrMaybeMutable = TRUE; 1919 1920 if (fWordBreakItr != NULL) { 1921 #if UCONFIG_NO_BREAK_ITERATION==0 1922 UErrorCode status = U_ZERO_ERROR; 1923 fWordBreakItr->setText(fInputText, status); 1924 #endif 1925 } 1926 return *this; 1927 } 1928 1929 1930 RegexMatcher &RegexMatcher::reset(UText *input) { 1931 if (fInputText != input) { 1932 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus); 1933 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1934 fInputLength = utext_nativeLength(fInputText); 1935 1936 delete fInput; 1937 fInput = NULL; 1938 1939 if (fWordBreakItr != NULL) { 1940 #if UCONFIG_NO_BREAK_ITERATION==0 1941 UErrorCode status = U_ZERO_ERROR; 1942 fWordBreakItr->setText(input, status); 1943 #endif 1944 } 1945 } 1946 reset(); 1947 fInputUniStrMaybeMutable = FALSE; 1948 1949 return *this; 1950 } 1951 1952 /*RegexMatcher &RegexMatcher::reset(const UChar *) { 1953 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; 1954 return *this; 1955 }*/ 1956 1957 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { 1958 if (U_FAILURE(status)) { 1959 return *this; 1960 } 1961 reset(); // Reset also resets the region to be the entire string. 1962 1963 if (position < 0 || position > fActiveLimit) { 1964 status = U_INDEX_OUTOFBOUNDS_ERROR; 1965 return *this; 1966 } 1967 fMatchEnd = position; 1968 return *this; 1969 } 1970 1971 1972 //-------------------------------------------------------------------------------- 1973 // 1974 // refresh 1975 // 1976 //-------------------------------------------------------------------------------- 1977 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) { 1978 if (U_FAILURE(status)) { 1979 return *this; 1980 } 1981 if (input == NULL) { 1982 status = U_ILLEGAL_ARGUMENT_ERROR; 1983 return *this; 1984 } 1985 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) { 1986 status = U_ILLEGAL_ARGUMENT_ERROR; 1987 return *this; 1988 } 1989 int64_t pos = utext_getNativeIndex(fInputText); 1990 // Shallow read-only clone of the new UText into the existing input UText 1991 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status); 1992 if (U_FAILURE(status)) { 1993 return *this; 1994 } 1995 utext_setNativeIndex(fInputText, pos); 1996 1997 if (fAltInputText != NULL) { 1998 pos = utext_getNativeIndex(fAltInputText); 1999 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status); 2000 if (U_FAILURE(status)) { 2001 return *this; 2002 } 2003 utext_setNativeIndex(fAltInputText, pos); 2004 } 2005 return *this; 2006 } 2007 2008 2009 2010 //-------------------------------------------------------------------------------- 2011 // 2012 // setTrace 2013 // 2014 //-------------------------------------------------------------------------------- 2015 void RegexMatcher::setTrace(UBool state) { 2016 fTraceDebug = state; 2017 } 2018 2019 2020 2021 //--------------------------------------------------------------------- 2022 // 2023 // split 2024 // 2025 //--------------------------------------------------------------------- 2026 int32_t RegexMatcher::split(const UnicodeString &input, 2027 UnicodeString dest[], 2028 int32_t destCapacity, 2029 UErrorCode &status) 2030 { 2031 UText inputText = UTEXT_INITIALIZER; 2032 utext_openConstUnicodeString(&inputText, &input, &status); 2033 if (U_FAILURE(status)) { 2034 return 0; 2035 } 2036 2037 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); 2038 if (destText == NULL) { 2039 status = U_MEMORY_ALLOCATION_ERROR; 2040 return 0; 2041 } 2042 int32_t i; 2043 for (i = 0; i < destCapacity; i++) { 2044 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); 2045 } 2046 2047 int32_t fieldCount = split(&inputText, destText, destCapacity, status); 2048 2049 for (i = 0; i < destCapacity; i++) { 2050 utext_close(destText[i]); 2051 } 2052 2053 uprv_free(destText); 2054 utext_close(&inputText); 2055 return fieldCount; 2056 } 2057 2058 // 2059 // split, UText mode 2060 // 2061 int32_t RegexMatcher::split(UText *input, 2062 UText *dest[], 2063 int32_t destCapacity, 2064 UErrorCode &status) 2065 { 2066 // 2067 // Check arguements for validity 2068 // 2069 if (U_FAILURE(status)) { 2070 return 0; 2071 }; 2072 2073 if (destCapacity < 1) { 2074 status = U_ILLEGAL_ARGUMENT_ERROR; 2075 return 0; 2076 } 2077 2078 // 2079 // Reset for the input text 2080 // 2081 reset(input); 2082 int64_t nextOutputStringStart = 0; 2083 if (fActiveLimit == 0) { 2084 return 0; 2085 } 2086 2087 // 2088 // Loop through the input text, searching for the delimiter pattern 2089 // 2090 int32_t i; 2091 int32_t numCaptureGroups = fPattern->fGroupMap->size(); 2092 for (i=0; ; i++) { 2093 if (i>=destCapacity-1) { 2094 // There is one or zero output string left. 2095 // Fill the last output string with whatever is left from the input, then exit the loop. 2096 // ( i will be == destCapacity if we filled the output array while processing 2097 // capture groups of the delimiter expression, in which case we will discard the 2098 // last capture group saved in favor of the unprocessed remainder of the 2099 // input string.) 2100 i = destCapacity-1; 2101 if (fActiveLimit > nextOutputStringStart) { 2102 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2103 if (dest[i]) { 2104 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2105 input->chunkContents+nextOutputStringStart, 2106 (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2107 } else { 2108 UText remainingText = UTEXT_INITIALIZER; 2109 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2110 fActiveLimit-nextOutputStringStart, &status); 2111 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2112 utext_close(&remainingText); 2113 } 2114 } else { 2115 UErrorCode lengthStatus = U_ZERO_ERROR; 2116 int32_t remaining16Length = 2117 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 2118 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2119 if (remainingChars == NULL) { 2120 status = U_MEMORY_ALLOCATION_ERROR; 2121 break; 2122 } 2123 2124 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2125 if (dest[i]) { 2126 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2127 } else { 2128 UText remainingText = UTEXT_INITIALIZER; 2129 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2130 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2131 utext_close(&remainingText); 2132 } 2133 2134 uprv_free(remainingChars); 2135 } 2136 } 2137 break; 2138 } 2139 if (find()) { 2140 // We found another delimiter. Move everything from where we started looking 2141 // up until the start of the delimiter into the next output string. 2142 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2143 if (dest[i]) { 2144 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2145 input->chunkContents+nextOutputStringStart, 2146 (int32_t)(fMatchStart-nextOutputStringStart), &status); 2147 } else { 2148 UText remainingText = UTEXT_INITIALIZER; 2149 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2150 fMatchStart-nextOutputStringStart, &status); 2151 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2152 utext_close(&remainingText); 2153 } 2154 } else { 2155 UErrorCode lengthStatus = U_ZERO_ERROR; 2156 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus); 2157 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2158 if (remainingChars == NULL) { 2159 status = U_MEMORY_ALLOCATION_ERROR; 2160 break; 2161 } 2162 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status); 2163 if (dest[i]) { 2164 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2165 } else { 2166 UText remainingText = UTEXT_INITIALIZER; 2167 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2168 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2169 utext_close(&remainingText); 2170 } 2171 2172 uprv_free(remainingChars); 2173 } 2174 nextOutputStringStart = fMatchEnd; 2175 2176 // If the delimiter pattern has capturing parentheses, the captured 2177 // text goes out into the next n destination strings. 2178 int32_t groupNum; 2179 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 2180 if (i >= destCapacity-2) { 2181 // Never fill the last available output string with capture group text. 2182 // It will filled with the last field, the remainder of the 2183 // unsplit input text. 2184 break; 2185 } 2186 i++; 2187 dest[i] = group(groupNum, dest[i], status); 2188 } 2189 2190 if (nextOutputStringStart == fActiveLimit) { 2191 // The delimiter was at the end of the string. We're done, but first 2192 // we output one last empty string, for the empty field following 2193 // the delimiter at the end of input. 2194 if (i+1 < destCapacity) { 2195 ++i; 2196 if (dest[i] == NULL) { 2197 dest[i] = utext_openUChars(NULL, NULL, 0, &status); 2198 } else { 2199 static UChar emptyString[] = {(UChar)0}; 2200 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status); 2201 } 2202 } 2203 break; 2204 2205 } 2206 } 2207 else 2208 { 2209 // We ran off the end of the input while looking for the next delimiter. 2210 // All the remaining text goes into the current output string. 2211 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2212 if (dest[i]) { 2213 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2214 input->chunkContents+nextOutputStringStart, 2215 (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2216 } else { 2217 UText remainingText = UTEXT_INITIALIZER; 2218 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2219 fActiveLimit-nextOutputStringStart, &status); 2220 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2221 utext_close(&remainingText); 2222 } 2223 } else { 2224 UErrorCode lengthStatus = U_ZERO_ERROR; 2225 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 2226 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2227 if (remainingChars == NULL) { 2228 status = U_MEMORY_ALLOCATION_ERROR; 2229 break; 2230 } 2231 2232 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2233 if (dest[i]) { 2234 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2235 } else { 2236 UText remainingText = UTEXT_INITIALIZER; 2237 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2238 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2239 utext_close(&remainingText); 2240 } 2241 2242 uprv_free(remainingChars); 2243 } 2244 break; 2245 } 2246 if (U_FAILURE(status)) { 2247 break; 2248 } 2249 } // end of for loop 2250 return i+1; 2251 } 2252 2253 2254 //-------------------------------------------------------------------------------- 2255 // 2256 // start 2257 // 2258 //-------------------------------------------------------------------------------- 2259 int32_t RegexMatcher::start(UErrorCode &status) const { 2260 return start(0, status); 2261 } 2262 2263 int64_t RegexMatcher::start64(UErrorCode &status) const { 2264 return start64(0, status); 2265 } 2266 2267 //-------------------------------------------------------------------------------- 2268 // 2269 // start(int32_t group, UErrorCode &status) 2270 // 2271 //-------------------------------------------------------------------------------- 2272 2273 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const { 2274 if (U_FAILURE(status)) { 2275 return -1; 2276 } 2277 if (U_FAILURE(fDeferredStatus)) { 2278 status = fDeferredStatus; 2279 return -1; 2280 } 2281 if (fMatch == FALSE) { 2282 status = U_REGEX_INVALID_STATE; 2283 return -1; 2284 } 2285 if (group < 0 || group > fPattern->fGroupMap->size()) { 2286 status = U_INDEX_OUTOFBOUNDS_ERROR; 2287 return -1; 2288 } 2289 int64_t s; 2290 if (group == 0) { 2291 s = fMatchStart; 2292 } else { 2293 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 2294 U_ASSERT(groupOffset < fPattern->fFrameSize); 2295 U_ASSERT(groupOffset >= 0); 2296 s = fFrame->fExtra[groupOffset]; 2297 } 2298 2299 return s; 2300 } 2301 2302 2303 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { 2304 return (int32_t)start64(group, status); 2305 } 2306 2307 //-------------------------------------------------------------------------------- 2308 // 2309 // useAnchoringBounds 2310 // 2311 //-------------------------------------------------------------------------------- 2312 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { 2313 fAnchoringBounds = b; 2314 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); 2315 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); 2316 return *this; 2317 } 2318 2319 2320 //-------------------------------------------------------------------------------- 2321 // 2322 // useTransparentBounds 2323 // 2324 //-------------------------------------------------------------------------------- 2325 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { 2326 fTransparentBounds = b; 2327 fLookStart = (fTransparentBounds ? 0 : fRegionStart); 2328 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); 2329 return *this; 2330 } 2331 2332 //-------------------------------------------------------------------------------- 2333 // 2334 // setTimeLimit 2335 // 2336 //-------------------------------------------------------------------------------- 2337 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { 2338 if (U_FAILURE(status)) { 2339 return; 2340 } 2341 if (U_FAILURE(fDeferredStatus)) { 2342 status = fDeferredStatus; 2343 return; 2344 } 2345 if (limit < 0) { 2346 status = U_ILLEGAL_ARGUMENT_ERROR; 2347 return; 2348 } 2349 fTimeLimit = limit; 2350 } 2351 2352 2353 //-------------------------------------------------------------------------------- 2354 // 2355 // getTimeLimit 2356 // 2357 //-------------------------------------------------------------------------------- 2358 int32_t RegexMatcher::getTimeLimit() const { 2359 return fTimeLimit; 2360 } 2361 2362 2363 //-------------------------------------------------------------------------------- 2364 // 2365 // setStackLimit 2366 // 2367 //-------------------------------------------------------------------------------- 2368 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { 2369 if (U_FAILURE(status)) { 2370 return; 2371 } 2372 if (U_FAILURE(fDeferredStatus)) { 2373 status = fDeferredStatus; 2374 return; 2375 } 2376 if (limit < 0) { 2377 status = U_ILLEGAL_ARGUMENT_ERROR; 2378 return; 2379 } 2380 2381 // Reset the matcher. This is needed here in case there is a current match 2382 // whose final stack frame (containing the match results, pointed to by fFrame) 2383 // would be lost by resizing to a smaller stack size. 2384 reset(); 2385 2386 if (limit == 0) { 2387 // Unlimited stack expansion 2388 fStack->setMaxCapacity(0); 2389 } else { 2390 // Change the units of the limit from bytes to ints, and bump the size up 2391 // to be big enough to hold at least one stack frame for the pattern, 2392 // if it isn't there already. 2393 int32_t adjustedLimit = limit / sizeof(int32_t); 2394 if (adjustedLimit < fPattern->fFrameSize) { 2395 adjustedLimit = fPattern->fFrameSize; 2396 } 2397 fStack->setMaxCapacity(adjustedLimit); 2398 } 2399 fStackLimit = limit; 2400 } 2401 2402 2403 //-------------------------------------------------------------------------------- 2404 // 2405 // getStackLimit 2406 // 2407 //-------------------------------------------------------------------------------- 2408 int32_t RegexMatcher::getStackLimit() const { 2409 return fStackLimit; 2410 } 2411 2412 2413 //-------------------------------------------------------------------------------- 2414 // 2415 // setMatchCallback 2416 // 2417 //-------------------------------------------------------------------------------- 2418 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, 2419 const void *context, 2420 UErrorCode &status) { 2421 if (U_FAILURE(status)) { 2422 return; 2423 } 2424 fCallbackFn = callback; 2425 fCallbackContext = context; 2426 } 2427 2428 2429 //-------------------------------------------------------------------------------- 2430 // 2431 // getMatchCallback 2432 // 2433 //-------------------------------------------------------------------------------- 2434 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, 2435 const void *&context, 2436 UErrorCode &status) { 2437 if (U_FAILURE(status)) { 2438 return; 2439 } 2440 callback = fCallbackFn; 2441 context = fCallbackContext; 2442 } 2443 2444 2445 //-------------------------------------------------------------------------------- 2446 // 2447 // setMatchCallback 2448 // 2449 //-------------------------------------------------------------------------------- 2450 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback, 2451 const void *context, 2452 UErrorCode &status) { 2453 if (U_FAILURE(status)) { 2454 return; 2455 } 2456 fFindProgressCallbackFn = callback; 2457 fFindProgressCallbackContext = context; 2458 } 2459 2460 2461 //-------------------------------------------------------------------------------- 2462 // 2463 // getMatchCallback 2464 // 2465 //-------------------------------------------------------------------------------- 2466 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback, 2467 const void *&context, 2468 UErrorCode &status) { 2469 if (U_FAILURE(status)) { 2470 return; 2471 } 2472 callback = fFindProgressCallbackFn; 2473 context = fFindProgressCallbackContext; 2474 } 2475 2476 2477 //================================================================================ 2478 // 2479 // Code following this point in this file is the internal 2480 // Match Engine Implementation. 2481 // 2482 //================================================================================ 2483 2484 2485 //-------------------------------------------------------------------------------- 2486 // 2487 // resetStack 2488 // Discard any previous contents of the state save stack, and initialize a 2489 // new stack frame to all -1. The -1s are needed for capture group limits, 2490 // where they indicate that a group has not yet matched anything. 2491 //-------------------------------------------------------------------------------- 2492 REStackFrame *RegexMatcher::resetStack() { 2493 // Discard any previous contents of the state save stack, and initialize a 2494 // new stack frame with all -1 data. The -1s are needed for capture group limits, 2495 // where they indicate that a group has not yet matched anything. 2496 fStack->removeAllElements(); 2497 2498 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); 2499 int32_t i; 2500 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { 2501 iFrame->fExtra[i] = -1; 2502 } 2503 return iFrame; 2504 } 2505 2506 2507 2508 //-------------------------------------------------------------------------------- 2509 // 2510 // isWordBoundary 2511 // in perl, "xab..cd..", \b is true at positions 0,3,5,7 2512 // For us, 2513 // If the current char is a combining mark, 2514 // \b is FALSE. 2515 // Else Scan backwards to the first non-combining char. 2516 // We are at a boundary if the this char and the original chars are 2517 // opposite in membership in \w set 2518 // 2519 // parameters: pos - the current position in the input buffer 2520 // 2521 // TODO: double-check edge cases at region boundaries. 2522 // 2523 //-------------------------------------------------------------------------------- 2524 UBool RegexMatcher::isWordBoundary(int64_t pos) { 2525 UBool isBoundary = FALSE; 2526 UBool cIsWord = FALSE; 2527 2528 if (pos >= fLookLimit) { 2529 fHitEnd = TRUE; 2530 } else { 2531 // Determine whether char c at current position is a member of the word set of chars. 2532 // If we're off the end of the string, behave as though we're not at a word char. 2533 UTEXT_SETNATIVEINDEX(fInputText, pos); 2534 UChar32 c = UTEXT_CURRENT32(fInputText); 2535 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2536 // Current char is a combining one. Not a boundary. 2537 return FALSE; 2538 } 2539 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2540 } 2541 2542 // Back up until we come to a non-combining char, determine whether 2543 // that char is a word char. 2544 UBool prevCIsWord = FALSE; 2545 for (;;) { 2546 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { 2547 break; 2548 } 2549 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); 2550 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2551 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2552 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 2553 break; 2554 } 2555 } 2556 isBoundary = cIsWord ^ prevCIsWord; 2557 return isBoundary; 2558 } 2559 2560 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { 2561 UBool isBoundary = FALSE; 2562 UBool cIsWord = FALSE; 2563 2564 const UChar *inputBuf = fInputText->chunkContents; 2565 2566 if (pos >= fLookLimit) { 2567 fHitEnd = TRUE; 2568 } else { 2569 // Determine whether char c at current position is a member of the word set of chars. 2570 // If we're off the end of the string, behave as though we're not at a word char. 2571 UChar32 c; 2572 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); 2573 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2574 // Current char is a combining one. Not a boundary. 2575 return FALSE; 2576 } 2577 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2578 } 2579 2580 // Back up until we come to a non-combining char, determine whether 2581 // that char is a word char. 2582 UBool prevCIsWord = FALSE; 2583 for (;;) { 2584 if (pos <= fLookStart) { 2585 break; 2586 } 2587 UChar32 prevChar; 2588 U16_PREV(inputBuf, fLookStart, pos, prevChar); 2589 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2590 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2591 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 2592 break; 2593 } 2594 } 2595 isBoundary = cIsWord ^ prevCIsWord; 2596 return isBoundary; 2597 } 2598 2599 //-------------------------------------------------------------------------------- 2600 // 2601 // isUWordBoundary 2602 // 2603 // Test for a word boundary using RBBI word break. 2604 // 2605 // parameters: pos - the current position in the input buffer 2606 // 2607 //-------------------------------------------------------------------------------- 2608 UBool RegexMatcher::isUWordBoundary(int64_t pos) { 2609 UBool returnVal = FALSE; 2610 #if UCONFIG_NO_BREAK_ITERATION==0 2611 2612 // If we haven't yet created a break iterator for this matcher, do it now. 2613 if (fWordBreakItr == NULL) { 2614 fWordBreakItr = 2615 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus); 2616 if (U_FAILURE(fDeferredStatus)) { 2617 return FALSE; 2618 } 2619 fWordBreakItr->setText(fInputText, fDeferredStatus); 2620 } 2621 2622 if (pos >= fLookLimit) { 2623 fHitEnd = TRUE; 2624 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" 2625 // words are not boundaries. All non-word chars stand by themselves, 2626 // with word boundaries on both sides. 2627 } else { 2628 if (!UTEXT_USES_U16(fInputText)) { 2629 // !!!: Would like a better way to do this! 2630 UErrorCode status = U_ZERO_ERROR; 2631 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status); 2632 } 2633 returnVal = fWordBreakItr->isBoundary((int32_t)pos); 2634 } 2635 #endif 2636 return returnVal; 2637 } 2638 2639 //-------------------------------------------------------------------------------- 2640 // 2641 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state 2642 // saves. Increment the "time" counter, and call the 2643 // user callback function if there is one installed. 2644 // 2645 // If the match operation needs to be aborted, either for a time-out 2646 // or because the user callback asked for it, just set an error status. 2647 // The engine will pick that up and stop in its outer loop. 2648 // 2649 //-------------------------------------------------------------------------------- 2650 void RegexMatcher::IncrementTime(UErrorCode &status) { 2651 fTickCounter = TIMER_INITIAL_VALUE; 2652 fTime++; 2653 if (fCallbackFn != NULL) { 2654 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { 2655 status = U_REGEX_STOPPED_BY_CALLER; 2656 return; 2657 } 2658 } 2659 if (fTimeLimit > 0 && fTime >= fTimeLimit) { 2660 status = U_REGEX_TIME_OUT; 2661 } 2662 } 2663 2664 //-------------------------------------------------------------------------------- 2665 // 2666 // ReportFindProgress This function is called once for each advance in the target 2667 // string from the find() function, and calls the user progress callback 2668 // function if there is one installed. 2669 // 2670 // NOTE: 2671 // 2672 // If the match operation needs to be aborted because the user 2673 // callback asked for it, just set an error status. 2674 // The engine will pick that up and stop in its outer loop. 2675 // 2676 //-------------------------------------------------------------------------------- 2677 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) { 2678 if (fFindProgressCallbackFn != NULL) { 2679 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) { 2680 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/; 2681 return FALSE; 2682 } 2683 } 2684 return TRUE; 2685 } 2686 2687 //-------------------------------------------------------------------------------- 2688 // 2689 // StateSave 2690 // Make a new stack frame, initialized as a copy of the current stack frame. 2691 // Set the pattern index in the original stack frame from the operand value 2692 // in the opcode. Execution of the engine continues with the state in 2693 // the newly created stack frame 2694 // 2695 // Note that reserveBlock() may grow the stack, resulting in the 2696 // whole thing being relocated in memory. 2697 // 2698 // Parameters: 2699 // fp The top frame pointer when called. At return, a new 2700 // fame will be present 2701 // savePatIdx An index into the compiled pattern. Goes into the original 2702 // (not new) frame. If execution ever back-tracks out of the 2703 // new frame, this will be where we continue from in the pattern. 2704 // Return 2705 // The new frame pointer. 2706 // 2707 //-------------------------------------------------------------------------------- 2708 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) { 2709 // push storage for a new frame. 2710 int64_t *newFP = fStack->reserveBlock(fFrameSize, status); 2711 if (newFP == NULL) { 2712 // Failure on attempted stack expansion. 2713 // Stack function set some other error code, change it to a more 2714 // specific one for regular expressions. 2715 status = U_REGEX_STACK_OVERFLOW; 2716 // We need to return a writable stack frame, so just return the 2717 // previous frame. The match operation will stop quickly 2718 // because of the error status, after which the frame will never 2719 // be looked at again. 2720 return fp; 2721 } 2722 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. 2723 2724 // New stack frame = copy of old top frame. 2725 int64_t *source = (int64_t *)fp; 2726 int64_t *dest = newFP; 2727 for (;;) { 2728 *dest++ = *source++; 2729 if (source == newFP) { 2730 break; 2731 } 2732 } 2733 2734 fTickCounter--; 2735 if (fTickCounter <= 0) { 2736 IncrementTime(status); // Re-initializes fTickCounter 2737 } 2738 fp->fPatIdx = savePatIdx; 2739 return (REStackFrame *)newFP; 2740 } 2741 2742 2743 //-------------------------------------------------------------------------------- 2744 // 2745 // MatchAt This is the actual matching engine. 2746 // 2747 // startIdx: begin matching a this index. 2748 // toEnd: if true, match must extend to end of the input region 2749 // 2750 //-------------------------------------------------------------------------------- 2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { 2752 UBool isMatch = FALSE; // True if the we have a match. 2753 2754 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards 2755 2756 int32_t op; // Operation from the compiled pattern, split into 2757 int32_t opType; // the opcode 2758 int32_t opValue; // and the operand value. 2759 2760 #ifdef REGEX_RUN_DEBUG 2761 if (fTraceDebug) 2762 { 2763 printf("MatchAt(startIdx=%ld)\n", startIdx); 2764 printf("Original Pattern: "); 2765 UChar32 c = utext_next32From(fPattern->fPattern, 0); 2766 while (c != U_SENTINEL) { 2767 if (c<32 || c>256) { 2768 c = '.'; 2769 } 2770 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); 2771 2772 c = UTEXT_NEXT32(fPattern->fPattern); 2773 } 2774 printf("\n"); 2775 printf("Input String: "); 2776 c = utext_next32From(fInputText, 0); 2777 while (c != U_SENTINEL) { 2778 if (c<32 || c>256) { 2779 c = '.'; 2780 } 2781 printf("%c", c); 2782 2783 c = UTEXT_NEXT32(fInputText); 2784 } 2785 printf("\n"); 2786 printf("\n"); 2787 } 2788 #endif 2789 2790 if (U_FAILURE(status)) { 2791 return; 2792 } 2793 2794 // Cache frequently referenced items from the compiled pattern 2795 // 2796 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 2797 2798 const UChar *litText = fPattern->fLiteralText.getBuffer(); 2799 UVector *sets = fPattern->fSets; 2800 2801 fFrameSize = fPattern->fFrameSize; 2802 REStackFrame *fp = resetStack(); 2803 2804 fp->fPatIdx = 0; 2805 fp->fInputIdx = startIdx; 2806 2807 // Zero out the pattern's static data 2808 int32_t i; 2809 for (i = 0; i<fPattern->fDataSize; i++) { 2810 fData[i] = 0; 2811 } 2812 2813 // 2814 // Main loop for interpreting the compiled pattern. 2815 // One iteration of the loop per pattern operation performed. 2816 // 2817 for (;;) { 2818 #if 0 2819 if (_heapchk() != _HEAPOK) { 2820 fprintf(stderr, "Heap Trouble\n"); 2821 } 2822 #endif 2823 2824 op = (int32_t)pat[fp->fPatIdx]; 2825 opType = URX_TYPE(op); 2826 opValue = URX_VAL(op); 2827 #ifdef REGEX_RUN_DEBUG 2828 if (fTraceDebug) { 2829 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2830 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, 2831 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 2832 fPattern->dumpOp(fp->fPatIdx); 2833 } 2834 #endif 2835 fp->fPatIdx++; 2836 2837 switch (opType) { 2838 2839 2840 case URX_NOP: 2841 break; 2842 2843 2844 case URX_BACKTRACK: 2845 // Force a backtrack. In some circumstances, the pattern compiler 2846 // will notice that the pattern can't possibly match anything, and will 2847 // emit one of these at that point. 2848 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2849 break; 2850 2851 2852 case URX_ONECHAR: 2853 if (fp->fInputIdx < fActiveLimit) { 2854 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2855 UChar32 c = UTEXT_NEXT32(fInputText); 2856 if (c == opValue) { 2857 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2858 break; 2859 } 2860 } else { 2861 fHitEnd = TRUE; 2862 } 2863 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2864 break; 2865 2866 2867 case URX_STRING: 2868 { 2869 // Test input against a literal string. 2870 // Strings require two slots in the compiled pattern, one for the 2871 // offset to the string text, and one for the length. 2872 2873 int32_t stringStartIdx = opValue; 2874 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 2875 fp->fPatIdx++; 2876 opType = URX_TYPE(op); 2877 int32_t stringLen = URX_VAL(op); 2878 U_ASSERT(opType == URX_STRING_LEN); 2879 U_ASSERT(stringLen >= 2); 2880 2881 const UChar *patternString = litText+stringStartIdx; 2882 int32_t patternStringIndex = 0; 2883 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2884 UChar32 inputChar; 2885 UChar32 patternChar; 2886 UBool success = TRUE; 2887 while (patternStringIndex < stringLen) { 2888 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { 2889 success = FALSE; 2890 fHitEnd = TRUE; 2891 break; 2892 } 2893 inputChar = UTEXT_NEXT32(fInputText); 2894 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar); 2895 if (patternChar != inputChar) { 2896 success = FALSE; 2897 break; 2898 } 2899 } 2900 2901 if (success) { 2902 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2903 } else { 2904 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2905 } 2906 } 2907 break; 2908 2909 2910 case URX_STATE_SAVE: 2911 fp = StateSave(fp, opValue, status); 2912 break; 2913 2914 2915 case URX_END: 2916 // The match loop will exit via this path on a successful match, 2917 // when we reach the end of the pattern. 2918 if (toEnd && fp->fInputIdx != fActiveLimit) { 2919 // The pattern matched, but not to the end of input. Try some more. 2920 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2921 break; 2922 } 2923 isMatch = TRUE; 2924 goto breakFromLoop; 2925 2926 // Start and End Capture stack frame variables are laid out out like this: 2927 // fp->fExtra[opValue] - The start of a completed capture group 2928 // opValue+1 - The end of a completed capture group 2929 // opValue+2 - the start of a capture group whose end 2930 // has not yet been reached (and might not ever be). 2931 case URX_START_CAPTURE: 2932 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2933 fp->fExtra[opValue+2] = fp->fInputIdx; 2934 break; 2935 2936 2937 case URX_END_CAPTURE: 2938 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2939 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 2940 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 2941 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 2942 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 2943 break; 2944 2945 2946 case URX_DOLLAR: // $, test for End of line 2947 // or for position before new line at end of input 2948 { 2949 if (fp->fInputIdx >= fAnchorLimit) { 2950 // We really are at the end of input. Success. 2951 fHitEnd = TRUE; 2952 fRequireEnd = TRUE; 2953 break; 2954 } 2955 2956 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2957 2958 // If we are positioned just before a new-line that is located at the 2959 // end of input, succeed. 2960 UChar32 c = UTEXT_NEXT32(fInputText); 2961 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 2962 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 2963 // If not in the middle of a CR/LF sequence 2964 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { 2965 // At new-line at end of input. Success 2966 fHitEnd = TRUE; 2967 fRequireEnd = TRUE; 2968 2969 break; 2970 } 2971 } 2972 } else { 2973 UChar32 nextC = UTEXT_NEXT32(fInputText); 2974 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 2975 fHitEnd = TRUE; 2976 fRequireEnd = TRUE; 2977 break; // At CR/LF at end of input. Success 2978 } 2979 } 2980 2981 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2982 } 2983 break; 2984 2985 2986 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 2987 if (fp->fInputIdx >= fAnchorLimit) { 2988 // Off the end of input. Success. 2989 fHitEnd = TRUE; 2990 fRequireEnd = TRUE; 2991 break; 2992 } else { 2993 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2994 UChar32 c = UTEXT_NEXT32(fInputText); 2995 // Either at the last character of input, or off the end. 2996 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) { 2997 fHitEnd = TRUE; 2998 fRequireEnd = TRUE; 2999 break; 3000 } 3001 } 3002 3003 // Not at end of input. Back-track out. 3004 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3005 break; 3006 3007 3008 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 3009 { 3010 if (fp->fInputIdx >= fAnchorLimit) { 3011 // We really are at the end of input. Success. 3012 fHitEnd = TRUE; 3013 fRequireEnd = TRUE; 3014 break; 3015 } 3016 // If we are positioned just before a new-line, succeed. 3017 // It makes no difference where the new-line is within the input. 3018 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3019 UChar32 c = UTEXT_CURRENT32(fInputText); 3020 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 3021 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 3022 // In multi-line mode, hitting a new-line just before the end of input does not 3023 // set the hitEnd or requireEnd flags 3024 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) { 3025 break; 3026 } 3027 } 3028 // not at a new line. Fail. 3029 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3030 } 3031 break; 3032 3033 3034 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 3035 { 3036 if (fp->fInputIdx >= fAnchorLimit) { 3037 // We really are at the end of input. Success. 3038 fHitEnd = TRUE; 3039 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 3040 break; // adding a new-line would not lose the match. 3041 } 3042 // If we are not positioned just before a new-line, the test fails; backtrack out. 3043 // It makes no difference where the new-line is within the input. 3044 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3045 if (UTEXT_CURRENT32(fInputText) != 0x0a) { 3046 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3047 } 3048 } 3049 break; 3050 3051 3052 case URX_CARET: // ^, test for start of line 3053 if (fp->fInputIdx != fAnchorStart) { 3054 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3055 } 3056 break; 3057 3058 3059 case URX_CARET_M: // ^, test for start of line in mulit-line mode 3060 { 3061 if (fp->fInputIdx == fAnchorStart) { 3062 // We are at the start input. Success. 3063 break; 3064 } 3065 // Check whether character just before the current pos is a new-line 3066 // unless we are at the end of input 3067 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3068 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3069 if ((fp->fInputIdx < fAnchorLimit) && 3070 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 3071 // It's a new-line. ^ is true. Success. 3072 // TODO: what should be done with positions between a CR and LF? 3073 break; 3074 } 3075 // Not at the start of a line. Fail. 3076 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3077 } 3078 break; 3079 3080 3081 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 3082 { 3083 U_ASSERT(fp->fInputIdx >= fAnchorStart); 3084 if (fp->fInputIdx <= fAnchorStart) { 3085 // We are at the start input. Success. 3086 break; 3087 } 3088 // Check whether character just before the current pos is a new-line 3089 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 3090 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3091 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3092 if (c != 0x0a) { 3093 // Not at the start of a line. Back-track out. 3094 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3095 } 3096 } 3097 break; 3098 3099 case URX_BACKSLASH_B: // Test for word boundaries 3100 { 3101 UBool success = isWordBoundary(fp->fInputIdx); 3102 success ^= (UBool)(opValue != 0); // flip sense for \B 3103 if (!success) { 3104 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3105 } 3106 } 3107 break; 3108 3109 3110 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 3111 { 3112 UBool success = isUWordBoundary(fp->fInputIdx); 3113 success ^= (UBool)(opValue != 0); // flip sense for \B 3114 if (!success) { 3115 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3116 } 3117 } 3118 break; 3119 3120 3121 case URX_BACKSLASH_D: // Test for decimal digit 3122 { 3123 if (fp->fInputIdx >= fActiveLimit) { 3124 fHitEnd = TRUE; 3125 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3126 break; 3127 } 3128 3129 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3130 3131 UChar32 c = UTEXT_NEXT32(fInputText); 3132 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 3133 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 3134 success ^= (UBool)(opValue != 0); // flip sense for \D 3135 if (success) { 3136 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3137 } else { 3138 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3139 } 3140 } 3141 break; 3142 3143 3144 case URX_BACKSLASH_G: // Test for position at end of previous match 3145 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { 3146 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3147 } 3148 break; 3149 3150 3151 case URX_BACKSLASH_X: 3152 // Match a Grapheme, as defined by Unicode TR 29. 3153 // Differs slightly from Perl, which consumes combining marks independently 3154 // of context. 3155 { 3156 3157 // Fail if at end of input 3158 if (fp->fInputIdx >= fActiveLimit) { 3159 fHitEnd = TRUE; 3160 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3161 break; 3162 } 3163 3164 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3165 3166 // Examine (and consume) the current char. 3167 // Dispatch into a little state machine, based on the char. 3168 UChar32 c; 3169 c = UTEXT_NEXT32(fInputText); 3170 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3171 UnicodeSet **sets = fPattern->fStaticSets; 3172 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 3173 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 3174 if (sets[URX_GC_L]->contains(c)) goto GC_L; 3175 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 3176 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 3177 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3178 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3179 goto GC_Extend; 3180 3181 3182 3183 GC_L: 3184 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3185 c = UTEXT_NEXT32(fInputText); 3186 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3187 if (sets[URX_GC_L]->contains(c)) goto GC_L; 3188 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 3189 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 3190 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3191 (void)UTEXT_PREVIOUS32(fInputText); 3192 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3193 goto GC_Extend; 3194 3195 GC_V: 3196 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3197 c = UTEXT_NEXT32(fInputText); 3198 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3199 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3200 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3201 (void)UTEXT_PREVIOUS32(fInputText); 3202 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3203 goto GC_Extend; 3204 3205 GC_T: 3206 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3207 c = UTEXT_NEXT32(fInputText); 3208 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3209 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3210 (void)UTEXT_PREVIOUS32(fInputText); 3211 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3212 goto GC_Extend; 3213 3214 GC_Extend: 3215 // Combining characters are consumed here 3216 for (;;) { 3217 if (fp->fInputIdx >= fActiveLimit) { 3218 break; 3219 } 3220 c = UTEXT_CURRENT32(fInputText); 3221 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 3222 break; 3223 } 3224 (void)UTEXT_NEXT32(fInputText); 3225 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3226 } 3227 goto GC_Done; 3228 3229 GC_Control: 3230 // Most control chars stand alone (don't combine with combining chars), 3231 // except for that CR/LF sequence is a single grapheme cluster. 3232 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 3233 c = UTEXT_NEXT32(fInputText); 3234 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3235 } 3236 3237 GC_Done: 3238 if (fp->fInputIdx >= fActiveLimit) { 3239 fHitEnd = TRUE; 3240 } 3241 break; 3242 } 3243 3244 3245 3246 3247 case URX_BACKSLASH_Z: // Test for end of Input 3248 if (fp->fInputIdx < fAnchorLimit) { 3249 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3250 } else { 3251 fHitEnd = TRUE; 3252 fRequireEnd = TRUE; 3253 } 3254 break; 3255 3256 3257 3258 case URX_STATIC_SETREF: 3259 { 3260 // Test input character against one of the predefined sets 3261 // (Word Characters, for example) 3262 // The high bit of the op value is a flag for the match polarity. 3263 // 0: success if input char is in set. 3264 // 1: success if input char is not in set. 3265 if (fp->fInputIdx >= fActiveLimit) { 3266 fHitEnd = TRUE; 3267 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3268 break; 3269 } 3270 3271 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 3272 opValue &= ~URX_NEG_SET; 3273 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3274 3275 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3276 UChar32 c = UTEXT_NEXT32(fInputText); 3277 if (c < 256) { 3278 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3279 if (s8->contains(c)) { 3280 success = !success; 3281 } 3282 } else { 3283 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3284 if (s->contains(c)) { 3285 success = !success; 3286 } 3287 } 3288 if (success) { 3289 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3290 } else { 3291 // the character wasn't in the set. 3292 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3293 } 3294 } 3295 break; 3296 3297 3298 case URX_STAT_SETREF_N: 3299 { 3300 // Test input character for NOT being a member of one of 3301 // the predefined sets (Word Characters, for example) 3302 if (fp->fInputIdx >= fActiveLimit) { 3303 fHitEnd = TRUE; 3304 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3305 break; 3306 } 3307 3308 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3309 3310 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3311 3312 UChar32 c = UTEXT_NEXT32(fInputText); 3313 if (c < 256) { 3314 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3315 if (s8->contains(c) == FALSE) { 3316 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3317 break; 3318 } 3319 } else { 3320 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3321 if (s->contains(c) == FALSE) { 3322 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3323 break; 3324 } 3325 } 3326 // the character wasn't in the set. 3327 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3328 } 3329 break; 3330 3331 3332 case URX_SETREF: 3333 if (fp->fInputIdx >= fActiveLimit) { 3334 fHitEnd = TRUE; 3335 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3336 break; 3337 } else { 3338 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3339 3340 // There is input left. Pick up one char and test it for set membership. 3341 UChar32 c = UTEXT_NEXT32(fInputText); 3342 U_ASSERT(opValue > 0 && opValue < sets->size()); 3343 if (c<256) { 3344 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 3345 if (s8->contains(c)) { 3346 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3347 break; 3348 } 3349 } else { 3350 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 3351 if (s->contains(c)) { 3352 // The character is in the set. A Match. 3353 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3354 break; 3355 } 3356 } 3357 3358 // the character wasn't in the set. 3359 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3360 } 3361 break; 3362 3363 3364 case URX_DOTANY: 3365 { 3366 // . matches anything, but stops at end-of-line. 3367 if (fp->fInputIdx >= fActiveLimit) { 3368 // At end of input. Match failed. Backtrack out. 3369 fHitEnd = TRUE; 3370 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3371 break; 3372 } 3373 3374 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3375 3376 // There is input left. Advance over one char, unless we've hit end-of-line 3377 UChar32 c = UTEXT_NEXT32(fInputText); 3378 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 3379 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 3380 // End of line in normal mode. . does not match. 3381 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3382 break; 3383 } 3384 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3385 } 3386 break; 3387 3388 3389 case URX_DOTANY_ALL: 3390 { 3391 // ., in dot-matches-all (including new lines) mode 3392 if (fp->fInputIdx >= fActiveLimit) { 3393 // At end of input. Match failed. Backtrack out. 3394 fHitEnd = TRUE; 3395 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3396 break; 3397 } 3398 3399 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3400 3401 // There is input left. Advance over one char, except if we are 3402 // at a cr/lf, advance over both of them. 3403 UChar32 c; 3404 c = UTEXT_NEXT32(fInputText); 3405 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3406 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 3407 // In the case of a CR/LF, we need to advance over both. 3408 UChar32 nextc = UTEXT_CURRENT32(fInputText); 3409 if (nextc == 0x0a) { 3410 (void)UTEXT_NEXT32(fInputText); 3411 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3412 } 3413 } 3414 } 3415 break; 3416 3417 3418 case URX_DOTANY_UNIX: 3419 { 3420 // '.' operator, matches all, but stops at end-of-line. 3421 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 3422 if (fp->fInputIdx >= fActiveLimit) { 3423 // At end of input. Match failed. Backtrack out. 3424 fHitEnd = TRUE; 3425 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3426 break; 3427 } 3428 3429 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3430 3431 // There is input left. Advance over one char, unless we've hit end-of-line 3432 UChar32 c = UTEXT_NEXT32(fInputText); 3433 if (c == 0x0a) { 3434 // End of line in normal mode. '.' does not match the \n 3435 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3436 } else { 3437 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3438 } 3439 } 3440 break; 3441 3442 3443 case URX_JMP: 3444 fp->fPatIdx = opValue; 3445 break; 3446 3447 case URX_FAIL: 3448 isMatch = FALSE; 3449 goto breakFromLoop; 3450 3451 case URX_JMP_SAV: 3452 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 3453 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3454 fp->fPatIdx = opValue; // Then JMP. 3455 break; 3456 3457 case URX_JMP_SAV_X: 3458 // This opcode is used with (x)+, when x can match a zero length string. 3459 // Same as JMP_SAV, except conditional on the match having made forward progress. 3460 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 3461 // data address of the input position at the start of the loop. 3462 { 3463 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 3464 int32_t stoOp = (int32_t)pat[opValue-1]; 3465 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 3466 int32_t frameLoc = URX_VAL(stoOp); 3467 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 3468 int64_t prevInputIdx = fp->fExtra[frameLoc]; 3469 U_ASSERT(prevInputIdx <= fp->fInputIdx); 3470 if (prevInputIdx < fp->fInputIdx) { 3471 // The match did make progress. Repeat the loop. 3472 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3473 fp->fPatIdx = opValue; 3474 fp->fExtra[frameLoc] = fp->fInputIdx; 3475 } 3476 // If the input position did not advance, we do nothing here, 3477 // execution will fall out of the loop. 3478 } 3479 break; 3480 3481 case URX_CTR_INIT: 3482 { 3483 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3484 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3485 3486 // Pick up the three extra operands that CTR_INIT has, and 3487 // skip the pattern location counter past 3488 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3489 fp->fPatIdx += 3; 3490 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3491 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3492 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3493 U_ASSERT(minCount>=0); 3494 U_ASSERT(maxCount>=minCount || maxCount==-1); 3495 U_ASSERT(loopLoc>=fp->fPatIdx); 3496 3497 if (minCount == 0) { 3498 fp = StateSave(fp, loopLoc+1, status); 3499 } 3500 if (maxCount == -1) { 3501 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. 3502 } else if (maxCount == 0) { 3503 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3504 } 3505 } 3506 break; 3507 3508 case URX_CTR_LOOP: 3509 { 3510 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3511 int32_t initOp = (int32_t)pat[opValue]; 3512 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 3513 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3514 int32_t minCount = (int32_t)pat[opValue+2]; 3515 int32_t maxCount = (int32_t)pat[opValue+3]; 3516 (*pCounter)++; 3517 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 3518 U_ASSERT(*pCounter == maxCount); 3519 break; 3520 } 3521 if (*pCounter >= minCount) { 3522 if (maxCount == -1) { 3523 // Loop has no hard upper bound. 3524 // Check that it is progressing through the input, break if it is not. 3525 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 3526 if (fp->fInputIdx == *pLastInputIdx) { 3527 break; 3528 } else { 3529 *pLastInputIdx = fp->fInputIdx; 3530 } 3531 } 3532 fp = StateSave(fp, fp->fPatIdx, status); 3533 } 3534 fp->fPatIdx = opValue + 4; // Loop back. 3535 } 3536 break; 3537 3538 case URX_CTR_INIT_NG: 3539 { 3540 // Initialize a non-greedy loop 3541 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3542 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3543 3544 // Pick up the three extra operands that CTR_INIT_NG has, and 3545 // skip the pattern location counter past 3546 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3547 fp->fPatIdx += 3; 3548 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3549 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3550 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3551 U_ASSERT(minCount>=0); 3552 U_ASSERT(maxCount>=minCount || maxCount==-1); 3553 U_ASSERT(loopLoc>fp->fPatIdx); 3554 if (maxCount == -1) { 3555 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. 3556 } 3557 3558 if (minCount == 0) { 3559 if (maxCount != 0) { 3560 fp = StateSave(fp, fp->fPatIdx, status); 3561 } 3562 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 3563 } 3564 } 3565 break; 3566 3567 case URX_CTR_LOOP_NG: 3568 { 3569 // Non-greedy {min, max} loops 3570 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3571 int32_t initOp = (int32_t)pat[opValue]; 3572 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 3573 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3574 int32_t minCount = (int32_t)pat[opValue+2]; 3575 int32_t maxCount = (int32_t)pat[opValue+3]; 3576 3577 (*pCounter)++; 3578 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 3579 // The loop has matched the maximum permitted number of times. 3580 // Break out of here with no action. Matching will 3581 // continue with the following pattern. 3582 U_ASSERT(*pCounter == maxCount); 3583 break; 3584 } 3585 3586 if (*pCounter < minCount) { 3587 // We haven't met the minimum number of matches yet. 3588 // Loop back for another one. 3589 fp->fPatIdx = opValue + 4; // Loop back. 3590 } else { 3591 // We do have the minimum number of matches. 3592 3593 // If there is no upper bound on the loop iterations, check that the input index 3594 // is progressing, and stop the loop if it is not. 3595 if (maxCount == -1) { 3596 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 3597 if (fp->fInputIdx == *pLastInputIdx) { 3598 break; 3599 } 3600 *pLastInputIdx = fp->fInputIdx; 3601 } 3602 3603 // Loop Continuation: we will fall into the pattern following the loop 3604 // (non-greedy, don't execute loop body first), but first do 3605 // a state save to the top of the loop, so that a match failure 3606 // in the following pattern will try another iteration of the loop. 3607 fp = StateSave(fp, opValue + 4, status); 3608 } 3609 } 3610 break; 3611 3612 case URX_STO_SP: 3613 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3614 fData[opValue] = fStack->size(); 3615 break; 3616 3617 case URX_LD_SP: 3618 { 3619 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3620 int32_t newStackSize = (int32_t)fData[opValue]; 3621 U_ASSERT(newStackSize <= fStack->size()); 3622 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3623 if (newFP == (int64_t *)fp) { 3624 break; 3625 } 3626 int32_t i; 3627 for (i=0; i<fFrameSize; i++) { 3628 newFP[i] = ((int64_t *)fp)[i]; 3629 } 3630 fp = (REStackFrame *)newFP; 3631 fStack->setSize(newStackSize); 3632 } 3633 break; 3634 3635 case URX_BACKREF: 3636 { 3637 U_ASSERT(opValue < fFrameSize); 3638 int64_t groupStartIdx = fp->fExtra[opValue]; 3639 int64_t groupEndIdx = fp->fExtra[opValue+1]; 3640 U_ASSERT(groupStartIdx <= groupEndIdx); 3641 if (groupStartIdx < 0) { 3642 // This capture group has not participated in the match thus far, 3643 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3644 break; 3645 } 3646 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); 3647 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3648 3649 // Note: if the capture group match was of an empty string the backref 3650 // match succeeds. Verified by testing: Perl matches succeed 3651 // in this case, so we do too. 3652 3653 UBool success = TRUE; 3654 for (;;) { 3655 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) { 3656 success = TRUE; 3657 break; 3658 } 3659 if (utext_getNativeIndex(fInputText) >= fActiveLimit) { 3660 success = FALSE; 3661 fHitEnd = TRUE; 3662 break; 3663 } 3664 UChar32 captureGroupChar = utext_next32(fAltInputText); 3665 UChar32 inputChar = utext_next32(fInputText); 3666 if (inputChar != captureGroupChar) { 3667 success = FALSE; 3668 break; 3669 } 3670 } 3671 3672 if (success) { 3673 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3674 } else { 3675 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3676 } 3677 } 3678 break; 3679 3680 3681 3682 case URX_BACKREF_I: 3683 { 3684 U_ASSERT(opValue < fFrameSize); 3685 int64_t groupStartIdx = fp->fExtra[opValue]; 3686 int64_t groupEndIdx = fp->fExtra[opValue+1]; 3687 U_ASSERT(groupStartIdx <= groupEndIdx); 3688 if (groupStartIdx < 0) { 3689 // This capture group has not participated in the match thus far, 3690 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3691 break; 3692 } 3693 utext_setNativeIndex(fAltInputText, groupStartIdx); 3694 utext_setNativeIndex(fInputText, fp->fInputIdx); 3695 CaseFoldingUTextIterator captureGroupItr(*fAltInputText); 3696 CaseFoldingUTextIterator inputItr(*fInputText); 3697 3698 // Note: if the capture group match was of an empty string the backref 3699 // match succeeds. Verified by testing: Perl matches succeed 3700 // in this case, so we do too. 3701 3702 UBool success = TRUE; 3703 for (;;) { 3704 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) { 3705 success = TRUE; 3706 break; 3707 } 3708 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) { 3709 success = FALSE; 3710 fHitEnd = TRUE; 3711 break; 3712 } 3713 UChar32 captureGroupChar = captureGroupItr.next(); 3714 UChar32 inputChar = inputItr.next(); 3715 if (inputChar != captureGroupChar) { 3716 success = FALSE; 3717 break; 3718 } 3719 } 3720 3721 if (success && inputItr.inExpansion()) { 3722 // We otained a match by consuming part of a string obtained from 3723 // case-folding a single code point of the input text. 3724 // This does not count as an overall match. 3725 success = FALSE; 3726 } 3727 3728 if (success) { 3729 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3730 } else { 3731 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3732 } 3733 3734 } 3735 break; 3736 3737 case URX_STO_INP_LOC: 3738 { 3739 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 3740 fp->fExtra[opValue] = fp->fInputIdx; 3741 } 3742 break; 3743 3744 case URX_JMPX: 3745 { 3746 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3747 fp->fPatIdx += 1; 3748 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 3749 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 3750 int64_t savedInputIdx = fp->fExtra[dataLoc]; 3751 U_ASSERT(savedInputIdx <= fp->fInputIdx); 3752 if (savedInputIdx < fp->fInputIdx) { 3753 fp->fPatIdx = opValue; // JMP 3754 } else { 3755 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 3756 } 3757 } 3758 break; 3759 3760 case URX_LA_START: 3761 { 3762 // Entering a lookahead block. 3763 // Save Stack Ptr, Input Pos. 3764 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3765 fData[opValue] = fStack->size(); 3766 fData[opValue+1] = fp->fInputIdx; 3767 fActiveStart = fLookStart; // Set the match region change for 3768 fActiveLimit = fLookLimit; // transparent bounds. 3769 } 3770 break; 3771 3772 case URX_LA_END: 3773 { 3774 // Leaving a look-ahead block. 3775 // restore Stack Ptr, Input Pos to positions they had on entry to block. 3776 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3777 int32_t stackSize = fStack->size(); 3778 int32_t newStackSize =(int32_t)fData[opValue]; 3779 U_ASSERT(stackSize >= newStackSize); 3780 if (stackSize > newStackSize) { 3781 // Copy the current top frame back to the new (cut back) top frame. 3782 // This makes the capture groups from within the look-ahead 3783 // expression available. 3784 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3785 int32_t i; 3786 for (i=0; i<fFrameSize; i++) { 3787 newFP[i] = ((int64_t *)fp)[i]; 3788 } 3789 fp = (REStackFrame *)newFP; 3790 fStack->setSize(newStackSize); 3791 } 3792 fp->fInputIdx = fData[opValue+1]; 3793 3794 // Restore the active region bounds in the input string; they may have 3795 // been changed because of transparent bounds on a Region. 3796 fActiveStart = fRegionStart; 3797 fActiveLimit = fRegionLimit; 3798 } 3799 break; 3800 3801 case URX_ONECHAR_I: 3802 // Case insensitive one char. The char from the pattern is already case folded. 3803 // Input text is not, but case folding the input can not reduce two or more code 3804 // points to one. 3805 if (fp->fInputIdx < fActiveLimit) { 3806 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3807 3808 UChar32 c = UTEXT_NEXT32(fInputText); 3809 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 3810 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3811 break; 3812 } 3813 } else { 3814 fHitEnd = TRUE; 3815 } 3816 3817 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3818 break; 3819 3820 case URX_STRING_I: 3821 { 3822 // Case-insensitive test input against a literal string. 3823 // Strings require two slots in the compiled pattern, one for the 3824 // offset to the string text, and one for the length. 3825 // The compiled string has already been case folded. 3826 { 3827 const UChar *patternString = litText + opValue; 3828 int32_t patternStringIdx = 0; 3829 3830 op = (int32_t)pat[fp->fPatIdx]; 3831 fp->fPatIdx++; 3832 opType = URX_TYPE(op); 3833 opValue = URX_VAL(op); 3834 U_ASSERT(opType == URX_STRING_LEN); 3835 int32_t patternStringLen = opValue; // Length of the string from the pattern. 3836 3837 3838 UChar32 cPattern; 3839 UChar32 cText; 3840 UBool success = TRUE; 3841 3842 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3843 CaseFoldingUTextIterator inputIterator(*fInputText); 3844 while (patternStringIdx < patternStringLen) { 3845 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { 3846 success = FALSE; 3847 fHitEnd = TRUE; 3848 break; 3849 } 3850 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); 3851 cText = inputIterator.next(); 3852 if (cText != cPattern) { 3853 success = FALSE; 3854 break; 3855 } 3856 } 3857 if (inputIterator.inExpansion()) { 3858 success = FALSE; 3859 } 3860 3861 if (success) { 3862 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3863 } else { 3864 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3865 } 3866 } 3867 } 3868 break; 3869 3870 case URX_LB_START: 3871 { 3872 // Entering a look-behind block. 3873 // Save Stack Ptr, Input Pos. 3874 // TODO: implement transparent bounds. Ticket #6067 3875 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3876 fData[opValue] = fStack->size(); 3877 fData[opValue+1] = fp->fInputIdx; 3878 // Init the variable containing the start index for attempted matches. 3879 fData[opValue+2] = -1; 3880 // Save input string length, then reset to pin any matches to end at 3881 // the current position. 3882 fData[opValue+3] = fActiveLimit; 3883 fActiveLimit = fp->fInputIdx; 3884 } 3885 break; 3886 3887 3888 case URX_LB_CONT: 3889 { 3890 // Positive Look-Behind, at top of loop checking for matches of LB expression 3891 // at all possible input starting positions. 3892 3893 // Fetch the min and max possible match lengths. They are the operands 3894 // of this op in the pattern. 3895 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 3896 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 3897 U_ASSERT(minML <= maxML); 3898 U_ASSERT(minML >= 0); 3899 3900 // Fetch (from data) the last input index where a match was attempted. 3901 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3902 int64_t *lbStartIdx = &fData[opValue+2]; 3903 if (*lbStartIdx < 0) { 3904 // First time through loop. 3905 *lbStartIdx = fp->fInputIdx - minML; 3906 } else { 3907 // 2nd through nth time through the loop. 3908 // Back up start position for match by one. 3909 if (*lbStartIdx == 0) { 3910 (*lbStartIdx)--; 3911 } else { 3912 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); 3913 (void)UTEXT_PREVIOUS32(fInputText); 3914 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 3915 } 3916 } 3917 3918 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 3919 // We have tried all potential match starting points without 3920 // getting a match. Backtrack out, and out of the 3921 // Look Behind altogether. 3922 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3923 int64_t restoreInputLen = fData[opValue+3]; 3924 U_ASSERT(restoreInputLen >= fActiveLimit); 3925 U_ASSERT(restoreInputLen <= fInputLength); 3926 fActiveLimit = restoreInputLen; 3927 break; 3928 } 3929 3930 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 3931 // (successful match will fall off the end of the loop.) 3932 fp = StateSave(fp, fp->fPatIdx-3, status); 3933 fp->fInputIdx = *lbStartIdx; 3934 } 3935 break; 3936 3937 case URX_LB_END: 3938 // End of a look-behind block, after a successful match. 3939 { 3940 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3941 if (fp->fInputIdx != fActiveLimit) { 3942 // The look-behind expression matched, but the match did not 3943 // extend all the way to the point that we are looking behind from. 3944 // FAIL out of here, which will take us back to the LB_CONT, which 3945 // will retry the match starting at another position or fail 3946 // the look-behind altogether, whichever is appropriate. 3947 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3948 break; 3949 } 3950 3951 // Look-behind match is good. Restore the orignal input string length, 3952 // which had been truncated to pin the end of the lookbehind match to the 3953 // position being looked-behind. 3954 int64_t originalInputLen = fData[opValue+3]; 3955 U_ASSERT(originalInputLen >= fActiveLimit); 3956 U_ASSERT(originalInputLen <= fInputLength); 3957 fActiveLimit = originalInputLen; 3958 } 3959 break; 3960 3961 3962 case URX_LBN_CONT: 3963 { 3964 // Negative Look-Behind, at top of loop checking for matches of LB expression 3965 // at all possible input starting positions. 3966 3967 // Fetch the extra parameters of this op. 3968 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 3969 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 3970 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 3971 continueLoc = URX_VAL(continueLoc); 3972 U_ASSERT(minML <= maxML); 3973 U_ASSERT(minML >= 0); 3974 U_ASSERT(continueLoc > fp->fPatIdx); 3975 3976 // Fetch (from data) the last input index where a match was attempted. 3977 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3978 int64_t *lbStartIdx = &fData[opValue+2]; 3979 if (*lbStartIdx < 0) { 3980 // First time through loop. 3981 *lbStartIdx = fp->fInputIdx - minML; 3982 } else { 3983 // 2nd through nth time through the loop. 3984 // Back up start position for match by one. 3985 if (*lbStartIdx == 0) { 3986 (*lbStartIdx)--; 3987 } else { 3988 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); 3989 (void)UTEXT_PREVIOUS32(fInputText); 3990 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 3991 } 3992 } 3993 3994 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 3995 // We have tried all potential match starting points without 3996 // getting a match, which means that the negative lookbehind as 3997 // a whole has succeeded. Jump forward to the continue location 3998 int64_t restoreInputLen = fData[opValue+3]; 3999 U_ASSERT(restoreInputLen >= fActiveLimit); 4000 U_ASSERT(restoreInputLen <= fInputLength); 4001 fActiveLimit = restoreInputLen; 4002 fp->fPatIdx = continueLoc; 4003 break; 4004 } 4005 4006 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 4007 // (successful match will cause a FAIL out of the loop altogether.) 4008 fp = StateSave(fp, fp->fPatIdx-4, status); 4009 fp->fInputIdx = *lbStartIdx; 4010 } 4011 break; 4012 4013 case URX_LBN_END: 4014 // End of a negative look-behind block, after a successful match. 4015 { 4016 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4017 if (fp->fInputIdx != fActiveLimit) { 4018 // The look-behind expression matched, but the match did not 4019 // extend all the way to the point that we are looking behind from. 4020 // FAIL out of here, which will take us back to the LB_CONT, which 4021 // will retry the match starting at another position or succeed 4022 // the look-behind altogether, whichever is appropriate. 4023 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4024 break; 4025 } 4026 4027 // Look-behind expression matched, which means look-behind test as 4028 // a whole Fails 4029 4030 // Restore the orignal input string length, which had been truncated 4031 // inorder to pin the end of the lookbehind match 4032 // to the position being looked-behind. 4033 int64_t originalInputLen = fData[opValue+3]; 4034 U_ASSERT(originalInputLen >= fActiveLimit); 4035 U_ASSERT(originalInputLen <= fInputLength); 4036 fActiveLimit = originalInputLen; 4037 4038 // Restore original stack position, discarding any state saved 4039 // by the successful pattern match. 4040 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4041 int32_t newStackSize = (int32_t)fData[opValue]; 4042 U_ASSERT(fStack->size() > newStackSize); 4043 fStack->setSize(newStackSize); 4044 4045 // FAIL, which will take control back to someplace 4046 // prior to entering the look-behind test. 4047 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4048 } 4049 break; 4050 4051 4052 case URX_LOOP_SR_I: 4053 // Loop Initialization for the optimized implementation of 4054 // [some character set]* 4055 // This op scans through all matching input. 4056 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4057 { 4058 U_ASSERT(opValue > 0 && opValue < sets->size()); 4059 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 4060 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 4061 4062 // Loop through input, until either the input is exhausted or 4063 // we reach a character that is not a member of the set. 4064 int64_t ix = fp->fInputIdx; 4065 UTEXT_SETNATIVEINDEX(fInputText, ix); 4066 for (;;) { 4067 if (ix >= fActiveLimit) { 4068 fHitEnd = TRUE; 4069 break; 4070 } 4071 UChar32 c = UTEXT_NEXT32(fInputText); 4072 if (c<256) { 4073 if (s8->contains(c) == FALSE) { 4074 break; 4075 } 4076 } else { 4077 if (s->contains(c) == FALSE) { 4078 break; 4079 } 4080 } 4081 ix = UTEXT_GETNATIVEINDEX(fInputText); 4082 } 4083 4084 // If there were no matching characters, skip over the loop altogether. 4085 // The loop doesn't run at all, a * op always succeeds. 4086 if (ix == fp->fInputIdx) { 4087 fp->fPatIdx++; // skip the URX_LOOP_C op. 4088 break; 4089 } 4090 4091 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4092 // must follow. It's operand is the stack location 4093 // that holds the starting input index for the match of this [set]* 4094 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 4095 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4096 int32_t stackLoc = URX_VAL(loopcOp); 4097 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4098 fp->fExtra[stackLoc] = fp->fInputIdx; 4099 fp->fInputIdx = ix; 4100 4101 // Save State to the URX_LOOP_C op that follows this one, 4102 // so that match failures in the following code will return to there. 4103 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4104 fp = StateSave(fp, fp->fPatIdx, status); 4105 fp->fPatIdx++; 4106 } 4107 break; 4108 4109 4110 case URX_LOOP_DOT_I: 4111 // Loop Initialization for the optimized implementation of .* 4112 // This op scans through all remaining input. 4113 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4114 { 4115 // Loop through input until the input is exhausted (we reach an end-of-line) 4116 // In DOTALL mode, we can just go straight to the end of the input. 4117 int64_t ix; 4118 if ((opValue & 1) == 1) { 4119 // Dot-matches-All mode. Jump straight to the end of the string. 4120 ix = fActiveLimit; 4121 fHitEnd = TRUE; 4122 } else { 4123 // NOT DOT ALL mode. Line endings do not match '.' 4124 // Scan forward until a line ending or end of input. 4125 ix = fp->fInputIdx; 4126 UTEXT_SETNATIVEINDEX(fInputText, ix); 4127 for (;;) { 4128 if (ix >= fActiveLimit) { 4129 fHitEnd = TRUE; 4130 break; 4131 } 4132 UChar32 c = UTEXT_NEXT32(fInputText); 4133 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 4134 if ((c == 0x0a) || // 0x0a is newline in both modes. 4135 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 4136 (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) { 4137 // char is a line ending. Exit the scanning loop. 4138 break; 4139 } 4140 } 4141 ix = UTEXT_GETNATIVEINDEX(fInputText); 4142 } 4143 } 4144 4145 // If there were no matching characters, skip over the loop altogether. 4146 // The loop doesn't run at all, a * op always succeeds. 4147 if (ix == fp->fInputIdx) { 4148 fp->fPatIdx++; // skip the URX_LOOP_C op. 4149 break; 4150 } 4151 4152 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4153 // must follow. It's operand is the stack location 4154 // that holds the starting input index for the match of this .* 4155 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 4156 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4157 int32_t stackLoc = URX_VAL(loopcOp); 4158 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4159 fp->fExtra[stackLoc] = fp->fInputIdx; 4160 fp->fInputIdx = ix; 4161 4162 // Save State to the URX_LOOP_C op that follows this one, 4163 // so that match failures in the following code will return to there. 4164 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4165 fp = StateSave(fp, fp->fPatIdx, status); 4166 fp->fPatIdx++; 4167 } 4168 break; 4169 4170 4171 case URX_LOOP_C: 4172 { 4173 U_ASSERT(opValue>=0 && opValue<fFrameSize); 4174 backSearchIndex = fp->fExtra[opValue]; 4175 U_ASSERT(backSearchIndex <= fp->fInputIdx); 4176 if (backSearchIndex == fp->fInputIdx) { 4177 // We've backed up the input idx to the point that the loop started. 4178 // The loop is done. Leave here without saving state. 4179 // Subsequent failures won't come back here. 4180 break; 4181 } 4182 // Set up for the next iteration of the loop, with input index 4183 // backed up by one from the last time through, 4184 // and a state save to this instruction in case the following code fails again. 4185 // (We're going backwards because this loop emulates stack unwinding, not 4186 // the initial scan forward.) 4187 U_ASSERT(fp->fInputIdx > 0); 4188 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4189 UChar32 prevC = UTEXT_PREVIOUS32(fInputText); 4190 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4191 4192 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); 4193 if (prevC == 0x0a && 4194 fp->fInputIdx > backSearchIndex && 4195 twoPrevC == 0x0d) { 4196 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 4197 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 4198 // .*, stepping back over CRLF pair. 4199 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4200 } 4201 } 4202 4203 4204 fp = StateSave(fp, fp->fPatIdx-1, status); 4205 } 4206 break; 4207 4208 4209 4210 default: 4211 // Trouble. The compiled pattern contains an entry with an 4212 // unrecognized type tag. 4213 U_ASSERT(FALSE); 4214 } 4215 4216 if (U_FAILURE(status)) { 4217 isMatch = FALSE; 4218 break; 4219 } 4220 } 4221 4222 breakFromLoop: 4223 fMatch = isMatch; 4224 if (isMatch) { 4225 fLastMatchEnd = fMatchEnd; 4226 fMatchStart = startIdx; 4227 fMatchEnd = fp->fInputIdx; 4228 if (fTraceDebug) { 4229 REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd)); 4230 } 4231 } 4232 else 4233 { 4234 if (fTraceDebug) { 4235 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); 4236 } 4237 } 4238 4239 fFrame = fp; // The active stack frame when the engine stopped. 4240 // Contains the capture group results that we need to 4241 // access later. 4242 return; 4243 } 4244 4245 4246 //-------------------------------------------------------------------------------- 4247 // 4248 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the 4249 // assumption that the entire string is available in the UText's 4250 // chunk buffer. For now, that means we can use int32_t indexes, 4251 // except for anything that needs to be saved (like group starts 4252 // and ends). 4253 // 4254 // startIdx: begin matching a this index. 4255 // toEnd: if true, match must extend to end of the input region 4256 // 4257 //-------------------------------------------------------------------------------- 4258 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { 4259 UBool isMatch = FALSE; // True if the we have a match. 4260 4261 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards 4262 4263 int32_t op; // Operation from the compiled pattern, split into 4264 int32_t opType; // the opcode 4265 int32_t opValue; // and the operand value. 4266 4267 #ifdef REGEX_RUN_DEBUG 4268 if (fTraceDebug) 4269 { 4270 printf("MatchAt(startIdx=%d)\n", startIdx); 4271 printf("Original Pattern: "); 4272 UChar32 c = utext_next32From(fPattern->fPattern, 0); 4273 while (c != U_SENTINEL) { 4274 if (c<32 || c>256) { 4275 c = '.'; 4276 } 4277 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); 4278 4279 c = UTEXT_NEXT32(fPattern->fPattern); 4280 } 4281 printf("\n"); 4282 printf("Input String: "); 4283 c = utext_next32From(fInputText, 0); 4284 while (c != U_SENTINEL) { 4285 if (c<32 || c>256) { 4286 c = '.'; 4287 } 4288 printf("%c", c); 4289 4290 c = UTEXT_NEXT32(fInputText); 4291 } 4292 printf("\n"); 4293 printf("\n"); 4294 } 4295 #endif 4296 4297 if (U_FAILURE(status)) { 4298 return; 4299 } 4300 4301 // Cache frequently referenced items from the compiled pattern 4302 // 4303 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 4304 4305 const UChar *litText = fPattern->fLiteralText.getBuffer(); 4306 UVector *sets = fPattern->fSets; 4307 4308 const UChar *inputBuf = fInputText->chunkContents; 4309 4310 fFrameSize = fPattern->fFrameSize; 4311 REStackFrame *fp = resetStack(); 4312 4313 fp->fPatIdx = 0; 4314 fp->fInputIdx = startIdx; 4315 4316 // Zero out the pattern's static data 4317 int32_t i; 4318 for (i = 0; i<fPattern->fDataSize; i++) { 4319 fData[i] = 0; 4320 } 4321 4322 // 4323 // Main loop for interpreting the compiled pattern. 4324 // One iteration of the loop per pattern operation performed. 4325 // 4326 for (;;) { 4327 #if 0 4328 if (_heapchk() != _HEAPOK) { 4329 fprintf(stderr, "Heap Trouble\n"); 4330 } 4331 #endif 4332 4333 op = (int32_t)pat[fp->fPatIdx]; 4334 opType = URX_TYPE(op); 4335 opValue = URX_VAL(op); 4336 #ifdef REGEX_RUN_DEBUG 4337 if (fTraceDebug) { 4338 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4339 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, 4340 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 4341 fPattern->dumpOp(fp->fPatIdx); 4342 } 4343 #endif 4344 fp->fPatIdx++; 4345 4346 switch (opType) { 4347 4348 4349 case URX_NOP: 4350 break; 4351 4352 4353 case URX_BACKTRACK: 4354 // Force a backtrack. In some circumstances, the pattern compiler 4355 // will notice that the pattern can't possibly match anything, and will 4356 // emit one of these at that point. 4357 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4358 break; 4359 4360 4361 case URX_ONECHAR: 4362 if (fp->fInputIdx < fActiveLimit) { 4363 UChar32 c; 4364 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4365 if (c == opValue) { 4366 break; 4367 } 4368 } else { 4369 fHitEnd = TRUE; 4370 } 4371 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4372 break; 4373 4374 4375 case URX_STRING: 4376 { 4377 // Test input against a literal string. 4378 // Strings require two slots in the compiled pattern, one for the 4379 // offset to the string text, and one for the length. 4380 int32_t stringStartIdx = opValue; 4381 int32_t stringLen; 4382 4383 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 4384 fp->fPatIdx++; 4385 opType = URX_TYPE(op); 4386 stringLen = URX_VAL(op); 4387 U_ASSERT(opType == URX_STRING_LEN); 4388 U_ASSERT(stringLen >= 2); 4389 4390 const UChar * pInp = inputBuf + fp->fInputIdx; 4391 const UChar * pInpLimit = inputBuf + fActiveLimit; 4392 const UChar * pPat = litText+stringStartIdx; 4393 const UChar * pEnd = pInp + stringLen; 4394 UBool success = TRUE; 4395 while (pInp < pEnd) { 4396 if (pInp >= pInpLimit) { 4397 fHitEnd = TRUE; 4398 success = FALSE; 4399 break; 4400 } 4401 if (*pInp++ != *pPat++) { 4402 success = FALSE; 4403 break; 4404 } 4405 } 4406 4407 if (success) { 4408 fp->fInputIdx += stringLen; 4409 } else { 4410 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4411 } 4412 } 4413 break; 4414 4415 4416 case URX_STATE_SAVE: 4417 fp = StateSave(fp, opValue, status); 4418 break; 4419 4420 4421 case URX_END: 4422 // The match loop will exit via this path on a successful match, 4423 // when we reach the end of the pattern. 4424 if (toEnd && fp->fInputIdx != fActiveLimit) { 4425 // The pattern matched, but not to the end of input. Try some more. 4426 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4427 break; 4428 } 4429 isMatch = TRUE; 4430 goto breakFromLoop; 4431 4432 // Start and End Capture stack frame variables are laid out out like this: 4433 // fp->fExtra[opValue] - The start of a completed capture group 4434 // opValue+1 - The end of a completed capture group 4435 // opValue+2 - the start of a capture group whose end 4436 // has not yet been reached (and might not ever be). 4437 case URX_START_CAPTURE: 4438 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4439 fp->fExtra[opValue+2] = fp->fInputIdx; 4440 break; 4441 4442 4443 case URX_END_CAPTURE: 4444 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4445 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 4446 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 4447 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 4448 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 4449 break; 4450 4451 4452 case URX_DOLLAR: // $, test for End of line 4453 // or for position before new line at end of input 4454 if (fp->fInputIdx < fAnchorLimit-2) { 4455 // We are no where near the end of input. Fail. 4456 // This is the common case. Keep it first. 4457 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4458 break; 4459 } 4460 if (fp->fInputIdx >= fAnchorLimit) { 4461 // We really are at the end of input. Success. 4462 fHitEnd = TRUE; 4463 fRequireEnd = TRUE; 4464 break; 4465 } 4466 4467 // If we are positioned just before a new-line that is located at the 4468 // end of input, succeed. 4469 if (fp->fInputIdx == fAnchorLimit-1) { 4470 UChar32 c; 4471 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); 4472 4473 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 4474 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4475 // At new-line at end of input. Success 4476 fHitEnd = TRUE; 4477 fRequireEnd = TRUE; 4478 break; 4479 } 4480 } 4481 } else if (fp->fInputIdx == fAnchorLimit-2 && 4482 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) { 4483 fHitEnd = TRUE; 4484 fRequireEnd = TRUE; 4485 break; // At CR/LF at end of input. Success 4486 } 4487 4488 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4489 4490 break; 4491 4492 4493 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 4494 if (fp->fInputIdx >= fAnchorLimit-1) { 4495 // Either at the last character of input, or off the end. 4496 if (fp->fInputIdx == fAnchorLimit-1) { 4497 // At last char of input. Success if it's a new line. 4498 if (inputBuf[fp->fInputIdx] == 0x0a) { 4499 fHitEnd = TRUE; 4500 fRequireEnd = TRUE; 4501 break; 4502 } 4503 } else { 4504 // Off the end of input. Success. 4505 fHitEnd = TRUE; 4506 fRequireEnd = TRUE; 4507 break; 4508 } 4509 } 4510 4511 // Not at end of input. Back-track out. 4512 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4513 break; 4514 4515 4516 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 4517 { 4518 if (fp->fInputIdx >= fAnchorLimit) { 4519 // We really are at the end of input. Success. 4520 fHitEnd = TRUE; 4521 fRequireEnd = TRUE; 4522 break; 4523 } 4524 // If we are positioned just before a new-line, succeed. 4525 // It makes no difference where the new-line is within the input. 4526 UChar32 c = inputBuf[fp->fInputIdx]; 4527 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 4528 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 4529 // In multi-line mode, hitting a new-line just before the end of input does not 4530 // set the hitEnd or requireEnd flags 4531 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4532 break; 4533 } 4534 } 4535 // not at a new line. Fail. 4536 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4537 } 4538 break; 4539 4540 4541 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 4542 { 4543 if (fp->fInputIdx >= fAnchorLimit) { 4544 // We really are at the end of input. Success. 4545 fHitEnd = TRUE; 4546 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 4547 break; // adding a new-line would not lose the match. 4548 } 4549 // If we are not positioned just before a new-line, the test fails; backtrack out. 4550 // It makes no difference where the new-line is within the input. 4551 if (inputBuf[fp->fInputIdx] != 0x0a) { 4552 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4553 } 4554 } 4555 break; 4556 4557 4558 case URX_CARET: // ^, test for start of line 4559 if (fp->fInputIdx != fAnchorStart) { 4560 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4561 } 4562 break; 4563 4564 4565 case URX_CARET_M: // ^, test for start of line in mulit-line mode 4566 { 4567 if (fp->fInputIdx == fAnchorStart) { 4568 // We are at the start input. Success. 4569 break; 4570 } 4571 // Check whether character just before the current pos is a new-line 4572 // unless we are at the end of input 4573 UChar c = inputBuf[fp->fInputIdx - 1]; 4574 if ((fp->fInputIdx < fAnchorLimit) && 4575 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 4576 // It's a new-line. ^ is true. Success. 4577 // TODO: what should be done with positions between a CR and LF? 4578 break; 4579 } 4580 // Not at the start of a line. Fail. 4581 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4582 } 4583 break; 4584 4585 4586 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 4587 { 4588 U_ASSERT(fp->fInputIdx >= fAnchorStart); 4589 if (fp->fInputIdx <= fAnchorStart) { 4590 // We are at the start input. Success. 4591 break; 4592 } 4593 // Check whether character just before the current pos is a new-line 4594 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 4595 UChar c = inputBuf[fp->fInputIdx - 1]; 4596 if (c != 0x0a) { 4597 // Not at the start of a line. Back-track out. 4598 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4599 } 4600 } 4601 break; 4602 4603 case URX_BACKSLASH_B: // Test for word boundaries 4604 { 4605 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); 4606 success ^= (UBool)(opValue != 0); // flip sense for \B 4607 if (!success) { 4608 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4609 } 4610 } 4611 break; 4612 4613 4614 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 4615 { 4616 UBool success = isUWordBoundary(fp->fInputIdx); 4617 success ^= (UBool)(opValue != 0); // flip sense for \B 4618 if (!success) { 4619 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4620 } 4621 } 4622 break; 4623 4624 4625 case URX_BACKSLASH_D: // Test for decimal digit 4626 { 4627 if (fp->fInputIdx >= fActiveLimit) { 4628 fHitEnd = TRUE; 4629 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4630 break; 4631 } 4632 4633 UChar32 c; 4634 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4635 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 4636 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 4637 success ^= (UBool)(opValue != 0); // flip sense for \D 4638 if (!success) { 4639 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4640 } 4641 } 4642 break; 4643 4644 4645 case URX_BACKSLASH_G: // Test for position at end of previous match 4646 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { 4647 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4648 } 4649 break; 4650 4651 4652 case URX_BACKSLASH_X: 4653 // Match a Grapheme, as defined by Unicode TR 29. 4654 // Differs slightly from Perl, which consumes combining marks independently 4655 // of context. 4656 { 4657 4658 // Fail if at end of input 4659 if (fp->fInputIdx >= fActiveLimit) { 4660 fHitEnd = TRUE; 4661 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4662 break; 4663 } 4664 4665 // Examine (and consume) the current char. 4666 // Dispatch into a little state machine, based on the char. 4667 UChar32 c; 4668 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4669 UnicodeSet **sets = fPattern->fStaticSets; 4670 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 4671 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 4672 if (sets[URX_GC_L]->contains(c)) goto GC_L; 4673 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 4674 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 4675 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4676 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4677 goto GC_Extend; 4678 4679 4680 4681 GC_L: 4682 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4683 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4684 if (sets[URX_GC_L]->contains(c)) goto GC_L; 4685 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 4686 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 4687 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4688 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4689 goto GC_Extend; 4690 4691 GC_V: 4692 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4693 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4694 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4695 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4696 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4697 goto GC_Extend; 4698 4699 GC_T: 4700 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4701 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4702 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4703 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4704 goto GC_Extend; 4705 4706 GC_Extend: 4707 // Combining characters are consumed here 4708 for (;;) { 4709 if (fp->fInputIdx >= fActiveLimit) { 4710 break; 4711 } 4712 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4713 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 4714 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 4715 break; 4716 } 4717 } 4718 goto GC_Done; 4719 4720 GC_Control: 4721 // Most control chars stand alone (don't combine with combining chars), 4722 // except for that CR/LF sequence is a single grapheme cluster. 4723 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) { 4724 fp->fInputIdx++; 4725 } 4726 4727 GC_Done: 4728 if (fp->fInputIdx >= fActiveLimit) { 4729 fHitEnd = TRUE; 4730 } 4731 break; 4732 } 4733 4734 4735 4736 4737 case URX_BACKSLASH_Z: // Test for end of Input 4738 if (fp->fInputIdx < fAnchorLimit) { 4739 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4740 } else { 4741 fHitEnd = TRUE; 4742 fRequireEnd = TRUE; 4743 } 4744 break; 4745 4746 4747 4748 case URX_STATIC_SETREF: 4749 { 4750 // Test input character against one of the predefined sets 4751 // (Word Characters, for example) 4752 // The high bit of the op value is a flag for the match polarity. 4753 // 0: success if input char is in set. 4754 // 1: success if input char is not in set. 4755 if (fp->fInputIdx >= fActiveLimit) { 4756 fHitEnd = TRUE; 4757 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4758 break; 4759 } 4760 4761 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 4762 opValue &= ~URX_NEG_SET; 4763 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 4764 4765 UChar32 c; 4766 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4767 if (c < 256) { 4768 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 4769 if (s8->contains(c)) { 4770 success = !success; 4771 } 4772 } else { 4773 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 4774 if (s->contains(c)) { 4775 success = !success; 4776 } 4777 } 4778 if (!success) { 4779 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4780 } 4781 } 4782 break; 4783 4784 4785 case URX_STAT_SETREF_N: 4786 { 4787 // Test input character for NOT being a member of one of 4788 // the predefined sets (Word Characters, for example) 4789 if (fp->fInputIdx >= fActiveLimit) { 4790 fHitEnd = TRUE; 4791 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4792 break; 4793 } 4794 4795 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 4796 4797 UChar32 c; 4798 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4799 if (c < 256) { 4800 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 4801 if (s8->contains(c) == FALSE) { 4802 break; 4803 } 4804 } else { 4805 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 4806 if (s->contains(c) == FALSE) { 4807 break; 4808 } 4809 } 4810 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4811 } 4812 break; 4813 4814 4815 case URX_SETREF: 4816 { 4817 if (fp->fInputIdx >= fActiveLimit) { 4818 fHitEnd = TRUE; 4819 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4820 break; 4821 } 4822 4823 U_ASSERT(opValue > 0 && opValue < sets->size()); 4824 4825 // There is input left. Pick up one char and test it for set membership. 4826 UChar32 c; 4827 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4828 if (c<256) { 4829 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 4830 if (s8->contains(c)) { 4831 // The character is in the set. A Match. 4832 break; 4833 } 4834 } else { 4835 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 4836 if (s->contains(c)) { 4837 // The character is in the set. A Match. 4838 break; 4839 } 4840 } 4841 4842 // the character wasn't in the set. 4843 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4844 } 4845 break; 4846 4847 4848 case URX_DOTANY: 4849 { 4850 // . matches anything, but stops at end-of-line. 4851 if (fp->fInputIdx >= fActiveLimit) { 4852 // At end of input. Match failed. Backtrack out. 4853 fHitEnd = TRUE; 4854 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4855 break; 4856 } 4857 4858 // There is input left. Advance over one char, unless we've hit end-of-line 4859 UChar32 c; 4860 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4861 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 4862 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 4863 // End of line in normal mode. . does not match. 4864 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4865 break; 4866 } 4867 } 4868 break; 4869 4870 4871 case URX_DOTANY_ALL: 4872 { 4873 // . in dot-matches-all (including new lines) mode 4874 if (fp->fInputIdx >= fActiveLimit) { 4875 // At end of input. Match failed. Backtrack out. 4876 fHitEnd = TRUE; 4877 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4878 break; 4879 } 4880 4881 // There is input left. Advance over one char, except if we are 4882 // at a cr/lf, advance over both of them. 4883 UChar32 c; 4884 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4885 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 4886 // In the case of a CR/LF, we need to advance over both. 4887 if (inputBuf[fp->fInputIdx] == 0x0a) { 4888 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); 4889 } 4890 } 4891 } 4892 break; 4893 4894 4895 case URX_DOTANY_UNIX: 4896 { 4897 // '.' operator, matches all, but stops at end-of-line. 4898 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 4899 if (fp->fInputIdx >= fActiveLimit) { 4900 // At end of input. Match failed. Backtrack out. 4901 fHitEnd = TRUE; 4902 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4903 break; 4904 } 4905 4906 // There is input left. Advance over one char, unless we've hit end-of-line 4907 UChar32 c; 4908 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4909 if (c == 0x0a) { 4910 // End of line in normal mode. '.' does not match the \n 4911 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4912 } 4913 } 4914 break; 4915 4916 4917 case URX_JMP: 4918 fp->fPatIdx = opValue; 4919 break; 4920 4921 case URX_FAIL: 4922 isMatch = FALSE; 4923 goto breakFromLoop; 4924 4925 case URX_JMP_SAV: 4926 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 4927 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 4928 fp->fPatIdx = opValue; // Then JMP. 4929 break; 4930 4931 case URX_JMP_SAV_X: 4932 // This opcode is used with (x)+, when x can match a zero length string. 4933 // Same as JMP_SAV, except conditional on the match having made forward progress. 4934 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 4935 // data address of the input position at the start of the loop. 4936 { 4937 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 4938 int32_t stoOp = (int32_t)pat[opValue-1]; 4939 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 4940 int32_t frameLoc = URX_VAL(stoOp); 4941 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 4942 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; 4943 U_ASSERT(prevInputIdx <= fp->fInputIdx); 4944 if (prevInputIdx < fp->fInputIdx) { 4945 // The match did make progress. Repeat the loop. 4946 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 4947 fp->fPatIdx = opValue; 4948 fp->fExtra[frameLoc] = fp->fInputIdx; 4949 } 4950 // If the input position did not advance, we do nothing here, 4951 // execution will fall out of the loop. 4952 } 4953 break; 4954 4955 case URX_CTR_INIT: 4956 { 4957 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 4958 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 4959 4960 // Pick up the three extra operands that CTR_INIT has, and 4961 // skip the pattern location counter past 4962 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 4963 fp->fPatIdx += 3; 4964 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 4965 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 4966 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 4967 U_ASSERT(minCount>=0); 4968 U_ASSERT(maxCount>=minCount || maxCount==-1); 4969 U_ASSERT(loopLoc>=fp->fPatIdx); 4970 4971 if (minCount == 0) { 4972 fp = StateSave(fp, loopLoc+1, status); 4973 } 4974 if (maxCount == -1) { 4975 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. 4976 } else if (maxCount == 0) { 4977 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4978 } 4979 } 4980 break; 4981 4982 case URX_CTR_LOOP: 4983 { 4984 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 4985 int32_t initOp = (int32_t)pat[opValue]; 4986 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 4987 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 4988 int32_t minCount = (int32_t)pat[opValue+2]; 4989 int32_t maxCount = (int32_t)pat[opValue+3]; 4990 (*pCounter)++; 4991 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 4992 U_ASSERT(*pCounter == maxCount); 4993 break; 4994 } 4995 if (*pCounter >= minCount) { 4996 if (maxCount == -1) { 4997 // Loop has no hard upper bound. 4998 // Check that it is progressing through the input, break if it is not. 4999 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 5000 if (fp->fInputIdx == *pLastInputIdx) { 5001 break; 5002 } else { 5003 *pLastInputIdx = fp->fInputIdx; 5004 } 5005 } 5006 fp = StateSave(fp, fp->fPatIdx, status); 5007 } 5008 fp->fPatIdx = opValue + 4; // Loop back. 5009 } 5010 break; 5011 5012 case URX_CTR_INIT_NG: 5013 { 5014 // Initialize a non-greedy loop 5015 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 5016 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 5017 5018 // Pick up the three extra operands that CTR_INIT_NG has, and 5019 // skip the pattern location counter past 5020 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5021 fp->fPatIdx += 3; 5022 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 5023 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 5024 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 5025 U_ASSERT(minCount>=0); 5026 U_ASSERT(maxCount>=minCount || maxCount==-1); 5027 U_ASSERT(loopLoc>fp->fPatIdx); 5028 if (maxCount == -1) { 5029 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. 5030 } 5031 5032 if (minCount == 0) { 5033 if (maxCount != 0) { 5034 fp = StateSave(fp, fp->fPatIdx, status); 5035 } 5036 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 5037 } 5038 } 5039 break; 5040 5041 case URX_CTR_LOOP_NG: 5042 { 5043 // Non-greedy {min, max} loops 5044 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 5045 int32_t initOp = (int32_t)pat[opValue]; 5046 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 5047 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 5048 int32_t minCount = (int32_t)pat[opValue+2]; 5049 int32_t maxCount = (int32_t)pat[opValue+3]; 5050 5051 (*pCounter)++; 5052 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 5053 // The loop has matched the maximum permitted number of times. 5054 // Break out of here with no action. Matching will 5055 // continue with the following pattern. 5056 U_ASSERT(*pCounter == maxCount); 5057 break; 5058 } 5059 5060 if (*pCounter < minCount) { 5061 // We haven't met the minimum number of matches yet. 5062 // Loop back for another one. 5063 fp->fPatIdx = opValue + 4; // Loop back. 5064 } else { 5065 // We do have the minimum number of matches. 5066 5067 // If there is no upper bound on the loop iterations, check that the input index 5068 // is progressing, and stop the loop if it is not. 5069 if (maxCount == -1) { 5070 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 5071 if (fp->fInputIdx == *pLastInputIdx) { 5072 break; 5073 } 5074 *pLastInputIdx = fp->fInputIdx; 5075 } 5076 5077 // Loop Continuation: we will fall into the pattern following the loop 5078 // (non-greedy, don't execute loop body first), but first do 5079 // a state save to the top of the loop, so that a match failure 5080 // in the following pattern will try another iteration of the loop. 5081 fp = StateSave(fp, opValue + 4, status); 5082 } 5083 } 5084 break; 5085 5086 case URX_STO_SP: 5087 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5088 fData[opValue] = fStack->size(); 5089 break; 5090 5091 case URX_LD_SP: 5092 { 5093 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5094 int32_t newStackSize = (int32_t)fData[opValue]; 5095 U_ASSERT(newStackSize <= fStack->size()); 5096 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5097 if (newFP == (int64_t *)fp) { 5098 break; 5099 } 5100 int32_t i; 5101 for (i=0; i<fFrameSize; i++) { 5102 newFP[i] = ((int64_t *)fp)[i]; 5103 } 5104 fp = (REStackFrame *)newFP; 5105 fStack->setSize(newStackSize); 5106 } 5107 break; 5108 5109 case URX_BACKREF: 5110 { 5111 U_ASSERT(opValue < fFrameSize); 5112 int64_t groupStartIdx = fp->fExtra[opValue]; 5113 int64_t groupEndIdx = fp->fExtra[opValue+1]; 5114 U_ASSERT(groupStartIdx <= groupEndIdx); 5115 int64_t inputIndex = fp->fInputIdx; 5116 if (groupStartIdx < 0) { 5117 // This capture group has not participated in the match thus far, 5118 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 5119 break; 5120 } 5121 UBool success = TRUE; 5122 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) { 5123 if (inputIndex >= fActiveLimit) { 5124 success = FALSE; 5125 fHitEnd = TRUE; 5126 break; 5127 } 5128 if (inputBuf[groupIndex] != inputBuf[inputIndex]) { 5129 success = FALSE; 5130 break; 5131 } 5132 } 5133 if (success) { 5134 fp->fInputIdx = inputIndex; 5135 } else { 5136 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5137 } 5138 } 5139 break; 5140 5141 case URX_BACKREF_I: 5142 { 5143 U_ASSERT(opValue < fFrameSize); 5144 int64_t groupStartIdx = fp->fExtra[opValue]; 5145 int64_t groupEndIdx = fp->fExtra[opValue+1]; 5146 U_ASSERT(groupStartIdx <= groupEndIdx); 5147 if (groupStartIdx < 0) { 5148 // This capture group has not participated in the match thus far, 5149 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 5150 break; 5151 } 5152 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx); 5153 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit); 5154 5155 // Note: if the capture group match was of an empty string the backref 5156 // match succeeds. Verified by testing: Perl matches succeed 5157 // in this case, so we do too. 5158 5159 UBool success = TRUE; 5160 for (;;) { 5161 UChar32 captureGroupChar = captureGroupItr.next(); 5162 if (captureGroupChar == U_SENTINEL) { 5163 success = TRUE; 5164 break; 5165 } 5166 UChar32 inputChar = inputItr.next(); 5167 if (inputChar == U_SENTINEL) { 5168 success = FALSE; 5169 fHitEnd = TRUE; 5170 break; 5171 } 5172 if (inputChar != captureGroupChar) { 5173 success = FALSE; 5174 break; 5175 } 5176 } 5177 5178 if (success && inputItr.inExpansion()) { 5179 // We otained a match by consuming part of a string obtained from 5180 // case-folding a single code point of the input text. 5181 // This does not count as an overall match. 5182 success = FALSE; 5183 } 5184 5185 if (success) { 5186 fp->fInputIdx = inputItr.getIndex(); 5187 } else { 5188 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5189 } 5190 } 5191 break; 5192 5193 case URX_STO_INP_LOC: 5194 { 5195 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 5196 fp->fExtra[opValue] = fp->fInputIdx; 5197 } 5198 break; 5199 5200 case URX_JMPX: 5201 { 5202 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5203 fp->fPatIdx += 1; 5204 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 5205 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 5206 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; 5207 U_ASSERT(savedInputIdx <= fp->fInputIdx); 5208 if (savedInputIdx < fp->fInputIdx) { 5209 fp->fPatIdx = opValue; // JMP 5210 } else { 5211 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 5212 } 5213 } 5214 break; 5215 5216 case URX_LA_START: 5217 { 5218 // Entering a lookahead block. 5219 // Save Stack Ptr, Input Pos. 5220 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5221 fData[opValue] = fStack->size(); 5222 fData[opValue+1] = fp->fInputIdx; 5223 fActiveStart = fLookStart; // Set the match region change for 5224 fActiveLimit = fLookLimit; // transparent bounds. 5225 } 5226 break; 5227 5228 case URX_LA_END: 5229 { 5230 // Leaving a look-ahead block. 5231 // restore Stack Ptr, Input Pos to positions they had on entry to block. 5232 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5233 int32_t stackSize = fStack->size(); 5234 int32_t newStackSize = (int32_t)fData[opValue]; 5235 U_ASSERT(stackSize >= newStackSize); 5236 if (stackSize > newStackSize) { 5237 // Copy the current top frame back to the new (cut back) top frame. 5238 // This makes the capture groups from within the look-ahead 5239 // expression available. 5240 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5241 int32_t i; 5242 for (i=0; i<fFrameSize; i++) { 5243 newFP[i] = ((int64_t *)fp)[i]; 5244 } 5245 fp = (REStackFrame *)newFP; 5246 fStack->setSize(newStackSize); 5247 } 5248 fp->fInputIdx = fData[opValue+1]; 5249 5250 // Restore the active region bounds in the input string; they may have 5251 // been changed because of transparent bounds on a Region. 5252 fActiveStart = fRegionStart; 5253 fActiveLimit = fRegionLimit; 5254 } 5255 break; 5256 5257 case URX_ONECHAR_I: 5258 if (fp->fInputIdx < fActiveLimit) { 5259 UChar32 c; 5260 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5261 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 5262 break; 5263 } 5264 } else { 5265 fHitEnd = TRUE; 5266 } 5267 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5268 break; 5269 5270 case URX_STRING_I: 5271 // Case-insensitive test input against a literal string. 5272 // Strings require two slots in the compiled pattern, one for the 5273 // offset to the string text, and one for the length. 5274 // The compiled string has already been case folded. 5275 { 5276 const UChar *patternString = litText + opValue; 5277 5278 op = (int32_t)pat[fp->fPatIdx]; 5279 fp->fPatIdx++; 5280 opType = URX_TYPE(op); 5281 opValue = URX_VAL(op); 5282 U_ASSERT(opType == URX_STRING_LEN); 5283 int32_t patternStringLen = opValue; // Length of the string from the pattern. 5284 5285 UChar32 cText; 5286 UChar32 cPattern; 5287 UBool success = TRUE; 5288 int32_t patternStringIdx = 0; 5289 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit); 5290 while (patternStringIdx < patternStringLen) { 5291 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); 5292 cText = inputIterator.next(); 5293 if (cText != cPattern) { 5294 success = FALSE; 5295 if (cText == U_SENTINEL) { 5296 fHitEnd = TRUE; 5297 } 5298 break; 5299 } 5300 } 5301 if (inputIterator.inExpansion()) { 5302 success = FALSE; 5303 } 5304 5305 if (success) { 5306 fp->fInputIdx = inputIterator.getIndex(); 5307 } else { 5308 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5309 } 5310 } 5311 break; 5312 5313 case URX_LB_START: 5314 { 5315 // Entering a look-behind block. 5316 // Save Stack Ptr, Input Pos. 5317 // TODO: implement transparent bounds. Ticket #6067 5318 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5319 fData[opValue] = fStack->size(); 5320 fData[opValue+1] = fp->fInputIdx; 5321 // Init the variable containing the start index for attempted matches. 5322 fData[opValue+2] = -1; 5323 // Save input string length, then reset to pin any matches to end at 5324 // the current position. 5325 fData[opValue+3] = fActiveLimit; 5326 fActiveLimit = fp->fInputIdx; 5327 } 5328 break; 5329 5330 5331 case URX_LB_CONT: 5332 { 5333 // Positive Look-Behind, at top of loop checking for matches of LB expression 5334 // at all possible input starting positions. 5335 5336 // Fetch the min and max possible match lengths. They are the operands 5337 // of this op in the pattern. 5338 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5339 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5340 U_ASSERT(minML <= maxML); 5341 U_ASSERT(minML >= 0); 5342 5343 // Fetch (from data) the last input index where a match was attempted. 5344 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5345 int64_t *lbStartIdx = &fData[opValue+2]; 5346 if (*lbStartIdx < 0) { 5347 // First time through loop. 5348 *lbStartIdx = fp->fInputIdx - minML; 5349 } else { 5350 // 2nd through nth time through the loop. 5351 // Back up start position for match by one. 5352 if (*lbStartIdx == 0) { 5353 (*lbStartIdx)--; 5354 } else { 5355 U16_BACK_1(inputBuf, 0, *lbStartIdx); 5356 } 5357 } 5358 5359 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 5360 // We have tried all potential match starting points without 5361 // getting a match. Backtrack out, and out of the 5362 // Look Behind altogether. 5363 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5364 int64_t restoreInputLen = fData[opValue+3]; 5365 U_ASSERT(restoreInputLen >= fActiveLimit); 5366 U_ASSERT(restoreInputLen <= fInputLength); 5367 fActiveLimit = restoreInputLen; 5368 break; 5369 } 5370 5371 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5372 // (successful match will fall off the end of the loop.) 5373 fp = StateSave(fp, fp->fPatIdx-3, status); 5374 fp->fInputIdx = *lbStartIdx; 5375 } 5376 break; 5377 5378 case URX_LB_END: 5379 // End of a look-behind block, after a successful match. 5380 { 5381 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5382 if (fp->fInputIdx != fActiveLimit) { 5383 // The look-behind expression matched, but the match did not 5384 // extend all the way to the point that we are looking behind from. 5385 // FAIL out of here, which will take us back to the LB_CONT, which 5386 // will retry the match starting at another position or fail 5387 // the look-behind altogether, whichever is appropriate. 5388 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5389 break; 5390 } 5391 5392 // Look-behind match is good. Restore the orignal input string length, 5393 // which had been truncated to pin the end of the lookbehind match to the 5394 // position being looked-behind. 5395 int64_t originalInputLen = fData[opValue+3]; 5396 U_ASSERT(originalInputLen >= fActiveLimit); 5397 U_ASSERT(originalInputLen <= fInputLength); 5398 fActiveLimit = originalInputLen; 5399 } 5400 break; 5401 5402 5403 case URX_LBN_CONT: 5404 { 5405 // Negative Look-Behind, at top of loop checking for matches of LB expression 5406 // at all possible input starting positions. 5407 5408 // Fetch the extra parameters of this op. 5409 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5410 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5411 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 5412 continueLoc = URX_VAL(continueLoc); 5413 U_ASSERT(minML <= maxML); 5414 U_ASSERT(minML >= 0); 5415 U_ASSERT(continueLoc > fp->fPatIdx); 5416 5417 // Fetch (from data) the last input index where a match was attempted. 5418 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5419 int64_t *lbStartIdx = &fData[opValue+2]; 5420 if (*lbStartIdx < 0) { 5421 // First time through loop. 5422 *lbStartIdx = fp->fInputIdx - minML; 5423 } else { 5424 // 2nd through nth time through the loop. 5425 // Back up start position for match by one. 5426 if (*lbStartIdx == 0) { 5427 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. 5428 } else { 5429 U16_BACK_1(inputBuf, 0, *lbStartIdx); 5430 } 5431 } 5432 5433 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 5434 // We have tried all potential match starting points without 5435 // getting a match, which means that the negative lookbehind as 5436 // a whole has succeeded. Jump forward to the continue location 5437 int64_t restoreInputLen = fData[opValue+3]; 5438 U_ASSERT(restoreInputLen >= fActiveLimit); 5439 U_ASSERT(restoreInputLen <= fInputLength); 5440 fActiveLimit = restoreInputLen; 5441 fp->fPatIdx = continueLoc; 5442 break; 5443 } 5444 5445 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5446 // (successful match will cause a FAIL out of the loop altogether.) 5447 fp = StateSave(fp, fp->fPatIdx-4, status); 5448 fp->fInputIdx = *lbStartIdx; 5449 } 5450 break; 5451 5452 case URX_LBN_END: 5453 // End of a negative look-behind block, after a successful match. 5454 { 5455 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5456 if (fp->fInputIdx != fActiveLimit) { 5457 // The look-behind expression matched, but the match did not 5458 // extend all the way to the point that we are looking behind from. 5459 // FAIL out of here, which will take us back to the LB_CONT, which 5460 // will retry the match starting at another position or succeed 5461 // the look-behind altogether, whichever is appropriate. 5462 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5463 break; 5464 } 5465 5466 // Look-behind expression matched, which means look-behind test as 5467 // a whole Fails 5468 5469 // Restore the orignal input string length, which had been truncated 5470 // inorder to pin the end of the lookbehind match 5471 // to the position being looked-behind. 5472 int64_t originalInputLen = fData[opValue+3]; 5473 U_ASSERT(originalInputLen >= fActiveLimit); 5474 U_ASSERT(originalInputLen <= fInputLength); 5475 fActiveLimit = originalInputLen; 5476 5477 // Restore original stack position, discarding any state saved 5478 // by the successful pattern match. 5479 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5480 int32_t newStackSize = (int32_t)fData[opValue]; 5481 U_ASSERT(fStack->size() > newStackSize); 5482 fStack->setSize(newStackSize); 5483 5484 // FAIL, which will take control back to someplace 5485 // prior to entering the look-behind test. 5486 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5487 } 5488 break; 5489 5490 5491 case URX_LOOP_SR_I: 5492 // Loop Initialization for the optimized implementation of 5493 // [some character set]* 5494 // This op scans through all matching input. 5495 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5496 { 5497 U_ASSERT(opValue > 0 && opValue < sets->size()); 5498 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5499 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 5500 5501 // Loop through input, until either the input is exhausted or 5502 // we reach a character that is not a member of the set. 5503 int32_t ix = (int32_t)fp->fInputIdx; 5504 for (;;) { 5505 if (ix >= fActiveLimit) { 5506 fHitEnd = TRUE; 5507 break; 5508 } 5509 UChar32 c; 5510 U16_NEXT(inputBuf, ix, fActiveLimit, c); 5511 if (c<256) { 5512 if (s8->contains(c) == FALSE) { 5513 U16_BACK_1(inputBuf, 0, ix); 5514 break; 5515 } 5516 } else { 5517 if (s->contains(c) == FALSE) { 5518 U16_BACK_1(inputBuf, 0, ix); 5519 break; 5520 } 5521 } 5522 } 5523 5524 // If there were no matching characters, skip over the loop altogether. 5525 // The loop doesn't run at all, a * op always succeeds. 5526 if (ix == fp->fInputIdx) { 5527 fp->fPatIdx++; // skip the URX_LOOP_C op. 5528 break; 5529 } 5530 5531 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5532 // must follow. It's operand is the stack location 5533 // that holds the starting input index for the match of this [set]* 5534 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 5535 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5536 int32_t stackLoc = URX_VAL(loopcOp); 5537 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5538 fp->fExtra[stackLoc] = fp->fInputIdx; 5539 fp->fInputIdx = ix; 5540 5541 // Save State to the URX_LOOP_C op that follows this one, 5542 // so that match failures in the following code will return to there. 5543 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 5544 fp = StateSave(fp, fp->fPatIdx, status); 5545 fp->fPatIdx++; 5546 } 5547 break; 5548 5549 5550 case URX_LOOP_DOT_I: 5551 // Loop Initialization for the optimized implementation of .* 5552 // This op scans through all remaining input. 5553 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5554 { 5555 // Loop through input until the input is exhausted (we reach an end-of-line) 5556 // In DOTALL mode, we can just go straight to the end of the input. 5557 int32_t ix; 5558 if ((opValue & 1) == 1) { 5559 // Dot-matches-All mode. Jump straight to the end of the string. 5560 ix = (int32_t)fActiveLimit; 5561 fHitEnd = TRUE; 5562 } else { 5563 // NOT DOT ALL mode. Line endings do not match '.' 5564 // Scan forward until a line ending or end of input. 5565 ix = (int32_t)fp->fInputIdx; 5566 for (;;) { 5567 if (ix >= fActiveLimit) { 5568 fHitEnd = TRUE; 5569 break; 5570 } 5571 UChar32 c; 5572 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] 5573 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 5574 if ((c == 0x0a) || // 0x0a is newline in both modes. 5575 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 5576 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) { 5577 // char is a line ending. Put the input pos back to the 5578 // line ending char, and exit the scanning loop. 5579 U16_BACK_1(inputBuf, 0, ix); 5580 break; 5581 } 5582 } 5583 } 5584 } 5585 5586 // If there were no matching characters, skip over the loop altogether. 5587 // The loop doesn't run at all, a * op always succeeds. 5588 if (ix == fp->fInputIdx) { 5589 fp->fPatIdx++; // skip the URX_LOOP_C op. 5590 break; 5591 } 5592 5593 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5594 // must follow. It's operand is the stack location 5595 // that holds the starting input index for the match of this .* 5596 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 5597 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5598 int32_t stackLoc = URX_VAL(loopcOp); 5599 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5600 fp->fExtra[stackLoc] = fp->fInputIdx; 5601 fp->fInputIdx = ix; 5602 5603 // Save State to the URX_LOOP_C op that follows this one, 5604 // so that match failures in the following code will return to there. 5605 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 5606 fp = StateSave(fp, fp->fPatIdx, status); 5607 fp->fPatIdx++; 5608 } 5609 break; 5610 5611 5612 case URX_LOOP_C: 5613 { 5614 U_ASSERT(opValue>=0 && opValue<fFrameSize); 5615 backSearchIndex = (int32_t)fp->fExtra[opValue]; 5616 U_ASSERT(backSearchIndex <= fp->fInputIdx); 5617 if (backSearchIndex == fp->fInputIdx) { 5618 // We've backed up the input idx to the point that the loop started. 5619 // The loop is done. Leave here without saving state. 5620 // Subsequent failures won't come back here. 5621 break; 5622 } 5623 // Set up for the next iteration of the loop, with input index 5624 // backed up by one from the last time through, 5625 // and a state save to this instruction in case the following code fails again. 5626 // (We're going backwards because this loop emulates stack unwinding, not 5627 // the initial scan forward.) 5628 U_ASSERT(fp->fInputIdx > 0); 5629 UChar32 prevC; 5630 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit? 5631 5632 if (prevC == 0x0a && 5633 fp->fInputIdx > backSearchIndex && 5634 inputBuf[fp->fInputIdx-1] == 0x0d) { 5635 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 5636 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 5637 // .*, stepping back over CRLF pair. 5638 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 5639 } 5640 } 5641 5642 5643 fp = StateSave(fp, fp->fPatIdx-1, status); 5644 } 5645 break; 5646 5647 5648 5649 default: 5650 // Trouble. The compiled pattern contains an entry with an 5651 // unrecognized type tag. 5652 U_ASSERT(FALSE); 5653 } 5654 5655 if (U_FAILURE(status)) { 5656 isMatch = FALSE; 5657 break; 5658 } 5659 } 5660 5661 breakFromLoop: 5662 fMatch = isMatch; 5663 if (isMatch) { 5664 fLastMatchEnd = fMatchEnd; 5665 fMatchStart = startIdx; 5666 fMatchEnd = fp->fInputIdx; 5667 if (fTraceDebug) { 5668 REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd)); 5669 } 5670 } 5671 else 5672 { 5673 if (fTraceDebug) { 5674 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); 5675 } 5676 } 5677 5678 fFrame = fp; // The active stack frame when the engine stopped. 5679 // Contains the capture group results that we need to 5680 // access later. 5681 5682 return; 5683 } 5684 5685 5686 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) 5687 5688 U_NAMESPACE_END 5689 5690 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 5691