1 /* 2 ************************************************************************** 3 * Copyright (C) 2002-2013 International Business Machines Corporation * 4 * and others. All rights reserved. * 5 ************************************************************************** 6 */ 7 // 8 // file: rematch.cpp 9 // 10 // Contains the implementation of class RegexMatcher, 11 // which is one of the main API classes for the ICU regular expression package. 12 // 13 14 #include "unicode/utypes.h" 15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 16 17 #include "unicode/regex.h" 18 #include "unicode/uniset.h" 19 #include "unicode/uchar.h" 20 #include "unicode/ustring.h" 21 #include "unicode/rbbi.h" 22 #include "unicode/utf.h" 23 #include "unicode/utf16.h" 24 #include "uassert.h" 25 #include "cmemory.h" 26 #include "uvector.h" 27 #include "uvectr32.h" 28 #include "uvectr64.h" 29 #include "regeximp.h" 30 #include "regexst.h" 31 #include "regextxt.h" 32 #include "ucase.h" 33 34 // #include <malloc.h> // Needed for heapcheck testing 35 36 37 // Find progress callback 38 // ---------------------- 39 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call. 40 // 41 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \ 42 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE) 43 44 45 // Smart Backtracking 46 // ------------------ 47 // When a failure would go back to a LOOP_C instruction, 48 // strings, characters, and setrefs scan backwards for a valid start 49 // character themselves, pop the stack, and save state, emulating the 50 // LOOP_C's effect but assured that the next character of input is a 51 // possible matching character. 52 // 53 // Good idea in theory; unfortunately it only helps out a few specific 54 // cases and slows the engine down a little in the rest. 55 56 U_NAMESPACE_BEGIN 57 58 // Default limit for the size of the back track stack, to avoid system 59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. 60 // This value puts ICU's limits higher than most other regexp implementations, 61 // which use recursion rather than the heap, and take more storage per 62 // backtrack point. 63 // 64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; 65 66 // Time limit counter constant. 67 // Time limits for expression evaluation are in terms of quanta of work by 68 // the engine, each of which is 10,000 state saves. 69 // This constant determines that state saves per tick number. 70 static const int32_t TIMER_INITIAL_VALUE = 10000; 71 72 //----------------------------------------------------------------------------- 73 // 74 // Constructor and Destructor 75 // 76 //----------------------------------------------------------------------------- 77 RegexMatcher::RegexMatcher(const RegexPattern *pat) { 78 fDeferredStatus = U_ZERO_ERROR; 79 init(fDeferredStatus); 80 if (U_FAILURE(fDeferredStatus)) { 81 return; 82 } 83 if (pat==NULL) { 84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; 85 return; 86 } 87 fPattern = pat; 88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); 89 } 90 91 92 93 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 94 uint32_t flags, UErrorCode &status) { 95 init(status); 96 if (U_FAILURE(status)) { 97 return; 98 } 99 UParseError pe; 100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 101 fPattern = fPatternOwned; 102 103 UText inputText = UTEXT_INITIALIZER; 104 utext_openConstUnicodeString(&inputText, &input, &status); 105 init2(&inputText, status); 106 utext_close(&inputText); 107 108 fInputUniStrMaybeMutable = TRUE; 109 } 110 111 112 RegexMatcher::RegexMatcher(UText *regexp, UText *input, 113 uint32_t flags, UErrorCode &status) { 114 init(status); 115 if (U_FAILURE(status)) { 116 return; 117 } 118 UParseError pe; 119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 120 if (U_FAILURE(status)) { 121 return; 122 } 123 124 fPattern = fPatternOwned; 125 init2(input, status); 126 } 127 128 129 RegexMatcher::RegexMatcher(const UnicodeString ®exp, 130 uint32_t flags, UErrorCode &status) { 131 init(status); 132 if (U_FAILURE(status)) { 133 return; 134 } 135 UParseError pe; 136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 137 if (U_FAILURE(status)) { 138 return; 139 } 140 fPattern = fPatternOwned; 141 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 142 } 143 144 RegexMatcher::RegexMatcher(UText *regexp, 145 uint32_t flags, UErrorCode &status) { 146 init(status); 147 if (U_FAILURE(status)) { 148 return; 149 } 150 UParseError pe; 151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 152 if (U_FAILURE(status)) { 153 return; 154 } 155 156 fPattern = fPatternOwned; 157 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 158 } 159 160 161 162 163 RegexMatcher::~RegexMatcher() { 164 delete fStack; 165 if (fData != fSmallData) { 166 uprv_free(fData); 167 fData = NULL; 168 } 169 if (fPatternOwned) { 170 delete fPatternOwned; 171 fPatternOwned = NULL; 172 fPattern = NULL; 173 } 174 175 if (fInput) { 176 delete fInput; 177 } 178 if (fInputText) { 179 utext_close(fInputText); 180 } 181 if (fAltInputText) { 182 utext_close(fAltInputText); 183 } 184 185 #if UCONFIG_NO_BREAK_ITERATION==0 186 delete fWordBreakItr; 187 #endif 188 } 189 190 // 191 // init() common initialization for use by all constructors. 192 // Initialize all fields, get the object into a consistent state. 193 // This must be done even when the initial status shows an error, 194 // so that the object is initialized sufficiently well for the destructor 195 // to run safely. 196 // 197 void RegexMatcher::init(UErrorCode &status) { 198 fPattern = NULL; 199 fPatternOwned = NULL; 200 fFrameSize = 0; 201 fRegionStart = 0; 202 fRegionLimit = 0; 203 fAnchorStart = 0; 204 fAnchorLimit = 0; 205 fLookStart = 0; 206 fLookLimit = 0; 207 fActiveStart = 0; 208 fActiveLimit = 0; 209 fTransparentBounds = FALSE; 210 fAnchoringBounds = TRUE; 211 fMatch = FALSE; 212 fMatchStart = 0; 213 fMatchEnd = 0; 214 fLastMatchEnd = -1; 215 fAppendPosition = 0; 216 fHitEnd = FALSE; 217 fRequireEnd = FALSE; 218 fStack = NULL; 219 fFrame = NULL; 220 fTimeLimit = 0; 221 fTime = 0; 222 fTickCounter = 0; 223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; 224 fCallbackFn = NULL; 225 fCallbackContext = NULL; 226 fFindProgressCallbackFn = NULL; 227 fFindProgressCallbackContext = NULL; 228 fTraceDebug = FALSE; 229 fDeferredStatus = status; 230 fData = fSmallData; 231 fWordBreakItr = NULL; 232 233 fStack = NULL; 234 fInputText = NULL; 235 fAltInputText = NULL; 236 fInput = NULL; 237 fInputLength = 0; 238 fInputUniStrMaybeMutable = FALSE; 239 240 if (U_FAILURE(status)) { 241 fDeferredStatus = status; 242 } 243 } 244 245 // 246 // init2() Common initialization for use by RegexMatcher constructors, part 2. 247 // This handles the common setup to be done after the Pattern is available. 248 // 249 void RegexMatcher::init2(UText *input, UErrorCode &status) { 250 if (U_FAILURE(status)) { 251 fDeferredStatus = status; 252 return; 253 } 254 255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) { 256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); 257 if (fData == NULL) { 258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 259 return; 260 } 261 } 262 263 fStack = new UVector64(status); 264 if (fStack == NULL) { 265 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 266 return; 267 } 268 269 reset(input); 270 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); 271 if (U_FAILURE(status)) { 272 fDeferredStatus = status; 273 return; 274 } 275 } 276 277 278 static const UChar BACKSLASH = 0x5c; 279 static const UChar DOLLARSIGN = 0x24; 280 //-------------------------------------------------------------------------------- 281 // 282 // appendReplacement 283 // 284 //-------------------------------------------------------------------------------- 285 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, 286 const UnicodeString &replacement, 287 UErrorCode &status) { 288 UText replacementText = UTEXT_INITIALIZER; 289 290 utext_openConstUnicodeString(&replacementText, &replacement, &status); 291 if (U_SUCCESS(status)) { 292 UText resultText = UTEXT_INITIALIZER; 293 utext_openUnicodeString(&resultText, &dest, &status); 294 295 if (U_SUCCESS(status)) { 296 appendReplacement(&resultText, &replacementText, status); 297 utext_close(&resultText); 298 } 299 utext_close(&replacementText); 300 } 301 302 return *this; 303 } 304 305 // 306 // appendReplacement, UText mode 307 // 308 RegexMatcher &RegexMatcher::appendReplacement(UText *dest, 309 UText *replacement, 310 UErrorCode &status) { 311 if (U_FAILURE(status)) { 312 return *this; 313 } 314 if (U_FAILURE(fDeferredStatus)) { 315 status = fDeferredStatus; 316 return *this; 317 } 318 if (fMatch == FALSE) { 319 status = U_REGEX_INVALID_STATE; 320 return *this; 321 } 322 323 // Copy input string from the end of previous match to start of current match 324 int64_t destLen = utext_nativeLength(dest); 325 if (fMatchStart > fAppendPosition) { 326 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 327 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 328 (int32_t)(fMatchStart-fAppendPosition), &status); 329 } else { 330 int32_t len16; 331 if (UTEXT_USES_U16(fInputText)) { 332 len16 = (int32_t)(fMatchStart-fAppendPosition); 333 } else { 334 UErrorCode lengthStatus = U_ZERO_ERROR; 335 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus); 336 } 337 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 338 if (inputChars == NULL) { 339 status = U_MEMORY_ALLOCATION_ERROR; 340 return *this; 341 } 342 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); 343 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); 344 uprv_free(inputChars); 345 } 346 } 347 fAppendPosition = fMatchEnd; 348 349 350 // scan the replacement text, looking for substitutions ($n) and \escapes. 351 // TODO: optimize this loop by efficiently scanning for '$' or '\', 352 // move entire ranges not containing substitutions. 353 UTEXT_SETNATIVEINDEX(replacement, 0); 354 UChar32 c = UTEXT_NEXT32(replacement); 355 while (c != U_SENTINEL) { 356 if (c == BACKSLASH) { 357 // Backslash Escape. Copy the following char out without further checks. 358 // Note: Surrogate pairs don't need any special handling 359 // The second half wont be a '$' or a '\', and 360 // will move to the dest normally on the next 361 // loop iteration. 362 c = UTEXT_CURRENT32(replacement); 363 if (c == U_SENTINEL) { 364 break; 365 } 366 367 if (c==0x55/*U*/ || c==0x75/*u*/) { 368 // We have a \udddd or \Udddddddd escape sequence. 369 int32_t offset = 0; 370 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement); 371 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 372 if (escapedChar != (UChar32)0xFFFFFFFF) { 373 if (U_IS_BMP(escapedChar)) { 374 UChar c16 = (UChar)escapedChar; 375 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 376 } else { 377 UChar surrogate[2]; 378 surrogate[0] = U16_LEAD(escapedChar); 379 surrogate[1] = U16_TRAIL(escapedChar); 380 if (U_SUCCESS(status)) { 381 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 382 } 383 } 384 // TODO: Report errors for mal-formed \u escapes? 385 // As this is, the original sequence is output, which may be OK. 386 if (context.lastOffset == offset) { 387 (void)UTEXT_PREVIOUS32(replacement); 388 } else if (context.lastOffset != offset-1) { 389 utext_moveIndex32(replacement, offset - context.lastOffset - 1); 390 } 391 } 392 } else { 393 (void)UTEXT_NEXT32(replacement); 394 // Plain backslash escape. Just put out the escaped character. 395 if (U_IS_BMP(c)) { 396 UChar c16 = (UChar)c; 397 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 398 } else { 399 UChar surrogate[2]; 400 surrogate[0] = U16_LEAD(c); 401 surrogate[1] = U16_TRAIL(c); 402 if (U_SUCCESS(status)) { 403 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 404 } 405 } 406 } 407 } else if (c != DOLLARSIGN) { 408 // Normal char, not a $. Copy it out without further checks. 409 if (U_IS_BMP(c)) { 410 UChar c16 = (UChar)c; 411 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 412 } else { 413 UChar surrogate[2]; 414 surrogate[0] = U16_LEAD(c); 415 surrogate[1] = U16_TRAIL(c); 416 if (U_SUCCESS(status)) { 417 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 418 } 419 } 420 } else { 421 // We've got a $. Pick up a capture group number if one follows. 422 // Consume at most the number of digits necessary for the largest capture 423 // number that is valid for this pattern. 424 425 int32_t numDigits = 0; 426 int32_t groupNum = 0; 427 UChar32 digitC; 428 for (;;) { 429 digitC = UTEXT_CURRENT32(replacement); 430 if (digitC == U_SENTINEL) { 431 break; 432 } 433 if (u_isdigit(digitC) == FALSE) { 434 break; 435 } 436 (void)UTEXT_NEXT32(replacement); 437 groupNum=groupNum*10 + u_charDigitValue(digitC); 438 numDigits++; 439 if (numDigits >= fPattern->fMaxCaptureDigits) { 440 break; 441 } 442 } 443 444 445 if (numDigits == 0) { 446 // The $ didn't introduce a group number at all. 447 // Treat it as just part of the substitution text. 448 UChar c16 = DOLLARSIGN; 449 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 450 } else { 451 // Finally, append the capture group data to the destination. 452 destLen += appendGroup(groupNum, dest, status); 453 if (U_FAILURE(status)) { 454 // Can fail if group number is out of range. 455 break; 456 } 457 } 458 } 459 460 if (U_FAILURE(status)) { 461 break; 462 } else { 463 c = UTEXT_NEXT32(replacement); 464 } 465 } 466 467 return *this; 468 } 469 470 471 472 //-------------------------------------------------------------------------------- 473 // 474 // appendTail Intended to be used in conjunction with appendReplacement() 475 // To the destination string, append everything following 476 // the last match position from the input string. 477 // 478 // Note: Match ranges do not affect appendTail or appendReplacement 479 // 480 //-------------------------------------------------------------------------------- 481 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { 482 UErrorCode status = U_ZERO_ERROR; 483 UText resultText = UTEXT_INITIALIZER; 484 utext_openUnicodeString(&resultText, &dest, &status); 485 486 if (U_SUCCESS(status)) { 487 appendTail(&resultText, status); 488 utext_close(&resultText); 489 } 490 491 return dest; 492 } 493 494 // 495 // appendTail, UText mode 496 // 497 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { 498 if (U_FAILURE(status)) { 499 return dest; 500 } 501 if (U_FAILURE(fDeferredStatus)) { 502 status = fDeferredStatus; 503 return dest; 504 } 505 506 if (fInputLength > fAppendPosition) { 507 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 508 int64_t destLen = utext_nativeLength(dest); 509 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 510 (int32_t)(fInputLength-fAppendPosition), &status); 511 } else { 512 int32_t len16; 513 if (UTEXT_USES_U16(fInputText)) { 514 len16 = (int32_t)(fInputLength-fAppendPosition); 515 } else { 516 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status); 517 status = U_ZERO_ERROR; // buffer overflow 518 } 519 520 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); 521 if (inputChars == NULL) { 522 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 523 } else { 524 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated 525 int64_t destLen = utext_nativeLength(dest); 526 utext_replace(dest, destLen, destLen, inputChars, len16, &status); 527 uprv_free(inputChars); 528 } 529 } 530 } 531 return dest; 532 } 533 534 535 536 //-------------------------------------------------------------------------------- 537 // 538 // end 539 // 540 //-------------------------------------------------------------------------------- 541 int32_t RegexMatcher::end(UErrorCode &err) const { 542 return end(0, err); 543 } 544 545 int64_t RegexMatcher::end64(UErrorCode &err) const { 546 return end64(0, err); 547 } 548 549 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const { 550 if (U_FAILURE(err)) { 551 return -1; 552 } 553 if (fMatch == FALSE) { 554 err = U_REGEX_INVALID_STATE; 555 return -1; 556 } 557 if (group < 0 || group > fPattern->fGroupMap->size()) { 558 err = U_INDEX_OUTOFBOUNDS_ERROR; 559 return -1; 560 } 561 int64_t e = -1; 562 if (group == 0) { 563 e = fMatchEnd; 564 } else { 565 // Get the position within the stack frame of the variables for 566 // this capture group. 567 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 568 U_ASSERT(groupOffset < fPattern->fFrameSize); 569 U_ASSERT(groupOffset >= 0); 570 e = fFrame->fExtra[groupOffset + 1]; 571 } 572 573 return e; 574 } 575 576 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { 577 return (int32_t)end64(group, err); 578 } 579 580 581 //-------------------------------------------------------------------------------- 582 // 583 // find() 584 // 585 //-------------------------------------------------------------------------------- 586 UBool RegexMatcher::find() { 587 // Start at the position of the last match end. (Will be zero if the 588 // matcher has been reset.) 589 // 590 if (U_FAILURE(fDeferredStatus)) { 591 return FALSE; 592 } 593 594 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 595 return findUsingChunk(); 596 } 597 598 int64_t startPos = fMatchEnd; 599 if (startPos==0) { 600 startPos = fActiveStart; 601 } 602 603 if (fMatch) { 604 // Save the position of any previous successful match. 605 fLastMatchEnd = fMatchEnd; 606 607 if (fMatchStart == fMatchEnd) { 608 // Previous match had zero length. Move start position up one position 609 // to avoid sending find() into a loop on zero-length matches. 610 if (startPos >= fActiveLimit) { 611 fMatch = FALSE; 612 fHitEnd = TRUE; 613 return FALSE; 614 } 615 UTEXT_SETNATIVEINDEX(fInputText, startPos); 616 (void)UTEXT_NEXT32(fInputText); 617 startPos = UTEXT_GETNATIVEINDEX(fInputText); 618 } 619 } else { 620 if (fLastMatchEnd >= 0) { 621 // A previous find() failed to match. Don't try again. 622 // (without this test, a pattern with a zero-length match 623 // could match again at the end of an input string.) 624 fHitEnd = TRUE; 625 return FALSE; 626 } 627 } 628 629 630 // Compute the position in the input string beyond which a match can not begin, because 631 // the minimum length match would extend past the end of the input. 632 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 633 // Be aware of possible overflows if making changes here. 634 int64_t testStartLimit; 635 if (UTEXT_USES_U16(fInputText)) { 636 testStartLimit = fActiveLimit - fPattern->fMinMatchLen; 637 if (startPos > testStartLimit) { 638 fMatch = FALSE; 639 fHitEnd = TRUE; 640 return FALSE; 641 } 642 } else { 643 // For now, let the matcher discover that it can't match on its own 644 // We don't know how long the match len is in native characters 645 testStartLimit = fActiveLimit; 646 } 647 648 UChar32 c; 649 U_ASSERT(startPos >= 0); 650 651 switch (fPattern->fStartType) { 652 case START_NO_INFO: 653 // No optimization was found. 654 // Try a match at each input position. 655 for (;;) { 656 MatchAt(startPos, FALSE, fDeferredStatus); 657 if (U_FAILURE(fDeferredStatus)) { 658 return FALSE; 659 } 660 if (fMatch) { 661 return TRUE; 662 } 663 if (startPos >= testStartLimit) { 664 fHitEnd = TRUE; 665 return FALSE; 666 } 667 UTEXT_SETNATIVEINDEX(fInputText, startPos); 668 (void)UTEXT_NEXT32(fInputText); 669 startPos = UTEXT_GETNATIVEINDEX(fInputText); 670 // Note that it's perfectly OK for a pattern to have a zero-length 671 // match at the end of a string, so we must make sure that the loop 672 // runs with startPos == testStartLimit the last time through. 673 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 674 return FALSE; 675 } 676 U_ASSERT(FALSE); 677 678 case START_START: 679 // Matches are only possible at the start of the input string 680 // (pattern begins with ^ or \A) 681 if (startPos > fActiveStart) { 682 fMatch = FALSE; 683 return FALSE; 684 } 685 MatchAt(startPos, FALSE, fDeferredStatus); 686 if (U_FAILURE(fDeferredStatus)) { 687 return FALSE; 688 } 689 return fMatch; 690 691 692 case START_SET: 693 { 694 // Match may start on any char from a pre-computed set. 695 U_ASSERT(fPattern->fMinMatchLen > 0); 696 int64_t pos; 697 UTEXT_SETNATIVEINDEX(fInputText, startPos); 698 for (;;) { 699 c = UTEXT_NEXT32(fInputText); 700 pos = UTEXT_GETNATIVEINDEX(fInputText); 701 // c will be -1 (U_SENTINEL) at end of text, in which case we 702 // skip this next block (so we don't have a negative array index) 703 // and handle end of text in the following block. 704 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) || 705 (c>=256 && fPattern->fInitialChars->contains(c)))) { 706 MatchAt(startPos, FALSE, fDeferredStatus); 707 if (U_FAILURE(fDeferredStatus)) { 708 return FALSE; 709 } 710 if (fMatch) { 711 return TRUE; 712 } 713 UTEXT_SETNATIVEINDEX(fInputText, pos); 714 } 715 if (startPos >= testStartLimit) { 716 fMatch = FALSE; 717 fHitEnd = TRUE; 718 return FALSE; 719 } 720 startPos = pos; 721 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 722 return FALSE; 723 } 724 } 725 U_ASSERT(FALSE); 726 727 case START_STRING: 728 case START_CHAR: 729 { 730 // Match starts on exactly one char. 731 U_ASSERT(fPattern->fMinMatchLen > 0); 732 UChar32 theChar = fPattern->fInitialChar; 733 int64_t pos; 734 UTEXT_SETNATIVEINDEX(fInputText, startPos); 735 for (;;) { 736 c = UTEXT_NEXT32(fInputText); 737 pos = UTEXT_GETNATIVEINDEX(fInputText); 738 if (c == theChar) { 739 MatchAt(startPos, FALSE, fDeferredStatus); 740 if (U_FAILURE(fDeferredStatus)) { 741 return FALSE; 742 } 743 if (fMatch) { 744 return TRUE; 745 } 746 UTEXT_SETNATIVEINDEX(fInputText, pos); 747 } 748 if (startPos >= testStartLimit) { 749 fMatch = FALSE; 750 fHitEnd = TRUE; 751 return FALSE; 752 } 753 startPos = pos; 754 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 755 return FALSE; 756 } 757 } 758 U_ASSERT(FALSE); 759 760 case START_LINE: 761 { 762 UChar32 c; 763 if (startPos == fAnchorStart) { 764 MatchAt(startPos, FALSE, fDeferredStatus); 765 if (U_FAILURE(fDeferredStatus)) { 766 return FALSE; 767 } 768 if (fMatch) { 769 return TRUE; 770 } 771 UTEXT_SETNATIVEINDEX(fInputText, startPos); 772 c = UTEXT_NEXT32(fInputText); 773 startPos = UTEXT_GETNATIVEINDEX(fInputText); 774 } else { 775 UTEXT_SETNATIVEINDEX(fInputText, startPos); 776 c = UTEXT_PREVIOUS32(fInputText); 777 UTEXT_SETNATIVEINDEX(fInputText, startPos); 778 } 779 780 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 781 for (;;) { 782 if (c == 0x0a) { 783 MatchAt(startPos, FALSE, fDeferredStatus); 784 if (U_FAILURE(fDeferredStatus)) { 785 return FALSE; 786 } 787 if (fMatch) { 788 return TRUE; 789 } 790 UTEXT_SETNATIVEINDEX(fInputText, startPos); 791 } 792 if (startPos >= testStartLimit) { 793 fMatch = FALSE; 794 fHitEnd = TRUE; 795 return FALSE; 796 } 797 c = UTEXT_NEXT32(fInputText); 798 startPos = UTEXT_GETNATIVEINDEX(fInputText); 799 // Note that it's perfectly OK for a pattern to have a zero-length 800 // match at the end of a string, so we must make sure that the loop 801 // runs with startPos == testStartLimit the last time through. 802 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 803 return FALSE; 804 } 805 } else { 806 for (;;) { 807 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 808 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { 809 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 810 (void)UTEXT_NEXT32(fInputText); 811 startPos = UTEXT_GETNATIVEINDEX(fInputText); 812 } 813 MatchAt(startPos, FALSE, fDeferredStatus); 814 if (U_FAILURE(fDeferredStatus)) { 815 return FALSE; 816 } 817 if (fMatch) { 818 return TRUE; 819 } 820 UTEXT_SETNATIVEINDEX(fInputText, startPos); 821 } 822 if (startPos >= testStartLimit) { 823 fMatch = FALSE; 824 fHitEnd = TRUE; 825 return FALSE; 826 } 827 c = UTEXT_NEXT32(fInputText); 828 startPos = UTEXT_GETNATIVEINDEX(fInputText); 829 // Note that it's perfectly OK for a pattern to have a zero-length 830 // match at the end of a string, so we must make sure that the loop 831 // runs with startPos == testStartLimit the last time through. 832 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 833 return FALSE; 834 } 835 } 836 } 837 838 default: 839 U_ASSERT(FALSE); 840 } 841 842 U_ASSERT(FALSE); 843 return FALSE; 844 } 845 846 847 848 UBool RegexMatcher::find(int64_t start, UErrorCode &status) { 849 if (U_FAILURE(status)) { 850 return FALSE; 851 } 852 if (U_FAILURE(fDeferredStatus)) { 853 status = fDeferredStatus; 854 return FALSE; 855 } 856 this->reset(); // Note: Reset() is specified by Java Matcher documentation. 857 // This will reset the region to be the full input length. 858 if (start < 0) { 859 status = U_INDEX_OUTOFBOUNDS_ERROR; 860 return FALSE; 861 } 862 863 int64_t nativeStart = start; 864 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 865 status = U_INDEX_OUTOFBOUNDS_ERROR; 866 return FALSE; 867 } 868 fMatchEnd = nativeStart; 869 return find(); 870 } 871 872 873 //-------------------------------------------------------------------------------- 874 // 875 // findUsingChunk() -- like find(), but with the advance knowledge that the 876 // entire string is available in the UText's chunk buffer. 877 // 878 //-------------------------------------------------------------------------------- 879 UBool RegexMatcher::findUsingChunk() { 880 // Start at the position of the last match end. (Will be zero if the 881 // matcher has been reset. 882 // 883 884 int32_t startPos = (int32_t)fMatchEnd; 885 if (startPos==0) { 886 startPos = (int32_t)fActiveStart; 887 } 888 889 const UChar *inputBuf = fInputText->chunkContents; 890 891 if (fMatch) { 892 // Save the position of any previous successful match. 893 fLastMatchEnd = fMatchEnd; 894 895 if (fMatchStart == fMatchEnd) { 896 // Previous match had zero length. Move start position up one position 897 // to avoid sending find() into a loop on zero-length matches. 898 if (startPos >= fActiveLimit) { 899 fMatch = FALSE; 900 fHitEnd = TRUE; 901 return FALSE; 902 } 903 U16_FWD_1(inputBuf, startPos, fInputLength); 904 } 905 } else { 906 if (fLastMatchEnd >= 0) { 907 // A previous find() failed to match. Don't try again. 908 // (without this test, a pattern with a zero-length match 909 // could match again at the end of an input string.) 910 fHitEnd = TRUE; 911 return FALSE; 912 } 913 } 914 915 916 // Compute the position in the input string beyond which a match can not begin, because 917 // the minimum length match would extend past the end of the input. 918 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 919 // Be aware of possible overflows if making changes here. 920 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); 921 if (startPos > testLen) { 922 fMatch = FALSE; 923 fHitEnd = TRUE; 924 return FALSE; 925 } 926 927 UChar32 c; 928 U_ASSERT(startPos >= 0); 929 930 switch (fPattern->fStartType) { 931 case START_NO_INFO: 932 // No optimization was found. 933 // Try a match at each input position. 934 for (;;) { 935 MatchChunkAt(startPos, FALSE, fDeferredStatus); 936 if (U_FAILURE(fDeferredStatus)) { 937 return FALSE; 938 } 939 if (fMatch) { 940 return TRUE; 941 } 942 if (startPos >= testLen) { 943 fHitEnd = TRUE; 944 return FALSE; 945 } 946 U16_FWD_1(inputBuf, startPos, fActiveLimit); 947 // Note that it's perfectly OK for a pattern to have a zero-length 948 // match at the end of a string, so we must make sure that the loop 949 // runs with startPos == testLen the last time through. 950 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 951 return FALSE; 952 } 953 U_ASSERT(FALSE); 954 955 case START_START: 956 // Matches are only possible at the start of the input string 957 // (pattern begins with ^ or \A) 958 if (startPos > fActiveStart) { 959 fMatch = FALSE; 960 return FALSE; 961 } 962 MatchChunkAt(startPos, FALSE, fDeferredStatus); 963 if (U_FAILURE(fDeferredStatus)) { 964 return FALSE; 965 } 966 return fMatch; 967 968 969 case START_SET: 970 { 971 // Match may start on any char from a pre-computed set. 972 U_ASSERT(fPattern->fMinMatchLen > 0); 973 for (;;) { 974 int32_t pos = startPos; 975 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 976 if ((c<256 && fPattern->fInitialChars8->contains(c)) || 977 (c>=256 && fPattern->fInitialChars->contains(c))) { 978 MatchChunkAt(pos, FALSE, fDeferredStatus); 979 if (U_FAILURE(fDeferredStatus)) { 980 return FALSE; 981 } 982 if (fMatch) { 983 return TRUE; 984 } 985 } 986 if (pos >= testLen) { 987 fMatch = FALSE; 988 fHitEnd = TRUE; 989 return FALSE; 990 } 991 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 992 return FALSE; 993 } 994 } 995 U_ASSERT(FALSE); 996 997 case START_STRING: 998 case START_CHAR: 999 { 1000 // Match starts on exactly one char. 1001 U_ASSERT(fPattern->fMinMatchLen > 0); 1002 UChar32 theChar = fPattern->fInitialChar; 1003 for (;;) { 1004 int32_t pos = startPos; 1005 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 1006 if (c == theChar) { 1007 MatchChunkAt(pos, FALSE, fDeferredStatus); 1008 if (U_FAILURE(fDeferredStatus)) { 1009 return FALSE; 1010 } 1011 if (fMatch) { 1012 return TRUE; 1013 } 1014 } 1015 if (pos >= testLen) { 1016 fMatch = FALSE; 1017 fHitEnd = TRUE; 1018 return FALSE; 1019 } 1020 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1021 return FALSE; 1022 } 1023 } 1024 U_ASSERT(FALSE); 1025 1026 case START_LINE: 1027 { 1028 UChar32 c; 1029 if (startPos == fAnchorStart) { 1030 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1031 if (U_FAILURE(fDeferredStatus)) { 1032 return FALSE; 1033 } 1034 if (fMatch) { 1035 return TRUE; 1036 } 1037 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1038 } 1039 1040 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 1041 for (;;) { 1042 c = inputBuf[startPos-1]; 1043 if (c == 0x0a) { 1044 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1045 if (U_FAILURE(fDeferredStatus)) { 1046 return FALSE; 1047 } 1048 if (fMatch) { 1049 return TRUE; 1050 } 1051 } 1052 if (startPos >= testLen) { 1053 fMatch = FALSE; 1054 fHitEnd = TRUE; 1055 return FALSE; 1056 } 1057 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1058 // Note that it's perfectly OK for a pattern to have a zero-length 1059 // match at the end of a string, so we must make sure that the loop 1060 // runs with startPos == testLen the last time through. 1061 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1062 return FALSE; 1063 } 1064 } else { 1065 for (;;) { 1066 c = inputBuf[startPos-1]; 1067 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 1068 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { 1069 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { 1070 startPos++; 1071 } 1072 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1073 if (U_FAILURE(fDeferredStatus)) { 1074 return FALSE; 1075 } 1076 if (fMatch) { 1077 return TRUE; 1078 } 1079 } 1080 if (startPos >= testLen) { 1081 fMatch = FALSE; 1082 fHitEnd = TRUE; 1083 return FALSE; 1084 } 1085 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1086 // Note that it's perfectly OK for a pattern to have a zero-length 1087 // match at the end of a string, so we must make sure that the loop 1088 // runs with startPos == testLen the last time through. 1089 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1090 return FALSE; 1091 } 1092 } 1093 } 1094 1095 default: 1096 U_ASSERT(FALSE); 1097 } 1098 1099 U_ASSERT(FALSE); 1100 return FALSE; 1101 } 1102 1103 1104 1105 //-------------------------------------------------------------------------------- 1106 // 1107 // group() 1108 // 1109 //-------------------------------------------------------------------------------- 1110 UnicodeString RegexMatcher::group(UErrorCode &status) const { 1111 return group(0, status); 1112 } 1113 1114 // Return immutable shallow clone 1115 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const { 1116 return group(0, dest, group_len, status); 1117 } 1118 1119 // Return immutable shallow clone 1120 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const { 1121 group_len = 0; 1122 if (U_FAILURE(status)) { 1123 return dest; 1124 } 1125 if (U_FAILURE(fDeferredStatus)) { 1126 status = fDeferredStatus; 1127 } else if (fMatch == FALSE) { 1128 status = U_REGEX_INVALID_STATE; 1129 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1130 status = U_INDEX_OUTOFBOUNDS_ERROR; 1131 } 1132 1133 if (U_FAILURE(status)) { 1134 return dest; 1135 } 1136 1137 int64_t s, e; 1138 if (groupNum == 0) { 1139 s = fMatchStart; 1140 e = fMatchEnd; 1141 } else { 1142 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1143 U_ASSERT(groupOffset < fPattern->fFrameSize); 1144 U_ASSERT(groupOffset >= 0); 1145 s = fFrame->fExtra[groupOffset]; 1146 e = fFrame->fExtra[groupOffset+1]; 1147 } 1148 1149 if (s < 0) { 1150 // A capture group wasn't part of the match 1151 return utext_clone(dest, fInputText, FALSE, TRUE, &status); 1152 } 1153 U_ASSERT(s <= e); 1154 group_len = e - s; 1155 1156 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); 1157 if (dest) 1158 UTEXT_SETNATIVEINDEX(dest, s); 1159 return dest; 1160 } 1161 1162 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { 1163 UnicodeString result; 1164 if (U_FAILURE(status)) { 1165 return result; 1166 } 1167 UText resultText = UTEXT_INITIALIZER; 1168 utext_openUnicodeString(&resultText, &result, &status); 1169 group(groupNum, &resultText, status); 1170 utext_close(&resultText); 1171 return result; 1172 } 1173 1174 1175 // Return deep (mutable) clone 1176 // Technology Preview (as an API), but note that the UnicodeString API is implemented 1177 // using this function. 1178 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const { 1179 if (U_FAILURE(status)) { 1180 return dest; 1181 } 1182 1183 if (U_FAILURE(fDeferredStatus)) { 1184 status = fDeferredStatus; 1185 } else if (fMatch == FALSE) { 1186 status = U_REGEX_INVALID_STATE; 1187 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1188 status = U_INDEX_OUTOFBOUNDS_ERROR; 1189 } 1190 if (U_FAILURE(status)) { 1191 return dest; 1192 } 1193 1194 int64_t s, e; 1195 if (groupNum == 0) { 1196 s = fMatchStart; 1197 e = fMatchEnd; 1198 } else { 1199 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1200 U_ASSERT(groupOffset < fPattern->fFrameSize); 1201 U_ASSERT(groupOffset >= 0); 1202 s = fFrame->fExtra[groupOffset]; 1203 e = fFrame->fExtra[groupOffset+1]; 1204 } 1205 1206 if (s < 0) { 1207 // A capture group wasn't part of the match 1208 if (dest) { 1209 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1210 return dest; 1211 } else { 1212 return utext_openUChars(NULL, NULL, 0, &status); 1213 } 1214 } 1215 U_ASSERT(s <= e); 1216 1217 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1218 U_ASSERT(e <= fInputLength); 1219 if (dest) { 1220 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status); 1221 } else { 1222 UText groupText = UTEXT_INITIALIZER; 1223 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status); 1224 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); 1225 utext_close(&groupText); 1226 } 1227 } else { 1228 int32_t len16; 1229 if (UTEXT_USES_U16(fInputText)) { 1230 len16 = (int32_t)(e-s); 1231 } else { 1232 UErrorCode lengthStatus = U_ZERO_ERROR; 1233 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1234 } 1235 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1236 if (groupChars == NULL) { 1237 status = U_MEMORY_ALLOCATION_ERROR; 1238 return dest; 1239 } 1240 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1241 1242 if (dest) { 1243 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status); 1244 } else { 1245 UText groupText = UTEXT_INITIALIZER; 1246 utext_openUChars(&groupText, groupChars, len16, &status); 1247 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); 1248 utext_close(&groupText); 1249 } 1250 1251 uprv_free(groupChars); 1252 } 1253 return dest; 1254 } 1255 1256 //-------------------------------------------------------------------------------- 1257 // 1258 // appendGroup() -- currently internal only, appends a group to a UText rather 1259 // than replacing its contents 1260 // 1261 //-------------------------------------------------------------------------------- 1262 1263 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const { 1264 if (U_FAILURE(status)) { 1265 return 0; 1266 } 1267 if (U_FAILURE(fDeferredStatus)) { 1268 status = fDeferredStatus; 1269 return 0; 1270 } 1271 int64_t destLen = utext_nativeLength(dest); 1272 1273 if (fMatch == FALSE) { 1274 status = U_REGEX_INVALID_STATE; 1275 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1276 } 1277 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1278 status = U_INDEX_OUTOFBOUNDS_ERROR; 1279 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1280 } 1281 1282 int64_t s, e; 1283 if (groupNum == 0) { 1284 s = fMatchStart; 1285 e = fMatchEnd; 1286 } else { 1287 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1288 U_ASSERT(groupOffset < fPattern->fFrameSize); 1289 U_ASSERT(groupOffset >= 0); 1290 s = fFrame->fExtra[groupOffset]; 1291 e = fFrame->fExtra[groupOffset+1]; 1292 } 1293 1294 if (s < 0) { 1295 // A capture group wasn't part of the match 1296 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1297 } 1298 U_ASSERT(s <= e); 1299 1300 int64_t deltaLen; 1301 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1302 U_ASSERT(e <= fInputLength); 1303 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status); 1304 } else { 1305 int32_t len16; 1306 if (UTEXT_USES_U16(fInputText)) { 1307 len16 = (int32_t)(e-s); 1308 } else { 1309 UErrorCode lengthStatus = U_ZERO_ERROR; 1310 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1311 } 1312 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1313 if (groupChars == NULL) { 1314 status = U_MEMORY_ALLOCATION_ERROR; 1315 return 0; 1316 } 1317 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1318 1319 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status); 1320 uprv_free(groupChars); 1321 } 1322 return deltaLen; 1323 } 1324 1325 1326 1327 //-------------------------------------------------------------------------------- 1328 // 1329 // groupCount() 1330 // 1331 //-------------------------------------------------------------------------------- 1332 int32_t RegexMatcher::groupCount() const { 1333 return fPattern->fGroupMap->size(); 1334 } 1335 1336 1337 1338 //-------------------------------------------------------------------------------- 1339 // 1340 // hasAnchoringBounds() 1341 // 1342 //-------------------------------------------------------------------------------- 1343 UBool RegexMatcher::hasAnchoringBounds() const { 1344 return fAnchoringBounds; 1345 } 1346 1347 1348 //-------------------------------------------------------------------------------- 1349 // 1350 // hasTransparentBounds() 1351 // 1352 //-------------------------------------------------------------------------------- 1353 UBool RegexMatcher::hasTransparentBounds() const { 1354 return fTransparentBounds; 1355 } 1356 1357 1358 1359 //-------------------------------------------------------------------------------- 1360 // 1361 // hitEnd() 1362 // 1363 //-------------------------------------------------------------------------------- 1364 UBool RegexMatcher::hitEnd() const { 1365 return fHitEnd; 1366 } 1367 1368 1369 //-------------------------------------------------------------------------------- 1370 // 1371 // input() 1372 // 1373 //-------------------------------------------------------------------------------- 1374 const UnicodeString &RegexMatcher::input() const { 1375 if (!fInput) { 1376 UErrorCode status = U_ZERO_ERROR; 1377 int32_t len16; 1378 if (UTEXT_USES_U16(fInputText)) { 1379 len16 = (int32_t)fInputLength; 1380 } else { 1381 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status); 1382 status = U_ZERO_ERROR; // overflow, length status 1383 } 1384 UnicodeString *result = new UnicodeString(len16, 0, 0); 1385 1386 UChar *inputChars = result->getBuffer(len16); 1387 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning 1388 result->releaseBuffer(len16); 1389 1390 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator= 1391 } 1392 1393 return *fInput; 1394 } 1395 1396 //-------------------------------------------------------------------------------- 1397 // 1398 // inputText() 1399 // 1400 //-------------------------------------------------------------------------------- 1401 UText *RegexMatcher::inputText() const { 1402 return fInputText; 1403 } 1404 1405 1406 //-------------------------------------------------------------------------------- 1407 // 1408 // getInput() -- like inputText(), but makes a clone or copies into another UText 1409 // 1410 //-------------------------------------------------------------------------------- 1411 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { 1412 if (U_FAILURE(status)) { 1413 return dest; 1414 } 1415 if (U_FAILURE(fDeferredStatus)) { 1416 status = fDeferredStatus; 1417 return dest; 1418 } 1419 1420 if (dest) { 1421 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1422 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status); 1423 } else { 1424 int32_t input16Len; 1425 if (UTEXT_USES_U16(fInputText)) { 1426 input16Len = (int32_t)fInputLength; 1427 } else { 1428 UErrorCode lengthStatus = U_ZERO_ERROR; 1429 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error 1430 } 1431 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len)); 1432 if (inputChars == NULL) { 1433 return dest; 1434 } 1435 1436 status = U_ZERO_ERROR; 1437 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning 1438 status = U_ZERO_ERROR; 1439 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status); 1440 1441 uprv_free(inputChars); 1442 } 1443 return dest; 1444 } else { 1445 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); 1446 } 1447 } 1448 1449 1450 static UBool compat_SyncMutableUTextContents(UText *ut); 1451 static UBool compat_SyncMutableUTextContents(UText *ut) { 1452 UBool retVal = FALSE; 1453 1454 // In the following test, we're really only interested in whether the UText should switch 1455 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents 1456 // will still point to the correct data. 1457 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { 1458 UnicodeString *us=(UnicodeString *)ut->context; 1459 1460 // Update to the latest length. 1461 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). 1462 int32_t newLength = us->length(); 1463 1464 // Update the chunk description. 1465 // The buffer may have switched between stack- and heap-based. 1466 ut->chunkContents = us->getBuffer(); 1467 ut->chunkLength = newLength; 1468 ut->chunkNativeLimit = newLength; 1469 ut->nativeIndexingLimit = newLength; 1470 retVal = TRUE; 1471 } 1472 1473 return retVal; 1474 } 1475 1476 //-------------------------------------------------------------------------------- 1477 // 1478 // lookingAt() 1479 // 1480 //-------------------------------------------------------------------------------- 1481 UBool RegexMatcher::lookingAt(UErrorCode &status) { 1482 if (U_FAILURE(status)) { 1483 return FALSE; 1484 } 1485 if (U_FAILURE(fDeferredStatus)) { 1486 status = fDeferredStatus; 1487 return FALSE; 1488 } 1489 1490 if (fInputUniStrMaybeMutable) { 1491 if (compat_SyncMutableUTextContents(fInputText)) { 1492 fInputLength = utext_nativeLength(fInputText); 1493 reset(); 1494 } 1495 } 1496 else { 1497 resetPreserveRegion(); 1498 } 1499 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1500 MatchChunkAt((int32_t)fActiveStart, FALSE, status); 1501 } else { 1502 MatchAt(fActiveStart, FALSE, status); 1503 } 1504 return fMatch; 1505 } 1506 1507 1508 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { 1509 if (U_FAILURE(status)) { 1510 return FALSE; 1511 } 1512 if (U_FAILURE(fDeferredStatus)) { 1513 status = fDeferredStatus; 1514 return FALSE; 1515 } 1516 reset(); 1517 1518 if (start < 0) { 1519 status = U_INDEX_OUTOFBOUNDS_ERROR; 1520 return FALSE; 1521 } 1522 1523 if (fInputUniStrMaybeMutable) { 1524 if (compat_SyncMutableUTextContents(fInputText)) { 1525 fInputLength = utext_nativeLength(fInputText); 1526 reset(); 1527 } 1528 } 1529 1530 int64_t nativeStart; 1531 nativeStart = start; 1532 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1533 status = U_INDEX_OUTOFBOUNDS_ERROR; 1534 return FALSE; 1535 } 1536 1537 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1538 MatchChunkAt((int32_t)nativeStart, FALSE, status); 1539 } else { 1540 MatchAt(nativeStart, FALSE, status); 1541 } 1542 return fMatch; 1543 } 1544 1545 1546 1547 //-------------------------------------------------------------------------------- 1548 // 1549 // matches() 1550 // 1551 //-------------------------------------------------------------------------------- 1552 UBool RegexMatcher::matches(UErrorCode &status) { 1553 if (U_FAILURE(status)) { 1554 return FALSE; 1555 } 1556 if (U_FAILURE(fDeferredStatus)) { 1557 status = fDeferredStatus; 1558 return FALSE; 1559 } 1560 1561 if (fInputUniStrMaybeMutable) { 1562 if (compat_SyncMutableUTextContents(fInputText)) { 1563 fInputLength = utext_nativeLength(fInputText); 1564 reset(); 1565 } 1566 } 1567 else { 1568 resetPreserveRegion(); 1569 } 1570 1571 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1572 MatchChunkAt((int32_t)fActiveStart, TRUE, status); 1573 } else { 1574 MatchAt(fActiveStart, TRUE, status); 1575 } 1576 return fMatch; 1577 } 1578 1579 1580 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { 1581 if (U_FAILURE(status)) { 1582 return FALSE; 1583 } 1584 if (U_FAILURE(fDeferredStatus)) { 1585 status = fDeferredStatus; 1586 return FALSE; 1587 } 1588 reset(); 1589 1590 if (start < 0) { 1591 status = U_INDEX_OUTOFBOUNDS_ERROR; 1592 return FALSE; 1593 } 1594 1595 if (fInputUniStrMaybeMutable) { 1596 if (compat_SyncMutableUTextContents(fInputText)) { 1597 fInputLength = utext_nativeLength(fInputText); 1598 reset(); 1599 } 1600 } 1601 1602 int64_t nativeStart; 1603 nativeStart = start; 1604 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1605 status = U_INDEX_OUTOFBOUNDS_ERROR; 1606 return FALSE; 1607 } 1608 1609 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1610 MatchChunkAt((int32_t)nativeStart, TRUE, status); 1611 } else { 1612 MatchAt(nativeStart, TRUE, status); 1613 } 1614 return fMatch; 1615 } 1616 1617 1618 1619 //-------------------------------------------------------------------------------- 1620 // 1621 // pattern 1622 // 1623 //-------------------------------------------------------------------------------- 1624 const RegexPattern &RegexMatcher::pattern() const { 1625 return *fPattern; 1626 } 1627 1628 1629 1630 //-------------------------------------------------------------------------------- 1631 // 1632 // region 1633 // 1634 //-------------------------------------------------------------------------------- 1635 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) { 1636 if (U_FAILURE(status)) { 1637 return *this; 1638 } 1639 1640 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { 1641 status = U_ILLEGAL_ARGUMENT_ERROR; 1642 } 1643 1644 int64_t nativeStart = regionStart; 1645 int64_t nativeLimit = regionLimit; 1646 if (nativeStart > fInputLength || nativeLimit > fInputLength) { 1647 status = U_ILLEGAL_ARGUMENT_ERROR; 1648 } 1649 1650 if (startIndex == -1) 1651 this->reset(); 1652 else 1653 resetPreserveRegion(); 1654 1655 fRegionStart = nativeStart; 1656 fRegionLimit = nativeLimit; 1657 fActiveStart = nativeStart; 1658 fActiveLimit = nativeLimit; 1659 1660 if (startIndex != -1) { 1661 if (startIndex < fActiveStart || startIndex > fActiveLimit) { 1662 status = U_INDEX_OUTOFBOUNDS_ERROR; 1663 } 1664 fMatchEnd = startIndex; 1665 } 1666 1667 if (!fTransparentBounds) { 1668 fLookStart = nativeStart; 1669 fLookLimit = nativeLimit; 1670 } 1671 if (fAnchoringBounds) { 1672 fAnchorStart = nativeStart; 1673 fAnchorLimit = nativeLimit; 1674 } 1675 return *this; 1676 } 1677 1678 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) { 1679 return region(start, limit, -1, status); 1680 } 1681 1682 //-------------------------------------------------------------------------------- 1683 // 1684 // regionEnd 1685 // 1686 //-------------------------------------------------------------------------------- 1687 int32_t RegexMatcher::regionEnd() const { 1688 return (int32_t)fRegionLimit; 1689 } 1690 1691 int64_t RegexMatcher::regionEnd64() const { 1692 return fRegionLimit; 1693 } 1694 1695 //-------------------------------------------------------------------------------- 1696 // 1697 // regionStart 1698 // 1699 //-------------------------------------------------------------------------------- 1700 int32_t RegexMatcher::regionStart() const { 1701 return (int32_t)fRegionStart; 1702 } 1703 1704 int64_t RegexMatcher::regionStart64() const { 1705 return fRegionStart; 1706 } 1707 1708 1709 //-------------------------------------------------------------------------------- 1710 // 1711 // replaceAll 1712 // 1713 //-------------------------------------------------------------------------------- 1714 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { 1715 UText replacementText = UTEXT_INITIALIZER; 1716 UText resultText = UTEXT_INITIALIZER; 1717 UnicodeString resultString; 1718 if (U_FAILURE(status)) { 1719 return resultString; 1720 } 1721 1722 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1723 utext_openUnicodeString(&resultText, &resultString, &status); 1724 1725 replaceAll(&replacementText, &resultText, status); 1726 1727 utext_close(&resultText); 1728 utext_close(&replacementText); 1729 1730 return resultString; 1731 } 1732 1733 1734 // 1735 // replaceAll, UText mode 1736 // 1737 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) { 1738 if (U_FAILURE(status)) { 1739 return dest; 1740 } 1741 if (U_FAILURE(fDeferredStatus)) { 1742 status = fDeferredStatus; 1743 return dest; 1744 } 1745 1746 if (dest == NULL) { 1747 UnicodeString emptyString; 1748 UText empty = UTEXT_INITIALIZER; 1749 1750 utext_openUnicodeString(&empty, &emptyString, &status); 1751 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1752 utext_close(&empty); 1753 } 1754 1755 if (U_SUCCESS(status)) { 1756 reset(); 1757 while (find()) { 1758 appendReplacement(dest, replacement, status); 1759 if (U_FAILURE(status)) { 1760 break; 1761 } 1762 } 1763 appendTail(dest, status); 1764 } 1765 1766 return dest; 1767 } 1768 1769 1770 //-------------------------------------------------------------------------------- 1771 // 1772 // replaceFirst 1773 // 1774 //-------------------------------------------------------------------------------- 1775 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { 1776 UText replacementText = UTEXT_INITIALIZER; 1777 UText resultText = UTEXT_INITIALIZER; 1778 UnicodeString resultString; 1779 1780 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1781 utext_openUnicodeString(&resultText, &resultString, &status); 1782 1783 replaceFirst(&replacementText, &resultText, status); 1784 1785 utext_close(&resultText); 1786 utext_close(&replacementText); 1787 1788 return resultString; 1789 } 1790 1791 // 1792 // replaceFirst, UText mode 1793 // 1794 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) { 1795 if (U_FAILURE(status)) { 1796 return dest; 1797 } 1798 if (U_FAILURE(fDeferredStatus)) { 1799 status = fDeferredStatus; 1800 return dest; 1801 } 1802 1803 reset(); 1804 if (!find()) { 1805 return getInput(dest, status); 1806 } 1807 1808 if (dest == NULL) { 1809 UnicodeString emptyString; 1810 UText empty = UTEXT_INITIALIZER; 1811 1812 utext_openUnicodeString(&empty, &emptyString, &status); 1813 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1814 utext_close(&empty); 1815 } 1816 1817 appendReplacement(dest, replacement, status); 1818 appendTail(dest, status); 1819 1820 return dest; 1821 } 1822 1823 1824 //-------------------------------------------------------------------------------- 1825 // 1826 // requireEnd 1827 // 1828 //-------------------------------------------------------------------------------- 1829 UBool RegexMatcher::requireEnd() const { 1830 return fRequireEnd; 1831 } 1832 1833 1834 //-------------------------------------------------------------------------------- 1835 // 1836 // reset 1837 // 1838 //-------------------------------------------------------------------------------- 1839 RegexMatcher &RegexMatcher::reset() { 1840 fRegionStart = 0; 1841 fRegionLimit = fInputLength; 1842 fActiveStart = 0; 1843 fActiveLimit = fInputLength; 1844 fAnchorStart = 0; 1845 fAnchorLimit = fInputLength; 1846 fLookStart = 0; 1847 fLookLimit = fInputLength; 1848 resetPreserveRegion(); 1849 return *this; 1850 } 1851 1852 1853 1854 void RegexMatcher::resetPreserveRegion() { 1855 fMatchStart = 0; 1856 fMatchEnd = 0; 1857 fLastMatchEnd = -1; 1858 fAppendPosition = 0; 1859 fMatch = FALSE; 1860 fHitEnd = FALSE; 1861 fRequireEnd = FALSE; 1862 fTime = 0; 1863 fTickCounter = TIMER_INITIAL_VALUE; 1864 //resetStack(); // more expensive than it looks... 1865 } 1866 1867 1868 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { 1869 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus); 1870 if (fPattern->fNeedsAltInput) { 1871 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1872 } 1873 fInputLength = utext_nativeLength(fInputText); 1874 1875 reset(); 1876 delete fInput; 1877 fInput = NULL; 1878 1879 // Do the following for any UnicodeString. 1880 // This is for compatibility for those clients who modify the input string "live" during regex operations. 1881 fInputUniStrMaybeMutable = TRUE; 1882 1883 if (fWordBreakItr != NULL) { 1884 #if UCONFIG_NO_BREAK_ITERATION==0 1885 UErrorCode status = U_ZERO_ERROR; 1886 fWordBreakItr->setText(fInputText, status); 1887 #endif 1888 } 1889 return *this; 1890 } 1891 1892 1893 RegexMatcher &RegexMatcher::reset(UText *input) { 1894 if (fInputText != input) { 1895 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus); 1896 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1897 fInputLength = utext_nativeLength(fInputText); 1898 1899 delete fInput; 1900 fInput = NULL; 1901 1902 if (fWordBreakItr != NULL) { 1903 #if UCONFIG_NO_BREAK_ITERATION==0 1904 UErrorCode status = U_ZERO_ERROR; 1905 fWordBreakItr->setText(input, status); 1906 #endif 1907 } 1908 } 1909 reset(); 1910 fInputUniStrMaybeMutable = FALSE; 1911 1912 return *this; 1913 } 1914 1915 /*RegexMatcher &RegexMatcher::reset(const UChar *) { 1916 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; 1917 return *this; 1918 }*/ 1919 1920 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { 1921 if (U_FAILURE(status)) { 1922 return *this; 1923 } 1924 reset(); // Reset also resets the region to be the entire string. 1925 1926 if (position < 0 || position > fActiveLimit) { 1927 status = U_INDEX_OUTOFBOUNDS_ERROR; 1928 return *this; 1929 } 1930 fMatchEnd = position; 1931 return *this; 1932 } 1933 1934 1935 //-------------------------------------------------------------------------------- 1936 // 1937 // refresh 1938 // 1939 //-------------------------------------------------------------------------------- 1940 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) { 1941 if (U_FAILURE(status)) { 1942 return *this; 1943 } 1944 if (input == NULL) { 1945 status = U_ILLEGAL_ARGUMENT_ERROR; 1946 return *this; 1947 } 1948 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) { 1949 status = U_ILLEGAL_ARGUMENT_ERROR; 1950 return *this; 1951 } 1952 int64_t pos = utext_getNativeIndex(fInputText); 1953 // Shallow read-only clone of the new UText into the existing input UText 1954 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status); 1955 if (U_FAILURE(status)) { 1956 return *this; 1957 } 1958 utext_setNativeIndex(fInputText, pos); 1959 1960 if (fAltInputText != NULL) { 1961 pos = utext_getNativeIndex(fAltInputText); 1962 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status); 1963 if (U_FAILURE(status)) { 1964 return *this; 1965 } 1966 utext_setNativeIndex(fAltInputText, pos); 1967 } 1968 return *this; 1969 } 1970 1971 1972 1973 //-------------------------------------------------------------------------------- 1974 // 1975 // setTrace 1976 // 1977 //-------------------------------------------------------------------------------- 1978 void RegexMatcher::setTrace(UBool state) { 1979 fTraceDebug = state; 1980 } 1981 1982 1983 1984 //--------------------------------------------------------------------- 1985 // 1986 // split 1987 // 1988 //--------------------------------------------------------------------- 1989 int32_t RegexMatcher::split(const UnicodeString &input, 1990 UnicodeString dest[], 1991 int32_t destCapacity, 1992 UErrorCode &status) 1993 { 1994 UText inputText = UTEXT_INITIALIZER; 1995 utext_openConstUnicodeString(&inputText, &input, &status); 1996 if (U_FAILURE(status)) { 1997 return 0; 1998 } 1999 2000 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); 2001 if (destText == NULL) { 2002 status = U_MEMORY_ALLOCATION_ERROR; 2003 return 0; 2004 } 2005 int32_t i; 2006 for (i = 0; i < destCapacity; i++) { 2007 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); 2008 } 2009 2010 int32_t fieldCount = split(&inputText, destText, destCapacity, status); 2011 2012 for (i = 0; i < destCapacity; i++) { 2013 utext_close(destText[i]); 2014 } 2015 2016 uprv_free(destText); 2017 utext_close(&inputText); 2018 return fieldCount; 2019 } 2020 2021 // 2022 // split, UText mode 2023 // 2024 int32_t RegexMatcher::split(UText *input, 2025 UText *dest[], 2026 int32_t destCapacity, 2027 UErrorCode &status) 2028 { 2029 // 2030 // Check arguements for validity 2031 // 2032 if (U_FAILURE(status)) { 2033 return 0; 2034 }; 2035 2036 if (destCapacity < 1) { 2037 status = U_ILLEGAL_ARGUMENT_ERROR; 2038 return 0; 2039 } 2040 2041 // 2042 // Reset for the input text 2043 // 2044 reset(input); 2045 int64_t nextOutputStringStart = 0; 2046 if (fActiveLimit == 0) { 2047 return 0; 2048 } 2049 2050 // 2051 // Loop through the input text, searching for the delimiter pattern 2052 // 2053 int32_t i; 2054 int32_t numCaptureGroups = fPattern->fGroupMap->size(); 2055 for (i=0; ; i++) { 2056 if (i>=destCapacity-1) { 2057 // There is one or zero output string left. 2058 // Fill the last output string with whatever is left from the input, then exit the loop. 2059 // ( i will be == destCapacity if we filled the output array while processing 2060 // capture groups of the delimiter expression, in which case we will discard the 2061 // last capture group saved in favor of the unprocessed remainder of the 2062 // input string.) 2063 i = destCapacity-1; 2064 if (fActiveLimit > nextOutputStringStart) { 2065 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2066 if (dest[i]) { 2067 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2068 input->chunkContents+nextOutputStringStart, 2069 (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2070 } else { 2071 UText remainingText = UTEXT_INITIALIZER; 2072 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2073 fActiveLimit-nextOutputStringStart, &status); 2074 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2075 utext_close(&remainingText); 2076 } 2077 } else { 2078 UErrorCode lengthStatus = U_ZERO_ERROR; 2079 int32_t remaining16Length = 2080 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 2081 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2082 if (remainingChars == NULL) { 2083 status = U_MEMORY_ALLOCATION_ERROR; 2084 break; 2085 } 2086 2087 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2088 if (dest[i]) { 2089 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2090 } else { 2091 UText remainingText = UTEXT_INITIALIZER; 2092 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2093 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2094 utext_close(&remainingText); 2095 } 2096 2097 uprv_free(remainingChars); 2098 } 2099 } 2100 break; 2101 } 2102 if (find()) { 2103 // We found another delimiter. Move everything from where we started looking 2104 // up until the start of the delimiter into the next output string. 2105 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2106 if (dest[i]) { 2107 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2108 input->chunkContents+nextOutputStringStart, 2109 (int32_t)(fMatchStart-nextOutputStringStart), &status); 2110 } else { 2111 UText remainingText = UTEXT_INITIALIZER; 2112 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2113 fMatchStart-nextOutputStringStart, &status); 2114 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2115 utext_close(&remainingText); 2116 } 2117 } else { 2118 UErrorCode lengthStatus = U_ZERO_ERROR; 2119 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus); 2120 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2121 if (remainingChars == NULL) { 2122 status = U_MEMORY_ALLOCATION_ERROR; 2123 break; 2124 } 2125 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status); 2126 if (dest[i]) { 2127 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2128 } else { 2129 UText remainingText = UTEXT_INITIALIZER; 2130 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2131 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2132 utext_close(&remainingText); 2133 } 2134 2135 uprv_free(remainingChars); 2136 } 2137 nextOutputStringStart = fMatchEnd; 2138 2139 // If the delimiter pattern has capturing parentheses, the captured 2140 // text goes out into the next n destination strings. 2141 int32_t groupNum; 2142 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 2143 if (i >= destCapacity-2) { 2144 // Never fill the last available output string with capture group text. 2145 // It will filled with the last field, the remainder of the 2146 // unsplit input text. 2147 break; 2148 } 2149 i++; 2150 dest[i] = group(groupNum, dest[i], status); 2151 } 2152 2153 if (nextOutputStringStart == fActiveLimit) { 2154 // The delimiter was at the end of the string. We're done, but first 2155 // we output one last empty string, for the empty field following 2156 // the delimiter at the end of input. 2157 if (i+1 < destCapacity) { 2158 ++i; 2159 if (dest[i] == NULL) { 2160 dest[i] = utext_openUChars(NULL, NULL, 0, &status); 2161 } else { 2162 static UChar emptyString[] = {(UChar)0}; 2163 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status); 2164 } 2165 } 2166 break; 2167 2168 } 2169 } 2170 else 2171 { 2172 // We ran off the end of the input while looking for the next delimiter. 2173 // All the remaining text goes into the current output string. 2174 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2175 if (dest[i]) { 2176 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2177 input->chunkContents+nextOutputStringStart, 2178 (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2179 } else { 2180 UText remainingText = UTEXT_INITIALIZER; 2181 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2182 fActiveLimit-nextOutputStringStart, &status); 2183 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2184 utext_close(&remainingText); 2185 } 2186 } else { 2187 UErrorCode lengthStatus = U_ZERO_ERROR; 2188 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 2189 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2190 if (remainingChars == NULL) { 2191 status = U_MEMORY_ALLOCATION_ERROR; 2192 break; 2193 } 2194 2195 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2196 if (dest[i]) { 2197 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2198 } else { 2199 UText remainingText = UTEXT_INITIALIZER; 2200 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2201 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2202 utext_close(&remainingText); 2203 } 2204 2205 uprv_free(remainingChars); 2206 } 2207 break; 2208 } 2209 if (U_FAILURE(status)) { 2210 break; 2211 } 2212 } // end of for loop 2213 return i+1; 2214 } 2215 2216 2217 //-------------------------------------------------------------------------------- 2218 // 2219 // start 2220 // 2221 //-------------------------------------------------------------------------------- 2222 int32_t RegexMatcher::start(UErrorCode &status) const { 2223 return start(0, status); 2224 } 2225 2226 int64_t RegexMatcher::start64(UErrorCode &status) const { 2227 return start64(0, status); 2228 } 2229 2230 //-------------------------------------------------------------------------------- 2231 // 2232 // start(int32_t group, UErrorCode &status) 2233 // 2234 //-------------------------------------------------------------------------------- 2235 2236 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const { 2237 if (U_FAILURE(status)) { 2238 return -1; 2239 } 2240 if (U_FAILURE(fDeferredStatus)) { 2241 status = fDeferredStatus; 2242 return -1; 2243 } 2244 if (fMatch == FALSE) { 2245 status = U_REGEX_INVALID_STATE; 2246 return -1; 2247 } 2248 if (group < 0 || group > fPattern->fGroupMap->size()) { 2249 status = U_INDEX_OUTOFBOUNDS_ERROR; 2250 return -1; 2251 } 2252 int64_t s; 2253 if (group == 0) { 2254 s = fMatchStart; 2255 } else { 2256 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 2257 U_ASSERT(groupOffset < fPattern->fFrameSize); 2258 U_ASSERT(groupOffset >= 0); 2259 s = fFrame->fExtra[groupOffset]; 2260 } 2261 2262 return s; 2263 } 2264 2265 2266 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { 2267 return (int32_t)start64(group, status); 2268 } 2269 2270 //-------------------------------------------------------------------------------- 2271 // 2272 // useAnchoringBounds 2273 // 2274 //-------------------------------------------------------------------------------- 2275 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { 2276 fAnchoringBounds = b; 2277 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); 2278 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); 2279 return *this; 2280 } 2281 2282 2283 //-------------------------------------------------------------------------------- 2284 // 2285 // useTransparentBounds 2286 // 2287 //-------------------------------------------------------------------------------- 2288 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { 2289 fTransparentBounds = b; 2290 fLookStart = (fTransparentBounds ? 0 : fRegionStart); 2291 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); 2292 return *this; 2293 } 2294 2295 //-------------------------------------------------------------------------------- 2296 // 2297 // setTimeLimit 2298 // 2299 //-------------------------------------------------------------------------------- 2300 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { 2301 if (U_FAILURE(status)) { 2302 return; 2303 } 2304 if (U_FAILURE(fDeferredStatus)) { 2305 status = fDeferredStatus; 2306 return; 2307 } 2308 if (limit < 0) { 2309 status = U_ILLEGAL_ARGUMENT_ERROR; 2310 return; 2311 } 2312 fTimeLimit = limit; 2313 } 2314 2315 2316 //-------------------------------------------------------------------------------- 2317 // 2318 // getTimeLimit 2319 // 2320 //-------------------------------------------------------------------------------- 2321 int32_t RegexMatcher::getTimeLimit() const { 2322 return fTimeLimit; 2323 } 2324 2325 2326 //-------------------------------------------------------------------------------- 2327 // 2328 // setStackLimit 2329 // 2330 //-------------------------------------------------------------------------------- 2331 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { 2332 if (U_FAILURE(status)) { 2333 return; 2334 } 2335 if (U_FAILURE(fDeferredStatus)) { 2336 status = fDeferredStatus; 2337 return; 2338 } 2339 if (limit < 0) { 2340 status = U_ILLEGAL_ARGUMENT_ERROR; 2341 return; 2342 } 2343 2344 // Reset the matcher. This is needed here in case there is a current match 2345 // whose final stack frame (containing the match results, pointed to by fFrame) 2346 // would be lost by resizing to a smaller stack size. 2347 reset(); 2348 2349 if (limit == 0) { 2350 // Unlimited stack expansion 2351 fStack->setMaxCapacity(0); 2352 } else { 2353 // Change the units of the limit from bytes to ints, and bump the size up 2354 // to be big enough to hold at least one stack frame for the pattern, 2355 // if it isn't there already. 2356 int32_t adjustedLimit = limit / sizeof(int32_t); 2357 if (adjustedLimit < fPattern->fFrameSize) { 2358 adjustedLimit = fPattern->fFrameSize; 2359 } 2360 fStack->setMaxCapacity(adjustedLimit); 2361 } 2362 fStackLimit = limit; 2363 } 2364 2365 2366 //-------------------------------------------------------------------------------- 2367 // 2368 // getStackLimit 2369 // 2370 //-------------------------------------------------------------------------------- 2371 int32_t RegexMatcher::getStackLimit() const { 2372 return fStackLimit; 2373 } 2374 2375 2376 //-------------------------------------------------------------------------------- 2377 // 2378 // setMatchCallback 2379 // 2380 //-------------------------------------------------------------------------------- 2381 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, 2382 const void *context, 2383 UErrorCode &status) { 2384 if (U_FAILURE(status)) { 2385 return; 2386 } 2387 fCallbackFn = callback; 2388 fCallbackContext = context; 2389 } 2390 2391 2392 //-------------------------------------------------------------------------------- 2393 // 2394 // getMatchCallback 2395 // 2396 //-------------------------------------------------------------------------------- 2397 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, 2398 const void *&context, 2399 UErrorCode &status) { 2400 if (U_FAILURE(status)) { 2401 return; 2402 } 2403 callback = fCallbackFn; 2404 context = fCallbackContext; 2405 } 2406 2407 2408 //-------------------------------------------------------------------------------- 2409 // 2410 // setMatchCallback 2411 // 2412 //-------------------------------------------------------------------------------- 2413 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback, 2414 const void *context, 2415 UErrorCode &status) { 2416 if (U_FAILURE(status)) { 2417 return; 2418 } 2419 fFindProgressCallbackFn = callback; 2420 fFindProgressCallbackContext = context; 2421 } 2422 2423 2424 //-------------------------------------------------------------------------------- 2425 // 2426 // getMatchCallback 2427 // 2428 //-------------------------------------------------------------------------------- 2429 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback, 2430 const void *&context, 2431 UErrorCode &status) { 2432 if (U_FAILURE(status)) { 2433 return; 2434 } 2435 callback = fFindProgressCallbackFn; 2436 context = fFindProgressCallbackContext; 2437 } 2438 2439 2440 //================================================================================ 2441 // 2442 // Code following this point in this file is the internal 2443 // Match Engine Implementation. 2444 // 2445 //================================================================================ 2446 2447 2448 //-------------------------------------------------------------------------------- 2449 // 2450 // resetStack 2451 // Discard any previous contents of the state save stack, and initialize a 2452 // new stack frame to all -1. The -1s are needed for capture group limits, 2453 // where they indicate that a group has not yet matched anything. 2454 //-------------------------------------------------------------------------------- 2455 REStackFrame *RegexMatcher::resetStack() { 2456 // Discard any previous contents of the state save stack, and initialize a 2457 // new stack frame with all -1 data. The -1s are needed for capture group limits, 2458 // where they indicate that a group has not yet matched anything. 2459 fStack->removeAllElements(); 2460 2461 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); 2462 int32_t i; 2463 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { 2464 iFrame->fExtra[i] = -1; 2465 } 2466 return iFrame; 2467 } 2468 2469 2470 2471 //-------------------------------------------------------------------------------- 2472 // 2473 // isWordBoundary 2474 // in perl, "xab..cd..", \b is true at positions 0,3,5,7 2475 // For us, 2476 // If the current char is a combining mark, 2477 // \b is FALSE. 2478 // Else Scan backwards to the first non-combining char. 2479 // We are at a boundary if the this char and the original chars are 2480 // opposite in membership in \w set 2481 // 2482 // parameters: pos - the current position in the input buffer 2483 // 2484 // TODO: double-check edge cases at region boundaries. 2485 // 2486 //-------------------------------------------------------------------------------- 2487 UBool RegexMatcher::isWordBoundary(int64_t pos) { 2488 UBool isBoundary = FALSE; 2489 UBool cIsWord = FALSE; 2490 2491 if (pos >= fLookLimit) { 2492 fHitEnd = TRUE; 2493 } else { 2494 // Determine whether char c at current position is a member of the word set of chars. 2495 // If we're off the end of the string, behave as though we're not at a word char. 2496 UTEXT_SETNATIVEINDEX(fInputText, pos); 2497 UChar32 c = UTEXT_CURRENT32(fInputText); 2498 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2499 // Current char is a combining one. Not a boundary. 2500 return FALSE; 2501 } 2502 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2503 } 2504 2505 // Back up until we come to a non-combining char, determine whether 2506 // that char is a word char. 2507 UBool prevCIsWord = FALSE; 2508 for (;;) { 2509 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { 2510 break; 2511 } 2512 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); 2513 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2514 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2515 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 2516 break; 2517 } 2518 } 2519 isBoundary = cIsWord ^ prevCIsWord; 2520 return isBoundary; 2521 } 2522 2523 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { 2524 UBool isBoundary = FALSE; 2525 UBool cIsWord = FALSE; 2526 2527 const UChar *inputBuf = fInputText->chunkContents; 2528 2529 if (pos >= fLookLimit) { 2530 fHitEnd = TRUE; 2531 } else { 2532 // Determine whether char c at current position is a member of the word set of chars. 2533 // If we're off the end of the string, behave as though we're not at a word char. 2534 UChar32 c; 2535 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); 2536 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2537 // Current char is a combining one. Not a boundary. 2538 return FALSE; 2539 } 2540 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2541 } 2542 2543 // Back up until we come to a non-combining char, determine whether 2544 // that char is a word char. 2545 UBool prevCIsWord = FALSE; 2546 for (;;) { 2547 if (pos <= fLookStart) { 2548 break; 2549 } 2550 UChar32 prevChar; 2551 U16_PREV(inputBuf, fLookStart, pos, prevChar); 2552 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2553 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2554 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 2555 break; 2556 } 2557 } 2558 isBoundary = cIsWord ^ prevCIsWord; 2559 return isBoundary; 2560 } 2561 2562 //-------------------------------------------------------------------------------- 2563 // 2564 // isUWordBoundary 2565 // 2566 // Test for a word boundary using RBBI word break. 2567 // 2568 // parameters: pos - the current position in the input buffer 2569 // 2570 //-------------------------------------------------------------------------------- 2571 UBool RegexMatcher::isUWordBoundary(int64_t pos) { 2572 UBool returnVal = FALSE; 2573 #if UCONFIG_NO_BREAK_ITERATION==0 2574 2575 // If we haven't yet created a break iterator for this matcher, do it now. 2576 if (fWordBreakItr == NULL) { 2577 fWordBreakItr = 2578 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus); 2579 if (U_FAILURE(fDeferredStatus)) { 2580 return FALSE; 2581 } 2582 fWordBreakItr->setText(fInputText, fDeferredStatus); 2583 } 2584 2585 if (pos >= fLookLimit) { 2586 fHitEnd = TRUE; 2587 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" 2588 // words are not boundaries. All non-word chars stand by themselves, 2589 // with word boundaries on both sides. 2590 } else { 2591 if (!UTEXT_USES_U16(fInputText)) { 2592 // !!!: Would like a better way to do this! 2593 UErrorCode status = U_ZERO_ERROR; 2594 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status); 2595 } 2596 returnVal = fWordBreakItr->isBoundary((int32_t)pos); 2597 } 2598 #endif 2599 return returnVal; 2600 } 2601 2602 //-------------------------------------------------------------------------------- 2603 // 2604 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state 2605 // saves. Increment the "time" counter, and call the 2606 // user callback function if there is one installed. 2607 // 2608 // If the match operation needs to be aborted, either for a time-out 2609 // or because the user callback asked for it, just set an error status. 2610 // The engine will pick that up and stop in its outer loop. 2611 // 2612 //-------------------------------------------------------------------------------- 2613 void RegexMatcher::IncrementTime(UErrorCode &status) { 2614 fTickCounter = TIMER_INITIAL_VALUE; 2615 fTime++; 2616 if (fCallbackFn != NULL) { 2617 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { 2618 status = U_REGEX_STOPPED_BY_CALLER; 2619 return; 2620 } 2621 } 2622 if (fTimeLimit > 0 && fTime >= fTimeLimit) { 2623 status = U_REGEX_TIME_OUT; 2624 } 2625 } 2626 2627 //-------------------------------------------------------------------------------- 2628 // 2629 // ReportFindProgress This function is called once for each advance in the target 2630 // string from the find() function, and calls the user progress callback 2631 // function if there is one installed. 2632 // 2633 // NOTE: 2634 // 2635 // If the match operation needs to be aborted because the user 2636 // callback asked for it, just set an error status. 2637 // The engine will pick that up and stop in its outer loop. 2638 // 2639 //-------------------------------------------------------------------------------- 2640 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) { 2641 if (fFindProgressCallbackFn != NULL) { 2642 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) { 2643 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/; 2644 return FALSE; 2645 } 2646 } 2647 return TRUE; 2648 } 2649 2650 //-------------------------------------------------------------------------------- 2651 // 2652 // StateSave 2653 // Make a new stack frame, initialized as a copy of the current stack frame. 2654 // Set the pattern index in the original stack frame from the operand value 2655 // in the opcode. Execution of the engine continues with the state in 2656 // the newly created stack frame 2657 // 2658 // Note that reserveBlock() may grow the stack, resulting in the 2659 // whole thing being relocated in memory. 2660 // 2661 // Parameters: 2662 // fp The top frame pointer when called. At return, a new 2663 // fame will be present 2664 // savePatIdx An index into the compiled pattern. Goes into the original 2665 // (not new) frame. If execution ever back-tracks out of the 2666 // new frame, this will be where we continue from in the pattern. 2667 // Return 2668 // The new frame pointer. 2669 // 2670 //-------------------------------------------------------------------------------- 2671 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) { 2672 // push storage for a new frame. 2673 int64_t *newFP = fStack->reserveBlock(fFrameSize, status); 2674 if (newFP == NULL) { 2675 // Failure on attempted stack expansion. 2676 // Stack function set some other error code, change it to a more 2677 // specific one for regular expressions. 2678 status = U_REGEX_STACK_OVERFLOW; 2679 // We need to return a writable stack frame, so just return the 2680 // previous frame. The match operation will stop quickly 2681 // because of the error status, after which the frame will never 2682 // be looked at again. 2683 return fp; 2684 } 2685 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. 2686 2687 // New stack frame = copy of old top frame. 2688 int64_t *source = (int64_t *)fp; 2689 int64_t *dest = newFP; 2690 for (;;) { 2691 *dest++ = *source++; 2692 if (source == newFP) { 2693 break; 2694 } 2695 } 2696 2697 fTickCounter--; 2698 if (fTickCounter <= 0) { 2699 IncrementTime(status); // Re-initializes fTickCounter 2700 } 2701 fp->fPatIdx = savePatIdx; 2702 return (REStackFrame *)newFP; 2703 } 2704 2705 2706 //-------------------------------------------------------------------------------- 2707 // 2708 // MatchAt This is the actual matching engine. 2709 // 2710 // startIdx: begin matching a this index. 2711 // toEnd: if true, match must extend to end of the input region 2712 // 2713 //-------------------------------------------------------------------------------- 2714 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { 2715 UBool isMatch = FALSE; // True if the we have a match. 2716 2717 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards 2718 2719 int32_t op; // Operation from the compiled pattern, split into 2720 int32_t opType; // the opcode 2721 int32_t opValue; // and the operand value. 2722 2723 #ifdef REGEX_RUN_DEBUG 2724 if (fTraceDebug) 2725 { 2726 printf("MatchAt(startIdx=%ld)\n", startIdx); 2727 printf("Original Pattern: "); 2728 UChar32 c = utext_next32From(fPattern->fPattern, 0); 2729 while (c != U_SENTINEL) { 2730 if (c<32 || c>256) { 2731 c = '.'; 2732 } 2733 printf("%c", c); 2734 2735 c = UTEXT_NEXT32(fPattern->fPattern); 2736 } 2737 printf("\n"); 2738 printf("Input String: "); 2739 c = utext_next32From(fInputText, 0); 2740 while (c != U_SENTINEL) { 2741 if (c<32 || c>256) { 2742 c = '.'; 2743 } 2744 printf("%c", c); 2745 2746 c = UTEXT_NEXT32(fInputText); 2747 } 2748 printf("\n"); 2749 printf("\n"); 2750 } 2751 #endif 2752 2753 if (U_FAILURE(status)) { 2754 return; 2755 } 2756 2757 // Cache frequently referenced items from the compiled pattern 2758 // 2759 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 2760 2761 const UChar *litText = fPattern->fLiteralText.getBuffer(); 2762 UVector *sets = fPattern->fSets; 2763 2764 fFrameSize = fPattern->fFrameSize; 2765 REStackFrame *fp = resetStack(); 2766 2767 fp->fPatIdx = 0; 2768 fp->fInputIdx = startIdx; 2769 2770 // Zero out the pattern's static data 2771 int32_t i; 2772 for (i = 0; i<fPattern->fDataSize; i++) { 2773 fData[i] = 0; 2774 } 2775 2776 // 2777 // Main loop for interpreting the compiled pattern. 2778 // One iteration of the loop per pattern operation performed. 2779 // 2780 for (;;) { 2781 op = (int32_t)pat[fp->fPatIdx]; 2782 opType = URX_TYPE(op); 2783 opValue = URX_VAL(op); 2784 #ifdef REGEX_RUN_DEBUG 2785 if (fTraceDebug) { 2786 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2787 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, 2788 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 2789 fPattern->dumpOp(fp->fPatIdx); 2790 } 2791 #endif 2792 fp->fPatIdx++; 2793 2794 switch (opType) { 2795 2796 2797 case URX_NOP: 2798 break; 2799 2800 2801 case URX_BACKTRACK: 2802 // Force a backtrack. In some circumstances, the pattern compiler 2803 // will notice that the pattern can't possibly match anything, and will 2804 // emit one of these at that point. 2805 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2806 break; 2807 2808 2809 case URX_ONECHAR: 2810 if (fp->fInputIdx < fActiveLimit) { 2811 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2812 UChar32 c = UTEXT_NEXT32(fInputText); 2813 if (c == opValue) { 2814 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2815 break; 2816 } 2817 } else { 2818 fHitEnd = TRUE; 2819 } 2820 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2821 break; 2822 2823 2824 case URX_STRING: 2825 { 2826 // Test input against a literal string. 2827 // Strings require two slots in the compiled pattern, one for the 2828 // offset to the string text, and one for the length. 2829 2830 int32_t stringStartIdx = opValue; 2831 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 2832 fp->fPatIdx++; 2833 opType = URX_TYPE(op); 2834 int32_t stringLen = URX_VAL(op); 2835 U_ASSERT(opType == URX_STRING_LEN); 2836 U_ASSERT(stringLen >= 2); 2837 2838 const UChar *patternString = litText+stringStartIdx; 2839 int32_t patternStringIndex = 0; 2840 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2841 UChar32 inputChar; 2842 UChar32 patternChar; 2843 UBool success = TRUE; 2844 while (patternStringIndex < stringLen) { 2845 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { 2846 success = FALSE; 2847 fHitEnd = TRUE; 2848 break; 2849 } 2850 inputChar = UTEXT_NEXT32(fInputText); 2851 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar); 2852 if (patternChar != inputChar) { 2853 success = FALSE; 2854 break; 2855 } 2856 } 2857 2858 if (success) { 2859 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2860 } else { 2861 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2862 } 2863 } 2864 break; 2865 2866 2867 case URX_STATE_SAVE: 2868 fp = StateSave(fp, opValue, status); 2869 break; 2870 2871 2872 case URX_END: 2873 // The match loop will exit via this path on a successful match, 2874 // when we reach the end of the pattern. 2875 if (toEnd && fp->fInputIdx != fActiveLimit) { 2876 // The pattern matched, but not to the end of input. Try some more. 2877 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2878 break; 2879 } 2880 isMatch = TRUE; 2881 goto breakFromLoop; 2882 2883 // Start and End Capture stack frame variables are laid out out like this: 2884 // fp->fExtra[opValue] - The start of a completed capture group 2885 // opValue+1 - The end of a completed capture group 2886 // opValue+2 - the start of a capture group whose end 2887 // has not yet been reached (and might not ever be). 2888 case URX_START_CAPTURE: 2889 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2890 fp->fExtra[opValue+2] = fp->fInputIdx; 2891 break; 2892 2893 2894 case URX_END_CAPTURE: 2895 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2896 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 2897 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 2898 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 2899 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 2900 break; 2901 2902 2903 case URX_DOLLAR: // $, test for End of line 2904 // or for position before new line at end of input 2905 { 2906 if (fp->fInputIdx >= fAnchorLimit) { 2907 // We really are at the end of input. Success. 2908 fHitEnd = TRUE; 2909 fRequireEnd = TRUE; 2910 break; 2911 } 2912 2913 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2914 2915 // If we are positioned just before a new-line that is located at the 2916 // end of input, succeed. 2917 UChar32 c = UTEXT_NEXT32(fInputText); 2918 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 2919 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 2920 // If not in the middle of a CR/LF sequence 2921 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { 2922 // At new-line at end of input. Success 2923 fHitEnd = TRUE; 2924 fRequireEnd = TRUE; 2925 2926 break; 2927 } 2928 } 2929 } else { 2930 UChar32 nextC = UTEXT_NEXT32(fInputText); 2931 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 2932 fHitEnd = TRUE; 2933 fRequireEnd = TRUE; 2934 break; // At CR/LF at end of input. Success 2935 } 2936 } 2937 2938 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2939 } 2940 break; 2941 2942 2943 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 2944 if (fp->fInputIdx >= fAnchorLimit) { 2945 // Off the end of input. Success. 2946 fHitEnd = TRUE; 2947 fRequireEnd = TRUE; 2948 break; 2949 } else { 2950 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2951 UChar32 c = UTEXT_NEXT32(fInputText); 2952 // Either at the last character of input, or off the end. 2953 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) { 2954 fHitEnd = TRUE; 2955 fRequireEnd = TRUE; 2956 break; 2957 } 2958 } 2959 2960 // Not at end of input. Back-track out. 2961 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2962 break; 2963 2964 2965 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 2966 { 2967 if (fp->fInputIdx >= fAnchorLimit) { 2968 // We really are at the end of input. Success. 2969 fHitEnd = TRUE; 2970 fRequireEnd = TRUE; 2971 break; 2972 } 2973 // If we are positioned just before a new-line, succeed. 2974 // It makes no difference where the new-line is within the input. 2975 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2976 UChar32 c = UTEXT_CURRENT32(fInputText); 2977 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 2978 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 2979 // In multi-line mode, hitting a new-line just before the end of input does not 2980 // set the hitEnd or requireEnd flags 2981 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) { 2982 break; 2983 } 2984 } 2985 // not at a new line. Fail. 2986 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2987 } 2988 break; 2989 2990 2991 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 2992 { 2993 if (fp->fInputIdx >= fAnchorLimit) { 2994 // We really are at the end of input. Success. 2995 fHitEnd = TRUE; 2996 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 2997 break; // adding a new-line would not lose the match. 2998 } 2999 // If we are not positioned just before a new-line, the test fails; backtrack out. 3000 // It makes no difference where the new-line is within the input. 3001 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3002 if (UTEXT_CURRENT32(fInputText) != 0x0a) { 3003 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3004 } 3005 } 3006 break; 3007 3008 3009 case URX_CARET: // ^, test for start of line 3010 if (fp->fInputIdx != fAnchorStart) { 3011 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3012 } 3013 break; 3014 3015 3016 case URX_CARET_M: // ^, test for start of line in mulit-line mode 3017 { 3018 if (fp->fInputIdx == fAnchorStart) { 3019 // We are at the start input. Success. 3020 break; 3021 } 3022 // Check whether character just before the current pos is a new-line 3023 // unless we are at the end of input 3024 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3025 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3026 if ((fp->fInputIdx < fAnchorLimit) && 3027 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 3028 // It's a new-line. ^ is true. Success. 3029 // TODO: what should be done with positions between a CR and LF? 3030 break; 3031 } 3032 // Not at the start of a line. Fail. 3033 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3034 } 3035 break; 3036 3037 3038 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 3039 { 3040 U_ASSERT(fp->fInputIdx >= fAnchorStart); 3041 if (fp->fInputIdx <= fAnchorStart) { 3042 // We are at the start input. Success. 3043 break; 3044 } 3045 // Check whether character just before the current pos is a new-line 3046 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 3047 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3048 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3049 if (c != 0x0a) { 3050 // Not at the start of a line. Back-track out. 3051 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3052 } 3053 } 3054 break; 3055 3056 case URX_BACKSLASH_B: // Test for word boundaries 3057 { 3058 UBool success = isWordBoundary(fp->fInputIdx); 3059 success ^= (UBool)(opValue != 0); // flip sense for \B 3060 if (!success) { 3061 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3062 } 3063 } 3064 break; 3065 3066 3067 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 3068 { 3069 UBool success = isUWordBoundary(fp->fInputIdx); 3070 success ^= (UBool)(opValue != 0); // flip sense for \B 3071 if (!success) { 3072 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3073 } 3074 } 3075 break; 3076 3077 3078 case URX_BACKSLASH_D: // Test for decimal digit 3079 { 3080 if (fp->fInputIdx >= fActiveLimit) { 3081 fHitEnd = TRUE; 3082 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3083 break; 3084 } 3085 3086 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3087 3088 UChar32 c = UTEXT_NEXT32(fInputText); 3089 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 3090 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 3091 success ^= (UBool)(opValue != 0); // flip sense for \D 3092 if (success) { 3093 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3094 } else { 3095 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3096 } 3097 } 3098 break; 3099 3100 3101 case URX_BACKSLASH_G: // Test for position at end of previous match 3102 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { 3103 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3104 } 3105 break; 3106 3107 3108 case URX_BACKSLASH_X: 3109 // Match a Grapheme, as defined by Unicode TR 29. 3110 // Differs slightly from Perl, which consumes combining marks independently 3111 // of context. 3112 { 3113 3114 // Fail if at end of input 3115 if (fp->fInputIdx >= fActiveLimit) { 3116 fHitEnd = TRUE; 3117 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3118 break; 3119 } 3120 3121 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3122 3123 // Examine (and consume) the current char. 3124 // Dispatch into a little state machine, based on the char. 3125 UChar32 c; 3126 c = UTEXT_NEXT32(fInputText); 3127 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3128 UnicodeSet **sets = fPattern->fStaticSets; 3129 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 3130 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 3131 if (sets[URX_GC_L]->contains(c)) goto GC_L; 3132 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 3133 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 3134 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3135 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3136 goto GC_Extend; 3137 3138 3139 3140 GC_L: 3141 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3142 c = UTEXT_NEXT32(fInputText); 3143 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3144 if (sets[URX_GC_L]->contains(c)) goto GC_L; 3145 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 3146 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 3147 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3148 (void)UTEXT_PREVIOUS32(fInputText); 3149 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3150 goto GC_Extend; 3151 3152 GC_V: 3153 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3154 c = UTEXT_NEXT32(fInputText); 3155 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3156 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3157 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3158 (void)UTEXT_PREVIOUS32(fInputText); 3159 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3160 goto GC_Extend; 3161 3162 GC_T: 3163 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3164 c = UTEXT_NEXT32(fInputText); 3165 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3166 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3167 (void)UTEXT_PREVIOUS32(fInputText); 3168 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3169 goto GC_Extend; 3170 3171 GC_Extend: 3172 // Combining characters are consumed here 3173 for (;;) { 3174 if (fp->fInputIdx >= fActiveLimit) { 3175 break; 3176 } 3177 c = UTEXT_CURRENT32(fInputText); 3178 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 3179 break; 3180 } 3181 (void)UTEXT_NEXT32(fInputText); 3182 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3183 } 3184 goto GC_Done; 3185 3186 GC_Control: 3187 // Most control chars stand alone (don't combine with combining chars), 3188 // except for that CR/LF sequence is a single grapheme cluster. 3189 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 3190 c = UTEXT_NEXT32(fInputText); 3191 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3192 } 3193 3194 GC_Done: 3195 if (fp->fInputIdx >= fActiveLimit) { 3196 fHitEnd = TRUE; 3197 } 3198 break; 3199 } 3200 3201 3202 3203 3204 case URX_BACKSLASH_Z: // Test for end of Input 3205 if (fp->fInputIdx < fAnchorLimit) { 3206 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3207 } else { 3208 fHitEnd = TRUE; 3209 fRequireEnd = TRUE; 3210 } 3211 break; 3212 3213 3214 3215 case URX_STATIC_SETREF: 3216 { 3217 // Test input character against one of the predefined sets 3218 // (Word Characters, for example) 3219 // The high bit of the op value is a flag for the match polarity. 3220 // 0: success if input char is in set. 3221 // 1: success if input char is not in set. 3222 if (fp->fInputIdx >= fActiveLimit) { 3223 fHitEnd = TRUE; 3224 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3225 break; 3226 } 3227 3228 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 3229 opValue &= ~URX_NEG_SET; 3230 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3231 3232 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3233 UChar32 c = UTEXT_NEXT32(fInputText); 3234 if (c < 256) { 3235 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3236 if (s8->contains(c)) { 3237 success = !success; 3238 } 3239 } else { 3240 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3241 if (s->contains(c)) { 3242 success = !success; 3243 } 3244 } 3245 if (success) { 3246 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3247 } else { 3248 // the character wasn't in the set. 3249 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3250 } 3251 } 3252 break; 3253 3254 3255 case URX_STAT_SETREF_N: 3256 { 3257 // Test input character for NOT being a member of one of 3258 // the predefined sets (Word Characters, for example) 3259 if (fp->fInputIdx >= fActiveLimit) { 3260 fHitEnd = TRUE; 3261 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3262 break; 3263 } 3264 3265 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3266 3267 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3268 3269 UChar32 c = UTEXT_NEXT32(fInputText); 3270 if (c < 256) { 3271 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3272 if (s8->contains(c) == FALSE) { 3273 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3274 break; 3275 } 3276 } else { 3277 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3278 if (s->contains(c) == FALSE) { 3279 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3280 break; 3281 } 3282 } 3283 // the character wasn't in the set. 3284 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3285 } 3286 break; 3287 3288 3289 case URX_SETREF: 3290 if (fp->fInputIdx >= fActiveLimit) { 3291 fHitEnd = TRUE; 3292 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3293 break; 3294 } else { 3295 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3296 3297 // There is input left. Pick up one char and test it for set membership. 3298 UChar32 c = UTEXT_NEXT32(fInputText); 3299 U_ASSERT(opValue > 0 && opValue < sets->size()); 3300 if (c<256) { 3301 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 3302 if (s8->contains(c)) { 3303 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3304 break; 3305 } 3306 } else { 3307 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 3308 if (s->contains(c)) { 3309 // The character is in the set. A Match. 3310 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3311 break; 3312 } 3313 } 3314 3315 // the character wasn't in the set. 3316 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3317 } 3318 break; 3319 3320 3321 case URX_DOTANY: 3322 { 3323 // . matches anything, but stops at end-of-line. 3324 if (fp->fInputIdx >= fActiveLimit) { 3325 // At end of input. Match failed. Backtrack out. 3326 fHitEnd = TRUE; 3327 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3328 break; 3329 } 3330 3331 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3332 3333 // There is input left. Advance over one char, unless we've hit end-of-line 3334 UChar32 c = UTEXT_NEXT32(fInputText); 3335 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 3336 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 3337 // End of line in normal mode. . does not match. 3338 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3339 break; 3340 } 3341 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3342 } 3343 break; 3344 3345 3346 case URX_DOTANY_ALL: 3347 { 3348 // ., in dot-matches-all (including new lines) mode 3349 if (fp->fInputIdx >= fActiveLimit) { 3350 // At end of input. Match failed. Backtrack out. 3351 fHitEnd = TRUE; 3352 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3353 break; 3354 } 3355 3356 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3357 3358 // There is input left. Advance over one char, except if we are 3359 // at a cr/lf, advance over both of them. 3360 UChar32 c; 3361 c = UTEXT_NEXT32(fInputText); 3362 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3363 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 3364 // In the case of a CR/LF, we need to advance over both. 3365 UChar32 nextc = UTEXT_CURRENT32(fInputText); 3366 if (nextc == 0x0a) { 3367 (void)UTEXT_NEXT32(fInputText); 3368 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3369 } 3370 } 3371 } 3372 break; 3373 3374 3375 case URX_DOTANY_UNIX: 3376 { 3377 // '.' operator, matches all, but stops at end-of-line. 3378 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 3379 if (fp->fInputIdx >= fActiveLimit) { 3380 // At end of input. Match failed. Backtrack out. 3381 fHitEnd = TRUE; 3382 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3383 break; 3384 } 3385 3386 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3387 3388 // There is input left. Advance over one char, unless we've hit end-of-line 3389 UChar32 c = UTEXT_NEXT32(fInputText); 3390 if (c == 0x0a) { 3391 // End of line in normal mode. '.' does not match the \n 3392 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3393 } else { 3394 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3395 } 3396 } 3397 break; 3398 3399 3400 case URX_JMP: 3401 fp->fPatIdx = opValue; 3402 break; 3403 3404 case URX_FAIL: 3405 isMatch = FALSE; 3406 goto breakFromLoop; 3407 3408 case URX_JMP_SAV: 3409 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 3410 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3411 fp->fPatIdx = opValue; // Then JMP. 3412 break; 3413 3414 case URX_JMP_SAV_X: 3415 // This opcode is used with (x)+, when x can match a zero length string. 3416 // Same as JMP_SAV, except conditional on the match having made forward progress. 3417 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 3418 // data address of the input position at the start of the loop. 3419 { 3420 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 3421 int32_t stoOp = (int32_t)pat[opValue-1]; 3422 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 3423 int32_t frameLoc = URX_VAL(stoOp); 3424 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 3425 int64_t prevInputIdx = fp->fExtra[frameLoc]; 3426 U_ASSERT(prevInputIdx <= fp->fInputIdx); 3427 if (prevInputIdx < fp->fInputIdx) { 3428 // The match did make progress. Repeat the loop. 3429 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3430 fp->fPatIdx = opValue; 3431 fp->fExtra[frameLoc] = fp->fInputIdx; 3432 } 3433 // If the input position did not advance, we do nothing here, 3434 // execution will fall out of the loop. 3435 } 3436 break; 3437 3438 case URX_CTR_INIT: 3439 { 3440 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3441 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3442 3443 // Pick up the three extra operands that CTR_INIT has, and 3444 // skip the pattern location counter past 3445 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3446 fp->fPatIdx += 3; 3447 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3448 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3449 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3450 U_ASSERT(minCount>=0); 3451 U_ASSERT(maxCount>=minCount || maxCount==-1); 3452 U_ASSERT(loopLoc>=fp->fPatIdx); 3453 3454 if (minCount == 0) { 3455 fp = StateSave(fp, loopLoc+1, status); 3456 } 3457 if (maxCount == -1) { 3458 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. 3459 } else if (maxCount == 0) { 3460 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3461 } 3462 } 3463 break; 3464 3465 case URX_CTR_LOOP: 3466 { 3467 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3468 int32_t initOp = (int32_t)pat[opValue]; 3469 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 3470 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3471 int32_t minCount = (int32_t)pat[opValue+2]; 3472 int32_t maxCount = (int32_t)pat[opValue+3]; 3473 (*pCounter)++; 3474 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 3475 U_ASSERT(*pCounter == maxCount); 3476 break; 3477 } 3478 if (*pCounter >= minCount) { 3479 if (maxCount == -1) { 3480 // Loop has no hard upper bound. 3481 // Check that it is progressing through the input, break if it is not. 3482 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 3483 if (fp->fInputIdx == *pLastInputIdx) { 3484 break; 3485 } else { 3486 *pLastInputIdx = fp->fInputIdx; 3487 } 3488 } 3489 fp = StateSave(fp, fp->fPatIdx, status); 3490 } 3491 fp->fPatIdx = opValue + 4; // Loop back. 3492 } 3493 break; 3494 3495 case URX_CTR_INIT_NG: 3496 { 3497 // Initialize a non-greedy loop 3498 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3499 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3500 3501 // Pick up the three extra operands that CTR_INIT_NG has, and 3502 // skip the pattern location counter past 3503 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3504 fp->fPatIdx += 3; 3505 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3506 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3507 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3508 U_ASSERT(minCount>=0); 3509 U_ASSERT(maxCount>=minCount || maxCount==-1); 3510 U_ASSERT(loopLoc>fp->fPatIdx); 3511 if (maxCount == -1) { 3512 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. 3513 } 3514 3515 if (minCount == 0) { 3516 if (maxCount != 0) { 3517 fp = StateSave(fp, fp->fPatIdx, status); 3518 } 3519 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 3520 } 3521 } 3522 break; 3523 3524 case URX_CTR_LOOP_NG: 3525 { 3526 // Non-greedy {min, max} loops 3527 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3528 int32_t initOp = (int32_t)pat[opValue]; 3529 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 3530 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3531 int32_t minCount = (int32_t)pat[opValue+2]; 3532 int32_t maxCount = (int32_t)pat[opValue+3]; 3533 3534 (*pCounter)++; 3535 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 3536 // The loop has matched the maximum permitted number of times. 3537 // Break out of here with no action. Matching will 3538 // continue with the following pattern. 3539 U_ASSERT(*pCounter == maxCount); 3540 break; 3541 } 3542 3543 if (*pCounter < minCount) { 3544 // We haven't met the minimum number of matches yet. 3545 // Loop back for another one. 3546 fp->fPatIdx = opValue + 4; // Loop back. 3547 } else { 3548 // We do have the minimum number of matches. 3549 3550 // If there is no upper bound on the loop iterations, check that the input index 3551 // is progressing, and stop the loop if it is not. 3552 if (maxCount == -1) { 3553 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 3554 if (fp->fInputIdx == *pLastInputIdx) { 3555 break; 3556 } 3557 *pLastInputIdx = fp->fInputIdx; 3558 } 3559 3560 // Loop Continuation: we will fall into the pattern following the loop 3561 // (non-greedy, don't execute loop body first), but first do 3562 // a state save to the top of the loop, so that a match failure 3563 // in the following pattern will try another iteration of the loop. 3564 fp = StateSave(fp, opValue + 4, status); 3565 } 3566 } 3567 break; 3568 3569 case URX_STO_SP: 3570 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3571 fData[opValue] = fStack->size(); 3572 break; 3573 3574 case URX_LD_SP: 3575 { 3576 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3577 int32_t newStackSize = (int32_t)fData[opValue]; 3578 U_ASSERT(newStackSize <= fStack->size()); 3579 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3580 if (newFP == (int64_t *)fp) { 3581 break; 3582 } 3583 int32_t i; 3584 for (i=0; i<fFrameSize; i++) { 3585 newFP[i] = ((int64_t *)fp)[i]; 3586 } 3587 fp = (REStackFrame *)newFP; 3588 fStack->setSize(newStackSize); 3589 } 3590 break; 3591 3592 case URX_BACKREF: 3593 { 3594 U_ASSERT(opValue < fFrameSize); 3595 int64_t groupStartIdx = fp->fExtra[opValue]; 3596 int64_t groupEndIdx = fp->fExtra[opValue+1]; 3597 U_ASSERT(groupStartIdx <= groupEndIdx); 3598 if (groupStartIdx < 0) { 3599 // This capture group has not participated in the match thus far, 3600 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3601 break; 3602 } 3603 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); 3604 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3605 3606 // Note: if the capture group match was of an empty string the backref 3607 // match succeeds. Verified by testing: Perl matches succeed 3608 // in this case, so we do too. 3609 3610 UBool success = TRUE; 3611 for (;;) { 3612 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) { 3613 success = TRUE; 3614 break; 3615 } 3616 if (utext_getNativeIndex(fInputText) >= fActiveLimit) { 3617 success = FALSE; 3618 fHitEnd = TRUE; 3619 break; 3620 } 3621 UChar32 captureGroupChar = utext_next32(fAltInputText); 3622 UChar32 inputChar = utext_next32(fInputText); 3623 if (inputChar != captureGroupChar) { 3624 success = FALSE; 3625 break; 3626 } 3627 } 3628 3629 if (success) { 3630 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3631 } else { 3632 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3633 } 3634 } 3635 break; 3636 3637 3638 3639 case URX_BACKREF_I: 3640 { 3641 U_ASSERT(opValue < fFrameSize); 3642 int64_t groupStartIdx = fp->fExtra[opValue]; 3643 int64_t groupEndIdx = fp->fExtra[opValue+1]; 3644 U_ASSERT(groupStartIdx <= groupEndIdx); 3645 if (groupStartIdx < 0) { 3646 // This capture group has not participated in the match thus far, 3647 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3648 break; 3649 } 3650 utext_setNativeIndex(fAltInputText, groupStartIdx); 3651 utext_setNativeIndex(fInputText, fp->fInputIdx); 3652 CaseFoldingUTextIterator captureGroupItr(*fAltInputText); 3653 CaseFoldingUTextIterator inputItr(*fInputText); 3654 3655 // Note: if the capture group match was of an empty string the backref 3656 // match succeeds. Verified by testing: Perl matches succeed 3657 // in this case, so we do too. 3658 3659 UBool success = TRUE; 3660 for (;;) { 3661 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) { 3662 success = TRUE; 3663 break; 3664 } 3665 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) { 3666 success = FALSE; 3667 fHitEnd = TRUE; 3668 break; 3669 } 3670 UChar32 captureGroupChar = captureGroupItr.next(); 3671 UChar32 inputChar = inputItr.next(); 3672 if (inputChar != captureGroupChar) { 3673 success = FALSE; 3674 break; 3675 } 3676 } 3677 3678 if (success && inputItr.inExpansion()) { 3679 // We otained a match by consuming part of a string obtained from 3680 // case-folding a single code point of the input text. 3681 // This does not count as an overall match. 3682 success = FALSE; 3683 } 3684 3685 if (success) { 3686 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3687 } else { 3688 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3689 } 3690 3691 } 3692 break; 3693 3694 case URX_STO_INP_LOC: 3695 { 3696 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 3697 fp->fExtra[opValue] = fp->fInputIdx; 3698 } 3699 break; 3700 3701 case URX_JMPX: 3702 { 3703 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3704 fp->fPatIdx += 1; 3705 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 3706 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 3707 int64_t savedInputIdx = fp->fExtra[dataLoc]; 3708 U_ASSERT(savedInputIdx <= fp->fInputIdx); 3709 if (savedInputIdx < fp->fInputIdx) { 3710 fp->fPatIdx = opValue; // JMP 3711 } else { 3712 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 3713 } 3714 } 3715 break; 3716 3717 case URX_LA_START: 3718 { 3719 // Entering a lookahead block. 3720 // Save Stack Ptr, Input Pos. 3721 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3722 fData[opValue] = fStack->size(); 3723 fData[opValue+1] = fp->fInputIdx; 3724 fActiveStart = fLookStart; // Set the match region change for 3725 fActiveLimit = fLookLimit; // transparent bounds. 3726 } 3727 break; 3728 3729 case URX_LA_END: 3730 { 3731 // Leaving a look-ahead block. 3732 // restore Stack Ptr, Input Pos to positions they had on entry to block. 3733 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3734 int32_t stackSize = fStack->size(); 3735 int32_t newStackSize =(int32_t)fData[opValue]; 3736 U_ASSERT(stackSize >= newStackSize); 3737 if (stackSize > newStackSize) { 3738 // Copy the current top frame back to the new (cut back) top frame. 3739 // This makes the capture groups from within the look-ahead 3740 // expression available. 3741 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3742 int32_t i; 3743 for (i=0; i<fFrameSize; i++) { 3744 newFP[i] = ((int64_t *)fp)[i]; 3745 } 3746 fp = (REStackFrame *)newFP; 3747 fStack->setSize(newStackSize); 3748 } 3749 fp->fInputIdx = fData[opValue+1]; 3750 3751 // Restore the active region bounds in the input string; they may have 3752 // been changed because of transparent bounds on a Region. 3753 fActiveStart = fRegionStart; 3754 fActiveLimit = fRegionLimit; 3755 } 3756 break; 3757 3758 case URX_ONECHAR_I: 3759 // Case insensitive one char. The char from the pattern is already case folded. 3760 // Input text is not, but case folding the input can not reduce two or more code 3761 // points to one. 3762 if (fp->fInputIdx < fActiveLimit) { 3763 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3764 3765 UChar32 c = UTEXT_NEXT32(fInputText); 3766 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 3767 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3768 break; 3769 } 3770 } else { 3771 fHitEnd = TRUE; 3772 } 3773 3774 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3775 break; 3776 3777 case URX_STRING_I: 3778 { 3779 // Case-insensitive test input against a literal string. 3780 // Strings require two slots in the compiled pattern, one for the 3781 // offset to the string text, and one for the length. 3782 // The compiled string has already been case folded. 3783 { 3784 const UChar *patternString = litText + opValue; 3785 int32_t patternStringIdx = 0; 3786 3787 op = (int32_t)pat[fp->fPatIdx]; 3788 fp->fPatIdx++; 3789 opType = URX_TYPE(op); 3790 opValue = URX_VAL(op); 3791 U_ASSERT(opType == URX_STRING_LEN); 3792 int32_t patternStringLen = opValue; // Length of the string from the pattern. 3793 3794 3795 UChar32 cPattern; 3796 UChar32 cText; 3797 UBool success = TRUE; 3798 3799 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3800 CaseFoldingUTextIterator inputIterator(*fInputText); 3801 while (patternStringIdx < patternStringLen) { 3802 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { 3803 success = FALSE; 3804 fHitEnd = TRUE; 3805 break; 3806 } 3807 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); 3808 cText = inputIterator.next(); 3809 if (cText != cPattern) { 3810 success = FALSE; 3811 break; 3812 } 3813 } 3814 if (inputIterator.inExpansion()) { 3815 success = FALSE; 3816 } 3817 3818 if (success) { 3819 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3820 } else { 3821 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3822 } 3823 } 3824 } 3825 break; 3826 3827 case URX_LB_START: 3828 { 3829 // Entering a look-behind block. 3830 // Save Stack Ptr, Input Pos. 3831 // TODO: implement transparent bounds. Ticket #6067 3832 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3833 fData[opValue] = fStack->size(); 3834 fData[opValue+1] = fp->fInputIdx; 3835 // Init the variable containing the start index for attempted matches. 3836 fData[opValue+2] = -1; 3837 // Save input string length, then reset to pin any matches to end at 3838 // the current position. 3839 fData[opValue+3] = fActiveLimit; 3840 fActiveLimit = fp->fInputIdx; 3841 } 3842 break; 3843 3844 3845 case URX_LB_CONT: 3846 { 3847 // Positive Look-Behind, at top of loop checking for matches of LB expression 3848 // at all possible input starting positions. 3849 3850 // Fetch the min and max possible match lengths. They are the operands 3851 // of this op in the pattern. 3852 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 3853 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 3854 U_ASSERT(minML <= maxML); 3855 U_ASSERT(minML >= 0); 3856 3857 // Fetch (from data) the last input index where a match was attempted. 3858 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3859 int64_t *lbStartIdx = &fData[opValue+2]; 3860 if (*lbStartIdx < 0) { 3861 // First time through loop. 3862 *lbStartIdx = fp->fInputIdx - minML; 3863 } else { 3864 // 2nd through nth time through the loop. 3865 // Back up start position for match by one. 3866 if (*lbStartIdx == 0) { 3867 (*lbStartIdx)--; 3868 } else { 3869 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); 3870 (void)UTEXT_PREVIOUS32(fInputText); 3871 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 3872 } 3873 } 3874 3875 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 3876 // We have tried all potential match starting points without 3877 // getting a match. Backtrack out, and out of the 3878 // Look Behind altogether. 3879 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3880 int64_t restoreInputLen = fData[opValue+3]; 3881 U_ASSERT(restoreInputLen >= fActiveLimit); 3882 U_ASSERT(restoreInputLen <= fInputLength); 3883 fActiveLimit = restoreInputLen; 3884 break; 3885 } 3886 3887 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 3888 // (successful match will fall off the end of the loop.) 3889 fp = StateSave(fp, fp->fPatIdx-3, status); 3890 fp->fInputIdx = *lbStartIdx; 3891 } 3892 break; 3893 3894 case URX_LB_END: 3895 // End of a look-behind block, after a successful match. 3896 { 3897 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3898 if (fp->fInputIdx != fActiveLimit) { 3899 // The look-behind expression matched, but the match did not 3900 // extend all the way to the point that we are looking behind from. 3901 // FAIL out of here, which will take us back to the LB_CONT, which 3902 // will retry the match starting at another position or fail 3903 // the look-behind altogether, whichever is appropriate. 3904 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3905 break; 3906 } 3907 3908 // Look-behind match is good. Restore the orignal input string length, 3909 // which had been truncated to pin the end of the lookbehind match to the 3910 // position being looked-behind. 3911 int64_t originalInputLen = fData[opValue+3]; 3912 U_ASSERT(originalInputLen >= fActiveLimit); 3913 U_ASSERT(originalInputLen <= fInputLength); 3914 fActiveLimit = originalInputLen; 3915 } 3916 break; 3917 3918 3919 case URX_LBN_CONT: 3920 { 3921 // Negative Look-Behind, at top of loop checking for matches of LB expression 3922 // at all possible input starting positions. 3923 3924 // Fetch the extra parameters of this op. 3925 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 3926 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 3927 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 3928 continueLoc = URX_VAL(continueLoc); 3929 U_ASSERT(minML <= maxML); 3930 U_ASSERT(minML >= 0); 3931 U_ASSERT(continueLoc > fp->fPatIdx); 3932 3933 // Fetch (from data) the last input index where a match was attempted. 3934 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3935 int64_t *lbStartIdx = &fData[opValue+2]; 3936 if (*lbStartIdx < 0) { 3937 // First time through loop. 3938 *lbStartIdx = fp->fInputIdx - minML; 3939 } else { 3940 // 2nd through nth time through the loop. 3941 // Back up start position for match by one. 3942 if (*lbStartIdx == 0) { 3943 (*lbStartIdx)--; 3944 } else { 3945 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); 3946 (void)UTEXT_PREVIOUS32(fInputText); 3947 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 3948 } 3949 } 3950 3951 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 3952 // We have tried all potential match starting points without 3953 // getting a match, which means that the negative lookbehind as 3954 // a whole has succeeded. Jump forward to the continue location 3955 int64_t restoreInputLen = fData[opValue+3]; 3956 U_ASSERT(restoreInputLen >= fActiveLimit); 3957 U_ASSERT(restoreInputLen <= fInputLength); 3958 fActiveLimit = restoreInputLen; 3959 fp->fPatIdx = continueLoc; 3960 break; 3961 } 3962 3963 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 3964 // (successful match will cause a FAIL out of the loop altogether.) 3965 fp = StateSave(fp, fp->fPatIdx-4, status); 3966 fp->fInputIdx = *lbStartIdx; 3967 } 3968 break; 3969 3970 case URX_LBN_END: 3971 // End of a negative look-behind block, after a successful match. 3972 { 3973 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3974 if (fp->fInputIdx != fActiveLimit) { 3975 // The look-behind expression matched, but the match did not 3976 // extend all the way to the point that we are looking behind from. 3977 // FAIL out of here, which will take us back to the LB_CONT, which 3978 // will retry the match starting at another position or succeed 3979 // the look-behind altogether, whichever is appropriate. 3980 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3981 break; 3982 } 3983 3984 // Look-behind expression matched, which means look-behind test as 3985 // a whole Fails 3986 3987 // Restore the orignal input string length, which had been truncated 3988 // inorder to pin the end of the lookbehind match 3989 // to the position being looked-behind. 3990 int64_t originalInputLen = fData[opValue+3]; 3991 U_ASSERT(originalInputLen >= fActiveLimit); 3992 U_ASSERT(originalInputLen <= fInputLength); 3993 fActiveLimit = originalInputLen; 3994 3995 // Restore original stack position, discarding any state saved 3996 // by the successful pattern match. 3997 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3998 int32_t newStackSize = (int32_t)fData[opValue]; 3999 U_ASSERT(fStack->size() > newStackSize); 4000 fStack->setSize(newStackSize); 4001 4002 // FAIL, which will take control back to someplace 4003 // prior to entering the look-behind test. 4004 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4005 } 4006 break; 4007 4008 4009 case URX_LOOP_SR_I: 4010 // Loop Initialization for the optimized implementation of 4011 // [some character set]* 4012 // This op scans through all matching input. 4013 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4014 { 4015 U_ASSERT(opValue > 0 && opValue < sets->size()); 4016 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 4017 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 4018 4019 // Loop through input, until either the input is exhausted or 4020 // we reach a character that is not a member of the set. 4021 int64_t ix = fp->fInputIdx; 4022 UTEXT_SETNATIVEINDEX(fInputText, ix); 4023 for (;;) { 4024 if (ix >= fActiveLimit) { 4025 fHitEnd = TRUE; 4026 break; 4027 } 4028 UChar32 c = UTEXT_NEXT32(fInputText); 4029 if (c<256) { 4030 if (s8->contains(c) == FALSE) { 4031 break; 4032 } 4033 } else { 4034 if (s->contains(c) == FALSE) { 4035 break; 4036 } 4037 } 4038 ix = UTEXT_GETNATIVEINDEX(fInputText); 4039 } 4040 4041 // If there were no matching characters, skip over the loop altogether. 4042 // The loop doesn't run at all, a * op always succeeds. 4043 if (ix == fp->fInputIdx) { 4044 fp->fPatIdx++; // skip the URX_LOOP_C op. 4045 break; 4046 } 4047 4048 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4049 // must follow. It's operand is the stack location 4050 // that holds the starting input index for the match of this [set]* 4051 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 4052 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4053 int32_t stackLoc = URX_VAL(loopcOp); 4054 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4055 fp->fExtra[stackLoc] = fp->fInputIdx; 4056 fp->fInputIdx = ix; 4057 4058 // Save State to the URX_LOOP_C op that follows this one, 4059 // so that match failures in the following code will return to there. 4060 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4061 fp = StateSave(fp, fp->fPatIdx, status); 4062 fp->fPatIdx++; 4063 } 4064 break; 4065 4066 4067 case URX_LOOP_DOT_I: 4068 // Loop Initialization for the optimized implementation of .* 4069 // This op scans through all remaining input. 4070 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4071 { 4072 // Loop through input until the input is exhausted (we reach an end-of-line) 4073 // In DOTALL mode, we can just go straight to the end of the input. 4074 int64_t ix; 4075 if ((opValue & 1) == 1) { 4076 // Dot-matches-All mode. Jump straight to the end of the string. 4077 ix = fActiveLimit; 4078 fHitEnd = TRUE; 4079 } else { 4080 // NOT DOT ALL mode. Line endings do not match '.' 4081 // Scan forward until a line ending or end of input. 4082 ix = fp->fInputIdx; 4083 UTEXT_SETNATIVEINDEX(fInputText, ix); 4084 for (;;) { 4085 if (ix >= fActiveLimit) { 4086 fHitEnd = TRUE; 4087 break; 4088 } 4089 UChar32 c = UTEXT_NEXT32(fInputText); 4090 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 4091 if ((c == 0x0a) || // 0x0a is newline in both modes. 4092 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 4093 (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) { 4094 // char is a line ending. Exit the scanning loop. 4095 break; 4096 } 4097 } 4098 ix = UTEXT_GETNATIVEINDEX(fInputText); 4099 } 4100 } 4101 4102 // If there were no matching characters, skip over the loop altogether. 4103 // The loop doesn't run at all, a * op always succeeds. 4104 if (ix == fp->fInputIdx) { 4105 fp->fPatIdx++; // skip the URX_LOOP_C op. 4106 break; 4107 } 4108 4109 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4110 // must follow. It's operand is the stack location 4111 // that holds the starting input index for the match of this .* 4112 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 4113 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4114 int32_t stackLoc = URX_VAL(loopcOp); 4115 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4116 fp->fExtra[stackLoc] = fp->fInputIdx; 4117 fp->fInputIdx = ix; 4118 4119 // Save State to the URX_LOOP_C op that follows this one, 4120 // so that match failures in the following code will return to there. 4121 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4122 fp = StateSave(fp, fp->fPatIdx, status); 4123 fp->fPatIdx++; 4124 } 4125 break; 4126 4127 4128 case URX_LOOP_C: 4129 { 4130 U_ASSERT(opValue>=0 && opValue<fFrameSize); 4131 backSearchIndex = fp->fExtra[opValue]; 4132 U_ASSERT(backSearchIndex <= fp->fInputIdx); 4133 if (backSearchIndex == fp->fInputIdx) { 4134 // We've backed up the input idx to the point that the loop started. 4135 // The loop is done. Leave here without saving state. 4136 // Subsequent failures won't come back here. 4137 break; 4138 } 4139 // Set up for the next iteration of the loop, with input index 4140 // backed up by one from the last time through, 4141 // and a state save to this instruction in case the following code fails again. 4142 // (We're going backwards because this loop emulates stack unwinding, not 4143 // the initial scan forward.) 4144 U_ASSERT(fp->fInputIdx > 0); 4145 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4146 UChar32 prevC = UTEXT_PREVIOUS32(fInputText); 4147 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4148 4149 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); 4150 if (prevC == 0x0a && 4151 fp->fInputIdx > backSearchIndex && 4152 twoPrevC == 0x0d) { 4153 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 4154 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 4155 // .*, stepping back over CRLF pair. 4156 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4157 } 4158 } 4159 4160 4161 fp = StateSave(fp, fp->fPatIdx-1, status); 4162 } 4163 break; 4164 4165 4166 4167 default: 4168 // Trouble. The compiled pattern contains an entry with an 4169 // unrecognized type tag. 4170 U_ASSERT(FALSE); 4171 } 4172 4173 if (U_FAILURE(status)) { 4174 isMatch = FALSE; 4175 break; 4176 } 4177 } 4178 4179 breakFromLoop: 4180 fMatch = isMatch; 4181 if (isMatch) { 4182 fLastMatchEnd = fMatchEnd; 4183 fMatchStart = startIdx; 4184 fMatchEnd = fp->fInputIdx; 4185 } 4186 4187 #ifdef REGEX_RUN_DEBUG 4188 if (fTraceDebug) { 4189 if (isMatch) { 4190 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd); 4191 } else { 4192 printf("No match\n\n"); 4193 } 4194 } 4195 #endif 4196 4197 fFrame = fp; // The active stack frame when the engine stopped. 4198 // Contains the capture group results that we need to 4199 // access later. 4200 return; 4201 } 4202 4203 4204 //-------------------------------------------------------------------------------- 4205 // 4206 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the 4207 // assumption that the entire string is available in the UText's 4208 // chunk buffer. For now, that means we can use int32_t indexes, 4209 // except for anything that needs to be saved (like group starts 4210 // and ends). 4211 // 4212 // startIdx: begin matching a this index. 4213 // toEnd: if true, match must extend to end of the input region 4214 // 4215 //-------------------------------------------------------------------------------- 4216 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { 4217 UBool isMatch = FALSE; // True if the we have a match. 4218 4219 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards 4220 4221 int32_t op; // Operation from the compiled pattern, split into 4222 int32_t opType; // the opcode 4223 int32_t opValue; // and the operand value. 4224 4225 #ifdef REGEX_RUN_DEBUG 4226 if (fTraceDebug) { 4227 printf("MatchAt(startIdx=%d)\n", startIdx); 4228 printf("Original Pattern: "); 4229 UChar32 c = utext_next32From(fPattern->fPattern, 0); 4230 while (c != U_SENTINEL) { 4231 if (c<32 || c>256) { 4232 c = '.'; 4233 } 4234 printf("%c", c); 4235 4236 c = UTEXT_NEXT32(fPattern->fPattern); 4237 } 4238 printf("\n"); 4239 printf("Input String: "); 4240 c = utext_next32From(fInputText, 0); 4241 while (c != U_SENTINEL) { 4242 if (c<32 || c>256) { 4243 c = '.'; 4244 } 4245 printf("%c", c); 4246 4247 c = UTEXT_NEXT32(fInputText); 4248 } 4249 printf("\n"); 4250 printf("\n"); 4251 } 4252 #endif 4253 4254 if (U_FAILURE(status)) { 4255 return; 4256 } 4257 4258 // Cache frequently referenced items from the compiled pattern 4259 // 4260 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 4261 4262 const UChar *litText = fPattern->fLiteralText.getBuffer(); 4263 UVector *sets = fPattern->fSets; 4264 4265 const UChar *inputBuf = fInputText->chunkContents; 4266 4267 fFrameSize = fPattern->fFrameSize; 4268 REStackFrame *fp = resetStack(); 4269 4270 fp->fPatIdx = 0; 4271 fp->fInputIdx = startIdx; 4272 4273 // Zero out the pattern's static data 4274 int32_t i; 4275 for (i = 0; i<fPattern->fDataSize; i++) { 4276 fData[i] = 0; 4277 } 4278 4279 // 4280 // Main loop for interpreting the compiled pattern. 4281 // One iteration of the loop per pattern operation performed. 4282 // 4283 for (;;) { 4284 op = (int32_t)pat[fp->fPatIdx]; 4285 opType = URX_TYPE(op); 4286 opValue = URX_VAL(op); 4287 #ifdef REGEX_RUN_DEBUG 4288 if (fTraceDebug) { 4289 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4290 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, 4291 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 4292 fPattern->dumpOp(fp->fPatIdx); 4293 } 4294 #endif 4295 fp->fPatIdx++; 4296 4297 switch (opType) { 4298 4299 4300 case URX_NOP: 4301 break; 4302 4303 4304 case URX_BACKTRACK: 4305 // Force a backtrack. In some circumstances, the pattern compiler 4306 // will notice that the pattern can't possibly match anything, and will 4307 // emit one of these at that point. 4308 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4309 break; 4310 4311 4312 case URX_ONECHAR: 4313 if (fp->fInputIdx < fActiveLimit) { 4314 UChar32 c; 4315 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4316 if (c == opValue) { 4317 break; 4318 } 4319 } else { 4320 fHitEnd = TRUE; 4321 } 4322 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4323 break; 4324 4325 4326 case URX_STRING: 4327 { 4328 // Test input against a literal string. 4329 // Strings require two slots in the compiled pattern, one for the 4330 // offset to the string text, and one for the length. 4331 int32_t stringStartIdx = opValue; 4332 int32_t stringLen; 4333 4334 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 4335 fp->fPatIdx++; 4336 opType = URX_TYPE(op); 4337 stringLen = URX_VAL(op); 4338 U_ASSERT(opType == URX_STRING_LEN); 4339 U_ASSERT(stringLen >= 2); 4340 4341 const UChar * pInp = inputBuf + fp->fInputIdx; 4342 const UChar * pInpLimit = inputBuf + fActiveLimit; 4343 const UChar * pPat = litText+stringStartIdx; 4344 const UChar * pEnd = pInp + stringLen; 4345 UBool success = TRUE; 4346 while (pInp < pEnd) { 4347 if (pInp >= pInpLimit) { 4348 fHitEnd = TRUE; 4349 success = FALSE; 4350 break; 4351 } 4352 if (*pInp++ != *pPat++) { 4353 success = FALSE; 4354 break; 4355 } 4356 } 4357 4358 if (success) { 4359 fp->fInputIdx += stringLen; 4360 } else { 4361 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4362 } 4363 } 4364 break; 4365 4366 4367 case URX_STATE_SAVE: 4368 fp = StateSave(fp, opValue, status); 4369 break; 4370 4371 4372 case URX_END: 4373 // The match loop will exit via this path on a successful match, 4374 // when we reach the end of the pattern. 4375 if (toEnd && fp->fInputIdx != fActiveLimit) { 4376 // The pattern matched, but not to the end of input. Try some more. 4377 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4378 break; 4379 } 4380 isMatch = TRUE; 4381 goto breakFromLoop; 4382 4383 // Start and End Capture stack frame variables are laid out out like this: 4384 // fp->fExtra[opValue] - The start of a completed capture group 4385 // opValue+1 - The end of a completed capture group 4386 // opValue+2 - the start of a capture group whose end 4387 // has not yet been reached (and might not ever be). 4388 case URX_START_CAPTURE: 4389 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4390 fp->fExtra[opValue+2] = fp->fInputIdx; 4391 break; 4392 4393 4394 case URX_END_CAPTURE: 4395 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4396 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 4397 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 4398 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 4399 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 4400 break; 4401 4402 4403 case URX_DOLLAR: // $, test for End of line 4404 // or for position before new line at end of input 4405 if (fp->fInputIdx < fAnchorLimit-2) { 4406 // We are no where near the end of input. Fail. 4407 // This is the common case. Keep it first. 4408 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4409 break; 4410 } 4411 if (fp->fInputIdx >= fAnchorLimit) { 4412 // We really are at the end of input. Success. 4413 fHitEnd = TRUE; 4414 fRequireEnd = TRUE; 4415 break; 4416 } 4417 4418 // If we are positioned just before a new-line that is located at the 4419 // end of input, succeed. 4420 if (fp->fInputIdx == fAnchorLimit-1) { 4421 UChar32 c; 4422 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); 4423 4424 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 4425 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4426 // At new-line at end of input. Success 4427 fHitEnd = TRUE; 4428 fRequireEnd = TRUE; 4429 break; 4430 } 4431 } 4432 } else if (fp->fInputIdx == fAnchorLimit-2 && 4433 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) { 4434 fHitEnd = TRUE; 4435 fRequireEnd = TRUE; 4436 break; // At CR/LF at end of input. Success 4437 } 4438 4439 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4440 4441 break; 4442 4443 4444 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 4445 if (fp->fInputIdx >= fAnchorLimit-1) { 4446 // Either at the last character of input, or off the end. 4447 if (fp->fInputIdx == fAnchorLimit-1) { 4448 // At last char of input. Success if it's a new line. 4449 if (inputBuf[fp->fInputIdx] == 0x0a) { 4450 fHitEnd = TRUE; 4451 fRequireEnd = TRUE; 4452 break; 4453 } 4454 } else { 4455 // Off the end of input. Success. 4456 fHitEnd = TRUE; 4457 fRequireEnd = TRUE; 4458 break; 4459 } 4460 } 4461 4462 // Not at end of input. Back-track out. 4463 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4464 break; 4465 4466 4467 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 4468 { 4469 if (fp->fInputIdx >= fAnchorLimit) { 4470 // We really are at the end of input. Success. 4471 fHitEnd = TRUE; 4472 fRequireEnd = TRUE; 4473 break; 4474 } 4475 // If we are positioned just before a new-line, succeed. 4476 // It makes no difference where the new-line is within the input. 4477 UChar32 c = inputBuf[fp->fInputIdx]; 4478 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 4479 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 4480 // In multi-line mode, hitting a new-line just before the end of input does not 4481 // set the hitEnd or requireEnd flags 4482 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4483 break; 4484 } 4485 } 4486 // not at a new line. Fail. 4487 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4488 } 4489 break; 4490 4491 4492 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 4493 { 4494 if (fp->fInputIdx >= fAnchorLimit) { 4495 // We really are at the end of input. Success. 4496 fHitEnd = TRUE; 4497 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 4498 break; // adding a new-line would not lose the match. 4499 } 4500 // If we are not positioned just before a new-line, the test fails; backtrack out. 4501 // It makes no difference where the new-line is within the input. 4502 if (inputBuf[fp->fInputIdx] != 0x0a) { 4503 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4504 } 4505 } 4506 break; 4507 4508 4509 case URX_CARET: // ^, test for start of line 4510 if (fp->fInputIdx != fAnchorStart) { 4511 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4512 } 4513 break; 4514 4515 4516 case URX_CARET_M: // ^, test for start of line in mulit-line mode 4517 { 4518 if (fp->fInputIdx == fAnchorStart) { 4519 // We are at the start input. Success. 4520 break; 4521 } 4522 // Check whether character just before the current pos is a new-line 4523 // unless we are at the end of input 4524 UChar c = inputBuf[fp->fInputIdx - 1]; 4525 if ((fp->fInputIdx < fAnchorLimit) && 4526 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 4527 // It's a new-line. ^ is true. Success. 4528 // TODO: what should be done with positions between a CR and LF? 4529 break; 4530 } 4531 // Not at the start of a line. Fail. 4532 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4533 } 4534 break; 4535 4536 4537 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 4538 { 4539 U_ASSERT(fp->fInputIdx >= fAnchorStart); 4540 if (fp->fInputIdx <= fAnchorStart) { 4541 // We are at the start input. Success. 4542 break; 4543 } 4544 // Check whether character just before the current pos is a new-line 4545 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 4546 UChar c = inputBuf[fp->fInputIdx - 1]; 4547 if (c != 0x0a) { 4548 // Not at the start of a line. Back-track out. 4549 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4550 } 4551 } 4552 break; 4553 4554 case URX_BACKSLASH_B: // Test for word boundaries 4555 { 4556 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); 4557 success ^= (UBool)(opValue != 0); // flip sense for \B 4558 if (!success) { 4559 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4560 } 4561 } 4562 break; 4563 4564 4565 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 4566 { 4567 UBool success = isUWordBoundary(fp->fInputIdx); 4568 success ^= (UBool)(opValue != 0); // flip sense for \B 4569 if (!success) { 4570 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4571 } 4572 } 4573 break; 4574 4575 4576 case URX_BACKSLASH_D: // Test for decimal digit 4577 { 4578 if (fp->fInputIdx >= fActiveLimit) { 4579 fHitEnd = TRUE; 4580 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4581 break; 4582 } 4583 4584 UChar32 c; 4585 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4586 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 4587 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 4588 success ^= (UBool)(opValue != 0); // flip sense for \D 4589 if (!success) { 4590 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4591 } 4592 } 4593 break; 4594 4595 4596 case URX_BACKSLASH_G: // Test for position at end of previous match 4597 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { 4598 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4599 } 4600 break; 4601 4602 4603 case URX_BACKSLASH_X: 4604 // Match a Grapheme, as defined by Unicode TR 29. 4605 // Differs slightly from Perl, which consumes combining marks independently 4606 // of context. 4607 { 4608 4609 // Fail if at end of input 4610 if (fp->fInputIdx >= fActiveLimit) { 4611 fHitEnd = TRUE; 4612 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4613 break; 4614 } 4615 4616 // Examine (and consume) the current char. 4617 // Dispatch into a little state machine, based on the char. 4618 UChar32 c; 4619 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4620 UnicodeSet **sets = fPattern->fStaticSets; 4621 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 4622 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 4623 if (sets[URX_GC_L]->contains(c)) goto GC_L; 4624 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 4625 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 4626 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4627 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4628 goto GC_Extend; 4629 4630 4631 4632 GC_L: 4633 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4634 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4635 if (sets[URX_GC_L]->contains(c)) goto GC_L; 4636 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 4637 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 4638 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4639 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4640 goto GC_Extend; 4641 4642 GC_V: 4643 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4644 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4645 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4646 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4647 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4648 goto GC_Extend; 4649 4650 GC_T: 4651 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4652 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4653 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4654 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4655 goto GC_Extend; 4656 4657 GC_Extend: 4658 // Combining characters are consumed here 4659 for (;;) { 4660 if (fp->fInputIdx >= fActiveLimit) { 4661 break; 4662 } 4663 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4664 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 4665 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 4666 break; 4667 } 4668 } 4669 goto GC_Done; 4670 4671 GC_Control: 4672 // Most control chars stand alone (don't combine with combining chars), 4673 // except for that CR/LF sequence is a single grapheme cluster. 4674 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) { 4675 fp->fInputIdx++; 4676 } 4677 4678 GC_Done: 4679 if (fp->fInputIdx >= fActiveLimit) { 4680 fHitEnd = TRUE; 4681 } 4682 break; 4683 } 4684 4685 4686 4687 4688 case URX_BACKSLASH_Z: // Test for end of Input 4689 if (fp->fInputIdx < fAnchorLimit) { 4690 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4691 } else { 4692 fHitEnd = TRUE; 4693 fRequireEnd = TRUE; 4694 } 4695 break; 4696 4697 4698 4699 case URX_STATIC_SETREF: 4700 { 4701 // Test input character against one of the predefined sets 4702 // (Word Characters, for example) 4703 // The high bit of the op value is a flag for the match polarity. 4704 // 0: success if input char is in set. 4705 // 1: success if input char is not in set. 4706 if (fp->fInputIdx >= fActiveLimit) { 4707 fHitEnd = TRUE; 4708 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4709 break; 4710 } 4711 4712 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 4713 opValue &= ~URX_NEG_SET; 4714 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 4715 4716 UChar32 c; 4717 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4718 if (c < 256) { 4719 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 4720 if (s8->contains(c)) { 4721 success = !success; 4722 } 4723 } else { 4724 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 4725 if (s->contains(c)) { 4726 success = !success; 4727 } 4728 } 4729 if (!success) { 4730 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4731 } 4732 } 4733 break; 4734 4735 4736 case URX_STAT_SETREF_N: 4737 { 4738 // Test input character for NOT being a member of one of 4739 // the predefined sets (Word Characters, for example) 4740 if (fp->fInputIdx >= fActiveLimit) { 4741 fHitEnd = TRUE; 4742 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4743 break; 4744 } 4745 4746 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 4747 4748 UChar32 c; 4749 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4750 if (c < 256) { 4751 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 4752 if (s8->contains(c) == FALSE) { 4753 break; 4754 } 4755 } else { 4756 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 4757 if (s->contains(c) == FALSE) { 4758 break; 4759 } 4760 } 4761 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4762 } 4763 break; 4764 4765 4766 case URX_SETREF: 4767 { 4768 if (fp->fInputIdx >= fActiveLimit) { 4769 fHitEnd = TRUE; 4770 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4771 break; 4772 } 4773 4774 U_ASSERT(opValue > 0 && opValue < sets->size()); 4775 4776 // There is input left. Pick up one char and test it for set membership. 4777 UChar32 c; 4778 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4779 if (c<256) { 4780 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 4781 if (s8->contains(c)) { 4782 // The character is in the set. A Match. 4783 break; 4784 } 4785 } else { 4786 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 4787 if (s->contains(c)) { 4788 // The character is in the set. A Match. 4789 break; 4790 } 4791 } 4792 4793 // the character wasn't in the set. 4794 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4795 } 4796 break; 4797 4798 4799 case URX_DOTANY: 4800 { 4801 // . matches anything, but stops at end-of-line. 4802 if (fp->fInputIdx >= fActiveLimit) { 4803 // At end of input. Match failed. Backtrack out. 4804 fHitEnd = TRUE; 4805 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4806 break; 4807 } 4808 4809 // There is input left. Advance over one char, unless we've hit end-of-line 4810 UChar32 c; 4811 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4812 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 4813 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 4814 // End of line in normal mode. . does not match. 4815 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4816 break; 4817 } 4818 } 4819 break; 4820 4821 4822 case URX_DOTANY_ALL: 4823 { 4824 // . in dot-matches-all (including new lines) mode 4825 if (fp->fInputIdx >= fActiveLimit) { 4826 // At end of input. Match failed. Backtrack out. 4827 fHitEnd = TRUE; 4828 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4829 break; 4830 } 4831 4832 // There is input left. Advance over one char, except if we are 4833 // at a cr/lf, advance over both of them. 4834 UChar32 c; 4835 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4836 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 4837 // In the case of a CR/LF, we need to advance over both. 4838 if (inputBuf[fp->fInputIdx] == 0x0a) { 4839 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); 4840 } 4841 } 4842 } 4843 break; 4844 4845 4846 case URX_DOTANY_UNIX: 4847 { 4848 // '.' operator, matches all, but stops at end-of-line. 4849 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 4850 if (fp->fInputIdx >= fActiveLimit) { 4851 // At end of input. Match failed. Backtrack out. 4852 fHitEnd = TRUE; 4853 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4854 break; 4855 } 4856 4857 // There is input left. Advance over one char, unless we've hit end-of-line 4858 UChar32 c; 4859 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4860 if (c == 0x0a) { 4861 // End of line in normal mode. '.' does not match the \n 4862 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4863 } 4864 } 4865 break; 4866 4867 4868 case URX_JMP: 4869 fp->fPatIdx = opValue; 4870 break; 4871 4872 case URX_FAIL: 4873 isMatch = FALSE; 4874 goto breakFromLoop; 4875 4876 case URX_JMP_SAV: 4877 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 4878 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 4879 fp->fPatIdx = opValue; // Then JMP. 4880 break; 4881 4882 case URX_JMP_SAV_X: 4883 // This opcode is used with (x)+, when x can match a zero length string. 4884 // Same as JMP_SAV, except conditional on the match having made forward progress. 4885 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 4886 // data address of the input position at the start of the loop. 4887 { 4888 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 4889 int32_t stoOp = (int32_t)pat[opValue-1]; 4890 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 4891 int32_t frameLoc = URX_VAL(stoOp); 4892 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 4893 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; 4894 U_ASSERT(prevInputIdx <= fp->fInputIdx); 4895 if (prevInputIdx < fp->fInputIdx) { 4896 // The match did make progress. Repeat the loop. 4897 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 4898 fp->fPatIdx = opValue; 4899 fp->fExtra[frameLoc] = fp->fInputIdx; 4900 } 4901 // If the input position did not advance, we do nothing here, 4902 // execution will fall out of the loop. 4903 } 4904 break; 4905 4906 case URX_CTR_INIT: 4907 { 4908 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 4909 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 4910 4911 // Pick up the three extra operands that CTR_INIT has, and 4912 // skip the pattern location counter past 4913 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 4914 fp->fPatIdx += 3; 4915 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 4916 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 4917 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 4918 U_ASSERT(minCount>=0); 4919 U_ASSERT(maxCount>=minCount || maxCount==-1); 4920 U_ASSERT(loopLoc>=fp->fPatIdx); 4921 4922 if (minCount == 0) { 4923 fp = StateSave(fp, loopLoc+1, status); 4924 } 4925 if (maxCount == -1) { 4926 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. 4927 } else if (maxCount == 0) { 4928 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4929 } 4930 } 4931 break; 4932 4933 case URX_CTR_LOOP: 4934 { 4935 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 4936 int32_t initOp = (int32_t)pat[opValue]; 4937 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 4938 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 4939 int32_t minCount = (int32_t)pat[opValue+2]; 4940 int32_t maxCount = (int32_t)pat[opValue+3]; 4941 (*pCounter)++; 4942 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 4943 U_ASSERT(*pCounter == maxCount); 4944 break; 4945 } 4946 if (*pCounter >= minCount) { 4947 if (maxCount == -1) { 4948 // Loop has no hard upper bound. 4949 // Check that it is progressing through the input, break if it is not. 4950 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 4951 if (fp->fInputIdx == *pLastInputIdx) { 4952 break; 4953 } else { 4954 *pLastInputIdx = fp->fInputIdx; 4955 } 4956 } 4957 fp = StateSave(fp, fp->fPatIdx, status); 4958 } 4959 fp->fPatIdx = opValue + 4; // Loop back. 4960 } 4961 break; 4962 4963 case URX_CTR_INIT_NG: 4964 { 4965 // Initialize a non-greedy loop 4966 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 4967 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 4968 4969 // Pick up the three extra operands that CTR_INIT_NG has, and 4970 // skip the pattern location counter past 4971 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 4972 fp->fPatIdx += 3; 4973 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 4974 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 4975 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 4976 U_ASSERT(minCount>=0); 4977 U_ASSERT(maxCount>=minCount || maxCount==-1); 4978 U_ASSERT(loopLoc>fp->fPatIdx); 4979 if (maxCount == -1) { 4980 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. 4981 } 4982 4983 if (minCount == 0) { 4984 if (maxCount != 0) { 4985 fp = StateSave(fp, fp->fPatIdx, status); 4986 } 4987 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 4988 } 4989 } 4990 break; 4991 4992 case URX_CTR_LOOP_NG: 4993 { 4994 // Non-greedy {min, max} loops 4995 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 4996 int32_t initOp = (int32_t)pat[opValue]; 4997 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 4998 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 4999 int32_t minCount = (int32_t)pat[opValue+2]; 5000 int32_t maxCount = (int32_t)pat[opValue+3]; 5001 5002 (*pCounter)++; 5003 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 5004 // The loop has matched the maximum permitted number of times. 5005 // Break out of here with no action. Matching will 5006 // continue with the following pattern. 5007 U_ASSERT(*pCounter == maxCount); 5008 break; 5009 } 5010 5011 if (*pCounter < minCount) { 5012 // We haven't met the minimum number of matches yet. 5013 // Loop back for another one. 5014 fp->fPatIdx = opValue + 4; // Loop back. 5015 } else { 5016 // We do have the minimum number of matches. 5017 5018 // If there is no upper bound on the loop iterations, check that the input index 5019 // is progressing, and stop the loop if it is not. 5020 if (maxCount == -1) { 5021 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 5022 if (fp->fInputIdx == *pLastInputIdx) { 5023 break; 5024 } 5025 *pLastInputIdx = fp->fInputIdx; 5026 } 5027 5028 // Loop Continuation: we will fall into the pattern following the loop 5029 // (non-greedy, don't execute loop body first), but first do 5030 // a state save to the top of the loop, so that a match failure 5031 // in the following pattern will try another iteration of the loop. 5032 fp = StateSave(fp, opValue + 4, status); 5033 } 5034 } 5035 break; 5036 5037 case URX_STO_SP: 5038 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5039 fData[opValue] = fStack->size(); 5040 break; 5041 5042 case URX_LD_SP: 5043 { 5044 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5045 int32_t newStackSize = (int32_t)fData[opValue]; 5046 U_ASSERT(newStackSize <= fStack->size()); 5047 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5048 if (newFP == (int64_t *)fp) { 5049 break; 5050 } 5051 int32_t i; 5052 for (i=0; i<fFrameSize; i++) { 5053 newFP[i] = ((int64_t *)fp)[i]; 5054 } 5055 fp = (REStackFrame *)newFP; 5056 fStack->setSize(newStackSize); 5057 } 5058 break; 5059 5060 case URX_BACKREF: 5061 { 5062 U_ASSERT(opValue < fFrameSize); 5063 int64_t groupStartIdx = fp->fExtra[opValue]; 5064 int64_t groupEndIdx = fp->fExtra[opValue+1]; 5065 U_ASSERT(groupStartIdx <= groupEndIdx); 5066 int64_t inputIndex = fp->fInputIdx; 5067 if (groupStartIdx < 0) { 5068 // This capture group has not participated in the match thus far, 5069 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 5070 break; 5071 } 5072 UBool success = TRUE; 5073 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) { 5074 if (inputIndex >= fActiveLimit) { 5075 success = FALSE; 5076 fHitEnd = TRUE; 5077 break; 5078 } 5079 if (inputBuf[groupIndex] != inputBuf[inputIndex]) { 5080 success = FALSE; 5081 break; 5082 } 5083 } 5084 if (success) { 5085 fp->fInputIdx = inputIndex; 5086 } else { 5087 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5088 } 5089 } 5090 break; 5091 5092 case URX_BACKREF_I: 5093 { 5094 U_ASSERT(opValue < fFrameSize); 5095 int64_t groupStartIdx = fp->fExtra[opValue]; 5096 int64_t groupEndIdx = fp->fExtra[opValue+1]; 5097 U_ASSERT(groupStartIdx <= groupEndIdx); 5098 if (groupStartIdx < 0) { 5099 // This capture group has not participated in the match thus far, 5100 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 5101 break; 5102 } 5103 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx); 5104 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit); 5105 5106 // Note: if the capture group match was of an empty string the backref 5107 // match succeeds. Verified by testing: Perl matches succeed 5108 // in this case, so we do too. 5109 5110 UBool success = TRUE; 5111 for (;;) { 5112 UChar32 captureGroupChar = captureGroupItr.next(); 5113 if (captureGroupChar == U_SENTINEL) { 5114 success = TRUE; 5115 break; 5116 } 5117 UChar32 inputChar = inputItr.next(); 5118 if (inputChar == U_SENTINEL) { 5119 success = FALSE; 5120 fHitEnd = TRUE; 5121 break; 5122 } 5123 if (inputChar != captureGroupChar) { 5124 success = FALSE; 5125 break; 5126 } 5127 } 5128 5129 if (success && inputItr.inExpansion()) { 5130 // We otained a match by consuming part of a string obtained from 5131 // case-folding a single code point of the input text. 5132 // This does not count as an overall match. 5133 success = FALSE; 5134 } 5135 5136 if (success) { 5137 fp->fInputIdx = inputItr.getIndex(); 5138 } else { 5139 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5140 } 5141 } 5142 break; 5143 5144 case URX_STO_INP_LOC: 5145 { 5146 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 5147 fp->fExtra[opValue] = fp->fInputIdx; 5148 } 5149 break; 5150 5151 case URX_JMPX: 5152 { 5153 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5154 fp->fPatIdx += 1; 5155 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 5156 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 5157 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; 5158 U_ASSERT(savedInputIdx <= fp->fInputIdx); 5159 if (savedInputIdx < fp->fInputIdx) { 5160 fp->fPatIdx = opValue; // JMP 5161 } else { 5162 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 5163 } 5164 } 5165 break; 5166 5167 case URX_LA_START: 5168 { 5169 // Entering a lookahead block. 5170 // Save Stack Ptr, Input Pos. 5171 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5172 fData[opValue] = fStack->size(); 5173 fData[opValue+1] = fp->fInputIdx; 5174 fActiveStart = fLookStart; // Set the match region change for 5175 fActiveLimit = fLookLimit; // transparent bounds. 5176 } 5177 break; 5178 5179 case URX_LA_END: 5180 { 5181 // Leaving a look-ahead block. 5182 // restore Stack Ptr, Input Pos to positions they had on entry to block. 5183 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5184 int32_t stackSize = fStack->size(); 5185 int32_t newStackSize = (int32_t)fData[opValue]; 5186 U_ASSERT(stackSize >= newStackSize); 5187 if (stackSize > newStackSize) { 5188 // Copy the current top frame back to the new (cut back) top frame. 5189 // This makes the capture groups from within the look-ahead 5190 // expression available. 5191 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5192 int32_t i; 5193 for (i=0; i<fFrameSize; i++) { 5194 newFP[i] = ((int64_t *)fp)[i]; 5195 } 5196 fp = (REStackFrame *)newFP; 5197 fStack->setSize(newStackSize); 5198 } 5199 fp->fInputIdx = fData[opValue+1]; 5200 5201 // Restore the active region bounds in the input string; they may have 5202 // been changed because of transparent bounds on a Region. 5203 fActiveStart = fRegionStart; 5204 fActiveLimit = fRegionLimit; 5205 } 5206 break; 5207 5208 case URX_ONECHAR_I: 5209 if (fp->fInputIdx < fActiveLimit) { 5210 UChar32 c; 5211 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5212 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 5213 break; 5214 } 5215 } else { 5216 fHitEnd = TRUE; 5217 } 5218 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5219 break; 5220 5221 case URX_STRING_I: 5222 // Case-insensitive test input against a literal string. 5223 // Strings require two slots in the compiled pattern, one for the 5224 // offset to the string text, and one for the length. 5225 // The compiled string has already been case folded. 5226 { 5227 const UChar *patternString = litText + opValue; 5228 5229 op = (int32_t)pat[fp->fPatIdx]; 5230 fp->fPatIdx++; 5231 opType = URX_TYPE(op); 5232 opValue = URX_VAL(op); 5233 U_ASSERT(opType == URX_STRING_LEN); 5234 int32_t patternStringLen = opValue; // Length of the string from the pattern. 5235 5236 UChar32 cText; 5237 UChar32 cPattern; 5238 UBool success = TRUE; 5239 int32_t patternStringIdx = 0; 5240 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit); 5241 while (patternStringIdx < patternStringLen) { 5242 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); 5243 cText = inputIterator.next(); 5244 if (cText != cPattern) { 5245 success = FALSE; 5246 if (cText == U_SENTINEL) { 5247 fHitEnd = TRUE; 5248 } 5249 break; 5250 } 5251 } 5252 if (inputIterator.inExpansion()) { 5253 success = FALSE; 5254 } 5255 5256 if (success) { 5257 fp->fInputIdx = inputIterator.getIndex(); 5258 } else { 5259 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5260 } 5261 } 5262 break; 5263 5264 case URX_LB_START: 5265 { 5266 // Entering a look-behind block. 5267 // Save Stack Ptr, Input Pos. 5268 // TODO: implement transparent bounds. Ticket #6067 5269 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5270 fData[opValue] = fStack->size(); 5271 fData[opValue+1] = fp->fInputIdx; 5272 // Init the variable containing the start index for attempted matches. 5273 fData[opValue+2] = -1; 5274 // Save input string length, then reset to pin any matches to end at 5275 // the current position. 5276 fData[opValue+3] = fActiveLimit; 5277 fActiveLimit = fp->fInputIdx; 5278 } 5279 break; 5280 5281 5282 case URX_LB_CONT: 5283 { 5284 // Positive Look-Behind, at top of loop checking for matches of LB expression 5285 // at all possible input starting positions. 5286 5287 // Fetch the min and max possible match lengths. They are the operands 5288 // of this op in the pattern. 5289 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5290 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5291 U_ASSERT(minML <= maxML); 5292 U_ASSERT(minML >= 0); 5293 5294 // Fetch (from data) the last input index where a match was attempted. 5295 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5296 int64_t *lbStartIdx = &fData[opValue+2]; 5297 if (*lbStartIdx < 0) { 5298 // First time through loop. 5299 *lbStartIdx = fp->fInputIdx - minML; 5300 } else { 5301 // 2nd through nth time through the loop. 5302 // Back up start position for match by one. 5303 if (*lbStartIdx == 0) { 5304 (*lbStartIdx)--; 5305 } else { 5306 U16_BACK_1(inputBuf, 0, *lbStartIdx); 5307 } 5308 } 5309 5310 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 5311 // We have tried all potential match starting points without 5312 // getting a match. Backtrack out, and out of the 5313 // Look Behind altogether. 5314 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5315 int64_t restoreInputLen = fData[opValue+3]; 5316 U_ASSERT(restoreInputLen >= fActiveLimit); 5317 U_ASSERT(restoreInputLen <= fInputLength); 5318 fActiveLimit = restoreInputLen; 5319 break; 5320 } 5321 5322 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5323 // (successful match will fall off the end of the loop.) 5324 fp = StateSave(fp, fp->fPatIdx-3, status); 5325 fp->fInputIdx = *lbStartIdx; 5326 } 5327 break; 5328 5329 case URX_LB_END: 5330 // End of a look-behind block, after a successful match. 5331 { 5332 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5333 if (fp->fInputIdx != fActiveLimit) { 5334 // The look-behind expression matched, but the match did not 5335 // extend all the way to the point that we are looking behind from. 5336 // FAIL out of here, which will take us back to the LB_CONT, which 5337 // will retry the match starting at another position or fail 5338 // the look-behind altogether, whichever is appropriate. 5339 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5340 break; 5341 } 5342 5343 // Look-behind match is good. Restore the orignal input string length, 5344 // which had been truncated to pin the end of the lookbehind match to the 5345 // position being looked-behind. 5346 int64_t originalInputLen = fData[opValue+3]; 5347 U_ASSERT(originalInputLen >= fActiveLimit); 5348 U_ASSERT(originalInputLen <= fInputLength); 5349 fActiveLimit = originalInputLen; 5350 } 5351 break; 5352 5353 5354 case URX_LBN_CONT: 5355 { 5356 // Negative Look-Behind, at top of loop checking for matches of LB expression 5357 // at all possible input starting positions. 5358 5359 // Fetch the extra parameters of this op. 5360 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5361 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5362 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 5363 continueLoc = URX_VAL(continueLoc); 5364 U_ASSERT(minML <= maxML); 5365 U_ASSERT(minML >= 0); 5366 U_ASSERT(continueLoc > fp->fPatIdx); 5367 5368 // Fetch (from data) the last input index where a match was attempted. 5369 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5370 int64_t *lbStartIdx = &fData[opValue+2]; 5371 if (*lbStartIdx < 0) { 5372 // First time through loop. 5373 *lbStartIdx = fp->fInputIdx - minML; 5374 } else { 5375 // 2nd through nth time through the loop. 5376 // Back up start position for match by one. 5377 if (*lbStartIdx == 0) { 5378 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. 5379 } else { 5380 U16_BACK_1(inputBuf, 0, *lbStartIdx); 5381 } 5382 } 5383 5384 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 5385 // We have tried all potential match starting points without 5386 // getting a match, which means that the negative lookbehind as 5387 // a whole has succeeded. Jump forward to the continue location 5388 int64_t restoreInputLen = fData[opValue+3]; 5389 U_ASSERT(restoreInputLen >= fActiveLimit); 5390 U_ASSERT(restoreInputLen <= fInputLength); 5391 fActiveLimit = restoreInputLen; 5392 fp->fPatIdx = continueLoc; 5393 break; 5394 } 5395 5396 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5397 // (successful match will cause a FAIL out of the loop altogether.) 5398 fp = StateSave(fp, fp->fPatIdx-4, status); 5399 fp->fInputIdx = *lbStartIdx; 5400 } 5401 break; 5402 5403 case URX_LBN_END: 5404 // End of a negative look-behind block, after a successful match. 5405 { 5406 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5407 if (fp->fInputIdx != fActiveLimit) { 5408 // The look-behind expression matched, but the match did not 5409 // extend all the way to the point that we are looking behind from. 5410 // FAIL out of here, which will take us back to the LB_CONT, which 5411 // will retry the match starting at another position or succeed 5412 // the look-behind altogether, whichever is appropriate. 5413 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5414 break; 5415 } 5416 5417 // Look-behind expression matched, which means look-behind test as 5418 // a whole Fails 5419 5420 // Restore the orignal input string length, which had been truncated 5421 // inorder to pin the end of the lookbehind match 5422 // to the position being looked-behind. 5423 int64_t originalInputLen = fData[opValue+3]; 5424 U_ASSERT(originalInputLen >= fActiveLimit); 5425 U_ASSERT(originalInputLen <= fInputLength); 5426 fActiveLimit = originalInputLen; 5427 5428 // Restore original stack position, discarding any state saved 5429 // by the successful pattern match. 5430 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5431 int32_t newStackSize = (int32_t)fData[opValue]; 5432 U_ASSERT(fStack->size() > newStackSize); 5433 fStack->setSize(newStackSize); 5434 5435 // FAIL, which will take control back to someplace 5436 // prior to entering the look-behind test. 5437 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5438 } 5439 break; 5440 5441 5442 case URX_LOOP_SR_I: 5443 // Loop Initialization for the optimized implementation of 5444 // [some character set]* 5445 // This op scans through all matching input. 5446 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5447 { 5448 U_ASSERT(opValue > 0 && opValue < sets->size()); 5449 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5450 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 5451 5452 // Loop through input, until either the input is exhausted or 5453 // we reach a character that is not a member of the set. 5454 int32_t ix = (int32_t)fp->fInputIdx; 5455 for (;;) { 5456 if (ix >= fActiveLimit) { 5457 fHitEnd = TRUE; 5458 break; 5459 } 5460 UChar32 c; 5461 U16_NEXT(inputBuf, ix, fActiveLimit, c); 5462 if (c<256) { 5463 if (s8->contains(c) == FALSE) { 5464 U16_BACK_1(inputBuf, 0, ix); 5465 break; 5466 } 5467 } else { 5468 if (s->contains(c) == FALSE) { 5469 U16_BACK_1(inputBuf, 0, ix); 5470 break; 5471 } 5472 } 5473 } 5474 5475 // If there were no matching characters, skip over the loop altogether. 5476 // The loop doesn't run at all, a * op always succeeds. 5477 if (ix == fp->fInputIdx) { 5478 fp->fPatIdx++; // skip the URX_LOOP_C op. 5479 break; 5480 } 5481 5482 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5483 // must follow. It's operand is the stack location 5484 // that holds the starting input index for the match of this [set]* 5485 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 5486 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5487 int32_t stackLoc = URX_VAL(loopcOp); 5488 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5489 fp->fExtra[stackLoc] = fp->fInputIdx; 5490 fp->fInputIdx = ix; 5491 5492 // Save State to the URX_LOOP_C op that follows this one, 5493 // so that match failures in the following code will return to there. 5494 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 5495 fp = StateSave(fp, fp->fPatIdx, status); 5496 fp->fPatIdx++; 5497 } 5498 break; 5499 5500 5501 case URX_LOOP_DOT_I: 5502 // Loop Initialization for the optimized implementation of .* 5503 // This op scans through all remaining input. 5504 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5505 { 5506 // Loop through input until the input is exhausted (we reach an end-of-line) 5507 // In DOTALL mode, we can just go straight to the end of the input. 5508 int32_t ix; 5509 if ((opValue & 1) == 1) { 5510 // Dot-matches-All mode. Jump straight to the end of the string. 5511 ix = (int32_t)fActiveLimit; 5512 fHitEnd = TRUE; 5513 } else { 5514 // NOT DOT ALL mode. Line endings do not match '.' 5515 // Scan forward until a line ending or end of input. 5516 ix = (int32_t)fp->fInputIdx; 5517 for (;;) { 5518 if (ix >= fActiveLimit) { 5519 fHitEnd = TRUE; 5520 break; 5521 } 5522 UChar32 c; 5523 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] 5524 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 5525 if ((c == 0x0a) || // 0x0a is newline in both modes. 5526 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 5527 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) { 5528 // char is a line ending. Put the input pos back to the 5529 // line ending char, and exit the scanning loop. 5530 U16_BACK_1(inputBuf, 0, ix); 5531 break; 5532 } 5533 } 5534 } 5535 } 5536 5537 // If there were no matching characters, skip over the loop altogether. 5538 // The loop doesn't run at all, a * op always succeeds. 5539 if (ix == fp->fInputIdx) { 5540 fp->fPatIdx++; // skip the URX_LOOP_C op. 5541 break; 5542 } 5543 5544 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5545 // must follow. It's operand is the stack location 5546 // that holds the starting input index for the match of this .* 5547 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 5548 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5549 int32_t stackLoc = URX_VAL(loopcOp); 5550 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5551 fp->fExtra[stackLoc] = fp->fInputIdx; 5552 fp->fInputIdx = ix; 5553 5554 // Save State to the URX_LOOP_C op that follows this one, 5555 // so that match failures in the following code will return to there. 5556 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 5557 fp = StateSave(fp, fp->fPatIdx, status); 5558 fp->fPatIdx++; 5559 } 5560 break; 5561 5562 5563 case URX_LOOP_C: 5564 { 5565 U_ASSERT(opValue>=0 && opValue<fFrameSize); 5566 backSearchIndex = (int32_t)fp->fExtra[opValue]; 5567 U_ASSERT(backSearchIndex <= fp->fInputIdx); 5568 if (backSearchIndex == fp->fInputIdx) { 5569 // We've backed up the input idx to the point that the loop started. 5570 // The loop is done. Leave here without saving state. 5571 // Subsequent failures won't come back here. 5572 break; 5573 } 5574 // Set up for the next iteration of the loop, with input index 5575 // backed up by one from the last time through, 5576 // and a state save to this instruction in case the following code fails again. 5577 // (We're going backwards because this loop emulates stack unwinding, not 5578 // the initial scan forward.) 5579 U_ASSERT(fp->fInputIdx > 0); 5580 UChar32 prevC; 5581 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit? 5582 5583 if (prevC == 0x0a && 5584 fp->fInputIdx > backSearchIndex && 5585 inputBuf[fp->fInputIdx-1] == 0x0d) { 5586 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 5587 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 5588 // .*, stepping back over CRLF pair. 5589 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 5590 } 5591 } 5592 5593 5594 fp = StateSave(fp, fp->fPatIdx-1, status); 5595 } 5596 break; 5597 5598 5599 5600 default: 5601 // Trouble. The compiled pattern contains an entry with an 5602 // unrecognized type tag. 5603 U_ASSERT(FALSE); 5604 } 5605 5606 if (U_FAILURE(status)) { 5607 isMatch = FALSE; 5608 break; 5609 } 5610 } 5611 5612 breakFromLoop: 5613 fMatch = isMatch; 5614 if (isMatch) { 5615 fLastMatchEnd = fMatchEnd; 5616 fMatchStart = startIdx; 5617 fMatchEnd = fp->fInputIdx; 5618 } 5619 5620 #ifdef REGEX_RUN_DEBUG 5621 if (fTraceDebug) { 5622 if (isMatch) { 5623 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd); 5624 } else { 5625 printf("No match\n\n"); 5626 } 5627 } 5628 #endif 5629 5630 fFrame = fp; // The active stack frame when the engine stopped. 5631 // Contains the capture group results that we need to 5632 // access later. 5633 5634 return; 5635 } 5636 5637 5638 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) 5639 5640 U_NAMESPACE_END 5641 5642 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 5643