1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ************************************************************************** 5 * Copyright (C) 2002-2016 International Business Machines Corporation 6 * and others. All rights reserved. 7 ************************************************************************** 8 */ 9 // 10 // file: rematch.cpp 11 // 12 // Contains the implementation of class RegexMatcher, 13 // which is one of the main API classes for the ICU regular expression package. 14 // 15 16 #include "unicode/utypes.h" 17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 18 19 #include "unicode/regex.h" 20 #include "unicode/uniset.h" 21 #include "unicode/uchar.h" 22 #include "unicode/ustring.h" 23 #include "unicode/rbbi.h" 24 #include "unicode/utf.h" 25 #include "unicode/utf16.h" 26 #include "uassert.h" 27 #include "cmemory.h" 28 #include "cstr.h" 29 #include "uvector.h" 30 #include "uvectr32.h" 31 #include "uvectr64.h" 32 #include "regeximp.h" 33 #include "regexst.h" 34 #include "regextxt.h" 35 #include "ucase.h" 36 37 // #include <malloc.h> // Needed for heapcheck testing 38 39 40 U_NAMESPACE_BEGIN 41 42 // Default limit for the size of the back track stack, to avoid system 43 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. 44 // This value puts ICU's limits higher than most other regexp implementations, 45 // which use recursion rather than the heap, and take more storage per 46 // backtrack point. 47 // 48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; 49 50 // Time limit counter constant. 51 // Time limits for expression evaluation are in terms of quanta of work by 52 // the engine, each of which is 10,000 state saves. 53 // This constant determines that state saves per tick number. 54 static const int32_t TIMER_INITIAL_VALUE = 10000; 55 56 57 // Test for any of the Unicode line terminating characters. 58 static inline UBool isLineTerminator(UChar32 c) { 59 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) { 60 return false; 61 } 62 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029; 63 } 64 65 //----------------------------------------------------------------------------- 66 // 67 // Constructor and Destructor 68 // 69 //----------------------------------------------------------------------------- 70 RegexMatcher::RegexMatcher(const RegexPattern *pat) { 71 fDeferredStatus = U_ZERO_ERROR; 72 init(fDeferredStatus); 73 if (U_FAILURE(fDeferredStatus)) { 74 return; 75 } 76 if (pat==NULL) { 77 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; 78 return; 79 } 80 fPattern = pat; 81 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); 82 } 83 84 85 86 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 87 uint32_t flags, UErrorCode &status) { 88 init(status); 89 if (U_FAILURE(status)) { 90 return; 91 } 92 UParseError pe; 93 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 94 fPattern = fPatternOwned; 95 96 UText inputText = UTEXT_INITIALIZER; 97 utext_openConstUnicodeString(&inputText, &input, &status); 98 init2(&inputText, status); 99 utext_close(&inputText); 100 101 fInputUniStrMaybeMutable = TRUE; 102 } 103 104 105 RegexMatcher::RegexMatcher(UText *regexp, UText *input, 106 uint32_t flags, UErrorCode &status) { 107 init(status); 108 if (U_FAILURE(status)) { 109 return; 110 } 111 UParseError pe; 112 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 113 if (U_FAILURE(status)) { 114 return; 115 } 116 117 fPattern = fPatternOwned; 118 init2(input, status); 119 } 120 121 122 RegexMatcher::RegexMatcher(const UnicodeString ®exp, 123 uint32_t flags, UErrorCode &status) { 124 init(status); 125 if (U_FAILURE(status)) { 126 return; 127 } 128 UParseError pe; 129 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 130 if (U_FAILURE(status)) { 131 return; 132 } 133 fPattern = fPatternOwned; 134 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 135 } 136 137 RegexMatcher::RegexMatcher(UText *regexp, 138 uint32_t flags, UErrorCode &status) { 139 init(status); 140 if (U_FAILURE(status)) { 141 return; 142 } 143 UParseError pe; 144 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 145 if (U_FAILURE(status)) { 146 return; 147 } 148 149 fPattern = fPatternOwned; 150 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 151 } 152 153 154 155 156 RegexMatcher::~RegexMatcher() { 157 delete fStack; 158 if (fData != fSmallData) { 159 uprv_free(fData); 160 fData = NULL; 161 } 162 if (fPatternOwned) { 163 delete fPatternOwned; 164 fPatternOwned = NULL; 165 fPattern = NULL; 166 } 167 168 if (fInput) { 169 delete fInput; 170 } 171 if (fInputText) { 172 utext_close(fInputText); 173 } 174 if (fAltInputText) { 175 utext_close(fAltInputText); 176 } 177 178 #if UCONFIG_NO_BREAK_ITERATION==0 179 delete fWordBreakItr; 180 #endif 181 } 182 183 // 184 // init() common initialization for use by all constructors. 185 // Initialize all fields, get the object into a consistent state. 186 // This must be done even when the initial status shows an error, 187 // so that the object is initialized sufficiently well for the destructor 188 // to run safely. 189 // 190 void RegexMatcher::init(UErrorCode &status) { 191 fPattern = NULL; 192 fPatternOwned = NULL; 193 fFrameSize = 0; 194 fRegionStart = 0; 195 fRegionLimit = 0; 196 fAnchorStart = 0; 197 fAnchorLimit = 0; 198 fLookStart = 0; 199 fLookLimit = 0; 200 fActiveStart = 0; 201 fActiveLimit = 0; 202 fTransparentBounds = FALSE; 203 fAnchoringBounds = TRUE; 204 fMatch = FALSE; 205 fMatchStart = 0; 206 fMatchEnd = 0; 207 fLastMatchEnd = -1; 208 fAppendPosition = 0; 209 fHitEnd = FALSE; 210 fRequireEnd = FALSE; 211 fStack = NULL; 212 fFrame = NULL; 213 fTimeLimit = 0; 214 fTime = 0; 215 fTickCounter = 0; 216 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; 217 fCallbackFn = NULL; 218 fCallbackContext = NULL; 219 fFindProgressCallbackFn = NULL; 220 fFindProgressCallbackContext = NULL; 221 fTraceDebug = FALSE; 222 fDeferredStatus = status; 223 fData = fSmallData; 224 fWordBreakItr = NULL; 225 226 fStack = NULL; 227 fInputText = NULL; 228 fAltInputText = NULL; 229 fInput = NULL; 230 fInputLength = 0; 231 fInputUniStrMaybeMutable = FALSE; 232 } 233 234 // 235 // init2() Common initialization for use by RegexMatcher constructors, part 2. 236 // This handles the common setup to be done after the Pattern is available. 237 // 238 void RegexMatcher::init2(UText *input, UErrorCode &status) { 239 if (U_FAILURE(status)) { 240 fDeferredStatus = status; 241 return; 242 } 243 244 if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) { 245 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); 246 if (fData == NULL) { 247 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 248 return; 249 } 250 } 251 252 fStack = new UVector64(status); 253 if (fStack == NULL) { 254 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 255 return; 256 } 257 258 reset(input); 259 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); 260 if (U_FAILURE(status)) { 261 fDeferredStatus = status; 262 return; 263 } 264 } 265 266 267 static const UChar BACKSLASH = 0x5c; 268 static const UChar DOLLARSIGN = 0x24; 269 static const UChar LEFTBRACKET = 0x7b; 270 static const UChar RIGHTBRACKET = 0x7d; 271 272 //-------------------------------------------------------------------------------- 273 // 274 // appendReplacement 275 // 276 //-------------------------------------------------------------------------------- 277 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, 278 const UnicodeString &replacement, 279 UErrorCode &status) { 280 UText replacementText = UTEXT_INITIALIZER; 281 282 utext_openConstUnicodeString(&replacementText, &replacement, &status); 283 if (U_SUCCESS(status)) { 284 UText resultText = UTEXT_INITIALIZER; 285 utext_openUnicodeString(&resultText, &dest, &status); 286 287 if (U_SUCCESS(status)) { 288 appendReplacement(&resultText, &replacementText, status); 289 utext_close(&resultText); 290 } 291 utext_close(&replacementText); 292 } 293 294 return *this; 295 } 296 297 // 298 // appendReplacement, UText mode 299 // 300 RegexMatcher &RegexMatcher::appendReplacement(UText *dest, 301 UText *replacement, 302 UErrorCode &status) { 303 if (U_FAILURE(status)) { 304 return *this; 305 } 306 if (U_FAILURE(fDeferredStatus)) { 307 status = fDeferredStatus; 308 return *this; 309 } 310 if (fMatch == FALSE) { 311 status = U_REGEX_INVALID_STATE; 312 return *this; 313 } 314 315 // Copy input string from the end of previous match to start of current match 316 int64_t destLen = utext_nativeLength(dest); 317 if (fMatchStart > fAppendPosition) { 318 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 319 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 320 (int32_t)(fMatchStart-fAppendPosition), &status); 321 } else { 322 int32_t len16; 323 if (UTEXT_USES_U16(fInputText)) { 324 len16 = (int32_t)(fMatchStart-fAppendPosition); 325 } else { 326 UErrorCode lengthStatus = U_ZERO_ERROR; 327 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus); 328 } 329 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 330 if (inputChars == NULL) { 331 status = U_MEMORY_ALLOCATION_ERROR; 332 return *this; 333 } 334 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); 335 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); 336 uprv_free(inputChars); 337 } 338 } 339 fAppendPosition = fMatchEnd; 340 341 342 // scan the replacement text, looking for substitutions ($n) and \escapes. 343 // TODO: optimize this loop by efficiently scanning for '$' or '\', 344 // move entire ranges not containing substitutions. 345 UTEXT_SETNATIVEINDEX(replacement, 0); 346 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) { 347 if (c == BACKSLASH) { 348 // Backslash Escape. Copy the following char out without further checks. 349 // Note: Surrogate pairs don't need any special handling 350 // The second half wont be a '$' or a '\', and 351 // will move to the dest normally on the next 352 // loop iteration. 353 c = UTEXT_CURRENT32(replacement); 354 if (c == U_SENTINEL) { 355 break; 356 } 357 358 if (c==0x55/*U*/ || c==0x75/*u*/) { 359 // We have a \udddd or \Udddddddd escape sequence. 360 int32_t offset = 0; 361 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement); 362 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 363 if (escapedChar != (UChar32)0xFFFFFFFF) { 364 if (U_IS_BMP(escapedChar)) { 365 UChar c16 = (UChar)escapedChar; 366 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 367 } else { 368 UChar surrogate[2]; 369 surrogate[0] = U16_LEAD(escapedChar); 370 surrogate[1] = U16_TRAIL(escapedChar); 371 if (U_SUCCESS(status)) { 372 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 373 } 374 } 375 // TODO: Report errors for mal-formed \u escapes? 376 // As this is, the original sequence is output, which may be OK. 377 if (context.lastOffset == offset) { 378 (void)UTEXT_PREVIOUS32(replacement); 379 } else if (context.lastOffset != offset-1) { 380 utext_moveIndex32(replacement, offset - context.lastOffset - 1); 381 } 382 } 383 } else { 384 (void)UTEXT_NEXT32(replacement); 385 // Plain backslash escape. Just put out the escaped character. 386 if (U_IS_BMP(c)) { 387 UChar c16 = (UChar)c; 388 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 389 } else { 390 UChar surrogate[2]; 391 surrogate[0] = U16_LEAD(c); 392 surrogate[1] = U16_TRAIL(c); 393 if (U_SUCCESS(status)) { 394 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 395 } 396 } 397 } 398 } else if (c != DOLLARSIGN) { 399 // Normal char, not a $. Copy it out without further checks. 400 if (U_IS_BMP(c)) { 401 UChar c16 = (UChar)c; 402 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 403 } else { 404 UChar surrogate[2]; 405 surrogate[0] = U16_LEAD(c); 406 surrogate[1] = U16_TRAIL(c); 407 if (U_SUCCESS(status)) { 408 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 409 } 410 } 411 } else { 412 // We've got a $. Pick up a capture group name or number if one follows. 413 // Consume digits so long as the resulting group number <= the number of 414 // number of capture groups in the pattern. 415 416 int32_t groupNum = 0; 417 int32_t numDigits = 0; 418 UChar32 nextChar = utext_current32(replacement); 419 if (nextChar == LEFTBRACKET) { 420 // Scan for a Named Capture Group, ${name}. 421 UnicodeString groupName; 422 utext_next32(replacement); 423 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) { 424 nextChar = utext_next32(replacement); 425 if (nextChar == U_SENTINEL) { 426 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 427 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z 428 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z 429 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9 430 groupName.append(nextChar); 431 } else if (nextChar == RIGHTBRACKET) { 432 groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName); 433 if (groupNum == 0) { 434 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 435 } 436 } else { 437 // Character was something other than a name char or a closing '}' 438 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 439 } 440 } 441 442 } else if (u_isdigit(nextChar)) { 443 // $n Scan for a capture group number 444 int32_t numCaptureGroups = fPattern->fGroupMap->size(); 445 for (;;) { 446 nextChar = UTEXT_CURRENT32(replacement); 447 if (nextChar == U_SENTINEL) { 448 break; 449 } 450 if (u_isdigit(nextChar) == FALSE) { 451 break; 452 } 453 int32_t nextDigitVal = u_charDigitValue(nextChar); 454 if (groupNum*10 + nextDigitVal > numCaptureGroups) { 455 // Don't consume the next digit if it makes the capture group number too big. 456 if (numDigits == 0) { 457 status = U_INDEX_OUTOFBOUNDS_ERROR; 458 } 459 break; 460 } 461 (void)UTEXT_NEXT32(replacement); 462 groupNum=groupNum*10 + nextDigitVal; 463 ++numDigits; 464 } 465 } else { 466 // $ not followed by capture group name or number. 467 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 468 } 469 470 if (U_SUCCESS(status)) { 471 destLen += appendGroup(groupNum, dest, status); 472 } 473 } // End of $ capture group handling 474 } // End of per-character loop through the replacement string. 475 476 return *this; 477 } 478 479 480 481 //-------------------------------------------------------------------------------- 482 // 483 // appendTail Intended to be used in conjunction with appendReplacement() 484 // To the destination string, append everything following 485 // the last match position from the input string. 486 // 487 // Note: Match ranges do not affect appendTail or appendReplacement 488 // 489 //-------------------------------------------------------------------------------- 490 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { 491 UErrorCode status = U_ZERO_ERROR; 492 UText resultText = UTEXT_INITIALIZER; 493 utext_openUnicodeString(&resultText, &dest, &status); 494 495 if (U_SUCCESS(status)) { 496 appendTail(&resultText, status); 497 utext_close(&resultText); 498 } 499 500 return dest; 501 } 502 503 // 504 // appendTail, UText mode 505 // 506 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { 507 if (U_FAILURE(status)) { 508 return dest; 509 } 510 if (U_FAILURE(fDeferredStatus)) { 511 status = fDeferredStatus; 512 return dest; 513 } 514 515 if (fInputLength > fAppendPosition) { 516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 517 int64_t destLen = utext_nativeLength(dest); 518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 519 (int32_t)(fInputLength-fAppendPosition), &status); 520 } else { 521 int32_t len16; 522 if (UTEXT_USES_U16(fInputText)) { 523 len16 = (int32_t)(fInputLength-fAppendPosition); 524 } else { 525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status); 526 status = U_ZERO_ERROR; // buffer overflow 527 } 528 529 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); 530 if (inputChars == NULL) { 531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 532 } else { 533 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated 534 int64_t destLen = utext_nativeLength(dest); 535 utext_replace(dest, destLen, destLen, inputChars, len16, &status); 536 uprv_free(inputChars); 537 } 538 } 539 } 540 return dest; 541 } 542 543 544 545 //-------------------------------------------------------------------------------- 546 // 547 // end 548 // 549 //-------------------------------------------------------------------------------- 550 int32_t RegexMatcher::end(UErrorCode &err) const { 551 return end(0, err); 552 } 553 554 int64_t RegexMatcher::end64(UErrorCode &err) const { 555 return end64(0, err); 556 } 557 558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const { 559 if (U_FAILURE(err)) { 560 return -1; 561 } 562 if (fMatch == FALSE) { 563 err = U_REGEX_INVALID_STATE; 564 return -1; 565 } 566 if (group < 0 || group > fPattern->fGroupMap->size()) { 567 err = U_INDEX_OUTOFBOUNDS_ERROR; 568 return -1; 569 } 570 int64_t e = -1; 571 if (group == 0) { 572 e = fMatchEnd; 573 } else { 574 // Get the position within the stack frame of the variables for 575 // this capture group. 576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 577 U_ASSERT(groupOffset < fPattern->fFrameSize); 578 U_ASSERT(groupOffset >= 0); 579 e = fFrame->fExtra[groupOffset + 1]; 580 } 581 582 return e; 583 } 584 585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { 586 return (int32_t)end64(group, err); 587 } 588 589 //-------------------------------------------------------------------------------- 590 // 591 // findProgressInterrupt This function is called once for each advance in the target 592 // string from the find() function, and calls the user progress callback 593 // function if there is one installed. 594 // 595 // Return: TRUE if the find operation is to be terminated. 596 // FALSE if the find operation is to continue running. 597 // 598 //-------------------------------------------------------------------------------- 599 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) { 600 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) { 601 status = U_REGEX_STOPPED_BY_CALLER; 602 return TRUE; 603 } 604 return FALSE; 605 } 606 607 //-------------------------------------------------------------------------------- 608 // 609 // find() 610 // 611 //-------------------------------------------------------------------------------- 612 UBool RegexMatcher::find() { 613 if (U_FAILURE(fDeferredStatus)) { 614 return FALSE; 615 } 616 UErrorCode status = U_ZERO_ERROR; 617 UBool result = find(status); 618 return result; 619 } 620 621 //-------------------------------------------------------------------------------- 622 // 623 // find() 624 // 625 //-------------------------------------------------------------------------------- 626 UBool RegexMatcher::find(UErrorCode &status) { 627 // Start at the position of the last match end. (Will be zero if the 628 // matcher has been reset.) 629 // 630 if (U_FAILURE(status)) { 631 return FALSE; 632 } 633 if (U_FAILURE(fDeferredStatus)) { 634 status = fDeferredStatus; 635 return FALSE; 636 } 637 638 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 639 return findUsingChunk(status); 640 } 641 642 int64_t startPos = fMatchEnd; 643 if (startPos==0) { 644 startPos = fActiveStart; 645 } 646 647 if (fMatch) { 648 // Save the position of any previous successful match. 649 fLastMatchEnd = fMatchEnd; 650 651 if (fMatchStart == fMatchEnd) { 652 // Previous match had zero length. Move start position up one position 653 // to avoid sending find() into a loop on zero-length matches. 654 if (startPos >= fActiveLimit) { 655 fMatch = FALSE; 656 fHitEnd = TRUE; 657 return FALSE; 658 } 659 UTEXT_SETNATIVEINDEX(fInputText, startPos); 660 (void)UTEXT_NEXT32(fInputText); 661 startPos = UTEXT_GETNATIVEINDEX(fInputText); 662 } 663 } else { 664 if (fLastMatchEnd >= 0) { 665 // A previous find() failed to match. Don't try again. 666 // (without this test, a pattern with a zero-length match 667 // could match again at the end of an input string.) 668 fHitEnd = TRUE; 669 return FALSE; 670 } 671 } 672 673 674 // Compute the position in the input string beyond which a match can not begin, because 675 // the minimum length match would extend past the end of the input. 676 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 677 // Be aware of possible overflows if making changes here. 678 int64_t testStartLimit; 679 if (UTEXT_USES_U16(fInputText)) { 680 testStartLimit = fActiveLimit - fPattern->fMinMatchLen; 681 if (startPos > testStartLimit) { 682 fMatch = FALSE; 683 fHitEnd = TRUE; 684 return FALSE; 685 } 686 } else { 687 // We don't know exactly how long the minimum match length is in native characters. 688 // Treat anything > 0 as 1. 689 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0); 690 } 691 692 UChar32 c; 693 U_ASSERT(startPos >= 0); 694 695 switch (fPattern->fStartType) { 696 case START_NO_INFO: 697 // No optimization was found. 698 // Try a match at each input position. 699 for (;;) { 700 MatchAt(startPos, FALSE, status); 701 if (U_FAILURE(status)) { 702 return FALSE; 703 } 704 if (fMatch) { 705 return TRUE; 706 } 707 if (startPos >= testStartLimit) { 708 fHitEnd = TRUE; 709 return FALSE; 710 } 711 UTEXT_SETNATIVEINDEX(fInputText, startPos); 712 (void)UTEXT_NEXT32(fInputText); 713 startPos = UTEXT_GETNATIVEINDEX(fInputText); 714 // Note that it's perfectly OK for a pattern to have a zero-length 715 // match at the end of a string, so we must make sure that the loop 716 // runs with startPos == testStartLimit the last time through. 717 if (findProgressInterrupt(startPos, status)) 718 return FALSE; 719 } 720 U_ASSERT(FALSE); 721 722 case START_START: 723 // Matches are only possible at the start of the input string 724 // (pattern begins with ^ or \A) 725 if (startPos > fActiveStart) { 726 fMatch = FALSE; 727 return FALSE; 728 } 729 MatchAt(startPos, FALSE, status); 730 if (U_FAILURE(status)) { 731 return FALSE; 732 } 733 return fMatch; 734 735 736 case START_SET: 737 { 738 // Match may start on any char from a pre-computed set. 739 U_ASSERT(fPattern->fMinMatchLen > 0); 740 UTEXT_SETNATIVEINDEX(fInputText, startPos); 741 for (;;) { 742 int64_t pos = startPos; 743 c = UTEXT_NEXT32(fInputText); 744 startPos = UTEXT_GETNATIVEINDEX(fInputText); 745 // c will be -1 (U_SENTINEL) at end of text, in which case we 746 // skip this next block (so we don't have a negative array index) 747 // and handle end of text in the following block. 748 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) || 749 (c>=256 && fPattern->fInitialChars->contains(c)))) { 750 MatchAt(pos, FALSE, status); 751 if (U_FAILURE(status)) { 752 return FALSE; 753 } 754 if (fMatch) { 755 return TRUE; 756 } 757 UTEXT_SETNATIVEINDEX(fInputText, pos); 758 } 759 if (startPos > testStartLimit) { 760 fMatch = FALSE; 761 fHitEnd = TRUE; 762 return FALSE; 763 } 764 if (findProgressInterrupt(startPos, status)) 765 return FALSE; 766 } 767 } 768 U_ASSERT(FALSE); 769 770 case START_STRING: 771 case START_CHAR: 772 { 773 // Match starts on exactly one char. 774 U_ASSERT(fPattern->fMinMatchLen > 0); 775 UChar32 theChar = fPattern->fInitialChar; 776 UTEXT_SETNATIVEINDEX(fInputText, startPos); 777 for (;;) { 778 int64_t pos = startPos; 779 c = UTEXT_NEXT32(fInputText); 780 startPos = UTEXT_GETNATIVEINDEX(fInputText); 781 if (c == theChar) { 782 MatchAt(pos, FALSE, status); 783 if (U_FAILURE(status)) { 784 return FALSE; 785 } 786 if (fMatch) { 787 return TRUE; 788 } 789 UTEXT_SETNATIVEINDEX(fInputText, startPos); 790 } 791 if (startPos > testStartLimit) { 792 fMatch = FALSE; 793 fHitEnd = TRUE; 794 return FALSE; 795 } 796 if (findProgressInterrupt(startPos, status)) 797 return FALSE; 798 } 799 } 800 U_ASSERT(FALSE); 801 802 case START_LINE: 803 { 804 UChar32 ch; 805 if (startPos == fAnchorStart) { 806 MatchAt(startPos, FALSE, status); 807 if (U_FAILURE(status)) { 808 return FALSE; 809 } 810 if (fMatch) { 811 return TRUE; 812 } 813 UTEXT_SETNATIVEINDEX(fInputText, startPos); 814 ch = UTEXT_NEXT32(fInputText); 815 startPos = UTEXT_GETNATIVEINDEX(fInputText); 816 } else { 817 UTEXT_SETNATIVEINDEX(fInputText, startPos); 818 ch = UTEXT_PREVIOUS32(fInputText); 819 UTEXT_SETNATIVEINDEX(fInputText, startPos); 820 } 821 822 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 823 for (;;) { 824 if (ch == 0x0a) { 825 MatchAt(startPos, FALSE, status); 826 if (U_FAILURE(status)) { 827 return FALSE; 828 } 829 if (fMatch) { 830 return TRUE; 831 } 832 UTEXT_SETNATIVEINDEX(fInputText, startPos); 833 } 834 if (startPos >= testStartLimit) { 835 fMatch = FALSE; 836 fHitEnd = TRUE; 837 return FALSE; 838 } 839 ch = UTEXT_NEXT32(fInputText); 840 startPos = UTEXT_GETNATIVEINDEX(fInputText); 841 // Note that it's perfectly OK for a pattern to have a zero-length 842 // match at the end of a string, so we must make sure that the loop 843 // runs with startPos == testStartLimit the last time through. 844 if (findProgressInterrupt(startPos, status)) 845 return FALSE; 846 } 847 } else { 848 for (;;) { 849 if (isLineTerminator(ch)) { 850 if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 851 (void)UTEXT_NEXT32(fInputText); 852 startPos = UTEXT_GETNATIVEINDEX(fInputText); 853 } 854 MatchAt(startPos, FALSE, status); 855 if (U_FAILURE(status)) { 856 return FALSE; 857 } 858 if (fMatch) { 859 return TRUE; 860 } 861 UTEXT_SETNATIVEINDEX(fInputText, startPos); 862 } 863 if (startPos >= testStartLimit) { 864 fMatch = FALSE; 865 fHitEnd = TRUE; 866 return FALSE; 867 } 868 ch = UTEXT_NEXT32(fInputText); 869 startPos = UTEXT_GETNATIVEINDEX(fInputText); 870 // Note that it's perfectly OK for a pattern to have a zero-length 871 // match at the end of a string, so we must make sure that the loop 872 // runs with startPos == testStartLimit the last time through. 873 if (findProgressInterrupt(startPos, status)) 874 return FALSE; 875 } 876 } 877 } 878 879 default: 880 U_ASSERT(FALSE); 881 } 882 883 U_ASSERT(FALSE); 884 return FALSE; 885 } 886 887 888 889 UBool RegexMatcher::find(int64_t start, UErrorCode &status) { 890 if (U_FAILURE(status)) { 891 return FALSE; 892 } 893 if (U_FAILURE(fDeferredStatus)) { 894 status = fDeferredStatus; 895 return FALSE; 896 } 897 this->reset(); // Note: Reset() is specified by Java Matcher documentation. 898 // This will reset the region to be the full input length. 899 if (start < 0) { 900 status = U_INDEX_OUTOFBOUNDS_ERROR; 901 return FALSE; 902 } 903 904 int64_t nativeStart = start; 905 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 906 status = U_INDEX_OUTOFBOUNDS_ERROR; 907 return FALSE; 908 } 909 fMatchEnd = nativeStart; 910 return find(status); 911 } 912 913 914 //-------------------------------------------------------------------------------- 915 // 916 // findUsingChunk() -- like find(), but with the advance knowledge that the 917 // entire string is available in the UText's chunk buffer. 918 // 919 //-------------------------------------------------------------------------------- 920 UBool RegexMatcher::findUsingChunk(UErrorCode &status) { 921 // Start at the position of the last match end. (Will be zero if the 922 // matcher has been reset. 923 // 924 925 int32_t startPos = (int32_t)fMatchEnd; 926 if (startPos==0) { 927 startPos = (int32_t)fActiveStart; 928 } 929 930 const UChar *inputBuf = fInputText->chunkContents; 931 932 if (fMatch) { 933 // Save the position of any previous successful match. 934 fLastMatchEnd = fMatchEnd; 935 936 if (fMatchStart == fMatchEnd) { 937 // Previous match had zero length. Move start position up one position 938 // to avoid sending find() into a loop on zero-length matches. 939 if (startPos >= fActiveLimit) { 940 fMatch = FALSE; 941 fHitEnd = TRUE; 942 return FALSE; 943 } 944 U16_FWD_1(inputBuf, startPos, fInputLength); 945 } 946 } else { 947 if (fLastMatchEnd >= 0) { 948 // A previous find() failed to match. Don't try again. 949 // (without this test, a pattern with a zero-length match 950 // could match again at the end of an input string.) 951 fHitEnd = TRUE; 952 return FALSE; 953 } 954 } 955 956 957 // Compute the position in the input string beyond which a match can not begin, because 958 // the minimum length match would extend past the end of the input. 959 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 960 // Be aware of possible overflows if making changes here. 961 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit. 962 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); 963 if (startPos > testLen) { 964 fMatch = FALSE; 965 fHitEnd = TRUE; 966 return FALSE; 967 } 968 969 UChar32 c; 970 U_ASSERT(startPos >= 0); 971 972 switch (fPattern->fStartType) { 973 case START_NO_INFO: 974 // No optimization was found. 975 // Try a match at each input position. 976 for (;;) { 977 MatchChunkAt(startPos, FALSE, status); 978 if (U_FAILURE(status)) { 979 return FALSE; 980 } 981 if (fMatch) { 982 return TRUE; 983 } 984 if (startPos >= testLen) { 985 fHitEnd = TRUE; 986 return FALSE; 987 } 988 U16_FWD_1(inputBuf, startPos, fActiveLimit); 989 // Note that it's perfectly OK for a pattern to have a zero-length 990 // match at the end of a string, so we must make sure that the loop 991 // runs with startPos == testLen the last time through. 992 if (findProgressInterrupt(startPos, status)) 993 return FALSE; 994 } 995 U_ASSERT(FALSE); 996 997 case START_START: 998 // Matches are only possible at the start of the input string 999 // (pattern begins with ^ or \A) 1000 if (startPos > fActiveStart) { 1001 fMatch = FALSE; 1002 return FALSE; 1003 } 1004 MatchChunkAt(startPos, FALSE, status); 1005 if (U_FAILURE(status)) { 1006 return FALSE; 1007 } 1008 return fMatch; 1009 1010 1011 case START_SET: 1012 { 1013 // Match may start on any char from a pre-computed set. 1014 U_ASSERT(fPattern->fMinMatchLen > 0); 1015 for (;;) { 1016 int32_t pos = startPos; 1017 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 1018 if ((c<256 && fPattern->fInitialChars8->contains(c)) || 1019 (c>=256 && fPattern->fInitialChars->contains(c))) { 1020 MatchChunkAt(pos, FALSE, status); 1021 if (U_FAILURE(status)) { 1022 return FALSE; 1023 } 1024 if (fMatch) { 1025 return TRUE; 1026 } 1027 } 1028 if (startPos > testLen) { 1029 fMatch = FALSE; 1030 fHitEnd = TRUE; 1031 return FALSE; 1032 } 1033 if (findProgressInterrupt(startPos, status)) 1034 return FALSE; 1035 } 1036 } 1037 U_ASSERT(FALSE); 1038 1039 case START_STRING: 1040 case START_CHAR: 1041 { 1042 // Match starts on exactly one char. 1043 U_ASSERT(fPattern->fMinMatchLen > 0); 1044 UChar32 theChar = fPattern->fInitialChar; 1045 for (;;) { 1046 int32_t pos = startPos; 1047 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 1048 if (c == theChar) { 1049 MatchChunkAt(pos, FALSE, status); 1050 if (U_FAILURE(status)) { 1051 return FALSE; 1052 } 1053 if (fMatch) { 1054 return TRUE; 1055 } 1056 } 1057 if (startPos > testLen) { 1058 fMatch = FALSE; 1059 fHitEnd = TRUE; 1060 return FALSE; 1061 } 1062 if (findProgressInterrupt(startPos, status)) 1063 return FALSE; 1064 } 1065 } 1066 U_ASSERT(FALSE); 1067 1068 case START_LINE: 1069 { 1070 UChar32 ch; 1071 if (startPos == fAnchorStart) { 1072 MatchChunkAt(startPos, FALSE, status); 1073 if (U_FAILURE(status)) { 1074 return FALSE; 1075 } 1076 if (fMatch) { 1077 return TRUE; 1078 } 1079 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1080 } 1081 1082 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 1083 for (;;) { 1084 ch = inputBuf[startPos-1]; 1085 if (ch == 0x0a) { 1086 MatchChunkAt(startPos, FALSE, status); 1087 if (U_FAILURE(status)) { 1088 return FALSE; 1089 } 1090 if (fMatch) { 1091 return TRUE; 1092 } 1093 } 1094 if (startPos >= testLen) { 1095 fMatch = FALSE; 1096 fHitEnd = TRUE; 1097 return FALSE; 1098 } 1099 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1100 // Note that it's perfectly OK for a pattern to have a zero-length 1101 // match at the end of a string, so we must make sure that the loop 1102 // runs with startPos == testLen the last time through. 1103 if (findProgressInterrupt(startPos, status)) 1104 return FALSE; 1105 } 1106 } else { 1107 for (;;) { 1108 ch = inputBuf[startPos-1]; 1109 if (isLineTerminator(ch)) { 1110 if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { 1111 startPos++; 1112 } 1113 MatchChunkAt(startPos, FALSE, status); 1114 if (U_FAILURE(status)) { 1115 return FALSE; 1116 } 1117 if (fMatch) { 1118 return TRUE; 1119 } 1120 } 1121 if (startPos >= testLen) { 1122 fMatch = FALSE; 1123 fHitEnd = TRUE; 1124 return FALSE; 1125 } 1126 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1127 // Note that it's perfectly OK for a pattern to have a zero-length 1128 // match at the end of a string, so we must make sure that the loop 1129 // runs with startPos == testLen the last time through. 1130 if (findProgressInterrupt(startPos, status)) 1131 return FALSE; 1132 } 1133 } 1134 } 1135 1136 default: 1137 U_ASSERT(FALSE); 1138 } 1139 1140 U_ASSERT(FALSE); 1141 return FALSE; 1142 } 1143 1144 1145 1146 //-------------------------------------------------------------------------------- 1147 // 1148 // group() 1149 // 1150 //-------------------------------------------------------------------------------- 1151 UnicodeString RegexMatcher::group(UErrorCode &status) const { 1152 return group(0, status); 1153 } 1154 1155 // Return immutable shallow clone 1156 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const { 1157 return group(0, dest, group_len, status); 1158 } 1159 1160 // Return immutable shallow clone 1161 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const { 1162 group_len = 0; 1163 if (U_FAILURE(status)) { 1164 return dest; 1165 } 1166 if (U_FAILURE(fDeferredStatus)) { 1167 status = fDeferredStatus; 1168 } else if (fMatch == FALSE) { 1169 status = U_REGEX_INVALID_STATE; 1170 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1171 status = U_INDEX_OUTOFBOUNDS_ERROR; 1172 } 1173 1174 if (U_FAILURE(status)) { 1175 return dest; 1176 } 1177 1178 int64_t s, e; 1179 if (groupNum == 0) { 1180 s = fMatchStart; 1181 e = fMatchEnd; 1182 } else { 1183 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1184 U_ASSERT(groupOffset < fPattern->fFrameSize); 1185 U_ASSERT(groupOffset >= 0); 1186 s = fFrame->fExtra[groupOffset]; 1187 e = fFrame->fExtra[groupOffset+1]; 1188 } 1189 1190 if (s < 0) { 1191 // A capture group wasn't part of the match 1192 return utext_clone(dest, fInputText, FALSE, TRUE, &status); 1193 } 1194 U_ASSERT(s <= e); 1195 group_len = e - s; 1196 1197 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); 1198 if (dest) 1199 UTEXT_SETNATIVEINDEX(dest, s); 1200 return dest; 1201 } 1202 1203 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { 1204 UnicodeString result; 1205 int64_t groupStart = start64(groupNum, status); 1206 int64_t groupEnd = end64(groupNum, status); 1207 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) { 1208 return result; 1209 } 1210 1211 // Get the group length using a utext_extract preflight. 1212 // UText is actually pretty efficient at this when underlying encoding is UTF-16. 1213 int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status); 1214 if (status != U_BUFFER_OVERFLOW_ERROR) { 1215 return result; 1216 } 1217 1218 status = U_ZERO_ERROR; 1219 UChar *buf = result.getBuffer(length); 1220 if (buf == NULL) { 1221 status = U_MEMORY_ALLOCATION_ERROR; 1222 } else { 1223 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status); 1224 result.releaseBuffer(extractLength); 1225 U_ASSERT(length == extractLength); 1226 } 1227 return result; 1228 } 1229 1230 1231 //-------------------------------------------------------------------------------- 1232 // 1233 // appendGroup() -- currently internal only, appends a group to a UText rather 1234 // than replacing its contents 1235 // 1236 //-------------------------------------------------------------------------------- 1237 1238 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const { 1239 if (U_FAILURE(status)) { 1240 return 0; 1241 } 1242 if (U_FAILURE(fDeferredStatus)) { 1243 status = fDeferredStatus; 1244 return 0; 1245 } 1246 int64_t destLen = utext_nativeLength(dest); 1247 1248 if (fMatch == FALSE) { 1249 status = U_REGEX_INVALID_STATE; 1250 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1251 } 1252 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1253 status = U_INDEX_OUTOFBOUNDS_ERROR; 1254 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1255 } 1256 1257 int64_t s, e; 1258 if (groupNum == 0) { 1259 s = fMatchStart; 1260 e = fMatchEnd; 1261 } else { 1262 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1263 U_ASSERT(groupOffset < fPattern->fFrameSize); 1264 U_ASSERT(groupOffset >= 0); 1265 s = fFrame->fExtra[groupOffset]; 1266 e = fFrame->fExtra[groupOffset+1]; 1267 } 1268 1269 if (s < 0) { 1270 // A capture group wasn't part of the match 1271 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1272 } 1273 U_ASSERT(s <= e); 1274 1275 int64_t deltaLen; 1276 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1277 U_ASSERT(e <= fInputLength); 1278 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status); 1279 } else { 1280 int32_t len16; 1281 if (UTEXT_USES_U16(fInputText)) { 1282 len16 = (int32_t)(e-s); 1283 } else { 1284 UErrorCode lengthStatus = U_ZERO_ERROR; 1285 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1286 } 1287 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1288 if (groupChars == NULL) { 1289 status = U_MEMORY_ALLOCATION_ERROR; 1290 return 0; 1291 } 1292 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1293 1294 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status); 1295 uprv_free(groupChars); 1296 } 1297 return deltaLen; 1298 } 1299 1300 1301 1302 //-------------------------------------------------------------------------------- 1303 // 1304 // groupCount() 1305 // 1306 //-------------------------------------------------------------------------------- 1307 int32_t RegexMatcher::groupCount() const { 1308 return fPattern->fGroupMap->size(); 1309 } 1310 1311 //-------------------------------------------------------------------------------- 1312 // 1313 // hasAnchoringBounds() 1314 // 1315 //-------------------------------------------------------------------------------- 1316 UBool RegexMatcher::hasAnchoringBounds() const { 1317 return fAnchoringBounds; 1318 } 1319 1320 1321 //-------------------------------------------------------------------------------- 1322 // 1323 // hasTransparentBounds() 1324 // 1325 //-------------------------------------------------------------------------------- 1326 UBool RegexMatcher::hasTransparentBounds() const { 1327 return fTransparentBounds; 1328 } 1329 1330 1331 1332 //-------------------------------------------------------------------------------- 1333 // 1334 // hitEnd() 1335 // 1336 //-------------------------------------------------------------------------------- 1337 UBool RegexMatcher::hitEnd() const { 1338 return fHitEnd; 1339 } 1340 1341 1342 //-------------------------------------------------------------------------------- 1343 // 1344 // input() 1345 // 1346 //-------------------------------------------------------------------------------- 1347 const UnicodeString &RegexMatcher::input() const { 1348 if (!fInput) { 1349 UErrorCode status = U_ZERO_ERROR; 1350 int32_t len16; 1351 if (UTEXT_USES_U16(fInputText)) { 1352 len16 = (int32_t)fInputLength; 1353 } else { 1354 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status); 1355 status = U_ZERO_ERROR; // overflow, length status 1356 } 1357 UnicodeString *result = new UnicodeString(len16, 0, 0); 1358 1359 UChar *inputChars = result->getBuffer(len16); 1360 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning 1361 result->releaseBuffer(len16); 1362 1363 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator= 1364 } 1365 1366 return *fInput; 1367 } 1368 1369 //-------------------------------------------------------------------------------- 1370 // 1371 // inputText() 1372 // 1373 //-------------------------------------------------------------------------------- 1374 UText *RegexMatcher::inputText() const { 1375 return fInputText; 1376 } 1377 1378 1379 //-------------------------------------------------------------------------------- 1380 // 1381 // getInput() -- like inputText(), but makes a clone or copies into another UText 1382 // 1383 //-------------------------------------------------------------------------------- 1384 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { 1385 if (U_FAILURE(status)) { 1386 return dest; 1387 } 1388 if (U_FAILURE(fDeferredStatus)) { 1389 status = fDeferredStatus; 1390 return dest; 1391 } 1392 1393 if (dest) { 1394 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1395 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status); 1396 } else { 1397 int32_t input16Len; 1398 if (UTEXT_USES_U16(fInputText)) { 1399 input16Len = (int32_t)fInputLength; 1400 } else { 1401 UErrorCode lengthStatus = U_ZERO_ERROR; 1402 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error 1403 } 1404 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len)); 1405 if (inputChars == NULL) { 1406 return dest; 1407 } 1408 1409 status = U_ZERO_ERROR; 1410 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning 1411 status = U_ZERO_ERROR; 1412 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status); 1413 1414 uprv_free(inputChars); 1415 } 1416 return dest; 1417 } else { 1418 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); 1419 } 1420 } 1421 1422 1423 static UBool compat_SyncMutableUTextContents(UText *ut); 1424 static UBool compat_SyncMutableUTextContents(UText *ut) { 1425 UBool retVal = FALSE; 1426 1427 // In the following test, we're really only interested in whether the UText should switch 1428 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents 1429 // will still point to the correct data. 1430 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { 1431 UnicodeString *us=(UnicodeString *)ut->context; 1432 1433 // Update to the latest length. 1434 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). 1435 int32_t newLength = us->length(); 1436 1437 // Update the chunk description. 1438 // The buffer may have switched between stack- and heap-based. 1439 ut->chunkContents = us->getBuffer(); 1440 ut->chunkLength = newLength; 1441 ut->chunkNativeLimit = newLength; 1442 ut->nativeIndexingLimit = newLength; 1443 retVal = TRUE; 1444 } 1445 1446 return retVal; 1447 } 1448 1449 //-------------------------------------------------------------------------------- 1450 // 1451 // lookingAt() 1452 // 1453 //-------------------------------------------------------------------------------- 1454 UBool RegexMatcher::lookingAt(UErrorCode &status) { 1455 if (U_FAILURE(status)) { 1456 return FALSE; 1457 } 1458 if (U_FAILURE(fDeferredStatus)) { 1459 status = fDeferredStatus; 1460 return FALSE; 1461 } 1462 1463 if (fInputUniStrMaybeMutable) { 1464 if (compat_SyncMutableUTextContents(fInputText)) { 1465 fInputLength = utext_nativeLength(fInputText); 1466 reset(); 1467 } 1468 } 1469 else { 1470 resetPreserveRegion(); 1471 } 1472 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1473 MatchChunkAt((int32_t)fActiveStart, FALSE, status); 1474 } else { 1475 MatchAt(fActiveStart, FALSE, status); 1476 } 1477 return fMatch; 1478 } 1479 1480 1481 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { 1482 if (U_FAILURE(status)) { 1483 return FALSE; 1484 } 1485 if (U_FAILURE(fDeferredStatus)) { 1486 status = fDeferredStatus; 1487 return FALSE; 1488 } 1489 reset(); 1490 1491 if (start < 0) { 1492 status = U_INDEX_OUTOFBOUNDS_ERROR; 1493 return FALSE; 1494 } 1495 1496 if (fInputUniStrMaybeMutable) { 1497 if (compat_SyncMutableUTextContents(fInputText)) { 1498 fInputLength = utext_nativeLength(fInputText); 1499 reset(); 1500 } 1501 } 1502 1503 int64_t nativeStart; 1504 nativeStart = start; 1505 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1506 status = U_INDEX_OUTOFBOUNDS_ERROR; 1507 return FALSE; 1508 } 1509 1510 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1511 MatchChunkAt((int32_t)nativeStart, FALSE, status); 1512 } else { 1513 MatchAt(nativeStart, FALSE, status); 1514 } 1515 return fMatch; 1516 } 1517 1518 1519 1520 //-------------------------------------------------------------------------------- 1521 // 1522 // matches() 1523 // 1524 //-------------------------------------------------------------------------------- 1525 UBool RegexMatcher::matches(UErrorCode &status) { 1526 if (U_FAILURE(status)) { 1527 return FALSE; 1528 } 1529 if (U_FAILURE(fDeferredStatus)) { 1530 status = fDeferredStatus; 1531 return FALSE; 1532 } 1533 1534 if (fInputUniStrMaybeMutable) { 1535 if (compat_SyncMutableUTextContents(fInputText)) { 1536 fInputLength = utext_nativeLength(fInputText); 1537 reset(); 1538 } 1539 } 1540 else { 1541 resetPreserveRegion(); 1542 } 1543 1544 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1545 MatchChunkAt((int32_t)fActiveStart, TRUE, status); 1546 } else { 1547 MatchAt(fActiveStart, TRUE, status); 1548 } 1549 return fMatch; 1550 } 1551 1552 1553 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { 1554 if (U_FAILURE(status)) { 1555 return FALSE; 1556 } 1557 if (U_FAILURE(fDeferredStatus)) { 1558 status = fDeferredStatus; 1559 return FALSE; 1560 } 1561 reset(); 1562 1563 if (start < 0) { 1564 status = U_INDEX_OUTOFBOUNDS_ERROR; 1565 return FALSE; 1566 } 1567 1568 if (fInputUniStrMaybeMutable) { 1569 if (compat_SyncMutableUTextContents(fInputText)) { 1570 fInputLength = utext_nativeLength(fInputText); 1571 reset(); 1572 } 1573 } 1574 1575 int64_t nativeStart; 1576 nativeStart = start; 1577 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1578 status = U_INDEX_OUTOFBOUNDS_ERROR; 1579 return FALSE; 1580 } 1581 1582 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1583 MatchChunkAt((int32_t)nativeStart, TRUE, status); 1584 } else { 1585 MatchAt(nativeStart, TRUE, status); 1586 } 1587 return fMatch; 1588 } 1589 1590 1591 1592 //-------------------------------------------------------------------------------- 1593 // 1594 // pattern 1595 // 1596 //-------------------------------------------------------------------------------- 1597 const RegexPattern &RegexMatcher::pattern() const { 1598 return *fPattern; 1599 } 1600 1601 1602 1603 //-------------------------------------------------------------------------------- 1604 // 1605 // region 1606 // 1607 //-------------------------------------------------------------------------------- 1608 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) { 1609 if (U_FAILURE(status)) { 1610 return *this; 1611 } 1612 1613 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { 1614 status = U_ILLEGAL_ARGUMENT_ERROR; 1615 } 1616 1617 int64_t nativeStart = regionStart; 1618 int64_t nativeLimit = regionLimit; 1619 if (nativeStart > fInputLength || nativeLimit > fInputLength) { 1620 status = U_ILLEGAL_ARGUMENT_ERROR; 1621 } 1622 1623 if (startIndex == -1) 1624 this->reset(); 1625 else 1626 resetPreserveRegion(); 1627 1628 fRegionStart = nativeStart; 1629 fRegionLimit = nativeLimit; 1630 fActiveStart = nativeStart; 1631 fActiveLimit = nativeLimit; 1632 1633 if (startIndex != -1) { 1634 if (startIndex < fActiveStart || startIndex > fActiveLimit) { 1635 status = U_INDEX_OUTOFBOUNDS_ERROR; 1636 } 1637 fMatchEnd = startIndex; 1638 } 1639 1640 if (!fTransparentBounds) { 1641 fLookStart = nativeStart; 1642 fLookLimit = nativeLimit; 1643 } 1644 if (fAnchoringBounds) { 1645 fAnchorStart = nativeStart; 1646 fAnchorLimit = nativeLimit; 1647 } 1648 return *this; 1649 } 1650 1651 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) { 1652 return region(start, limit, -1, status); 1653 } 1654 1655 //-------------------------------------------------------------------------------- 1656 // 1657 // regionEnd 1658 // 1659 //-------------------------------------------------------------------------------- 1660 int32_t RegexMatcher::regionEnd() const { 1661 return (int32_t)fRegionLimit; 1662 } 1663 1664 int64_t RegexMatcher::regionEnd64() const { 1665 return fRegionLimit; 1666 } 1667 1668 //-------------------------------------------------------------------------------- 1669 // 1670 // regionStart 1671 // 1672 //-------------------------------------------------------------------------------- 1673 int32_t RegexMatcher::regionStart() const { 1674 return (int32_t)fRegionStart; 1675 } 1676 1677 int64_t RegexMatcher::regionStart64() const { 1678 return fRegionStart; 1679 } 1680 1681 1682 //-------------------------------------------------------------------------------- 1683 // 1684 // replaceAll 1685 // 1686 //-------------------------------------------------------------------------------- 1687 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { 1688 UText replacementText = UTEXT_INITIALIZER; 1689 UText resultText = UTEXT_INITIALIZER; 1690 UnicodeString resultString; 1691 if (U_FAILURE(status)) { 1692 return resultString; 1693 } 1694 1695 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1696 utext_openUnicodeString(&resultText, &resultString, &status); 1697 1698 replaceAll(&replacementText, &resultText, status); 1699 1700 utext_close(&resultText); 1701 utext_close(&replacementText); 1702 1703 return resultString; 1704 } 1705 1706 1707 // 1708 // replaceAll, UText mode 1709 // 1710 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) { 1711 if (U_FAILURE(status)) { 1712 return dest; 1713 } 1714 if (U_FAILURE(fDeferredStatus)) { 1715 status = fDeferredStatus; 1716 return dest; 1717 } 1718 1719 if (dest == NULL) { 1720 UnicodeString emptyString; 1721 UText empty = UTEXT_INITIALIZER; 1722 1723 utext_openUnicodeString(&empty, &emptyString, &status); 1724 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1725 utext_close(&empty); 1726 } 1727 1728 if (U_SUCCESS(status)) { 1729 reset(); 1730 while (find()) { 1731 appendReplacement(dest, replacement, status); 1732 if (U_FAILURE(status)) { 1733 break; 1734 } 1735 } 1736 appendTail(dest, status); 1737 } 1738 1739 return dest; 1740 } 1741 1742 1743 //-------------------------------------------------------------------------------- 1744 // 1745 // replaceFirst 1746 // 1747 //-------------------------------------------------------------------------------- 1748 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { 1749 UText replacementText = UTEXT_INITIALIZER; 1750 UText resultText = UTEXT_INITIALIZER; 1751 UnicodeString resultString; 1752 1753 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1754 utext_openUnicodeString(&resultText, &resultString, &status); 1755 1756 replaceFirst(&replacementText, &resultText, status); 1757 1758 utext_close(&resultText); 1759 utext_close(&replacementText); 1760 1761 return resultString; 1762 } 1763 1764 // 1765 // replaceFirst, UText mode 1766 // 1767 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) { 1768 if (U_FAILURE(status)) { 1769 return dest; 1770 } 1771 if (U_FAILURE(fDeferredStatus)) { 1772 status = fDeferredStatus; 1773 return dest; 1774 } 1775 1776 reset(); 1777 if (!find()) { 1778 return getInput(dest, status); 1779 } 1780 1781 if (dest == NULL) { 1782 UnicodeString emptyString; 1783 UText empty = UTEXT_INITIALIZER; 1784 1785 utext_openUnicodeString(&empty, &emptyString, &status); 1786 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1787 utext_close(&empty); 1788 } 1789 1790 appendReplacement(dest, replacement, status); 1791 appendTail(dest, status); 1792 1793 return dest; 1794 } 1795 1796 1797 //-------------------------------------------------------------------------------- 1798 // 1799 // requireEnd 1800 // 1801 //-------------------------------------------------------------------------------- 1802 UBool RegexMatcher::requireEnd() const { 1803 return fRequireEnd; 1804 } 1805 1806 1807 //-------------------------------------------------------------------------------- 1808 // 1809 // reset 1810 // 1811 //-------------------------------------------------------------------------------- 1812 RegexMatcher &RegexMatcher::reset() { 1813 fRegionStart = 0; 1814 fRegionLimit = fInputLength; 1815 fActiveStart = 0; 1816 fActiveLimit = fInputLength; 1817 fAnchorStart = 0; 1818 fAnchorLimit = fInputLength; 1819 fLookStart = 0; 1820 fLookLimit = fInputLength; 1821 resetPreserveRegion(); 1822 return *this; 1823 } 1824 1825 1826 1827 void RegexMatcher::resetPreserveRegion() { 1828 fMatchStart = 0; 1829 fMatchEnd = 0; 1830 fLastMatchEnd = -1; 1831 fAppendPosition = 0; 1832 fMatch = FALSE; 1833 fHitEnd = FALSE; 1834 fRequireEnd = FALSE; 1835 fTime = 0; 1836 fTickCounter = TIMER_INITIAL_VALUE; 1837 //resetStack(); // more expensive than it looks... 1838 } 1839 1840 1841 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { 1842 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus); 1843 if (fPattern->fNeedsAltInput) { 1844 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1845 } 1846 if (U_FAILURE(fDeferredStatus)) { 1847 return *this; 1848 } 1849 fInputLength = utext_nativeLength(fInputText); 1850 1851 reset(); 1852 delete fInput; 1853 fInput = NULL; 1854 1855 // Do the following for any UnicodeString. 1856 // This is for compatibility for those clients who modify the input string "live" during regex operations. 1857 fInputUniStrMaybeMutable = TRUE; 1858 1859 if (fWordBreakItr != NULL) { 1860 #if UCONFIG_NO_BREAK_ITERATION==0 1861 UErrorCode status = U_ZERO_ERROR; 1862 fWordBreakItr->setText(fInputText, status); 1863 #endif 1864 } 1865 return *this; 1866 } 1867 1868 1869 RegexMatcher &RegexMatcher::reset(UText *input) { 1870 if (fInputText != input) { 1871 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus); 1872 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1873 if (U_FAILURE(fDeferredStatus)) { 1874 return *this; 1875 } 1876 fInputLength = utext_nativeLength(fInputText); 1877 1878 delete fInput; 1879 fInput = NULL; 1880 1881 if (fWordBreakItr != NULL) { 1882 #if UCONFIG_NO_BREAK_ITERATION==0 1883 UErrorCode status = U_ZERO_ERROR; 1884 fWordBreakItr->setText(input, status); 1885 #endif 1886 } 1887 } 1888 reset(); 1889 fInputUniStrMaybeMutable = FALSE; 1890 1891 return *this; 1892 } 1893 1894 /*RegexMatcher &RegexMatcher::reset(const UChar *) { 1895 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; 1896 return *this; 1897 }*/ 1898 1899 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { 1900 if (U_FAILURE(status)) { 1901 return *this; 1902 } 1903 reset(); // Reset also resets the region to be the entire string. 1904 1905 if (position < 0 || position > fActiveLimit) { 1906 status = U_INDEX_OUTOFBOUNDS_ERROR; 1907 return *this; 1908 } 1909 fMatchEnd = position; 1910 return *this; 1911 } 1912 1913 1914 //-------------------------------------------------------------------------------- 1915 // 1916 // refresh 1917 // 1918 //-------------------------------------------------------------------------------- 1919 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) { 1920 if (U_FAILURE(status)) { 1921 return *this; 1922 } 1923 if (input == NULL) { 1924 status = U_ILLEGAL_ARGUMENT_ERROR; 1925 return *this; 1926 } 1927 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) { 1928 status = U_ILLEGAL_ARGUMENT_ERROR; 1929 return *this; 1930 } 1931 int64_t pos = utext_getNativeIndex(fInputText); 1932 // Shallow read-only clone of the new UText into the existing input UText 1933 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status); 1934 if (U_FAILURE(status)) { 1935 return *this; 1936 } 1937 utext_setNativeIndex(fInputText, pos); 1938 1939 if (fAltInputText != NULL) { 1940 pos = utext_getNativeIndex(fAltInputText); 1941 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status); 1942 if (U_FAILURE(status)) { 1943 return *this; 1944 } 1945 utext_setNativeIndex(fAltInputText, pos); 1946 } 1947 return *this; 1948 } 1949 1950 1951 1952 //-------------------------------------------------------------------------------- 1953 // 1954 // setTrace 1955 // 1956 //-------------------------------------------------------------------------------- 1957 void RegexMatcher::setTrace(UBool state) { 1958 fTraceDebug = state; 1959 } 1960 1961 1962 1963 /** 1964 * UText, replace entire contents of the destination UText with a substring of the source UText. 1965 * 1966 * @param src The source UText 1967 * @param dest The destination UText. Must be writable. 1968 * May be NULL, in which case a new UText will be allocated. 1969 * @param start Start index of source substring. 1970 * @param limit Limit index of source substring. 1971 * @param status An error code. 1972 */ 1973 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) { 1974 if (U_FAILURE(*status)) { 1975 return dest; 1976 } 1977 if (start == limit) { 1978 if (dest) { 1979 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status); 1980 return dest; 1981 } else { 1982 return utext_openUChars(NULL, NULL, 0, status); 1983 } 1984 } 1985 int32_t length = utext_extract(src, start, limit, NULL, 0, status); 1986 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { 1987 return dest; 1988 } 1989 *status = U_ZERO_ERROR; 1990 MaybeStackArray<UChar, 40> buffer; 1991 if (length >= buffer.getCapacity()) { 1992 UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul. 1993 if (newBuf == NULL) { 1994 *status = U_MEMORY_ALLOCATION_ERROR; 1995 } 1996 } 1997 utext_extract(src, start, limit, buffer.getAlias(), length+1, status); 1998 if (dest) { 1999 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status); 2000 return dest; 2001 } 2002 2003 // Caller did not provide a prexisting UText. 2004 // Open a new one, and have it adopt the text buffer storage. 2005 if (U_FAILURE(*status)) { 2006 return NULL; 2007 } 2008 int32_t ownedLength = 0; 2009 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength); 2010 if (ownedBuf == NULL) { 2011 *status = U_MEMORY_ALLOCATION_ERROR; 2012 return NULL; 2013 } 2014 UText *result = utext_openUChars(NULL, ownedBuf, length, status); 2015 if (U_FAILURE(*status)) { 2016 uprv_free(ownedBuf); 2017 return NULL; 2018 } 2019 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT); 2020 return result; 2021 } 2022 2023 2024 //--------------------------------------------------------------------- 2025 // 2026 // split 2027 // 2028 //--------------------------------------------------------------------- 2029 int32_t RegexMatcher::split(const UnicodeString &input, 2030 UnicodeString dest[], 2031 int32_t destCapacity, 2032 UErrorCode &status) 2033 { 2034 UText inputText = UTEXT_INITIALIZER; 2035 utext_openConstUnicodeString(&inputText, &input, &status); 2036 if (U_FAILURE(status)) { 2037 return 0; 2038 } 2039 2040 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); 2041 if (destText == NULL) { 2042 status = U_MEMORY_ALLOCATION_ERROR; 2043 return 0; 2044 } 2045 int32_t i; 2046 for (i = 0; i < destCapacity; i++) { 2047 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); 2048 } 2049 2050 int32_t fieldCount = split(&inputText, destText, destCapacity, status); 2051 2052 for (i = 0; i < destCapacity; i++) { 2053 utext_close(destText[i]); 2054 } 2055 2056 uprv_free(destText); 2057 utext_close(&inputText); 2058 return fieldCount; 2059 } 2060 2061 // 2062 // split, UText mode 2063 // 2064 int32_t RegexMatcher::split(UText *input, 2065 UText *dest[], 2066 int32_t destCapacity, 2067 UErrorCode &status) 2068 { 2069 // 2070 // Check arguements for validity 2071 // 2072 if (U_FAILURE(status)) { 2073 return 0; 2074 }; 2075 2076 if (destCapacity < 1) { 2077 status = U_ILLEGAL_ARGUMENT_ERROR; 2078 return 0; 2079 } 2080 2081 // 2082 // Reset for the input text 2083 // 2084 reset(input); 2085 int64_t nextOutputStringStart = 0; 2086 if (fActiveLimit == 0) { 2087 return 0; 2088 } 2089 2090 // 2091 // Loop through the input text, searching for the delimiter pattern 2092 // 2093 int32_t i; 2094 int32_t numCaptureGroups = fPattern->fGroupMap->size(); 2095 for (i=0; ; i++) { 2096 if (i>=destCapacity-1) { 2097 // There is one or zero output string left. 2098 // Fill the last output string with whatever is left from the input, then exit the loop. 2099 // ( i will be == destCapacity if we filled the output array while processing 2100 // capture groups of the delimiter expression, in which case we will discard the 2101 // last capture group saved in favor of the unprocessed remainder of the 2102 // input string.) 2103 i = destCapacity-1; 2104 if (fActiveLimit > nextOutputStringStart) { 2105 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2106 if (dest[i]) { 2107 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2108 input->chunkContents+nextOutputStringStart, 2109 (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2110 } else { 2111 UText remainingText = UTEXT_INITIALIZER; 2112 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2113 fActiveLimit-nextOutputStringStart, &status); 2114 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2115 utext_close(&remainingText); 2116 } 2117 } else { 2118 UErrorCode lengthStatus = U_ZERO_ERROR; 2119 int32_t remaining16Length = 2120 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 2121 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2122 if (remainingChars == NULL) { 2123 status = U_MEMORY_ALLOCATION_ERROR; 2124 break; 2125 } 2126 2127 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2128 if (dest[i]) { 2129 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2130 } else { 2131 UText remainingText = UTEXT_INITIALIZER; 2132 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2133 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2134 utext_close(&remainingText); 2135 } 2136 2137 uprv_free(remainingChars); 2138 } 2139 } 2140 break; 2141 } 2142 if (find()) { 2143 // We found another delimiter. Move everything from where we started looking 2144 // up until the start of the delimiter into the next output string. 2145 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2146 if (dest[i]) { 2147 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2148 input->chunkContents+nextOutputStringStart, 2149 (int32_t)(fMatchStart-nextOutputStringStart), &status); 2150 } else { 2151 UText remainingText = UTEXT_INITIALIZER; 2152 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2153 fMatchStart-nextOutputStringStart, &status); 2154 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2155 utext_close(&remainingText); 2156 } 2157 } else { 2158 UErrorCode lengthStatus = U_ZERO_ERROR; 2159 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus); 2160 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2161 if (remainingChars == NULL) { 2162 status = U_MEMORY_ALLOCATION_ERROR; 2163 break; 2164 } 2165 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status); 2166 if (dest[i]) { 2167 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2168 } else { 2169 UText remainingText = UTEXT_INITIALIZER; 2170 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2171 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2172 utext_close(&remainingText); 2173 } 2174 2175 uprv_free(remainingChars); 2176 } 2177 nextOutputStringStart = fMatchEnd; 2178 2179 // If the delimiter pattern has capturing parentheses, the captured 2180 // text goes out into the next n destination strings. 2181 int32_t groupNum; 2182 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 2183 if (i >= destCapacity-2) { 2184 // Never fill the last available output string with capture group text. 2185 // It will filled with the last field, the remainder of the 2186 // unsplit input text. 2187 break; 2188 } 2189 i++; 2190 dest[i] = utext_extract_replace(fInputText, dest[i], 2191 start64(groupNum, status), end64(groupNum, status), &status); 2192 } 2193 2194 if (nextOutputStringStart == fActiveLimit) { 2195 // The delimiter was at the end of the string. We're done, but first 2196 // we output one last empty string, for the empty field following 2197 // the delimiter at the end of input. 2198 if (i+1 < destCapacity) { 2199 ++i; 2200 if (dest[i] == NULL) { 2201 dest[i] = utext_openUChars(NULL, NULL, 0, &status); 2202 } else { 2203 static const UChar emptyString[] = {(UChar)0}; 2204 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status); 2205 } 2206 } 2207 break; 2208 2209 } 2210 } 2211 else 2212 { 2213 // We ran off the end of the input while looking for the next delimiter. 2214 // All the remaining text goes into the current output string. 2215 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2216 if (dest[i]) { 2217 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2218 input->chunkContents+nextOutputStringStart, 2219 (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2220 } else { 2221 UText remainingText = UTEXT_INITIALIZER; 2222 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2223 fActiveLimit-nextOutputStringStart, &status); 2224 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2225 utext_close(&remainingText); 2226 } 2227 } else { 2228 UErrorCode lengthStatus = U_ZERO_ERROR; 2229 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 2230 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2231 if (remainingChars == NULL) { 2232 status = U_MEMORY_ALLOCATION_ERROR; 2233 break; 2234 } 2235 2236 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2237 if (dest[i]) { 2238 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2239 } else { 2240 UText remainingText = UTEXT_INITIALIZER; 2241 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2242 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2243 utext_close(&remainingText); 2244 } 2245 2246 uprv_free(remainingChars); 2247 } 2248 break; 2249 } 2250 if (U_FAILURE(status)) { 2251 break; 2252 } 2253 } // end of for loop 2254 return i+1; 2255 } 2256 2257 2258 //-------------------------------------------------------------------------------- 2259 // 2260 // start 2261 // 2262 //-------------------------------------------------------------------------------- 2263 int32_t RegexMatcher::start(UErrorCode &status) const { 2264 return start(0, status); 2265 } 2266 2267 int64_t RegexMatcher::start64(UErrorCode &status) const { 2268 return start64(0, status); 2269 } 2270 2271 //-------------------------------------------------------------------------------- 2272 // 2273 // start(int32_t group, UErrorCode &status) 2274 // 2275 //-------------------------------------------------------------------------------- 2276 2277 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const { 2278 if (U_FAILURE(status)) { 2279 return -1; 2280 } 2281 if (U_FAILURE(fDeferredStatus)) { 2282 status = fDeferredStatus; 2283 return -1; 2284 } 2285 if (fMatch == FALSE) { 2286 status = U_REGEX_INVALID_STATE; 2287 return -1; 2288 } 2289 if (group < 0 || group > fPattern->fGroupMap->size()) { 2290 status = U_INDEX_OUTOFBOUNDS_ERROR; 2291 return -1; 2292 } 2293 int64_t s; 2294 if (group == 0) { 2295 s = fMatchStart; 2296 } else { 2297 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 2298 U_ASSERT(groupOffset < fPattern->fFrameSize); 2299 U_ASSERT(groupOffset >= 0); 2300 s = fFrame->fExtra[groupOffset]; 2301 } 2302 2303 return s; 2304 } 2305 2306 2307 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { 2308 return (int32_t)start64(group, status); 2309 } 2310 2311 //-------------------------------------------------------------------------------- 2312 // 2313 // useAnchoringBounds 2314 // 2315 //-------------------------------------------------------------------------------- 2316 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { 2317 fAnchoringBounds = b; 2318 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); 2319 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); 2320 return *this; 2321 } 2322 2323 2324 //-------------------------------------------------------------------------------- 2325 // 2326 // useTransparentBounds 2327 // 2328 //-------------------------------------------------------------------------------- 2329 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { 2330 fTransparentBounds = b; 2331 fLookStart = (fTransparentBounds ? 0 : fRegionStart); 2332 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); 2333 return *this; 2334 } 2335 2336 //-------------------------------------------------------------------------------- 2337 // 2338 // setTimeLimit 2339 // 2340 //-------------------------------------------------------------------------------- 2341 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { 2342 if (U_FAILURE(status)) { 2343 return; 2344 } 2345 if (U_FAILURE(fDeferredStatus)) { 2346 status = fDeferredStatus; 2347 return; 2348 } 2349 if (limit < 0) { 2350 status = U_ILLEGAL_ARGUMENT_ERROR; 2351 return; 2352 } 2353 fTimeLimit = limit; 2354 } 2355 2356 2357 //-------------------------------------------------------------------------------- 2358 // 2359 // getTimeLimit 2360 // 2361 //-------------------------------------------------------------------------------- 2362 int32_t RegexMatcher::getTimeLimit() const { 2363 return fTimeLimit; 2364 } 2365 2366 2367 //-------------------------------------------------------------------------------- 2368 // 2369 // setStackLimit 2370 // 2371 //-------------------------------------------------------------------------------- 2372 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { 2373 if (U_FAILURE(status)) { 2374 return; 2375 } 2376 if (U_FAILURE(fDeferredStatus)) { 2377 status = fDeferredStatus; 2378 return; 2379 } 2380 if (limit < 0) { 2381 status = U_ILLEGAL_ARGUMENT_ERROR; 2382 return; 2383 } 2384 2385 // Reset the matcher. This is needed here in case there is a current match 2386 // whose final stack frame (containing the match results, pointed to by fFrame) 2387 // would be lost by resizing to a smaller stack size. 2388 reset(); 2389 2390 if (limit == 0) { 2391 // Unlimited stack expansion 2392 fStack->setMaxCapacity(0); 2393 } else { 2394 // Change the units of the limit from bytes to ints, and bump the size up 2395 // to be big enough to hold at least one stack frame for the pattern, 2396 // if it isn't there already. 2397 int32_t adjustedLimit = limit / sizeof(int32_t); 2398 if (adjustedLimit < fPattern->fFrameSize) { 2399 adjustedLimit = fPattern->fFrameSize; 2400 } 2401 fStack->setMaxCapacity(adjustedLimit); 2402 } 2403 fStackLimit = limit; 2404 } 2405 2406 2407 //-------------------------------------------------------------------------------- 2408 // 2409 // getStackLimit 2410 // 2411 //-------------------------------------------------------------------------------- 2412 int32_t RegexMatcher::getStackLimit() const { 2413 return fStackLimit; 2414 } 2415 2416 2417 //-------------------------------------------------------------------------------- 2418 // 2419 // setMatchCallback 2420 // 2421 //-------------------------------------------------------------------------------- 2422 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, 2423 const void *context, 2424 UErrorCode &status) { 2425 if (U_FAILURE(status)) { 2426 return; 2427 } 2428 fCallbackFn = callback; 2429 fCallbackContext = context; 2430 } 2431 2432 2433 //-------------------------------------------------------------------------------- 2434 // 2435 // getMatchCallback 2436 // 2437 //-------------------------------------------------------------------------------- 2438 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, 2439 const void *&context, 2440 UErrorCode &status) { 2441 if (U_FAILURE(status)) { 2442 return; 2443 } 2444 callback = fCallbackFn; 2445 context = fCallbackContext; 2446 } 2447 2448 2449 //-------------------------------------------------------------------------------- 2450 // 2451 // setMatchCallback 2452 // 2453 //-------------------------------------------------------------------------------- 2454 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback, 2455 const void *context, 2456 UErrorCode &status) { 2457 if (U_FAILURE(status)) { 2458 return; 2459 } 2460 fFindProgressCallbackFn = callback; 2461 fFindProgressCallbackContext = context; 2462 } 2463 2464 2465 //-------------------------------------------------------------------------------- 2466 // 2467 // getMatchCallback 2468 // 2469 //-------------------------------------------------------------------------------- 2470 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback, 2471 const void *&context, 2472 UErrorCode &status) { 2473 if (U_FAILURE(status)) { 2474 return; 2475 } 2476 callback = fFindProgressCallbackFn; 2477 context = fFindProgressCallbackContext; 2478 } 2479 2480 2481 //================================================================================ 2482 // 2483 // Code following this point in this file is the internal 2484 // Match Engine Implementation. 2485 // 2486 //================================================================================ 2487 2488 2489 //-------------------------------------------------------------------------------- 2490 // 2491 // resetStack 2492 // Discard any previous contents of the state save stack, and initialize a 2493 // new stack frame to all -1. The -1s are needed for capture group limits, 2494 // where they indicate that a group has not yet matched anything. 2495 //-------------------------------------------------------------------------------- 2496 REStackFrame *RegexMatcher::resetStack() { 2497 // Discard any previous contents of the state save stack, and initialize a 2498 // new stack frame with all -1 data. The -1s are needed for capture group limits, 2499 // where they indicate that a group has not yet matched anything. 2500 fStack->removeAllElements(); 2501 2502 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); 2503 if(U_FAILURE(fDeferredStatus)) { 2504 return NULL; 2505 } 2506 2507 int32_t i; 2508 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { 2509 iFrame->fExtra[i] = -1; 2510 } 2511 return iFrame; 2512 } 2513 2514 2515 2516 //-------------------------------------------------------------------------------- 2517 // 2518 // isWordBoundary 2519 // in perl, "xab..cd..", \b is true at positions 0,3,5,7 2520 // For us, 2521 // If the current char is a combining mark, 2522 // \b is FALSE. 2523 // Else Scan backwards to the first non-combining char. 2524 // We are at a boundary if the this char and the original chars are 2525 // opposite in membership in \w set 2526 // 2527 // parameters: pos - the current position in the input buffer 2528 // 2529 // TODO: double-check edge cases at region boundaries. 2530 // 2531 //-------------------------------------------------------------------------------- 2532 UBool RegexMatcher::isWordBoundary(int64_t pos) { 2533 UBool isBoundary = FALSE; 2534 UBool cIsWord = FALSE; 2535 2536 if (pos >= fLookLimit) { 2537 fHitEnd = TRUE; 2538 } else { 2539 // Determine whether char c at current position is a member of the word set of chars. 2540 // If we're off the end of the string, behave as though we're not at a word char. 2541 UTEXT_SETNATIVEINDEX(fInputText, pos); 2542 UChar32 c = UTEXT_CURRENT32(fInputText); 2543 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2544 // Current char is a combining one. Not a boundary. 2545 return FALSE; 2546 } 2547 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2548 } 2549 2550 // Back up until we come to a non-combining char, determine whether 2551 // that char is a word char. 2552 UBool prevCIsWord = FALSE; 2553 for (;;) { 2554 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { 2555 break; 2556 } 2557 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); 2558 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2559 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2560 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 2561 break; 2562 } 2563 } 2564 isBoundary = cIsWord ^ prevCIsWord; 2565 return isBoundary; 2566 } 2567 2568 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { 2569 UBool isBoundary = FALSE; 2570 UBool cIsWord = FALSE; 2571 2572 const UChar *inputBuf = fInputText->chunkContents; 2573 2574 if (pos >= fLookLimit) { 2575 fHitEnd = TRUE; 2576 } else { 2577 // Determine whether char c at current position is a member of the word set of chars. 2578 // If we're off the end of the string, behave as though we're not at a word char. 2579 UChar32 c; 2580 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); 2581 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2582 // Current char is a combining one. Not a boundary. 2583 return FALSE; 2584 } 2585 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2586 } 2587 2588 // Back up until we come to a non-combining char, determine whether 2589 // that char is a word char. 2590 UBool prevCIsWord = FALSE; 2591 for (;;) { 2592 if (pos <= fLookStart) { 2593 break; 2594 } 2595 UChar32 prevChar; 2596 U16_PREV(inputBuf, fLookStart, pos, prevChar); 2597 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2598 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2599 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 2600 break; 2601 } 2602 } 2603 isBoundary = cIsWord ^ prevCIsWord; 2604 return isBoundary; 2605 } 2606 2607 //-------------------------------------------------------------------------------- 2608 // 2609 // isUWordBoundary 2610 // 2611 // Test for a word boundary using RBBI word break. 2612 // 2613 // parameters: pos - the current position in the input buffer 2614 // 2615 //-------------------------------------------------------------------------------- 2616 UBool RegexMatcher::isUWordBoundary(int64_t pos) { 2617 UBool returnVal = FALSE; 2618 #if UCONFIG_NO_BREAK_ITERATION==0 2619 2620 // If we haven't yet created a break iterator for this matcher, do it now. 2621 if (fWordBreakItr == NULL) { 2622 fWordBreakItr = 2623 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus); 2624 if (U_FAILURE(fDeferredStatus)) { 2625 return FALSE; 2626 } 2627 fWordBreakItr->setText(fInputText, fDeferredStatus); 2628 } 2629 2630 if (pos >= fLookLimit) { 2631 fHitEnd = TRUE; 2632 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" 2633 // words are not boundaries. All non-word chars stand by themselves, 2634 // with word boundaries on both sides. 2635 } else { 2636 if (!UTEXT_USES_U16(fInputText)) { 2637 // !!!: Would like a better way to do this! 2638 UErrorCode status = U_ZERO_ERROR; 2639 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status); 2640 } 2641 returnVal = fWordBreakItr->isBoundary((int32_t)pos); 2642 } 2643 #endif 2644 return returnVal; 2645 } 2646 2647 //-------------------------------------------------------------------------------- 2648 // 2649 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state 2650 // saves. Increment the "time" counter, and call the 2651 // user callback function if there is one installed. 2652 // 2653 // If the match operation needs to be aborted, either for a time-out 2654 // or because the user callback asked for it, just set an error status. 2655 // The engine will pick that up and stop in its outer loop. 2656 // 2657 //-------------------------------------------------------------------------------- 2658 void RegexMatcher::IncrementTime(UErrorCode &status) { 2659 fTickCounter = TIMER_INITIAL_VALUE; 2660 fTime++; 2661 if (fCallbackFn != NULL) { 2662 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { 2663 status = U_REGEX_STOPPED_BY_CALLER; 2664 return; 2665 } 2666 } 2667 if (fTimeLimit > 0 && fTime >= fTimeLimit) { 2668 status = U_REGEX_TIME_OUT; 2669 } 2670 } 2671 2672 //-------------------------------------------------------------------------------- 2673 // 2674 // StateSave 2675 // Make a new stack frame, initialized as a copy of the current stack frame. 2676 // Set the pattern index in the original stack frame from the operand value 2677 // in the opcode. Execution of the engine continues with the state in 2678 // the newly created stack frame 2679 // 2680 // Note that reserveBlock() may grow the stack, resulting in the 2681 // whole thing being relocated in memory. 2682 // 2683 // Parameters: 2684 // fp The top frame pointer when called. At return, a new 2685 // fame will be present 2686 // savePatIdx An index into the compiled pattern. Goes into the original 2687 // (not new) frame. If execution ever back-tracks out of the 2688 // new frame, this will be where we continue from in the pattern. 2689 // Return 2690 // The new frame pointer. 2691 // 2692 //-------------------------------------------------------------------------------- 2693 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) { 2694 if (U_FAILURE(status)) { 2695 return fp; 2696 } 2697 // push storage for a new frame. 2698 int64_t *newFP = fStack->reserveBlock(fFrameSize, status); 2699 if (U_FAILURE(status)) { 2700 // Failure on attempted stack expansion. 2701 // Stack function set some other error code, change it to a more 2702 // specific one for regular expressions. 2703 status = U_REGEX_STACK_OVERFLOW; 2704 // We need to return a writable stack frame, so just return the 2705 // previous frame. The match operation will stop quickly 2706 // because of the error status, after which the frame will never 2707 // be looked at again. 2708 return fp; 2709 } 2710 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. 2711 2712 // New stack frame = copy of old top frame. 2713 int64_t *source = (int64_t *)fp; 2714 int64_t *dest = newFP; 2715 for (;;) { 2716 *dest++ = *source++; 2717 if (source == newFP) { 2718 break; 2719 } 2720 } 2721 2722 fTickCounter--; 2723 if (fTickCounter <= 0) { 2724 IncrementTime(status); // Re-initializes fTickCounter 2725 } 2726 fp->fPatIdx = savePatIdx; 2727 return (REStackFrame *)newFP; 2728 } 2729 2730 #if defined(REGEX_DEBUG) 2731 namespace { 2732 UnicodeString StringFromUText(UText *ut) { 2733 UnicodeString result; 2734 for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) { 2735 result.append(c); 2736 } 2737 return result; 2738 } 2739 } 2740 #endif // REGEX_DEBUG 2741 2742 2743 //-------------------------------------------------------------------------------- 2744 // 2745 // MatchAt This is the actual matching engine. 2746 // 2747 // startIdx: begin matching a this index. 2748 // toEnd: if true, match must extend to end of the input region 2749 // 2750 //-------------------------------------------------------------------------------- 2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { 2752 UBool isMatch = FALSE; // True if the we have a match. 2753 2754 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards 2755 2756 int32_t op; // Operation from the compiled pattern, split into 2757 int32_t opType; // the opcode 2758 int32_t opValue; // and the operand value. 2759 2760 #ifdef REGEX_RUN_DEBUG 2761 if (fTraceDebug) { 2762 printf("MatchAt(startIdx=%ld)\n", startIdx); 2763 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))()); 2764 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))()); 2765 } 2766 #endif 2767 2768 if (U_FAILURE(status)) { 2769 return; 2770 } 2771 2772 // Cache frequently referenced items from the compiled pattern 2773 // 2774 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 2775 2776 const UChar *litText = fPattern->fLiteralText.getBuffer(); 2777 UVector *fSets = fPattern->fSets; 2778 2779 fFrameSize = fPattern->fFrameSize; 2780 REStackFrame *fp = resetStack(); 2781 if (U_FAILURE(fDeferredStatus)) { 2782 status = fDeferredStatus; 2783 return; 2784 } 2785 2786 fp->fPatIdx = 0; 2787 fp->fInputIdx = startIdx; 2788 2789 // Zero out the pattern's static data 2790 int32_t i; 2791 for (i = 0; i<fPattern->fDataSize; i++) { 2792 fData[i] = 0; 2793 } 2794 2795 // 2796 // Main loop for interpreting the compiled pattern. 2797 // One iteration of the loop per pattern operation performed. 2798 // 2799 for (;;) { 2800 op = (int32_t)pat[fp->fPatIdx]; 2801 opType = URX_TYPE(op); 2802 opValue = URX_VAL(op); 2803 #ifdef REGEX_RUN_DEBUG 2804 if (fTraceDebug) { 2805 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2806 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, 2807 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 2808 fPattern->dumpOp(fp->fPatIdx); 2809 } 2810 #endif 2811 fp->fPatIdx++; 2812 2813 switch (opType) { 2814 2815 2816 case URX_NOP: 2817 break; 2818 2819 2820 case URX_BACKTRACK: 2821 // Force a backtrack. In some circumstances, the pattern compiler 2822 // will notice that the pattern can't possibly match anything, and will 2823 // emit one of these at that point. 2824 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2825 break; 2826 2827 2828 case URX_ONECHAR: 2829 if (fp->fInputIdx < fActiveLimit) { 2830 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2831 UChar32 c = UTEXT_NEXT32(fInputText); 2832 if (c == opValue) { 2833 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2834 break; 2835 } 2836 } else { 2837 fHitEnd = TRUE; 2838 } 2839 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2840 break; 2841 2842 2843 case URX_STRING: 2844 { 2845 // Test input against a literal string. 2846 // Strings require two slots in the compiled pattern, one for the 2847 // offset to the string text, and one for the length. 2848 2849 int32_t stringStartIdx = opValue; 2850 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 2851 fp->fPatIdx++; 2852 opType = URX_TYPE(op); 2853 int32_t stringLen = URX_VAL(op); 2854 U_ASSERT(opType == URX_STRING_LEN); 2855 U_ASSERT(stringLen >= 2); 2856 2857 const UChar *patternString = litText+stringStartIdx; 2858 int32_t patternStringIndex = 0; 2859 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2860 UChar32 inputChar; 2861 UChar32 patternChar; 2862 UBool success = TRUE; 2863 while (patternStringIndex < stringLen) { 2864 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { 2865 success = FALSE; 2866 fHitEnd = TRUE; 2867 break; 2868 } 2869 inputChar = UTEXT_NEXT32(fInputText); 2870 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar); 2871 if (patternChar != inputChar) { 2872 success = FALSE; 2873 break; 2874 } 2875 } 2876 2877 if (success) { 2878 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2879 } else { 2880 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2881 } 2882 } 2883 break; 2884 2885 2886 case URX_STATE_SAVE: 2887 fp = StateSave(fp, opValue, status); 2888 break; 2889 2890 2891 case URX_END: 2892 // The match loop will exit via this path on a successful match, 2893 // when we reach the end of the pattern. 2894 if (toEnd && fp->fInputIdx != fActiveLimit) { 2895 // The pattern matched, but not to the end of input. Try some more. 2896 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2897 break; 2898 } 2899 isMatch = TRUE; 2900 goto breakFromLoop; 2901 2902 // Start and End Capture stack frame variables are laid out out like this: 2903 // fp->fExtra[opValue] - The start of a completed capture group 2904 // opValue+1 - The end of a completed capture group 2905 // opValue+2 - the start of a capture group whose end 2906 // has not yet been reached (and might not ever be). 2907 case URX_START_CAPTURE: 2908 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2909 fp->fExtra[opValue+2] = fp->fInputIdx; 2910 break; 2911 2912 2913 case URX_END_CAPTURE: 2914 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2915 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 2916 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 2917 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 2918 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 2919 break; 2920 2921 2922 case URX_DOLLAR: // $, test for End of line 2923 // or for position before new line at end of input 2924 { 2925 if (fp->fInputIdx >= fAnchorLimit) { 2926 // We really are at the end of input. Success. 2927 fHitEnd = TRUE; 2928 fRequireEnd = TRUE; 2929 break; 2930 } 2931 2932 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2933 2934 // If we are positioned just before a new-line that is located at the 2935 // end of input, succeed. 2936 UChar32 c = UTEXT_NEXT32(fInputText); 2937 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 2938 if (isLineTerminator(c)) { 2939 // If not in the middle of a CR/LF sequence 2940 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { 2941 // At new-line at end of input. Success 2942 fHitEnd = TRUE; 2943 fRequireEnd = TRUE; 2944 2945 break; 2946 } 2947 } 2948 } else { 2949 UChar32 nextC = UTEXT_NEXT32(fInputText); 2950 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 2951 fHitEnd = TRUE; 2952 fRequireEnd = TRUE; 2953 break; // At CR/LF at end of input. Success 2954 } 2955 } 2956 2957 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2958 } 2959 break; 2960 2961 2962 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 2963 if (fp->fInputIdx >= fAnchorLimit) { 2964 // Off the end of input. Success. 2965 fHitEnd = TRUE; 2966 fRequireEnd = TRUE; 2967 break; 2968 } else { 2969 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2970 UChar32 c = UTEXT_NEXT32(fInputText); 2971 // Either at the last character of input, or off the end. 2972 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) { 2973 fHitEnd = TRUE; 2974 fRequireEnd = TRUE; 2975 break; 2976 } 2977 } 2978 2979 // Not at end of input. Back-track out. 2980 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2981 break; 2982 2983 2984 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 2985 { 2986 if (fp->fInputIdx >= fAnchorLimit) { 2987 // We really are at the end of input. Success. 2988 fHitEnd = TRUE; 2989 fRequireEnd = TRUE; 2990 break; 2991 } 2992 // If we are positioned just before a new-line, succeed. 2993 // It makes no difference where the new-line is within the input. 2994 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2995 UChar32 c = UTEXT_CURRENT32(fInputText); 2996 if (isLineTerminator(c)) { 2997 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 2998 // In multi-line mode, hitting a new-line just before the end of input does not 2999 // set the hitEnd or requireEnd flags 3000 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) { 3001 break; 3002 } 3003 } 3004 // not at a new line. Fail. 3005 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3006 } 3007 break; 3008 3009 3010 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 3011 { 3012 if (fp->fInputIdx >= fAnchorLimit) { 3013 // We really are at the end of input. Success. 3014 fHitEnd = TRUE; 3015 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 3016 break; // adding a new-line would not lose the match. 3017 } 3018 // If we are not positioned just before a new-line, the test fails; backtrack out. 3019 // It makes no difference where the new-line is within the input. 3020 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3021 if (UTEXT_CURRENT32(fInputText) != 0x0a) { 3022 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3023 } 3024 } 3025 break; 3026 3027 3028 case URX_CARET: // ^, test for start of line 3029 if (fp->fInputIdx != fAnchorStart) { 3030 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3031 } 3032 break; 3033 3034 3035 case URX_CARET_M: // ^, test for start of line in mulit-line mode 3036 { 3037 if (fp->fInputIdx == fAnchorStart) { 3038 // We are at the start input. Success. 3039 break; 3040 } 3041 // Check whether character just before the current pos is a new-line 3042 // unless we are at the end of input 3043 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3044 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3045 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) { 3046 // It's a new-line. ^ is true. Success. 3047 // TODO: what should be done with positions between a CR and LF? 3048 break; 3049 } 3050 // Not at the start of a line. Fail. 3051 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3052 } 3053 break; 3054 3055 3056 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 3057 { 3058 U_ASSERT(fp->fInputIdx >= fAnchorStart); 3059 if (fp->fInputIdx <= fAnchorStart) { 3060 // We are at the start input. Success. 3061 break; 3062 } 3063 // Check whether character just before the current pos is a new-line 3064 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 3065 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3066 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3067 if (c != 0x0a) { 3068 // Not at the start of a line. Back-track out. 3069 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3070 } 3071 } 3072 break; 3073 3074 case URX_BACKSLASH_B: // Test for word boundaries 3075 { 3076 UBool success = isWordBoundary(fp->fInputIdx); 3077 success ^= (UBool)(opValue != 0); // flip sense for \B 3078 if (!success) { 3079 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3080 } 3081 } 3082 break; 3083 3084 3085 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 3086 { 3087 UBool success = isUWordBoundary(fp->fInputIdx); 3088 success ^= (UBool)(opValue != 0); // flip sense for \B 3089 if (!success) { 3090 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3091 } 3092 } 3093 break; 3094 3095 3096 case URX_BACKSLASH_D: // Test for decimal digit 3097 { 3098 if (fp->fInputIdx >= fActiveLimit) { 3099 fHitEnd = TRUE; 3100 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3101 break; 3102 } 3103 3104 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3105 3106 UChar32 c = UTEXT_NEXT32(fInputText); 3107 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 3108 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 3109 success ^= (UBool)(opValue != 0); // flip sense for \D 3110 if (success) { 3111 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3112 } else { 3113 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3114 } 3115 } 3116 break; 3117 3118 3119 case URX_BACKSLASH_G: // Test for position at end of previous match 3120 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { 3121 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3122 } 3123 break; 3124 3125 3126 case URX_BACKSLASH_H: // Test for \h, horizontal white space. 3127 { 3128 if (fp->fInputIdx >= fActiveLimit) { 3129 fHitEnd = TRUE; 3130 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3131 break; 3132 } 3133 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3134 UChar32 c = UTEXT_NEXT32(fInputText); 3135 int8_t ctype = u_charType(c); 3136 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB 3137 success ^= (UBool)(opValue != 0); // flip sense for \H 3138 if (success) { 3139 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3140 } else { 3141 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3142 } 3143 } 3144 break; 3145 3146 3147 case URX_BACKSLASH_R: // Test for \R, any line break sequence. 3148 { 3149 if (fp->fInputIdx >= fActiveLimit) { 3150 fHitEnd = TRUE; 3151 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3152 break; 3153 } 3154 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3155 UChar32 c = UTEXT_NEXT32(fInputText); 3156 if (isLineTerminator(c)) { 3157 if (c == 0x0d && utext_current32(fInputText) == 0x0a) { 3158 utext_next32(fInputText); 3159 } 3160 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3161 } else { 3162 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3163 } 3164 } 3165 break; 3166 3167 3168 case URX_BACKSLASH_V: // \v, any single line ending character. 3169 { 3170 if (fp->fInputIdx >= fActiveLimit) { 3171 fHitEnd = TRUE; 3172 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3173 break; 3174 } 3175 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3176 UChar32 c = UTEXT_NEXT32(fInputText); 3177 UBool success = isLineTerminator(c); 3178 success ^= (UBool)(opValue != 0); // flip sense for \V 3179 if (success) { 3180 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3181 } else { 3182 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3183 } 3184 } 3185 break; 3186 3187 3188 case URX_BACKSLASH_X: 3189 // Match a Grapheme, as defined by Unicode TR 29. 3190 // Differs slightly from Perl, which consumes combining marks independently 3191 // of context. 3192 { 3193 3194 // Fail if at end of input 3195 if (fp->fInputIdx >= fActiveLimit) { 3196 fHitEnd = TRUE; 3197 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3198 break; 3199 } 3200 3201 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3202 3203 // Examine (and consume) the current char. 3204 // Dispatch into a little state machine, based on the char. 3205 UChar32 c; 3206 c = UTEXT_NEXT32(fInputText); 3207 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3208 UnicodeSet **sets = fPattern->fStaticSets; 3209 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 3210 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 3211 if (sets[URX_GC_L]->contains(c)) goto GC_L; 3212 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 3213 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 3214 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3215 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3216 goto GC_Extend; 3217 3218 3219 3220 GC_L: 3221 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3222 c = UTEXT_NEXT32(fInputText); 3223 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3224 if (sets[URX_GC_L]->contains(c)) goto GC_L; 3225 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 3226 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 3227 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3228 (void)UTEXT_PREVIOUS32(fInputText); 3229 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3230 goto GC_Extend; 3231 3232 GC_V: 3233 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3234 c = UTEXT_NEXT32(fInputText); 3235 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3236 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3237 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3238 (void)UTEXT_PREVIOUS32(fInputText); 3239 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3240 goto GC_Extend; 3241 3242 GC_T: 3243 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3244 c = UTEXT_NEXT32(fInputText); 3245 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3246 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3247 (void)UTEXT_PREVIOUS32(fInputText); 3248 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3249 goto GC_Extend; 3250 3251 GC_Extend: 3252 // Combining characters are consumed here 3253 for (;;) { 3254 if (fp->fInputIdx >= fActiveLimit) { 3255 break; 3256 } 3257 c = UTEXT_CURRENT32(fInputText); 3258 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 3259 break; 3260 } 3261 (void)UTEXT_NEXT32(fInputText); 3262 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3263 } 3264 goto GC_Done; 3265 3266 GC_Control: 3267 // Most control chars stand alone (don't combine with combining chars), 3268 // except for that CR/LF sequence is a single grapheme cluster. 3269 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 3270 c = UTEXT_NEXT32(fInputText); 3271 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3272 } 3273 3274 GC_Done: 3275 if (fp->fInputIdx >= fActiveLimit) { 3276 fHitEnd = TRUE; 3277 } 3278 break; 3279 } 3280 3281 3282 3283 3284 case URX_BACKSLASH_Z: // Test for end of Input 3285 if (fp->fInputIdx < fAnchorLimit) { 3286 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3287 } else { 3288 fHitEnd = TRUE; 3289 fRequireEnd = TRUE; 3290 } 3291 break; 3292 3293 3294 3295 case URX_STATIC_SETREF: 3296 { 3297 // Test input character against one of the predefined sets 3298 // (Word Characters, for example) 3299 // The high bit of the op value is a flag for the match polarity. 3300 // 0: success if input char is in set. 3301 // 1: success if input char is not in set. 3302 if (fp->fInputIdx >= fActiveLimit) { 3303 fHitEnd = TRUE; 3304 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3305 break; 3306 } 3307 3308 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 3309 opValue &= ~URX_NEG_SET; 3310 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3311 3312 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3313 UChar32 c = UTEXT_NEXT32(fInputText); 3314 if (c < 256) { 3315 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3316 if (s8->contains(c)) { 3317 success = !success; 3318 } 3319 } else { 3320 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3321 if (s->contains(c)) { 3322 success = !success; 3323 } 3324 } 3325 if (success) { 3326 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3327 } else { 3328 // the character wasn't in the set. 3329 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3330 } 3331 } 3332 break; 3333 3334 3335 case URX_STAT_SETREF_N: 3336 { 3337 // Test input character for NOT being a member of one of 3338 // the predefined sets (Word Characters, for example) 3339 if (fp->fInputIdx >= fActiveLimit) { 3340 fHitEnd = TRUE; 3341 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3342 break; 3343 } 3344 3345 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3346 3347 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3348 3349 UChar32 c = UTEXT_NEXT32(fInputText); 3350 if (c < 256) { 3351 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3352 if (s8->contains(c) == FALSE) { 3353 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3354 break; 3355 } 3356 } else { 3357 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3358 if (s->contains(c) == FALSE) { 3359 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3360 break; 3361 } 3362 } 3363 // the character wasn't in the set. 3364 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3365 } 3366 break; 3367 3368 3369 case URX_SETREF: 3370 if (fp->fInputIdx >= fActiveLimit) { 3371 fHitEnd = TRUE; 3372 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3373 break; 3374 } else { 3375 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3376 3377 // There is input left. Pick up one char and test it for set membership. 3378 UChar32 c = UTEXT_NEXT32(fInputText); 3379 U_ASSERT(opValue > 0 && opValue < fSets->size()); 3380 if (c<256) { 3381 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 3382 if (s8->contains(c)) { 3383 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3384 break; 3385 } 3386 } else { 3387 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue); 3388 if (s->contains(c)) { 3389 // The character is in the set. A Match. 3390 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3391 break; 3392 } 3393 } 3394 3395 // the character wasn't in the set. 3396 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3397 } 3398 break; 3399 3400 3401 case URX_DOTANY: 3402 { 3403 // . matches anything, but stops at end-of-line. 3404 if (fp->fInputIdx >= fActiveLimit) { 3405 // At end of input. Match failed. Backtrack out. 3406 fHitEnd = TRUE; 3407 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3408 break; 3409 } 3410 3411 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3412 3413 // There is input left. Advance over one char, unless we've hit end-of-line 3414 UChar32 c = UTEXT_NEXT32(fInputText); 3415 if (isLineTerminator(c)) { 3416 // End of line in normal mode. . does not match. 3417 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3418 break; 3419 } 3420 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3421 } 3422 break; 3423 3424 3425 case URX_DOTANY_ALL: 3426 { 3427 // ., in dot-matches-all (including new lines) mode 3428 if (fp->fInputIdx >= fActiveLimit) { 3429 // At end of input. Match failed. Backtrack out. 3430 fHitEnd = TRUE; 3431 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3432 break; 3433 } 3434 3435 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3436 3437 // There is input left. Advance over one char, except if we are 3438 // at a cr/lf, advance over both of them. 3439 UChar32 c; 3440 c = UTEXT_NEXT32(fInputText); 3441 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3442 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 3443 // In the case of a CR/LF, we need to advance over both. 3444 UChar32 nextc = UTEXT_CURRENT32(fInputText); 3445 if (nextc == 0x0a) { 3446 (void)UTEXT_NEXT32(fInputText); 3447 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3448 } 3449 } 3450 } 3451 break; 3452 3453 3454 case URX_DOTANY_UNIX: 3455 { 3456 // '.' operator, matches all, but stops at end-of-line. 3457 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 3458 if (fp->fInputIdx >= fActiveLimit) { 3459 // At end of input. Match failed. Backtrack out. 3460 fHitEnd = TRUE; 3461 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3462 break; 3463 } 3464 3465 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3466 3467 // There is input left. Advance over one char, unless we've hit end-of-line 3468 UChar32 c = UTEXT_NEXT32(fInputText); 3469 if (c == 0x0a) { 3470 // End of line in normal mode. '.' does not match the \n 3471 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3472 } else { 3473 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3474 } 3475 } 3476 break; 3477 3478 3479 case URX_JMP: 3480 fp->fPatIdx = opValue; 3481 break; 3482 3483 case URX_FAIL: 3484 isMatch = FALSE; 3485 goto breakFromLoop; 3486 3487 case URX_JMP_SAV: 3488 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 3489 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3490 fp->fPatIdx = opValue; // Then JMP. 3491 break; 3492 3493 case URX_JMP_SAV_X: 3494 // This opcode is used with (x)+, when x can match a zero length string. 3495 // Same as JMP_SAV, except conditional on the match having made forward progress. 3496 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 3497 // data address of the input position at the start of the loop. 3498 { 3499 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 3500 int32_t stoOp = (int32_t)pat[opValue-1]; 3501 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 3502 int32_t frameLoc = URX_VAL(stoOp); 3503 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 3504 int64_t prevInputIdx = fp->fExtra[frameLoc]; 3505 U_ASSERT(prevInputIdx <= fp->fInputIdx); 3506 if (prevInputIdx < fp->fInputIdx) { 3507 // The match did make progress. Repeat the loop. 3508 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3509 fp->fPatIdx = opValue; 3510 fp->fExtra[frameLoc] = fp->fInputIdx; 3511 } 3512 // If the input position did not advance, we do nothing here, 3513 // execution will fall out of the loop. 3514 } 3515 break; 3516 3517 case URX_CTR_INIT: 3518 { 3519 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3520 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3521 3522 // Pick up the three extra operands that CTR_INIT has, and 3523 // skip the pattern location counter past 3524 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3525 fp->fPatIdx += 3; 3526 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3527 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3528 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3529 U_ASSERT(minCount>=0); 3530 U_ASSERT(maxCount>=minCount || maxCount==-1); 3531 U_ASSERT(loopLoc>=fp->fPatIdx); 3532 3533 if (minCount == 0) { 3534 fp = StateSave(fp, loopLoc+1, status); 3535 } 3536 if (maxCount == -1) { 3537 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. 3538 } else if (maxCount == 0) { 3539 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3540 } 3541 } 3542 break; 3543 3544 case URX_CTR_LOOP: 3545 { 3546 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3547 int32_t initOp = (int32_t)pat[opValue]; 3548 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 3549 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3550 int32_t minCount = (int32_t)pat[opValue+2]; 3551 int32_t maxCount = (int32_t)pat[opValue+3]; 3552 (*pCounter)++; 3553 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 3554 U_ASSERT(*pCounter == maxCount); 3555 break; 3556 } 3557 if (*pCounter >= minCount) { 3558 if (maxCount == -1) { 3559 // Loop has no hard upper bound. 3560 // Check that it is progressing through the input, break if it is not. 3561 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 3562 if (fp->fInputIdx == *pLastInputIdx) { 3563 break; 3564 } else { 3565 *pLastInputIdx = fp->fInputIdx; 3566 } 3567 } 3568 fp = StateSave(fp, fp->fPatIdx, status); 3569 } else { 3570 // Increment time-out counter. (StateSave() does it if count >= minCount) 3571 fTickCounter--; 3572 if (fTickCounter <= 0) { 3573 IncrementTime(status); // Re-initializes fTickCounter 3574 } 3575 } 3576 3577 fp->fPatIdx = opValue + 4; // Loop back. 3578 } 3579 break; 3580 3581 case URX_CTR_INIT_NG: 3582 { 3583 // Initialize a non-greedy loop 3584 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3585 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3586 3587 // Pick up the three extra operands that CTR_INIT_NG has, and 3588 // skip the pattern location counter past 3589 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3590 fp->fPatIdx += 3; 3591 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3592 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3593 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3594 U_ASSERT(minCount>=0); 3595 U_ASSERT(maxCount>=minCount || maxCount==-1); 3596 U_ASSERT(loopLoc>fp->fPatIdx); 3597 if (maxCount == -1) { 3598 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. 3599 } 3600 3601 if (minCount == 0) { 3602 if (maxCount != 0) { 3603 fp = StateSave(fp, fp->fPatIdx, status); 3604 } 3605 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 3606 } 3607 } 3608 break; 3609 3610 case URX_CTR_LOOP_NG: 3611 { 3612 // Non-greedy {min, max} loops 3613 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3614 int32_t initOp = (int32_t)pat[opValue]; 3615 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 3616 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3617 int32_t minCount = (int32_t)pat[opValue+2]; 3618 int32_t maxCount = (int32_t)pat[opValue+3]; 3619 3620 (*pCounter)++; 3621 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 3622 // The loop has matched the maximum permitted number of times. 3623 // Break out of here with no action. Matching will 3624 // continue with the following pattern. 3625 U_ASSERT(*pCounter == maxCount); 3626 break; 3627 } 3628 3629 if (*pCounter < minCount) { 3630 // We haven't met the minimum number of matches yet. 3631 // Loop back for another one. 3632 fp->fPatIdx = opValue + 4; // Loop back. 3633 // Increment time-out counter. (StateSave() does it if count >= minCount) 3634 fTickCounter--; 3635 if (fTickCounter <= 0) { 3636 IncrementTime(status); // Re-initializes fTickCounter 3637 } 3638 } else { 3639 // We do have the minimum number of matches. 3640 3641 // If there is no upper bound on the loop iterations, check that the input index 3642 // is progressing, and stop the loop if it is not. 3643 if (maxCount == -1) { 3644 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 3645 if (fp->fInputIdx == *pLastInputIdx) { 3646 break; 3647 } 3648 *pLastInputIdx = fp->fInputIdx; 3649 } 3650 3651 // Loop Continuation: we will fall into the pattern following the loop 3652 // (non-greedy, don't execute loop body first), but first do 3653 // a state save to the top of the loop, so that a match failure 3654 // in the following pattern will try another iteration of the loop. 3655 fp = StateSave(fp, opValue + 4, status); 3656 } 3657 } 3658 break; 3659 3660 case URX_STO_SP: 3661 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3662 fData[opValue] = fStack->size(); 3663 break; 3664 3665 case URX_LD_SP: 3666 { 3667 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3668 int32_t newStackSize = (int32_t)fData[opValue]; 3669 U_ASSERT(newStackSize <= fStack->size()); 3670 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3671 if (newFP == (int64_t *)fp) { 3672 break; 3673 } 3674 int32_t j; 3675 for (j=0; j<fFrameSize; j++) { 3676 newFP[j] = ((int64_t *)fp)[j]; 3677 } 3678 fp = (REStackFrame *)newFP; 3679 fStack->setSize(newStackSize); 3680 } 3681 break; 3682 3683 case URX_BACKREF: 3684 { 3685 U_ASSERT(opValue < fFrameSize); 3686 int64_t groupStartIdx = fp->fExtra[opValue]; 3687 int64_t groupEndIdx = fp->fExtra[opValue+1]; 3688 U_ASSERT(groupStartIdx <= groupEndIdx); 3689 if (groupStartIdx < 0) { 3690 // This capture group has not participated in the match thus far, 3691 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3692 break; 3693 } 3694 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); 3695 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3696 3697 // Note: if the capture group match was of an empty string the backref 3698 // match succeeds. Verified by testing: Perl matches succeed 3699 // in this case, so we do too. 3700 3701 UBool success = TRUE; 3702 for (;;) { 3703 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) { 3704 success = TRUE; 3705 break; 3706 } 3707 if (utext_getNativeIndex(fInputText) >= fActiveLimit) { 3708 success = FALSE; 3709 fHitEnd = TRUE; 3710 break; 3711 } 3712 UChar32 captureGroupChar = utext_next32(fAltInputText); 3713 UChar32 inputChar = utext_next32(fInputText); 3714 if (inputChar != captureGroupChar) { 3715 success = FALSE; 3716 break; 3717 } 3718 } 3719 3720 if (success) { 3721 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3722 } else { 3723 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3724 } 3725 } 3726 break; 3727 3728 3729 3730 case URX_BACKREF_I: 3731 { 3732 U_ASSERT(opValue < fFrameSize); 3733 int64_t groupStartIdx = fp->fExtra[opValue]; 3734 int64_t groupEndIdx = fp->fExtra[opValue+1]; 3735 U_ASSERT(groupStartIdx <= groupEndIdx); 3736 if (groupStartIdx < 0) { 3737 // This capture group has not participated in the match thus far, 3738 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3739 break; 3740 } 3741 utext_setNativeIndex(fAltInputText, groupStartIdx); 3742 utext_setNativeIndex(fInputText, fp->fInputIdx); 3743 CaseFoldingUTextIterator captureGroupItr(*fAltInputText); 3744 CaseFoldingUTextIterator inputItr(*fInputText); 3745 3746 // Note: if the capture group match was of an empty string the backref 3747 // match succeeds. Verified by testing: Perl matches succeed 3748 // in this case, so we do too. 3749 3750 UBool success = TRUE; 3751 for (;;) { 3752 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) { 3753 success = TRUE; 3754 break; 3755 } 3756 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) { 3757 success = FALSE; 3758 fHitEnd = TRUE; 3759 break; 3760 } 3761 UChar32 captureGroupChar = captureGroupItr.next(); 3762 UChar32 inputChar = inputItr.next(); 3763 if (inputChar != captureGroupChar) { 3764 success = FALSE; 3765 break; 3766 } 3767 } 3768 3769 if (success && inputItr.inExpansion()) { 3770 // We otained a match by consuming part of a string obtained from 3771 // case-folding a single code point of the input text. 3772 // This does not count as an overall match. 3773 success = FALSE; 3774 } 3775 3776 if (success) { 3777 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3778 } else { 3779 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3780 } 3781 3782 } 3783 break; 3784 3785 case URX_STO_INP_LOC: 3786 { 3787 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 3788 fp->fExtra[opValue] = fp->fInputIdx; 3789 } 3790 break; 3791 3792 case URX_JMPX: 3793 { 3794 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3795 fp->fPatIdx += 1; 3796 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 3797 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 3798 int64_t savedInputIdx = fp->fExtra[dataLoc]; 3799 U_ASSERT(savedInputIdx <= fp->fInputIdx); 3800 if (savedInputIdx < fp->fInputIdx) { 3801 fp->fPatIdx = opValue; // JMP 3802 } else { 3803 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 3804 } 3805 } 3806 break; 3807 3808 case URX_LA_START: 3809 { 3810 // Entering a lookahead block. 3811 // Save Stack Ptr, Input Pos. 3812 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3813 fData[opValue] = fStack->size(); 3814 fData[opValue+1] = fp->fInputIdx; 3815 fActiveStart = fLookStart; // Set the match region change for 3816 fActiveLimit = fLookLimit; // transparent bounds. 3817 } 3818 break; 3819 3820 case URX_LA_END: 3821 { 3822 // Leaving a look-ahead block. 3823 // restore Stack Ptr, Input Pos to positions they had on entry to block. 3824 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3825 int32_t stackSize = fStack->size(); 3826 int32_t newStackSize =(int32_t)fData[opValue]; 3827 U_ASSERT(stackSize >= newStackSize); 3828 if (stackSize > newStackSize) { 3829 // Copy the current top frame back to the new (cut back) top frame. 3830 // This makes the capture groups from within the look-ahead 3831 // expression available. 3832 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3833 int32_t j; 3834 for (j=0; j<fFrameSize; j++) { 3835 newFP[j] = ((int64_t *)fp)[j]; 3836 } 3837 fp = (REStackFrame *)newFP; 3838 fStack->setSize(newStackSize); 3839 } 3840 fp->fInputIdx = fData[opValue+1]; 3841 3842 // Restore the active region bounds in the input string; they may have 3843 // been changed because of transparent bounds on a Region. 3844 fActiveStart = fRegionStart; 3845 fActiveLimit = fRegionLimit; 3846 } 3847 break; 3848 3849 case URX_ONECHAR_I: 3850 // Case insensitive one char. The char from the pattern is already case folded. 3851 // Input text is not, but case folding the input can not reduce two or more code 3852 // points to one. 3853 if (fp->fInputIdx < fActiveLimit) { 3854 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3855 3856 UChar32 c = UTEXT_NEXT32(fInputText); 3857 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 3858 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3859 break; 3860 } 3861 } else { 3862 fHitEnd = TRUE; 3863 } 3864 3865 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3866 break; 3867 3868 case URX_STRING_I: 3869 { 3870 // Case-insensitive test input against a literal string. 3871 // Strings require two slots in the compiled pattern, one for the 3872 // offset to the string text, and one for the length. 3873 // The compiled string has already been case folded. 3874 { 3875 const UChar *patternString = litText + opValue; 3876 int32_t patternStringIdx = 0; 3877 3878 op = (int32_t)pat[fp->fPatIdx]; 3879 fp->fPatIdx++; 3880 opType = URX_TYPE(op); 3881 opValue = URX_VAL(op); 3882 U_ASSERT(opType == URX_STRING_LEN); 3883 int32_t patternStringLen = opValue; // Length of the string from the pattern. 3884 3885 3886 UChar32 cPattern; 3887 UChar32 cText; 3888 UBool success = TRUE; 3889 3890 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3891 CaseFoldingUTextIterator inputIterator(*fInputText); 3892 while (patternStringIdx < patternStringLen) { 3893 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { 3894 success = FALSE; 3895 fHitEnd = TRUE; 3896 break; 3897 } 3898 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); 3899 cText = inputIterator.next(); 3900 if (cText != cPattern) { 3901 success = FALSE; 3902 break; 3903 } 3904 } 3905 if (inputIterator.inExpansion()) { 3906 success = FALSE; 3907 } 3908 3909 if (success) { 3910 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3911 } else { 3912 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3913 } 3914 } 3915 } 3916 break; 3917 3918 case URX_LB_START: 3919 { 3920 // Entering a look-behind block. 3921 // Save Stack Ptr, Input Pos. 3922 // TODO: implement transparent bounds. Ticket #6067 3923 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3924 fData[opValue] = fStack->size(); 3925 fData[opValue+1] = fp->fInputIdx; 3926 // Init the variable containing the start index for attempted matches. 3927 fData[opValue+2] = -1; 3928 // Save input string length, then reset to pin any matches to end at 3929 // the current position. 3930 fData[opValue+3] = fActiveLimit; 3931 fActiveLimit = fp->fInputIdx; 3932 } 3933 break; 3934 3935 3936 case URX_LB_CONT: 3937 { 3938 // Positive Look-Behind, at top of loop checking for matches of LB expression 3939 // at all possible input starting positions. 3940 3941 // Fetch the min and max possible match lengths. They are the operands 3942 // of this op in the pattern. 3943 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 3944 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 3945 if (!UTEXT_USES_U16(fInputText)) { 3946 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16. 3947 // The max length need not be exact; it just needs to be >= actual maximum. 3948 maxML *= 3; 3949 } 3950 U_ASSERT(minML <= maxML); 3951 U_ASSERT(minML >= 0); 3952 3953 // Fetch (from data) the last input index where a match was attempted. 3954 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3955 int64_t &lbStartIdx = fData[opValue+2]; 3956 if (lbStartIdx < 0) { 3957 // First time through loop. 3958 lbStartIdx = fp->fInputIdx - minML; 3959 if (lbStartIdx > 0) { 3960 // move index to a code point boudary, if it's not on one already. 3961 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx); 3962 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 3963 } 3964 } else { 3965 // 2nd through nth time through the loop. 3966 // Back up start position for match by one. 3967 if (lbStartIdx == 0) { 3968 (lbStartIdx)--; 3969 } else { 3970 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx); 3971 (void)UTEXT_PREVIOUS32(fInputText); 3972 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 3973 } 3974 } 3975 3976 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) { 3977 // We have tried all potential match starting points without 3978 // getting a match. Backtrack out, and out of the 3979 // Look Behind altogether. 3980 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3981 int64_t restoreInputLen = fData[opValue+3]; 3982 U_ASSERT(restoreInputLen >= fActiveLimit); 3983 U_ASSERT(restoreInputLen <= fInputLength); 3984 fActiveLimit = restoreInputLen; 3985 break; 3986 } 3987 3988 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 3989 // (successful match will fall off the end of the loop.) 3990 fp = StateSave(fp, fp->fPatIdx-3, status); 3991 fp->fInputIdx = lbStartIdx; 3992 } 3993 break; 3994 3995 case URX_LB_END: 3996 // End of a look-behind block, after a successful match. 3997 { 3998 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3999 if (fp->fInputIdx != fActiveLimit) { 4000 // The look-behind expression matched, but the match did not 4001 // extend all the way to the point that we are looking behind from. 4002 // FAIL out of here, which will take us back to the LB_CONT, which 4003 // will retry the match starting at another position or fail 4004 // the look-behind altogether, whichever is appropriate. 4005 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4006 break; 4007 } 4008 4009 // Look-behind match is good. Restore the orignal input string length, 4010 // which had been truncated to pin the end of the lookbehind match to the 4011 // position being looked-behind. 4012 int64_t originalInputLen = fData[opValue+3]; 4013 U_ASSERT(originalInputLen >= fActiveLimit); 4014 U_ASSERT(originalInputLen <= fInputLength); 4015 fActiveLimit = originalInputLen; 4016 } 4017 break; 4018 4019 4020 case URX_LBN_CONT: 4021 { 4022 // Negative Look-Behind, at top of loop checking for matches of LB expression 4023 // at all possible input starting positions. 4024 4025 // Fetch the extra parameters of this op. 4026 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 4027 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 4028 if (!UTEXT_USES_U16(fInputText)) { 4029 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16. 4030 // The max length need not be exact; it just needs to be >= actual maximum. 4031 maxML *= 3; 4032 } 4033 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 4034 continueLoc = URX_VAL(continueLoc); 4035 U_ASSERT(minML <= maxML); 4036 U_ASSERT(minML >= 0); 4037 U_ASSERT(continueLoc > fp->fPatIdx); 4038 4039 // Fetch (from data) the last input index where a match was attempted. 4040 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4041 int64_t &lbStartIdx = fData[opValue+2]; 4042 if (lbStartIdx < 0) { 4043 // First time through loop. 4044 lbStartIdx = fp->fInputIdx - minML; 4045 if (lbStartIdx > 0) { 4046 // move index to a code point boudary, if it's not on one already. 4047 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx); 4048 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 4049 } 4050 } else { 4051 // 2nd through nth time through the loop. 4052 // Back up start position for match by one. 4053 if (lbStartIdx == 0) { 4054 (lbStartIdx)--; 4055 } else { 4056 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx); 4057 (void)UTEXT_PREVIOUS32(fInputText); 4058 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 4059 } 4060 } 4061 4062 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) { 4063 // We have tried all potential match starting points without 4064 // getting a match, which means that the negative lookbehind as 4065 // a whole has succeeded. Jump forward to the continue location 4066 int64_t restoreInputLen = fData[opValue+3]; 4067 U_ASSERT(restoreInputLen >= fActiveLimit); 4068 U_ASSERT(restoreInputLen <= fInputLength); 4069 fActiveLimit = restoreInputLen; 4070 fp->fPatIdx = continueLoc; 4071 break; 4072 } 4073 4074 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 4075 // (successful match will cause a FAIL out of the loop altogether.) 4076 fp = StateSave(fp, fp->fPatIdx-4, status); 4077 fp->fInputIdx = lbStartIdx; 4078 } 4079 break; 4080 4081 case URX_LBN_END: 4082 // End of a negative look-behind block, after a successful match. 4083 { 4084 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4085 if (fp->fInputIdx != fActiveLimit) { 4086 // The look-behind expression matched, but the match did not 4087 // extend all the way to the point that we are looking behind from. 4088 // FAIL out of here, which will take us back to the LB_CONT, which 4089 // will retry the match starting at another position or succeed 4090 // the look-behind altogether, whichever is appropriate. 4091 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4092 break; 4093 } 4094 4095 // Look-behind expression matched, which means look-behind test as 4096 // a whole Fails 4097 4098 // Restore the orignal input string length, which had been truncated 4099 // inorder to pin the end of the lookbehind match 4100 // to the position being looked-behind. 4101 int64_t originalInputLen = fData[opValue+3]; 4102 U_ASSERT(originalInputLen >= fActiveLimit); 4103 U_ASSERT(originalInputLen <= fInputLength); 4104 fActiveLimit = originalInputLen; 4105 4106 // Restore original stack position, discarding any state saved 4107 // by the successful pattern match. 4108 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4109 int32_t newStackSize = (int32_t)fData[opValue]; 4110 U_ASSERT(fStack->size() > newStackSize); 4111 fStack->setSize(newStackSize); 4112 4113 // FAIL, which will take control back to someplace 4114 // prior to entering the look-behind test. 4115 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4116 } 4117 break; 4118 4119 4120 case URX_LOOP_SR_I: 4121 // Loop Initialization for the optimized implementation of 4122 // [some character set]* 4123 // This op scans through all matching input. 4124 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4125 { 4126 U_ASSERT(opValue > 0 && opValue < fSets->size()); 4127 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 4128 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue); 4129 4130 // Loop through input, until either the input is exhausted or 4131 // we reach a character that is not a member of the set. 4132 int64_t ix = fp->fInputIdx; 4133 UTEXT_SETNATIVEINDEX(fInputText, ix); 4134 for (;;) { 4135 if (ix >= fActiveLimit) { 4136 fHitEnd = TRUE; 4137 break; 4138 } 4139 UChar32 c = UTEXT_NEXT32(fInputText); 4140 if (c<256) { 4141 if (s8->contains(c) == FALSE) { 4142 break; 4143 } 4144 } else { 4145 if (s->contains(c) == FALSE) { 4146 break; 4147 } 4148 } 4149 ix = UTEXT_GETNATIVEINDEX(fInputText); 4150 } 4151 4152 // If there were no matching characters, skip over the loop altogether. 4153 // The loop doesn't run at all, a * op always succeeds. 4154 if (ix == fp->fInputIdx) { 4155 fp->fPatIdx++; // skip the URX_LOOP_C op. 4156 break; 4157 } 4158 4159 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4160 // must follow. It's operand is the stack location 4161 // that holds the starting input index for the match of this [set]* 4162 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 4163 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4164 int32_t stackLoc = URX_VAL(loopcOp); 4165 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4166 fp->fExtra[stackLoc] = fp->fInputIdx; 4167 fp->fInputIdx = ix; 4168 4169 // Save State to the URX_LOOP_C op that follows this one, 4170 // so that match failures in the following code will return to there. 4171 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4172 fp = StateSave(fp, fp->fPatIdx, status); 4173 fp->fPatIdx++; 4174 } 4175 break; 4176 4177 4178 case URX_LOOP_DOT_I: 4179 // Loop Initialization for the optimized implementation of .* 4180 // This op scans through all remaining input. 4181 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4182 { 4183 // Loop through input until the input is exhausted (we reach an end-of-line) 4184 // In DOTALL mode, we can just go straight to the end of the input. 4185 int64_t ix; 4186 if ((opValue & 1) == 1) { 4187 // Dot-matches-All mode. Jump straight to the end of the string. 4188 ix = fActiveLimit; 4189 fHitEnd = TRUE; 4190 } else { 4191 // NOT DOT ALL mode. Line endings do not match '.' 4192 // Scan forward until a line ending or end of input. 4193 ix = fp->fInputIdx; 4194 UTEXT_SETNATIVEINDEX(fInputText, ix); 4195 for (;;) { 4196 if (ix >= fActiveLimit) { 4197 fHitEnd = TRUE; 4198 break; 4199 } 4200 UChar32 c = UTEXT_NEXT32(fInputText); 4201 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 4202 if ((c == 0x0a) || // 0x0a is newline in both modes. 4203 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 4204 isLineTerminator(c))) { 4205 // char is a line ending. Exit the scanning loop. 4206 break; 4207 } 4208 } 4209 ix = UTEXT_GETNATIVEINDEX(fInputText); 4210 } 4211 } 4212 4213 // If there were no matching characters, skip over the loop altogether. 4214 // The loop doesn't run at all, a * op always succeeds. 4215 if (ix == fp->fInputIdx) { 4216 fp->fPatIdx++; // skip the URX_LOOP_C op. 4217 break; 4218 } 4219 4220 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4221 // must follow. It's operand is the stack location 4222 // that holds the starting input index for the match of this .* 4223 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 4224 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4225 int32_t stackLoc = URX_VAL(loopcOp); 4226 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4227 fp->fExtra[stackLoc] = fp->fInputIdx; 4228 fp->fInputIdx = ix; 4229 4230 // Save State to the URX_LOOP_C op that follows this one, 4231 // so that match failures in the following code will return to there. 4232 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4233 fp = StateSave(fp, fp->fPatIdx, status); 4234 fp->fPatIdx++; 4235 } 4236 break; 4237 4238 4239 case URX_LOOP_C: 4240 { 4241 U_ASSERT(opValue>=0 && opValue<fFrameSize); 4242 backSearchIndex = fp->fExtra[opValue]; 4243 U_ASSERT(backSearchIndex <= fp->fInputIdx); 4244 if (backSearchIndex == fp->fInputIdx) { 4245 // We've backed up the input idx to the point that the loop started. 4246 // The loop is done. Leave here without saving state. 4247 // Subsequent failures won't come back here. 4248 break; 4249 } 4250 // Set up for the next iteration of the loop, with input index 4251 // backed up by one from the last time through, 4252 // and a state save to this instruction in case the following code fails again. 4253 // (We're going backwards because this loop emulates stack unwinding, not 4254 // the initial scan forward.) 4255 U_ASSERT(fp->fInputIdx > 0); 4256 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4257 UChar32 prevC = UTEXT_PREVIOUS32(fInputText); 4258 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4259 4260 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); 4261 if (prevC == 0x0a && 4262 fp->fInputIdx > backSearchIndex && 4263 twoPrevC == 0x0d) { 4264 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 4265 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 4266 // .*, stepping back over CRLF pair. 4267 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4268 } 4269 } 4270 4271 4272 fp = StateSave(fp, fp->fPatIdx-1, status); 4273 } 4274 break; 4275 4276 4277 4278 default: 4279 // Trouble. The compiled pattern contains an entry with an 4280 // unrecognized type tag. 4281 U_ASSERT(FALSE); 4282 } 4283 4284 if (U_FAILURE(status)) { 4285 isMatch = FALSE; 4286 break; 4287 } 4288 } 4289 4290 breakFromLoop: 4291 fMatch = isMatch; 4292 if (isMatch) { 4293 fLastMatchEnd = fMatchEnd; 4294 fMatchStart = startIdx; 4295 fMatchEnd = fp->fInputIdx; 4296 } 4297 4298 #ifdef REGEX_RUN_DEBUG 4299 if (fTraceDebug) { 4300 if (isMatch) { 4301 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd); 4302 } else { 4303 printf("No match\n\n"); 4304 } 4305 } 4306 #endif 4307 4308 fFrame = fp; // The active stack frame when the engine stopped. 4309 // Contains the capture group results that we need to 4310 // access later. 4311 return; 4312 } 4313 4314 4315 //-------------------------------------------------------------------------------- 4316 // 4317 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the 4318 // assumption that the entire string is available in the UText's 4319 // chunk buffer. For now, that means we can use int32_t indexes, 4320 // except for anything that needs to be saved (like group starts 4321 // and ends). 4322 // 4323 // startIdx: begin matching a this index. 4324 // toEnd: if true, match must extend to end of the input region 4325 // 4326 //-------------------------------------------------------------------------------- 4327 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { 4328 UBool isMatch = FALSE; // True if the we have a match. 4329 4330 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards 4331 4332 int32_t op; // Operation from the compiled pattern, split into 4333 int32_t opType; // the opcode 4334 int32_t opValue; // and the operand value. 4335 4336 #ifdef REGEX_RUN_DEBUG 4337 if (fTraceDebug) { 4338 printf("MatchAt(startIdx=%d)\n", startIdx); 4339 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))()); 4340 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))()); 4341 } 4342 #endif 4343 4344 if (U_FAILURE(status)) { 4345 return; 4346 } 4347 4348 // Cache frequently referenced items from the compiled pattern 4349 // 4350 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 4351 4352 const UChar *litText = fPattern->fLiteralText.getBuffer(); 4353 UVector *fSets = fPattern->fSets; 4354 4355 const UChar *inputBuf = fInputText->chunkContents; 4356 4357 fFrameSize = fPattern->fFrameSize; 4358 REStackFrame *fp = resetStack(); 4359 if (U_FAILURE(fDeferredStatus)) { 4360 status = fDeferredStatus; 4361 return; 4362 } 4363 4364 fp->fPatIdx = 0; 4365 fp->fInputIdx = startIdx; 4366 4367 // Zero out the pattern's static data 4368 int32_t i; 4369 for (i = 0; i<fPattern->fDataSize; i++) { 4370 fData[i] = 0; 4371 } 4372 4373 // 4374 // Main loop for interpreting the compiled pattern. 4375 // One iteration of the loop per pattern operation performed. 4376 // 4377 for (;;) { 4378 op = (int32_t)pat[fp->fPatIdx]; 4379 opType = URX_TYPE(op); 4380 opValue = URX_VAL(op); 4381 #ifdef REGEX_RUN_DEBUG 4382 if (fTraceDebug) { 4383 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4384 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, 4385 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 4386 fPattern->dumpOp(fp->fPatIdx); 4387 } 4388 #endif 4389 fp->fPatIdx++; 4390 4391 switch (opType) { 4392 4393 4394 case URX_NOP: 4395 break; 4396 4397 4398 case URX_BACKTRACK: 4399 // Force a backtrack. In some circumstances, the pattern compiler 4400 // will notice that the pattern can't possibly match anything, and will 4401 // emit one of these at that point. 4402 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4403 break; 4404 4405 4406 case URX_ONECHAR: 4407 if (fp->fInputIdx < fActiveLimit) { 4408 UChar32 c; 4409 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4410 if (c == opValue) { 4411 break; 4412 } 4413 } else { 4414 fHitEnd = TRUE; 4415 } 4416 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4417 break; 4418 4419 4420 case URX_STRING: 4421 { 4422 // Test input against a literal string. 4423 // Strings require two slots in the compiled pattern, one for the 4424 // offset to the string text, and one for the length. 4425 int32_t stringStartIdx = opValue; 4426 int32_t stringLen; 4427 4428 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 4429 fp->fPatIdx++; 4430 opType = URX_TYPE(op); 4431 stringLen = URX_VAL(op); 4432 U_ASSERT(opType == URX_STRING_LEN); 4433 U_ASSERT(stringLen >= 2); 4434 4435 const UChar * pInp = inputBuf + fp->fInputIdx; 4436 const UChar * pInpLimit = inputBuf + fActiveLimit; 4437 const UChar * pPat = litText+stringStartIdx; 4438 const UChar * pEnd = pInp + stringLen; 4439 UBool success = TRUE; 4440 while (pInp < pEnd) { 4441 if (pInp >= pInpLimit) { 4442 fHitEnd = TRUE; 4443 success = FALSE; 4444 break; 4445 } 4446 if (*pInp++ != *pPat++) { 4447 success = FALSE; 4448 break; 4449 } 4450 } 4451 4452 if (success) { 4453 fp->fInputIdx += stringLen; 4454 } else { 4455 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4456 } 4457 } 4458 break; 4459 4460 4461 case URX_STATE_SAVE: 4462 fp = StateSave(fp, opValue, status); 4463 break; 4464 4465 4466 case URX_END: 4467 // The match loop will exit via this path on a successful match, 4468 // when we reach the end of the pattern. 4469 if (toEnd && fp->fInputIdx != fActiveLimit) { 4470 // The pattern matched, but not to the end of input. Try some more. 4471 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4472 break; 4473 } 4474 isMatch = TRUE; 4475 goto breakFromLoop; 4476 4477 // Start and End Capture stack frame variables are laid out out like this: 4478 // fp->fExtra[opValue] - The start of a completed capture group 4479 // opValue+1 - The end of a completed capture group 4480 // opValue+2 - the start of a capture group whose end 4481 // has not yet been reached (and might not ever be). 4482 case URX_START_CAPTURE: 4483 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4484 fp->fExtra[opValue+2] = fp->fInputIdx; 4485 break; 4486 4487 4488 case URX_END_CAPTURE: 4489 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4490 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 4491 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 4492 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 4493 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 4494 break; 4495 4496 4497 case URX_DOLLAR: // $, test for End of line 4498 // or for position before new line at end of input 4499 if (fp->fInputIdx < fAnchorLimit-2) { 4500 // We are no where near the end of input. Fail. 4501 // This is the common case. Keep it first. 4502 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4503 break; 4504 } 4505 if (fp->fInputIdx >= fAnchorLimit) { 4506 // We really are at the end of input. Success. 4507 fHitEnd = TRUE; 4508 fRequireEnd = TRUE; 4509 break; 4510 } 4511 4512 // If we are positioned just before a new-line that is located at the 4513 // end of input, succeed. 4514 if (fp->fInputIdx == fAnchorLimit-1) { 4515 UChar32 c; 4516 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); 4517 4518 if (isLineTerminator(c)) { 4519 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4520 // At new-line at end of input. Success 4521 fHitEnd = TRUE; 4522 fRequireEnd = TRUE; 4523 break; 4524 } 4525 } 4526 } else if (fp->fInputIdx == fAnchorLimit-2 && 4527 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) { 4528 fHitEnd = TRUE; 4529 fRequireEnd = TRUE; 4530 break; // At CR/LF at end of input. Success 4531 } 4532 4533 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4534 4535 break; 4536 4537 4538 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 4539 if (fp->fInputIdx >= fAnchorLimit-1) { 4540 // Either at the last character of input, or off the end. 4541 if (fp->fInputIdx == fAnchorLimit-1) { 4542 // At last char of input. Success if it's a new line. 4543 if (inputBuf[fp->fInputIdx] == 0x0a) { 4544 fHitEnd = TRUE; 4545 fRequireEnd = TRUE; 4546 break; 4547 } 4548 } else { 4549 // Off the end of input. Success. 4550 fHitEnd = TRUE; 4551 fRequireEnd = TRUE; 4552 break; 4553 } 4554 } 4555 4556 // Not at end of input. Back-track out. 4557 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4558 break; 4559 4560 4561 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 4562 { 4563 if (fp->fInputIdx >= fAnchorLimit) { 4564 // We really are at the end of input. Success. 4565 fHitEnd = TRUE; 4566 fRequireEnd = TRUE; 4567 break; 4568 } 4569 // If we are positioned just before a new-line, succeed. 4570 // It makes no difference where the new-line is within the input. 4571 UChar32 c = inputBuf[fp->fInputIdx]; 4572 if (isLineTerminator(c)) { 4573 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 4574 // In multi-line mode, hitting a new-line just before the end of input does not 4575 // set the hitEnd or requireEnd flags 4576 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4577 break; 4578 } 4579 } 4580 // not at a new line. Fail. 4581 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4582 } 4583 break; 4584 4585 4586 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 4587 { 4588 if (fp->fInputIdx >= fAnchorLimit) { 4589 // We really are at the end of input. Success. 4590 fHitEnd = TRUE; 4591 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 4592 break; // adding a new-line would not lose the match. 4593 } 4594 // If we are not positioned just before a new-line, the test fails; backtrack out. 4595 // It makes no difference where the new-line is within the input. 4596 if (inputBuf[fp->fInputIdx] != 0x0a) { 4597 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4598 } 4599 } 4600 break; 4601 4602 4603 case URX_CARET: // ^, test for start of line 4604 if (fp->fInputIdx != fAnchorStart) { 4605 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4606 } 4607 break; 4608 4609 4610 case URX_CARET_M: // ^, test for start of line in mulit-line mode 4611 { 4612 if (fp->fInputIdx == fAnchorStart) { 4613 // We are at the start input. Success. 4614 break; 4615 } 4616 // Check whether character just before the current pos is a new-line 4617 // unless we are at the end of input 4618 UChar c = inputBuf[fp->fInputIdx - 1]; 4619 if ((fp->fInputIdx < fAnchorLimit) && 4620 isLineTerminator(c)) { 4621 // It's a new-line. ^ is true. Success. 4622 // TODO: what should be done with positions between a CR and LF? 4623 break; 4624 } 4625 // Not at the start of a line. Fail. 4626 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4627 } 4628 break; 4629 4630 4631 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 4632 { 4633 U_ASSERT(fp->fInputIdx >= fAnchorStart); 4634 if (fp->fInputIdx <= fAnchorStart) { 4635 // We are at the start input. Success. 4636 break; 4637 } 4638 // Check whether character just before the current pos is a new-line 4639 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 4640 UChar c = inputBuf[fp->fInputIdx - 1]; 4641 if (c != 0x0a) { 4642 // Not at the start of a line. Back-track out. 4643 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4644 } 4645 } 4646 break; 4647 4648 case URX_BACKSLASH_B: // Test for word boundaries 4649 { 4650 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); 4651 success ^= (UBool)(opValue != 0); // flip sense for \B 4652 if (!success) { 4653 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4654 } 4655 } 4656 break; 4657 4658 4659 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 4660 { 4661 UBool success = isUWordBoundary(fp->fInputIdx); 4662 success ^= (UBool)(opValue != 0); // flip sense for \B 4663 if (!success) { 4664 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4665 } 4666 } 4667 break; 4668 4669 4670 case URX_BACKSLASH_D: // Test for decimal digit 4671 { 4672 if (fp->fInputIdx >= fActiveLimit) { 4673 fHitEnd = TRUE; 4674 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4675 break; 4676 } 4677 4678 UChar32 c; 4679 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4680 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 4681 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 4682 success ^= (UBool)(opValue != 0); // flip sense for \D 4683 if (!success) { 4684 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4685 } 4686 } 4687 break; 4688 4689 4690 case URX_BACKSLASH_G: // Test for position at end of previous match 4691 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { 4692 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4693 } 4694 break; 4695 4696 4697 case URX_BACKSLASH_H: // Test for \h, horizontal white space. 4698 { 4699 if (fp->fInputIdx >= fActiveLimit) { 4700 fHitEnd = TRUE; 4701 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4702 break; 4703 } 4704 UChar32 c; 4705 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4706 int8_t ctype = u_charType(c); 4707 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB 4708 success ^= (UBool)(opValue != 0); // flip sense for \H 4709 if (!success) { 4710 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4711 } 4712 } 4713 break; 4714 4715 4716 case URX_BACKSLASH_R: // Test for \R, any line break sequence. 4717 { 4718 if (fp->fInputIdx >= fActiveLimit) { 4719 fHitEnd = TRUE; 4720 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4721 break; 4722 } 4723 UChar32 c; 4724 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4725 if (isLineTerminator(c)) { 4726 if (c == 0x0d && fp->fInputIdx < fActiveLimit) { 4727 // Check for CR/LF sequence. Consume both together when found. 4728 UChar c2; 4729 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2); 4730 if (c2 != 0x0a) { 4731 U16_PREV(inputBuf, 0, fp->fInputIdx, c2); 4732 } 4733 } 4734 } else { 4735 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4736 } 4737 } 4738 break; 4739 4740 4741 case URX_BACKSLASH_V: // Any single code point line ending. 4742 { 4743 if (fp->fInputIdx >= fActiveLimit) { 4744 fHitEnd = TRUE; 4745 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4746 break; 4747 } 4748 UChar32 c; 4749 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4750 UBool success = isLineTerminator(c); 4751 success ^= (UBool)(opValue != 0); // flip sense for \V 4752 if (!success) { 4753 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4754 } 4755 } 4756 break; 4757 4758 4759 4760 case URX_BACKSLASH_X: 4761 // Match a Grapheme, as defined by Unicode TR 29. 4762 // Differs slightly from Perl, which consumes combining marks independently 4763 // of context. 4764 { 4765 4766 // Fail if at end of input 4767 if (fp->fInputIdx >= fActiveLimit) { 4768 fHitEnd = TRUE; 4769 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4770 break; 4771 } 4772 4773 // Examine (and consume) the current char. 4774 // Dispatch into a little state machine, based on the char. 4775 UChar32 c; 4776 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4777 UnicodeSet **sets = fPattern->fStaticSets; 4778 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 4779 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 4780 if (sets[URX_GC_L]->contains(c)) goto GC_L; 4781 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 4782 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 4783 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4784 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4785 goto GC_Extend; 4786 4787 4788 4789 GC_L: 4790 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4791 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4792 if (sets[URX_GC_L]->contains(c)) goto GC_L; 4793 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 4794 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 4795 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4796 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4797 goto GC_Extend; 4798 4799 GC_V: 4800 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4801 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4802 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4803 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4804 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4805 goto GC_Extend; 4806 4807 GC_T: 4808 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4809 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4810 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4811 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4812 goto GC_Extend; 4813 4814 GC_Extend: 4815 // Combining characters are consumed here 4816 for (;;) { 4817 if (fp->fInputIdx >= fActiveLimit) { 4818 break; 4819 } 4820 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4821 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 4822 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 4823 break; 4824 } 4825 } 4826 goto GC_Done; 4827 4828 GC_Control: 4829 // Most control chars stand alone (don't combine with combining chars), 4830 // except for that CR/LF sequence is a single grapheme cluster. 4831 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) { 4832 fp->fInputIdx++; 4833 } 4834 4835 GC_Done: 4836 if (fp->fInputIdx >= fActiveLimit) { 4837 fHitEnd = TRUE; 4838 } 4839 break; 4840 } 4841 4842 4843 4844 4845 case URX_BACKSLASH_Z: // Test for end of Input 4846 if (fp->fInputIdx < fAnchorLimit) { 4847 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4848 } else { 4849 fHitEnd = TRUE; 4850 fRequireEnd = TRUE; 4851 } 4852 break; 4853 4854 4855 4856 case URX_STATIC_SETREF: 4857 { 4858 // Test input character against one of the predefined sets 4859 // (Word Characters, for example) 4860 // The high bit of the op value is a flag for the match polarity. 4861 // 0: success if input char is in set. 4862 // 1: success if input char is not in set. 4863 if (fp->fInputIdx >= fActiveLimit) { 4864 fHitEnd = TRUE; 4865 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4866 break; 4867 } 4868 4869 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 4870 opValue &= ~URX_NEG_SET; 4871 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 4872 4873 UChar32 c; 4874 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4875 if (c < 256) { 4876 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 4877 if (s8->contains(c)) { 4878 success = !success; 4879 } 4880 } else { 4881 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 4882 if (s->contains(c)) { 4883 success = !success; 4884 } 4885 } 4886 if (!success) { 4887 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4888 } 4889 } 4890 break; 4891 4892 4893 case URX_STAT_SETREF_N: 4894 { 4895 // Test input character for NOT being a member of one of 4896 // the predefined sets (Word Characters, for example) 4897 if (fp->fInputIdx >= fActiveLimit) { 4898 fHitEnd = TRUE; 4899 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4900 break; 4901 } 4902 4903 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 4904 4905 UChar32 c; 4906 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4907 if (c < 256) { 4908 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 4909 if (s8->contains(c) == FALSE) { 4910 break; 4911 } 4912 } else { 4913 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 4914 if (s->contains(c) == FALSE) { 4915 break; 4916 } 4917 } 4918 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4919 } 4920 break; 4921 4922 4923 case URX_SETREF: 4924 { 4925 if (fp->fInputIdx >= fActiveLimit) { 4926 fHitEnd = TRUE; 4927 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4928 break; 4929 } 4930 4931 U_ASSERT(opValue > 0 && opValue < fSets->size()); 4932 4933 // There is input left. Pick up one char and test it for set membership. 4934 UChar32 c; 4935 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4936 if (c<256) { 4937 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 4938 if (s8->contains(c)) { 4939 // The character is in the set. A Match. 4940 break; 4941 } 4942 } else { 4943 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue); 4944 if (s->contains(c)) { 4945 // The character is in the set. A Match. 4946 break; 4947 } 4948 } 4949 4950 // the character wasn't in the set. 4951 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4952 } 4953 break; 4954 4955 4956 case URX_DOTANY: 4957 { 4958 // . matches anything, but stops at end-of-line. 4959 if (fp->fInputIdx >= fActiveLimit) { 4960 // At end of input. Match failed. Backtrack out. 4961 fHitEnd = TRUE; 4962 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4963 break; 4964 } 4965 4966 // There is input left. Advance over one char, unless we've hit end-of-line 4967 UChar32 c; 4968 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4969 if (isLineTerminator(c)) { 4970 // End of line in normal mode. . does not match. 4971 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4972 break; 4973 } 4974 } 4975 break; 4976 4977 4978 case URX_DOTANY_ALL: 4979 { 4980 // . in dot-matches-all (including new lines) mode 4981 if (fp->fInputIdx >= fActiveLimit) { 4982 // At end of input. Match failed. Backtrack out. 4983 fHitEnd = TRUE; 4984 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4985 break; 4986 } 4987 4988 // There is input left. Advance over one char, except if we are 4989 // at a cr/lf, advance over both of them. 4990 UChar32 c; 4991 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4992 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 4993 // In the case of a CR/LF, we need to advance over both. 4994 if (inputBuf[fp->fInputIdx] == 0x0a) { 4995 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); 4996 } 4997 } 4998 } 4999 break; 5000 5001 5002 case URX_DOTANY_UNIX: 5003 { 5004 // '.' operator, matches all, but stops at end-of-line. 5005 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 5006 if (fp->fInputIdx >= fActiveLimit) { 5007 // At end of input. Match failed. Backtrack out. 5008 fHitEnd = TRUE; 5009 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5010 break; 5011 } 5012 5013 // There is input left. Advance over one char, unless we've hit end-of-line 5014 UChar32 c; 5015 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5016 if (c == 0x0a) { 5017 // End of line in normal mode. '.' does not match the \n 5018 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5019 } 5020 } 5021 break; 5022 5023 5024 case URX_JMP: 5025 fp->fPatIdx = opValue; 5026 break; 5027 5028 case URX_FAIL: 5029 isMatch = FALSE; 5030 goto breakFromLoop; 5031 5032 case URX_JMP_SAV: 5033 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 5034 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 5035 fp->fPatIdx = opValue; // Then JMP. 5036 break; 5037 5038 case URX_JMP_SAV_X: 5039 // This opcode is used with (x)+, when x can match a zero length string. 5040 // Same as JMP_SAV, except conditional on the match having made forward progress. 5041 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 5042 // data address of the input position at the start of the loop. 5043 { 5044 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 5045 int32_t stoOp = (int32_t)pat[opValue-1]; 5046 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 5047 int32_t frameLoc = URX_VAL(stoOp); 5048 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 5049 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; 5050 U_ASSERT(prevInputIdx <= fp->fInputIdx); 5051 if (prevInputIdx < fp->fInputIdx) { 5052 // The match did make progress. Repeat the loop. 5053 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 5054 fp->fPatIdx = opValue; 5055 fp->fExtra[frameLoc] = fp->fInputIdx; 5056 } 5057 // If the input position did not advance, we do nothing here, 5058 // execution will fall out of the loop. 5059 } 5060 break; 5061 5062 case URX_CTR_INIT: 5063 { 5064 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 5065 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 5066 5067 // Pick up the three extra operands that CTR_INIT has, and 5068 // skip the pattern location counter past 5069 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5070 fp->fPatIdx += 3; 5071 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 5072 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 5073 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 5074 U_ASSERT(minCount>=0); 5075 U_ASSERT(maxCount>=minCount || maxCount==-1); 5076 U_ASSERT(loopLoc>=fp->fPatIdx); 5077 5078 if (minCount == 0) { 5079 fp = StateSave(fp, loopLoc+1, status); 5080 } 5081 if (maxCount == -1) { 5082 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. 5083 } else if (maxCount == 0) { 5084 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5085 } 5086 } 5087 break; 5088 5089 case URX_CTR_LOOP: 5090 { 5091 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 5092 int32_t initOp = (int32_t)pat[opValue]; 5093 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 5094 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 5095 int32_t minCount = (int32_t)pat[opValue+2]; 5096 int32_t maxCount = (int32_t)pat[opValue+3]; 5097 (*pCounter)++; 5098 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 5099 U_ASSERT(*pCounter == maxCount); 5100 break; 5101 } 5102 if (*pCounter >= minCount) { 5103 if (maxCount == -1) { 5104 // Loop has no hard upper bound. 5105 // Check that it is progressing through the input, break if it is not. 5106 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 5107 if (fp->fInputIdx == *pLastInputIdx) { 5108 break; 5109 } else { 5110 *pLastInputIdx = fp->fInputIdx; 5111 } 5112 } 5113 fp = StateSave(fp, fp->fPatIdx, status); 5114 } else { 5115 // Increment time-out counter. (StateSave() does it if count >= minCount) 5116 fTickCounter--; 5117 if (fTickCounter <= 0) { 5118 IncrementTime(status); // Re-initializes fTickCounter 5119 } 5120 } 5121 fp->fPatIdx = opValue + 4; // Loop back. 5122 } 5123 break; 5124 5125 case URX_CTR_INIT_NG: 5126 { 5127 // Initialize a non-greedy loop 5128 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 5129 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 5130 5131 // Pick up the three extra operands that CTR_INIT_NG has, and 5132 // skip the pattern location counter past 5133 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5134 fp->fPatIdx += 3; 5135 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 5136 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 5137 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 5138 U_ASSERT(minCount>=0); 5139 U_ASSERT(maxCount>=minCount || maxCount==-1); 5140 U_ASSERT(loopLoc>fp->fPatIdx); 5141 if (maxCount == -1) { 5142 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. 5143 } 5144 5145 if (minCount == 0) { 5146 if (maxCount != 0) { 5147 fp = StateSave(fp, fp->fPatIdx, status); 5148 } 5149 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 5150 } 5151 } 5152 break; 5153 5154 case URX_CTR_LOOP_NG: 5155 { 5156 // Non-greedy {min, max} loops 5157 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 5158 int32_t initOp = (int32_t)pat[opValue]; 5159 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 5160 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 5161 int32_t minCount = (int32_t)pat[opValue+2]; 5162 int32_t maxCount = (int32_t)pat[opValue+3]; 5163 5164 (*pCounter)++; 5165 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 5166 // The loop has matched the maximum permitted number of times. 5167 // Break out of here with no action. Matching will 5168 // continue with the following pattern. 5169 U_ASSERT(*pCounter == maxCount); 5170 break; 5171 } 5172 5173 if (*pCounter < minCount) { 5174 // We haven't met the minimum number of matches yet. 5175 // Loop back for another one. 5176 fp->fPatIdx = opValue + 4; // Loop back. 5177 fTickCounter--; 5178 if (fTickCounter <= 0) { 5179 IncrementTime(status); // Re-initializes fTickCounter 5180 } 5181 } else { 5182 // We do have the minimum number of matches. 5183 5184 // If there is no upper bound on the loop iterations, check that the input index 5185 // is progressing, and stop the loop if it is not. 5186 if (maxCount == -1) { 5187 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 5188 if (fp->fInputIdx == *pLastInputIdx) { 5189 break; 5190 } 5191 *pLastInputIdx = fp->fInputIdx; 5192 } 5193 5194 // Loop Continuation: we will fall into the pattern following the loop 5195 // (non-greedy, don't execute loop body first), but first do 5196 // a state save to the top of the loop, so that a match failure 5197 // in the following pattern will try another iteration of the loop. 5198 fp = StateSave(fp, opValue + 4, status); 5199 } 5200 } 5201 break; 5202 5203 case URX_STO_SP: 5204 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5205 fData[opValue] = fStack->size(); 5206 break; 5207 5208 case URX_LD_SP: 5209 { 5210 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5211 int32_t newStackSize = (int32_t)fData[opValue]; 5212 U_ASSERT(newStackSize <= fStack->size()); 5213 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5214 if (newFP == (int64_t *)fp) { 5215 break; 5216 } 5217 int32_t j; 5218 for (j=0; j<fFrameSize; j++) { 5219 newFP[j] = ((int64_t *)fp)[j]; 5220 } 5221 fp = (REStackFrame *)newFP; 5222 fStack->setSize(newStackSize); 5223 } 5224 break; 5225 5226 case URX_BACKREF: 5227 { 5228 U_ASSERT(opValue < fFrameSize); 5229 int64_t groupStartIdx = fp->fExtra[opValue]; 5230 int64_t groupEndIdx = fp->fExtra[opValue+1]; 5231 U_ASSERT(groupStartIdx <= groupEndIdx); 5232 int64_t inputIndex = fp->fInputIdx; 5233 if (groupStartIdx < 0) { 5234 // This capture group has not participated in the match thus far, 5235 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 5236 break; 5237 } 5238 UBool success = TRUE; 5239 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) { 5240 if (inputIndex >= fActiveLimit) { 5241 success = FALSE; 5242 fHitEnd = TRUE; 5243 break; 5244 } 5245 if (inputBuf[groupIndex] != inputBuf[inputIndex]) { 5246 success = FALSE; 5247 break; 5248 } 5249 } 5250 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) && 5251 inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) { 5252 // Capture group ended with an unpaired lead surrogate. 5253 // Back reference is not permitted to match lead only of a surrogatge pair. 5254 success = FALSE; 5255 } 5256 if (success) { 5257 fp->fInputIdx = inputIndex; 5258 } else { 5259 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5260 } 5261 } 5262 break; 5263 5264 case URX_BACKREF_I: 5265 { 5266 U_ASSERT(opValue < fFrameSize); 5267 int64_t groupStartIdx = fp->fExtra[opValue]; 5268 int64_t groupEndIdx = fp->fExtra[opValue+1]; 5269 U_ASSERT(groupStartIdx <= groupEndIdx); 5270 if (groupStartIdx < 0) { 5271 // This capture group has not participated in the match thus far, 5272 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 5273 break; 5274 } 5275 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx); 5276 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit); 5277 5278 // Note: if the capture group match was of an empty string the backref 5279 // match succeeds. Verified by testing: Perl matches succeed 5280 // in this case, so we do too. 5281 5282 UBool success = TRUE; 5283 for (;;) { 5284 UChar32 captureGroupChar = captureGroupItr.next(); 5285 if (captureGroupChar == U_SENTINEL) { 5286 success = TRUE; 5287 break; 5288 } 5289 UChar32 inputChar = inputItr.next(); 5290 if (inputChar == U_SENTINEL) { 5291 success = FALSE; 5292 fHitEnd = TRUE; 5293 break; 5294 } 5295 if (inputChar != captureGroupChar) { 5296 success = FALSE; 5297 break; 5298 } 5299 } 5300 5301 if (success && inputItr.inExpansion()) { 5302 // We otained a match by consuming part of a string obtained from 5303 // case-folding a single code point of the input text. 5304 // This does not count as an overall match. 5305 success = FALSE; 5306 } 5307 5308 if (success) { 5309 fp->fInputIdx = inputItr.getIndex(); 5310 } else { 5311 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5312 } 5313 } 5314 break; 5315 5316 case URX_STO_INP_LOC: 5317 { 5318 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 5319 fp->fExtra[opValue] = fp->fInputIdx; 5320 } 5321 break; 5322 5323 case URX_JMPX: 5324 { 5325 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5326 fp->fPatIdx += 1; 5327 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 5328 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 5329 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; 5330 U_ASSERT(savedInputIdx <= fp->fInputIdx); 5331 if (savedInputIdx < fp->fInputIdx) { 5332 fp->fPatIdx = opValue; // JMP 5333 } else { 5334 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 5335 } 5336 } 5337 break; 5338 5339 case URX_LA_START: 5340 { 5341 // Entering a lookahead block. 5342 // Save Stack Ptr, Input Pos. 5343 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5344 fData[opValue] = fStack->size(); 5345 fData[opValue+1] = fp->fInputIdx; 5346 fActiveStart = fLookStart; // Set the match region change for 5347 fActiveLimit = fLookLimit; // transparent bounds. 5348 } 5349 break; 5350 5351 case URX_LA_END: 5352 { 5353 // Leaving a look-ahead block. 5354 // restore Stack Ptr, Input Pos to positions they had on entry to block. 5355 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5356 int32_t stackSize = fStack->size(); 5357 int32_t newStackSize = (int32_t)fData[opValue]; 5358 U_ASSERT(stackSize >= newStackSize); 5359 if (stackSize > newStackSize) { 5360 // Copy the current top frame back to the new (cut back) top frame. 5361 // This makes the capture groups from within the look-ahead 5362 // expression available. 5363 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5364 int32_t j; 5365 for (j=0; j<fFrameSize; j++) { 5366 newFP[j] = ((int64_t *)fp)[j]; 5367 } 5368 fp = (REStackFrame *)newFP; 5369 fStack->setSize(newStackSize); 5370 } 5371 fp->fInputIdx = fData[opValue+1]; 5372 5373 // Restore the active region bounds in the input string; they may have 5374 // been changed because of transparent bounds on a Region. 5375 fActiveStart = fRegionStart; 5376 fActiveLimit = fRegionLimit; 5377 } 5378 break; 5379 5380 case URX_ONECHAR_I: 5381 if (fp->fInputIdx < fActiveLimit) { 5382 UChar32 c; 5383 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5384 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 5385 break; 5386 } 5387 } else { 5388 fHitEnd = TRUE; 5389 } 5390 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5391 break; 5392 5393 case URX_STRING_I: 5394 // Case-insensitive test input against a literal string. 5395 // Strings require two slots in the compiled pattern, one for the 5396 // offset to the string text, and one for the length. 5397 // The compiled string has already been case folded. 5398 { 5399 const UChar *patternString = litText + opValue; 5400 5401 op = (int32_t)pat[fp->fPatIdx]; 5402 fp->fPatIdx++; 5403 opType = URX_TYPE(op); 5404 opValue = URX_VAL(op); 5405 U_ASSERT(opType == URX_STRING_LEN); 5406 int32_t patternStringLen = opValue; // Length of the string from the pattern. 5407 5408 UChar32 cText; 5409 UChar32 cPattern; 5410 UBool success = TRUE; 5411 int32_t patternStringIdx = 0; 5412 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit); 5413 while (patternStringIdx < patternStringLen) { 5414 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); 5415 cText = inputIterator.next(); 5416 if (cText != cPattern) { 5417 success = FALSE; 5418 if (cText == U_SENTINEL) { 5419 fHitEnd = TRUE; 5420 } 5421 break; 5422 } 5423 } 5424 if (inputIterator.inExpansion()) { 5425 success = FALSE; 5426 } 5427 5428 if (success) { 5429 fp->fInputIdx = inputIterator.getIndex(); 5430 } else { 5431 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5432 } 5433 } 5434 break; 5435 5436 case URX_LB_START: 5437 { 5438 // Entering a look-behind block. 5439 // Save Stack Ptr, Input Pos. 5440 // TODO: implement transparent bounds. Ticket #6067 5441 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5442 fData[opValue] = fStack->size(); 5443 fData[opValue+1] = fp->fInputIdx; 5444 // Init the variable containing the start index for attempted matches. 5445 fData[opValue+2] = -1; 5446 // Save input string length, then reset to pin any matches to end at 5447 // the current position. 5448 fData[opValue+3] = fActiveLimit; 5449 fActiveLimit = fp->fInputIdx; 5450 } 5451 break; 5452 5453 5454 case URX_LB_CONT: 5455 { 5456 // Positive Look-Behind, at top of loop checking for matches of LB expression 5457 // at all possible input starting positions. 5458 5459 // Fetch the min and max possible match lengths. They are the operands 5460 // of this op in the pattern. 5461 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5462 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5463 U_ASSERT(minML <= maxML); 5464 U_ASSERT(minML >= 0); 5465 5466 // Fetch (from data) the last input index where a match was attempted. 5467 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5468 int64_t &lbStartIdx = fData[opValue+2]; 5469 if (lbStartIdx < 0) { 5470 // First time through loop. 5471 lbStartIdx = fp->fInputIdx - minML; 5472 if (lbStartIdx > 0 && lbStartIdx < fInputLength) { 5473 U16_SET_CP_START(inputBuf, 0, lbStartIdx); 5474 } 5475 } else { 5476 // 2nd through nth time through the loop. 5477 // Back up start position for match by one. 5478 if (lbStartIdx == 0) { 5479 lbStartIdx--; 5480 } else { 5481 U16_BACK_1(inputBuf, 0, lbStartIdx); 5482 } 5483 } 5484 5485 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) { 5486 // We have tried all potential match starting points without 5487 // getting a match. Backtrack out, and out of the 5488 // Look Behind altogether. 5489 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5490 int64_t restoreInputLen = fData[opValue+3]; 5491 U_ASSERT(restoreInputLen >= fActiveLimit); 5492 U_ASSERT(restoreInputLen <= fInputLength); 5493 fActiveLimit = restoreInputLen; 5494 break; 5495 } 5496 5497 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5498 // (successful match will fall off the end of the loop.) 5499 fp = StateSave(fp, fp->fPatIdx-3, status); 5500 fp->fInputIdx = lbStartIdx; 5501 } 5502 break; 5503 5504 case URX_LB_END: 5505 // End of a look-behind block, after a successful match. 5506 { 5507 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5508 if (fp->fInputIdx != fActiveLimit) { 5509 // The look-behind expression matched, but the match did not 5510 // extend all the way to the point that we are looking behind from. 5511 // FAIL out of here, which will take us back to the LB_CONT, which 5512 // will retry the match starting at another position or fail 5513 // the look-behind altogether, whichever is appropriate. 5514 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5515 break; 5516 } 5517 5518 // Look-behind match is good. Restore the orignal input string length, 5519 // which had been truncated to pin the end of the lookbehind match to the 5520 // position being looked-behind. 5521 int64_t originalInputLen = fData[opValue+3]; 5522 U_ASSERT(originalInputLen >= fActiveLimit); 5523 U_ASSERT(originalInputLen <= fInputLength); 5524 fActiveLimit = originalInputLen; 5525 } 5526 break; 5527 5528 5529 case URX_LBN_CONT: 5530 { 5531 // Negative Look-Behind, at top of loop checking for matches of LB expression 5532 // at all possible input starting positions. 5533 5534 // Fetch the extra parameters of this op. 5535 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5536 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5537 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 5538 continueLoc = URX_VAL(continueLoc); 5539 U_ASSERT(minML <= maxML); 5540 U_ASSERT(minML >= 0); 5541 U_ASSERT(continueLoc > fp->fPatIdx); 5542 5543 // Fetch (from data) the last input index where a match was attempted. 5544 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5545 int64_t &lbStartIdx = fData[opValue+2]; 5546 if (lbStartIdx < 0) { 5547 // First time through loop. 5548 lbStartIdx = fp->fInputIdx - minML; 5549 if (lbStartIdx > 0 && lbStartIdx < fInputLength) { 5550 U16_SET_CP_START(inputBuf, 0, lbStartIdx); 5551 } 5552 } else { 5553 // 2nd through nth time through the loop. 5554 // Back up start position for match by one. 5555 if (lbStartIdx == 0) { 5556 lbStartIdx--; // Because U16_BACK is unsafe starting at 0. 5557 } else { 5558 U16_BACK_1(inputBuf, 0, lbStartIdx); 5559 } 5560 } 5561 5562 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) { 5563 // We have tried all potential match starting points without 5564 // getting a match, which means that the negative lookbehind as 5565 // a whole has succeeded. Jump forward to the continue location 5566 int64_t restoreInputLen = fData[opValue+3]; 5567 U_ASSERT(restoreInputLen >= fActiveLimit); 5568 U_ASSERT(restoreInputLen <= fInputLength); 5569 fActiveLimit = restoreInputLen; 5570 fp->fPatIdx = continueLoc; 5571 break; 5572 } 5573 5574 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5575 // (successful match will cause a FAIL out of the loop altogether.) 5576 fp = StateSave(fp, fp->fPatIdx-4, status); 5577 fp->fInputIdx = lbStartIdx; 5578 } 5579 break; 5580 5581 case URX_LBN_END: 5582 // End of a negative look-behind block, after a successful match. 5583 { 5584 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5585 if (fp->fInputIdx != fActiveLimit) { 5586 // The look-behind expression matched, but the match did not 5587 // extend all the way to the point that we are looking behind from. 5588 // FAIL out of here, which will take us back to the LB_CONT, which 5589 // will retry the match starting at another position or succeed 5590 // the look-behind altogether, whichever is appropriate. 5591 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5592 break; 5593 } 5594 5595 // Look-behind expression matched, which means look-behind test as 5596 // a whole Fails 5597 5598 // Restore the orignal input string length, which had been truncated 5599 // inorder to pin the end of the lookbehind match 5600 // to the position being looked-behind. 5601 int64_t originalInputLen = fData[opValue+3]; 5602 U_ASSERT(originalInputLen >= fActiveLimit); 5603 U_ASSERT(originalInputLen <= fInputLength); 5604 fActiveLimit = originalInputLen; 5605 5606 // Restore original stack position, discarding any state saved 5607 // by the successful pattern match. 5608 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5609 int32_t newStackSize = (int32_t)fData[opValue]; 5610 U_ASSERT(fStack->size() > newStackSize); 5611 fStack->setSize(newStackSize); 5612 5613 // FAIL, which will take control back to someplace 5614 // prior to entering the look-behind test. 5615 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5616 } 5617 break; 5618 5619 5620 case URX_LOOP_SR_I: 5621 // Loop Initialization for the optimized implementation of 5622 // [some character set]* 5623 // This op scans through all matching input. 5624 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5625 { 5626 U_ASSERT(opValue > 0 && opValue < fSets->size()); 5627 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5628 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue); 5629 5630 // Loop through input, until either the input is exhausted or 5631 // we reach a character that is not a member of the set. 5632 int32_t ix = (int32_t)fp->fInputIdx; 5633 for (;;) { 5634 if (ix >= fActiveLimit) { 5635 fHitEnd = TRUE; 5636 break; 5637 } 5638 UChar32 c; 5639 U16_NEXT(inputBuf, ix, fActiveLimit, c); 5640 if (c<256) { 5641 if (s8->contains(c) == FALSE) { 5642 U16_BACK_1(inputBuf, 0, ix); 5643 break; 5644 } 5645 } else { 5646 if (s->contains(c) == FALSE) { 5647 U16_BACK_1(inputBuf, 0, ix); 5648 break; 5649 } 5650 } 5651 } 5652 5653 // If there were no matching characters, skip over the loop altogether. 5654 // The loop doesn't run at all, a * op always succeeds. 5655 if (ix == fp->fInputIdx) { 5656 fp->fPatIdx++; // skip the URX_LOOP_C op. 5657 break; 5658 } 5659 5660 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5661 // must follow. It's operand is the stack location 5662 // that holds the starting input index for the match of this [set]* 5663 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 5664 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5665 int32_t stackLoc = URX_VAL(loopcOp); 5666 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5667 fp->fExtra[stackLoc] = fp->fInputIdx; 5668 fp->fInputIdx = ix; 5669 5670 // Save State to the URX_LOOP_C op that follows this one, 5671 // so that match failures in the following code will return to there. 5672 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 5673 fp = StateSave(fp, fp->fPatIdx, status); 5674 fp->fPatIdx++; 5675 } 5676 break; 5677 5678 5679 case URX_LOOP_DOT_I: 5680 // Loop Initialization for the optimized implementation of .* 5681 // This op scans through all remaining input. 5682 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5683 { 5684 // Loop through input until the input is exhausted (we reach an end-of-line) 5685 // In DOTALL mode, we can just go straight to the end of the input. 5686 int32_t ix; 5687 if ((opValue & 1) == 1) { 5688 // Dot-matches-All mode. Jump straight to the end of the string. 5689 ix = (int32_t)fActiveLimit; 5690 fHitEnd = TRUE; 5691 } else { 5692 // NOT DOT ALL mode. Line endings do not match '.' 5693 // Scan forward until a line ending or end of input. 5694 ix = (int32_t)fp->fInputIdx; 5695 for (;;) { 5696 if (ix >= fActiveLimit) { 5697 fHitEnd = TRUE; 5698 break; 5699 } 5700 UChar32 c; 5701 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] 5702 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 5703 if ((c == 0x0a) || // 0x0a is newline in both modes. 5704 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 5705 isLineTerminator(c))) { 5706 // char is a line ending. Put the input pos back to the 5707 // line ending char, and exit the scanning loop. 5708 U16_BACK_1(inputBuf, 0, ix); 5709 break; 5710 } 5711 } 5712 } 5713 } 5714 5715 // If there were no matching characters, skip over the loop altogether. 5716 // The loop doesn't run at all, a * op always succeeds. 5717 if (ix == fp->fInputIdx) { 5718 fp->fPatIdx++; // skip the URX_LOOP_C op. 5719 break; 5720 } 5721 5722 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5723 // must follow. It's operand is the stack location 5724 // that holds the starting input index for the match of this .* 5725 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 5726 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5727 int32_t stackLoc = URX_VAL(loopcOp); 5728 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5729 fp->fExtra[stackLoc] = fp->fInputIdx; 5730 fp->fInputIdx = ix; 5731 5732 // Save State to the URX_LOOP_C op that follows this one, 5733 // so that match failures in the following code will return to there. 5734 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 5735 fp = StateSave(fp, fp->fPatIdx, status); 5736 fp->fPatIdx++; 5737 } 5738 break; 5739 5740 5741 case URX_LOOP_C: 5742 { 5743 U_ASSERT(opValue>=0 && opValue<fFrameSize); 5744 backSearchIndex = (int32_t)fp->fExtra[opValue]; 5745 U_ASSERT(backSearchIndex <= fp->fInputIdx); 5746 if (backSearchIndex == fp->fInputIdx) { 5747 // We've backed up the input idx to the point that the loop started. 5748 // The loop is done. Leave here without saving state. 5749 // Subsequent failures won't come back here. 5750 break; 5751 } 5752 // Set up for the next iteration of the loop, with input index 5753 // backed up by one from the last time through, 5754 // and a state save to this instruction in case the following code fails again. 5755 // (We're going backwards because this loop emulates stack unwinding, not 5756 // the initial scan forward.) 5757 U_ASSERT(fp->fInputIdx > 0); 5758 UChar32 prevC; 5759 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit? 5760 5761 if (prevC == 0x0a && 5762 fp->fInputIdx > backSearchIndex && 5763 inputBuf[fp->fInputIdx-1] == 0x0d) { 5764 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 5765 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 5766 // .*, stepping back over CRLF pair. 5767 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 5768 } 5769 } 5770 5771 5772 fp = StateSave(fp, fp->fPatIdx-1, status); 5773 } 5774 break; 5775 5776 5777 5778 default: 5779 // Trouble. The compiled pattern contains an entry with an 5780 // unrecognized type tag. 5781 U_ASSERT(FALSE); 5782 } 5783 5784 if (U_FAILURE(status)) { 5785 isMatch = FALSE; 5786 break; 5787 } 5788 } 5789 5790 breakFromLoop: 5791 fMatch = isMatch; 5792 if (isMatch) { 5793 fLastMatchEnd = fMatchEnd; 5794 fMatchStart = startIdx; 5795 fMatchEnd = fp->fInputIdx; 5796 } 5797 5798 #ifdef REGEX_RUN_DEBUG 5799 if (fTraceDebug) { 5800 if (isMatch) { 5801 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd); 5802 } else { 5803 printf("No match\n\n"); 5804 } 5805 } 5806 #endif 5807 5808 fFrame = fp; // The active stack frame when the engine stopped. 5809 // Contains the capture group results that we need to 5810 // access later. 5811 5812 return; 5813 } 5814 5815 5816 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) 5817 5818 U_NAMESPACE_END 5819 5820 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 5821 5822