1 /* 2 ************************************************************************** 3 * Copyright (C) 2002-2015 International Business Machines Corporation * 4 * and others. All rights reserved. * 5 ************************************************************************** 6 */ 7 // 8 // file: rematch.cpp 9 // 10 // Contains the implementation of class RegexMatcher, 11 // which is one of the main API classes for the ICU regular expression package. 12 // 13 14 #include "unicode/utypes.h" 15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 16 17 #include "unicode/regex.h" 18 #include "unicode/uniset.h" 19 #include "unicode/uchar.h" 20 #include "unicode/ustring.h" 21 #include "unicode/rbbi.h" 22 #include "unicode/utf.h" 23 #include "unicode/utf16.h" 24 #include "uassert.h" 25 #include "cmemory.h" 26 #include "uvector.h" 27 #include "uvectr32.h" 28 #include "uvectr64.h" 29 #include "regeximp.h" 30 #include "regexst.h" 31 #include "regextxt.h" 32 #include "ucase.h" 33 34 // #include <malloc.h> // Needed for heapcheck testing 35 36 U_NAMESPACE_BEGIN 37 38 // Default limit for the size of the back track stack, to avoid system 39 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. 40 // This value puts ICU's limits higher than most other regexp implementations, 41 // which use recursion rather than the heap, and take more storage per 42 // backtrack point. 43 // 44 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; 45 46 // Time limit counter constant. 47 // Time limits for expression evaluation are in terms of quanta of work by 48 // the engine, each of which is 10,000 state saves. 49 // This constant determines that state saves per tick number. 50 static const int32_t TIMER_INITIAL_VALUE = 10000; 51 52 53 // Test for any of the Unicode line terminating characters. 54 static inline UBool isLineTerminator(UChar32 c) { 55 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) { 56 return false; 57 } 58 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029; 59 } 60 61 //----------------------------------------------------------------------------- 62 // 63 // Constructor and Destructor 64 // 65 //----------------------------------------------------------------------------- 66 RegexMatcher::RegexMatcher(const RegexPattern *pat) { 67 fDeferredStatus = U_ZERO_ERROR; 68 init(fDeferredStatus); 69 if (U_FAILURE(fDeferredStatus)) { 70 return; 71 } 72 if (pat==NULL) { 73 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; 74 return; 75 } 76 fPattern = pat; 77 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); 78 } 79 80 81 82 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 83 uint32_t flags, UErrorCode &status) { 84 init(status); 85 if (U_FAILURE(status)) { 86 return; 87 } 88 UParseError pe; 89 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 90 fPattern = fPatternOwned; 91 92 UText inputText = UTEXT_INITIALIZER; 93 utext_openConstUnicodeString(&inputText, &input, &status); 94 init2(&inputText, status); 95 utext_close(&inputText); 96 97 fInputUniStrMaybeMutable = TRUE; 98 } 99 100 101 RegexMatcher::RegexMatcher(UText *regexp, UText *input, 102 uint32_t flags, UErrorCode &status) { 103 init(status); 104 if (U_FAILURE(status)) { 105 return; 106 } 107 UParseError pe; 108 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 109 if (U_FAILURE(status)) { 110 return; 111 } 112 113 fPattern = fPatternOwned; 114 init2(input, status); 115 } 116 117 118 RegexMatcher::RegexMatcher(const UnicodeString ®exp, 119 uint32_t flags, UErrorCode &status) { 120 init(status); 121 if (U_FAILURE(status)) { 122 return; 123 } 124 UParseError pe; 125 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 126 if (U_FAILURE(status)) { 127 return; 128 } 129 fPattern = fPatternOwned; 130 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 131 } 132 133 RegexMatcher::RegexMatcher(UText *regexp, 134 uint32_t flags, UErrorCode &status) { 135 init(status); 136 if (U_FAILURE(status)) { 137 return; 138 } 139 UParseError pe; 140 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 141 if (U_FAILURE(status)) { 142 return; 143 } 144 145 fPattern = fPatternOwned; 146 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 147 } 148 149 150 151 152 RegexMatcher::~RegexMatcher() { 153 delete fStack; 154 if (fData != fSmallData) { 155 uprv_free(fData); 156 fData = NULL; 157 } 158 if (fPatternOwned) { 159 delete fPatternOwned; 160 fPatternOwned = NULL; 161 fPattern = NULL; 162 } 163 164 if (fInput) { 165 delete fInput; 166 } 167 if (fInputText) { 168 utext_close(fInputText); 169 } 170 if (fAltInputText) { 171 utext_close(fAltInputText); 172 } 173 174 #if UCONFIG_NO_BREAK_ITERATION==0 175 delete fWordBreakItr; 176 #endif 177 } 178 179 // 180 // init() common initialization for use by all constructors. 181 // Initialize all fields, get the object into a consistent state. 182 // This must be done even when the initial status shows an error, 183 // so that the object is initialized sufficiently well for the destructor 184 // to run safely. 185 // 186 void RegexMatcher::init(UErrorCode &status) { 187 fPattern = NULL; 188 fPatternOwned = NULL; 189 fFrameSize = 0; 190 fRegionStart = 0; 191 fRegionLimit = 0; 192 fAnchorStart = 0; 193 fAnchorLimit = 0; 194 fLookStart = 0; 195 fLookLimit = 0; 196 fActiveStart = 0; 197 fActiveLimit = 0; 198 fTransparentBounds = FALSE; 199 fAnchoringBounds = TRUE; 200 fMatch = FALSE; 201 fMatchStart = 0; 202 fMatchEnd = 0; 203 fLastMatchEnd = -1; 204 fAppendPosition = 0; 205 fHitEnd = FALSE; 206 fRequireEnd = FALSE; 207 fStack = NULL; 208 fFrame = NULL; 209 fTimeLimit = 0; 210 fTime = 0; 211 fTickCounter = 0; 212 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; 213 fCallbackFn = NULL; 214 fCallbackContext = NULL; 215 fFindProgressCallbackFn = NULL; 216 fFindProgressCallbackContext = NULL; 217 fTraceDebug = FALSE; 218 fDeferredStatus = status; 219 fData = fSmallData; 220 fWordBreakItr = NULL; 221 222 fStack = NULL; 223 fInputText = NULL; 224 fAltInputText = NULL; 225 fInput = NULL; 226 fInputLength = 0; 227 fInputUniStrMaybeMutable = FALSE; 228 } 229 230 // 231 // init2() Common initialization for use by RegexMatcher constructors, part 2. 232 // This handles the common setup to be done after the Pattern is available. 233 // 234 void RegexMatcher::init2(UText *input, UErrorCode &status) { 235 if (U_FAILURE(status)) { 236 fDeferredStatus = status; 237 return; 238 } 239 240 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) { 241 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); 242 if (fData == NULL) { 243 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 244 return; 245 } 246 } 247 248 fStack = new UVector64(status); 249 if (fStack == NULL) { 250 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 251 return; 252 } 253 254 reset(input); 255 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); 256 if (U_FAILURE(status)) { 257 fDeferredStatus = status; 258 return; 259 } 260 } 261 262 263 static const UChar BACKSLASH = 0x5c; 264 static const UChar DOLLARSIGN = 0x24; 265 static const UChar LEFTBRACKET = 0x7b; 266 static const UChar RIGHTBRACKET = 0x7d; 267 268 //-------------------------------------------------------------------------------- 269 // 270 // appendReplacement 271 // 272 //-------------------------------------------------------------------------------- 273 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, 274 const UnicodeString &replacement, 275 UErrorCode &status) { 276 UText replacementText = UTEXT_INITIALIZER; 277 278 utext_openConstUnicodeString(&replacementText, &replacement, &status); 279 if (U_SUCCESS(status)) { 280 UText resultText = UTEXT_INITIALIZER; 281 utext_openUnicodeString(&resultText, &dest, &status); 282 283 if (U_SUCCESS(status)) { 284 appendReplacement(&resultText, &replacementText, status); 285 utext_close(&resultText); 286 } 287 utext_close(&replacementText); 288 } 289 290 return *this; 291 } 292 293 // 294 // appendReplacement, UText mode 295 // 296 RegexMatcher &RegexMatcher::appendReplacement(UText *dest, 297 UText *replacement, 298 UErrorCode &status) { 299 if (U_FAILURE(status)) { 300 return *this; 301 } 302 if (U_FAILURE(fDeferredStatus)) { 303 status = fDeferredStatus; 304 return *this; 305 } 306 if (fMatch == FALSE) { 307 status = U_REGEX_INVALID_STATE; 308 return *this; 309 } 310 311 // Copy input string from the end of previous match to start of current match 312 int64_t destLen = utext_nativeLength(dest); 313 if (fMatchStart > fAppendPosition) { 314 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 315 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 316 (int32_t)(fMatchStart-fAppendPosition), &status); 317 } else { 318 int32_t len16; 319 if (UTEXT_USES_U16(fInputText)) { 320 len16 = (int32_t)(fMatchStart-fAppendPosition); 321 } else { 322 UErrorCode lengthStatus = U_ZERO_ERROR; 323 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus); 324 } 325 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 326 if (inputChars == NULL) { 327 status = U_MEMORY_ALLOCATION_ERROR; 328 return *this; 329 } 330 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); 331 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); 332 uprv_free(inputChars); 333 } 334 } 335 fAppendPosition = fMatchEnd; 336 337 338 // scan the replacement text, looking for substitutions ($n) and \escapes. 339 // TODO: optimize this loop by efficiently scanning for '$' or '\', 340 // move entire ranges not containing substitutions. 341 UTEXT_SETNATIVEINDEX(replacement, 0); 342 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) { 343 if (c == BACKSLASH) { 344 // Backslash Escape. Copy the following char out without further checks. 345 // Note: Surrogate pairs don't need any special handling 346 // The second half wont be a '$' or a '\', and 347 // will move to the dest normally on the next 348 // loop iteration. 349 c = UTEXT_CURRENT32(replacement); 350 if (c == U_SENTINEL) { 351 break; 352 } 353 354 if (c==0x55/*U*/ || c==0x75/*u*/) { 355 // We have a \udddd or \Udddddddd escape sequence. 356 int32_t offset = 0; 357 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement); 358 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 359 if (escapedChar != (UChar32)0xFFFFFFFF) { 360 if (U_IS_BMP(escapedChar)) { 361 UChar c16 = (UChar)escapedChar; 362 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 363 } else { 364 UChar surrogate[2]; 365 surrogate[0] = U16_LEAD(escapedChar); 366 surrogate[1] = U16_TRAIL(escapedChar); 367 if (U_SUCCESS(status)) { 368 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 369 } 370 } 371 // TODO: Report errors for mal-formed \u escapes? 372 // As this is, the original sequence is output, which may be OK. 373 if (context.lastOffset == offset) { 374 (void)UTEXT_PREVIOUS32(replacement); 375 } else if (context.lastOffset != offset-1) { 376 utext_moveIndex32(replacement, offset - context.lastOffset - 1); 377 } 378 } 379 } else { 380 (void)UTEXT_NEXT32(replacement); 381 // Plain backslash escape. Just put out the escaped character. 382 if (U_IS_BMP(c)) { 383 UChar c16 = (UChar)c; 384 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 385 } else { 386 UChar surrogate[2]; 387 surrogate[0] = U16_LEAD(c); 388 surrogate[1] = U16_TRAIL(c); 389 if (U_SUCCESS(status)) { 390 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 391 } 392 } 393 } 394 } else if (c != DOLLARSIGN) { 395 // Normal char, not a $. Copy it out without further checks. 396 if (U_IS_BMP(c)) { 397 UChar c16 = (UChar)c; 398 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 399 } else { 400 UChar surrogate[2]; 401 surrogate[0] = U16_LEAD(c); 402 surrogate[1] = U16_TRAIL(c); 403 if (U_SUCCESS(status)) { 404 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 405 } 406 } 407 } else { 408 // We've got a $. Pick up a capture group name or number if one follows. 409 // Consume digits so long as the resulting group number <= the number of 410 // number of capture groups in the pattern. 411 412 int32_t groupNum = 0; 413 int32_t numDigits = 0; 414 UChar32 nextChar = utext_current32(replacement); 415 if (nextChar == LEFTBRACKET) { 416 // Scan for a Named Capture Group, ${name}. 417 UnicodeString groupName; 418 utext_next32(replacement); 419 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) { 420 nextChar = utext_next32(replacement); 421 if (nextChar == U_SENTINEL) { 422 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 423 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z 424 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z 425 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9 426 groupName.append(nextChar); 427 } else if (nextChar == RIGHTBRACKET) { 428 groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName); 429 if (groupNum == 0) { 430 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 431 } 432 } else { 433 // Character was something other than a name char or a closing '}' 434 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 435 } 436 } 437 438 } else if (u_isdigit(nextChar)) { 439 // $n Scan for a capture group number 440 int32_t numCaptureGroups = fPattern->fGroupMap->size(); 441 for (;;) { 442 nextChar = UTEXT_CURRENT32(replacement); 443 if (nextChar == U_SENTINEL) { 444 break; 445 } 446 if (u_isdigit(nextChar) == FALSE) { 447 break; 448 } 449 int32_t nextDigitVal = u_charDigitValue(nextChar); 450 if (groupNum*10 + nextDigitVal > numCaptureGroups) { 451 // Don't consume the next digit if it makes the capture group number too big. 452 if (numDigits == 0) { 453 status = U_INDEX_OUTOFBOUNDS_ERROR; 454 } 455 break; 456 } 457 (void)UTEXT_NEXT32(replacement); 458 groupNum=groupNum*10 + nextDigitVal; 459 ++numDigits; 460 } 461 } else { 462 // $ not followed by capture group name or number. 463 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 464 } 465 466 if (U_SUCCESS(status)) { 467 destLen += appendGroup(groupNum, dest, status); 468 } 469 } // End of $ capture group handling 470 } // End of per-character loop through the replacement string. 471 472 return *this; 473 } 474 475 476 477 //-------------------------------------------------------------------------------- 478 // 479 // appendTail Intended to be used in conjunction with appendReplacement() 480 // To the destination string, append everything following 481 // the last match position from the input string. 482 // 483 // Note: Match ranges do not affect appendTail or appendReplacement 484 // 485 //-------------------------------------------------------------------------------- 486 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { 487 UErrorCode status = U_ZERO_ERROR; 488 UText resultText = UTEXT_INITIALIZER; 489 utext_openUnicodeString(&resultText, &dest, &status); 490 491 if (U_SUCCESS(status)) { 492 appendTail(&resultText, status); 493 utext_close(&resultText); 494 } 495 496 return dest; 497 } 498 499 // 500 // appendTail, UText mode 501 // 502 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { 503 if (U_FAILURE(status)) { 504 return dest; 505 } 506 if (U_FAILURE(fDeferredStatus)) { 507 status = fDeferredStatus; 508 return dest; 509 } 510 511 if (fInputLength > fAppendPosition) { 512 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 513 int64_t destLen = utext_nativeLength(dest); 514 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 515 (int32_t)(fInputLength-fAppendPosition), &status); 516 } else { 517 int32_t len16; 518 if (UTEXT_USES_U16(fInputText)) { 519 len16 = (int32_t)(fInputLength-fAppendPosition); 520 } else { 521 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status); 522 status = U_ZERO_ERROR; // buffer overflow 523 } 524 525 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); 526 if (inputChars == NULL) { 527 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 528 } else { 529 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated 530 int64_t destLen = utext_nativeLength(dest); 531 utext_replace(dest, destLen, destLen, inputChars, len16, &status); 532 uprv_free(inputChars); 533 } 534 } 535 } 536 return dest; 537 } 538 539 540 541 //-------------------------------------------------------------------------------- 542 // 543 // end 544 // 545 //-------------------------------------------------------------------------------- 546 int32_t RegexMatcher::end(UErrorCode &err) const { 547 return end(0, err); 548 } 549 550 int64_t RegexMatcher::end64(UErrorCode &err) const { 551 return end64(0, err); 552 } 553 554 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const { 555 if (U_FAILURE(err)) { 556 return -1; 557 } 558 if (fMatch == FALSE) { 559 err = U_REGEX_INVALID_STATE; 560 return -1; 561 } 562 if (group < 0 || group > fPattern->fGroupMap->size()) { 563 err = U_INDEX_OUTOFBOUNDS_ERROR; 564 return -1; 565 } 566 int64_t e = -1; 567 if (group == 0) { 568 e = fMatchEnd; 569 } else { 570 // Get the position within the stack frame of the variables for 571 // this capture group. 572 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 573 U_ASSERT(groupOffset < fPattern->fFrameSize); 574 U_ASSERT(groupOffset >= 0); 575 e = fFrame->fExtra[groupOffset + 1]; 576 } 577 578 return e; 579 } 580 581 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { 582 return (int32_t)end64(group, err); 583 } 584 585 //-------------------------------------------------------------------------------- 586 // 587 // findProgressInterrupt This function is called once for each advance in the target 588 // string from the find() function, and calls the user progress callback 589 // function if there is one installed. 590 // 591 // Return: TRUE if the find operation is to be terminated. 592 // FALSE if the find operation is to continue running. 593 // 594 //-------------------------------------------------------------------------------- 595 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) { 596 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) { 597 status = U_REGEX_STOPPED_BY_CALLER; 598 return TRUE; 599 } 600 return FALSE; 601 } 602 603 //-------------------------------------------------------------------------------- 604 // 605 // find() 606 // 607 //-------------------------------------------------------------------------------- 608 UBool RegexMatcher::find() { 609 if (U_FAILURE(fDeferredStatus)) { 610 return FALSE; 611 } 612 UErrorCode status = U_ZERO_ERROR; 613 UBool result = find(status); 614 return result; 615 } 616 617 //-------------------------------------------------------------------------------- 618 // 619 // find() 620 // 621 //-------------------------------------------------------------------------------- 622 UBool RegexMatcher::find(UErrorCode &status) { 623 // Start at the position of the last match end. (Will be zero if the 624 // matcher has been reset.) 625 // 626 if (U_FAILURE(status)) { 627 return FALSE; 628 } 629 if (U_FAILURE(fDeferredStatus)) { 630 status = fDeferredStatus; 631 return FALSE; 632 } 633 634 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 635 return findUsingChunk(status); 636 } 637 638 int64_t startPos = fMatchEnd; 639 if (startPos==0) { 640 startPos = fActiveStart; 641 } 642 643 if (fMatch) { 644 // Save the position of any previous successful match. 645 fLastMatchEnd = fMatchEnd; 646 647 if (fMatchStart == fMatchEnd) { 648 // Previous match had zero length. Move start position up one position 649 // to avoid sending find() into a loop on zero-length matches. 650 if (startPos >= fActiveLimit) { 651 fMatch = FALSE; 652 fHitEnd = TRUE; 653 return FALSE; 654 } 655 UTEXT_SETNATIVEINDEX(fInputText, startPos); 656 (void)UTEXT_NEXT32(fInputText); 657 startPos = UTEXT_GETNATIVEINDEX(fInputText); 658 } 659 } else { 660 if (fLastMatchEnd >= 0) { 661 // A previous find() failed to match. Don't try again. 662 // (without this test, a pattern with a zero-length match 663 // could match again at the end of an input string.) 664 fHitEnd = TRUE; 665 return FALSE; 666 } 667 } 668 669 670 // Compute the position in the input string beyond which a match can not begin, because 671 // the minimum length match would extend past the end of the input. 672 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 673 // Be aware of possible overflows if making changes here. 674 int64_t testStartLimit; 675 if (UTEXT_USES_U16(fInputText)) { 676 testStartLimit = fActiveLimit - fPattern->fMinMatchLen; 677 if (startPos > testStartLimit) { 678 fMatch = FALSE; 679 fHitEnd = TRUE; 680 return FALSE; 681 } 682 } else { 683 // We don't know exactly how long the minimum match length is in native characters. 684 // Treat anything > 0 as 1. 685 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0); 686 } 687 688 UChar32 c; 689 U_ASSERT(startPos >= 0); 690 691 switch (fPattern->fStartType) { 692 case START_NO_INFO: 693 // No optimization was found. 694 // Try a match at each input position. 695 for (;;) { 696 MatchAt(startPos, FALSE, status); 697 if (U_FAILURE(status)) { 698 return FALSE; 699 } 700 if (fMatch) { 701 return TRUE; 702 } 703 if (startPos >= testStartLimit) { 704 fHitEnd = TRUE; 705 return FALSE; 706 } 707 UTEXT_SETNATIVEINDEX(fInputText, startPos); 708 (void)UTEXT_NEXT32(fInputText); 709 startPos = UTEXT_GETNATIVEINDEX(fInputText); 710 // Note that it's perfectly OK for a pattern to have a zero-length 711 // match at the end of a string, so we must make sure that the loop 712 // runs with startPos == testStartLimit the last time through. 713 if (findProgressInterrupt(startPos, status)) 714 return FALSE; 715 } 716 U_ASSERT(FALSE); 717 718 case START_START: 719 // Matches are only possible at the start of the input string 720 // (pattern begins with ^ or \A) 721 if (startPos > fActiveStart) { 722 fMatch = FALSE; 723 return FALSE; 724 } 725 MatchAt(startPos, FALSE, status); 726 if (U_FAILURE(status)) { 727 return FALSE; 728 } 729 return fMatch; 730 731 732 case START_SET: 733 { 734 // Match may start on any char from a pre-computed set. 735 U_ASSERT(fPattern->fMinMatchLen > 0); 736 UTEXT_SETNATIVEINDEX(fInputText, startPos); 737 for (;;) { 738 int64_t pos = startPos; 739 c = UTEXT_NEXT32(fInputText); 740 startPos = UTEXT_GETNATIVEINDEX(fInputText); 741 // c will be -1 (U_SENTINEL) at end of text, in which case we 742 // skip this next block (so we don't have a negative array index) 743 // and handle end of text in the following block. 744 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) || 745 (c>=256 && fPattern->fInitialChars->contains(c)))) { 746 MatchAt(pos, FALSE, status); 747 if (U_FAILURE(status)) { 748 return FALSE; 749 } 750 if (fMatch) { 751 return TRUE; 752 } 753 UTEXT_SETNATIVEINDEX(fInputText, pos); 754 } 755 if (startPos > testStartLimit) { 756 fMatch = FALSE; 757 fHitEnd = TRUE; 758 return FALSE; 759 } 760 if (findProgressInterrupt(startPos, status)) 761 return FALSE; 762 } 763 } 764 U_ASSERT(FALSE); 765 766 case START_STRING: 767 case START_CHAR: 768 { 769 // Match starts on exactly one char. 770 U_ASSERT(fPattern->fMinMatchLen > 0); 771 UChar32 theChar = fPattern->fInitialChar; 772 UTEXT_SETNATIVEINDEX(fInputText, startPos); 773 for (;;) { 774 int64_t pos = startPos; 775 c = UTEXT_NEXT32(fInputText); 776 startPos = UTEXT_GETNATIVEINDEX(fInputText); 777 if (c == theChar) { 778 MatchAt(pos, FALSE, status); 779 if (U_FAILURE(status)) { 780 return FALSE; 781 } 782 if (fMatch) { 783 return TRUE; 784 } 785 UTEXT_SETNATIVEINDEX(fInputText, pos); 786 } 787 if (startPos > testStartLimit) { 788 fMatch = FALSE; 789 fHitEnd = TRUE; 790 return FALSE; 791 } 792 if (findProgressInterrupt(startPos, status)) 793 return FALSE; 794 } 795 } 796 U_ASSERT(FALSE); 797 798 case START_LINE: 799 { 800 UChar32 c; 801 if (startPos == fAnchorStart) { 802 MatchAt(startPos, FALSE, status); 803 if (U_FAILURE(status)) { 804 return FALSE; 805 } 806 if (fMatch) { 807 return TRUE; 808 } 809 UTEXT_SETNATIVEINDEX(fInputText, startPos); 810 c = UTEXT_NEXT32(fInputText); 811 startPos = UTEXT_GETNATIVEINDEX(fInputText); 812 } else { 813 UTEXT_SETNATIVEINDEX(fInputText, startPos); 814 c = UTEXT_PREVIOUS32(fInputText); 815 UTEXT_SETNATIVEINDEX(fInputText, startPos); 816 } 817 818 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 819 for (;;) { 820 if (c == 0x0a) { 821 MatchAt(startPos, FALSE, status); 822 if (U_FAILURE(status)) { 823 return FALSE; 824 } 825 if (fMatch) { 826 return TRUE; 827 } 828 UTEXT_SETNATIVEINDEX(fInputText, startPos); 829 } 830 if (startPos >= testStartLimit) { 831 fMatch = FALSE; 832 fHitEnd = TRUE; 833 return FALSE; 834 } 835 c = UTEXT_NEXT32(fInputText); 836 startPos = UTEXT_GETNATIVEINDEX(fInputText); 837 // Note that it's perfectly OK for a pattern to have a zero-length 838 // match at the end of a string, so we must make sure that the loop 839 // runs with startPos == testStartLimit the last time through. 840 if (findProgressInterrupt(startPos, status)) 841 return FALSE; 842 } 843 } else { 844 for (;;) { 845 if (isLineTerminator(c)) { 846 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 847 (void)UTEXT_NEXT32(fInputText); 848 startPos = UTEXT_GETNATIVEINDEX(fInputText); 849 } 850 MatchAt(startPos, FALSE, status); 851 if (U_FAILURE(status)) { 852 return FALSE; 853 } 854 if (fMatch) { 855 return TRUE; 856 } 857 UTEXT_SETNATIVEINDEX(fInputText, startPos); 858 } 859 if (startPos >= testStartLimit) { 860 fMatch = FALSE; 861 fHitEnd = TRUE; 862 return FALSE; 863 } 864 c = UTEXT_NEXT32(fInputText); 865 startPos = UTEXT_GETNATIVEINDEX(fInputText); 866 // Note that it's perfectly OK for a pattern to have a zero-length 867 // match at the end of a string, so we must make sure that the loop 868 // runs with startPos == testStartLimit the last time through. 869 if (findProgressInterrupt(startPos, status)) 870 return FALSE; 871 } 872 } 873 } 874 875 default: 876 U_ASSERT(FALSE); 877 } 878 879 U_ASSERT(FALSE); 880 return FALSE; 881 } 882 883 884 885 UBool RegexMatcher::find(int64_t start, UErrorCode &status) { 886 if (U_FAILURE(status)) { 887 return FALSE; 888 } 889 if (U_FAILURE(fDeferredStatus)) { 890 status = fDeferredStatus; 891 return FALSE; 892 } 893 this->reset(); // Note: Reset() is specified by Java Matcher documentation. 894 // This will reset the region to be the full input length. 895 if (start < 0) { 896 status = U_INDEX_OUTOFBOUNDS_ERROR; 897 return FALSE; 898 } 899 900 int64_t nativeStart = start; 901 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 902 status = U_INDEX_OUTOFBOUNDS_ERROR; 903 return FALSE; 904 } 905 fMatchEnd = nativeStart; 906 return find(status); 907 } 908 909 910 //-------------------------------------------------------------------------------- 911 // 912 // findUsingChunk() -- like find(), but with the advance knowledge that the 913 // entire string is available in the UText's chunk buffer. 914 // 915 //-------------------------------------------------------------------------------- 916 UBool RegexMatcher::findUsingChunk(UErrorCode &status) { 917 // Start at the position of the last match end. (Will be zero if the 918 // matcher has been reset. 919 // 920 921 int32_t startPos = (int32_t)fMatchEnd; 922 if (startPos==0) { 923 startPos = (int32_t)fActiveStart; 924 } 925 926 const UChar *inputBuf = fInputText->chunkContents; 927 928 if (fMatch) { 929 // Save the position of any previous successful match. 930 fLastMatchEnd = fMatchEnd; 931 932 if (fMatchStart == fMatchEnd) { 933 // Previous match had zero length. Move start position up one position 934 // to avoid sending find() into a loop on zero-length matches. 935 if (startPos >= fActiveLimit) { 936 fMatch = FALSE; 937 fHitEnd = TRUE; 938 return FALSE; 939 } 940 U16_FWD_1(inputBuf, startPos, fInputLength); 941 } 942 } else { 943 if (fLastMatchEnd >= 0) { 944 // A previous find() failed to match. Don't try again. 945 // (without this test, a pattern with a zero-length match 946 // could match again at the end of an input string.) 947 fHitEnd = TRUE; 948 return FALSE; 949 } 950 } 951 952 953 // Compute the position in the input string beyond which a match can not begin, because 954 // the minimum length match would extend past the end of the input. 955 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 956 // Be aware of possible overflows if making changes here. 957 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit. 958 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); 959 if (startPos > testLen) { 960 fMatch = FALSE; 961 fHitEnd = TRUE; 962 return FALSE; 963 } 964 965 UChar32 c; 966 U_ASSERT(startPos >= 0); 967 968 switch (fPattern->fStartType) { 969 case START_NO_INFO: 970 // No optimization was found. 971 // Try a match at each input position. 972 for (;;) { 973 MatchChunkAt(startPos, FALSE, status); 974 if (U_FAILURE(status)) { 975 return FALSE; 976 } 977 if (fMatch) { 978 return TRUE; 979 } 980 if (startPos >= testLen) { 981 fHitEnd = TRUE; 982 return FALSE; 983 } 984 U16_FWD_1(inputBuf, startPos, fActiveLimit); 985 // Note that it's perfectly OK for a pattern to have a zero-length 986 // match at the end of a string, so we must make sure that the loop 987 // runs with startPos == testLen the last time through. 988 if (findProgressInterrupt(startPos, status)) 989 return FALSE; 990 } 991 U_ASSERT(FALSE); 992 993 case START_START: 994 // Matches are only possible at the start of the input string 995 // (pattern begins with ^ or \A) 996 if (startPos > fActiveStart) { 997 fMatch = FALSE; 998 return FALSE; 999 } 1000 MatchChunkAt(startPos, FALSE, status); 1001 if (U_FAILURE(status)) { 1002 return FALSE; 1003 } 1004 return fMatch; 1005 1006 1007 case START_SET: 1008 { 1009 // Match may start on any char from a pre-computed set. 1010 U_ASSERT(fPattern->fMinMatchLen > 0); 1011 for (;;) { 1012 int32_t pos = startPos; 1013 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 1014 if ((c<256 && fPattern->fInitialChars8->contains(c)) || 1015 (c>=256 && fPattern->fInitialChars->contains(c))) { 1016 MatchChunkAt(pos, FALSE, status); 1017 if (U_FAILURE(status)) { 1018 return FALSE; 1019 } 1020 if (fMatch) { 1021 return TRUE; 1022 } 1023 } 1024 if (startPos > testLen) { 1025 fMatch = FALSE; 1026 fHitEnd = TRUE; 1027 return FALSE; 1028 } 1029 if (findProgressInterrupt(startPos, status)) 1030 return FALSE; 1031 } 1032 } 1033 U_ASSERT(FALSE); 1034 1035 case START_STRING: 1036 case START_CHAR: 1037 { 1038 // Match starts on exactly one char. 1039 U_ASSERT(fPattern->fMinMatchLen > 0); 1040 UChar32 theChar = fPattern->fInitialChar; 1041 for (;;) { 1042 int32_t pos = startPos; 1043 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 1044 if (c == theChar) { 1045 MatchChunkAt(pos, FALSE, status); 1046 if (U_FAILURE(status)) { 1047 return FALSE; 1048 } 1049 if (fMatch) { 1050 return TRUE; 1051 } 1052 } 1053 if (startPos > testLen) { 1054 fMatch = FALSE; 1055 fHitEnd = TRUE; 1056 return FALSE; 1057 } 1058 if (findProgressInterrupt(startPos, status)) 1059 return FALSE; 1060 } 1061 } 1062 U_ASSERT(FALSE); 1063 1064 case START_LINE: 1065 { 1066 UChar32 c; 1067 if (startPos == fAnchorStart) { 1068 MatchChunkAt(startPos, FALSE, status); 1069 if (U_FAILURE(status)) { 1070 return FALSE; 1071 } 1072 if (fMatch) { 1073 return TRUE; 1074 } 1075 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1076 } 1077 1078 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 1079 for (;;) { 1080 c = inputBuf[startPos-1]; 1081 if (c == 0x0a) { 1082 MatchChunkAt(startPos, FALSE, status); 1083 if (U_FAILURE(status)) { 1084 return FALSE; 1085 } 1086 if (fMatch) { 1087 return TRUE; 1088 } 1089 } 1090 if (startPos >= testLen) { 1091 fMatch = FALSE; 1092 fHitEnd = TRUE; 1093 return FALSE; 1094 } 1095 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1096 // Note that it's perfectly OK for a pattern to have a zero-length 1097 // match at the end of a string, so we must make sure that the loop 1098 // runs with startPos == testLen the last time through. 1099 if (findProgressInterrupt(startPos, status)) 1100 return FALSE; 1101 } 1102 } else { 1103 for (;;) { 1104 c = inputBuf[startPos-1]; 1105 if (isLineTerminator(c)) { 1106 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { 1107 startPos++; 1108 } 1109 MatchChunkAt(startPos, FALSE, status); 1110 if (U_FAILURE(status)) { 1111 return FALSE; 1112 } 1113 if (fMatch) { 1114 return TRUE; 1115 } 1116 } 1117 if (startPos >= testLen) { 1118 fMatch = FALSE; 1119 fHitEnd = TRUE; 1120 return FALSE; 1121 } 1122 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1123 // Note that it's perfectly OK for a pattern to have a zero-length 1124 // match at the end of a string, so we must make sure that the loop 1125 // runs with startPos == testLen the last time through. 1126 if (findProgressInterrupt(startPos, status)) 1127 return FALSE; 1128 } 1129 } 1130 } 1131 1132 default: 1133 U_ASSERT(FALSE); 1134 } 1135 1136 U_ASSERT(FALSE); 1137 return FALSE; 1138 } 1139 1140 1141 1142 //-------------------------------------------------------------------------------- 1143 // 1144 // group() 1145 // 1146 //-------------------------------------------------------------------------------- 1147 UnicodeString RegexMatcher::group(UErrorCode &status) const { 1148 return group(0, status); 1149 } 1150 1151 // Return immutable shallow clone 1152 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const { 1153 return group(0, dest, group_len, status); 1154 } 1155 1156 // Return immutable shallow clone 1157 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const { 1158 group_len = 0; 1159 if (U_FAILURE(status)) { 1160 return dest; 1161 } 1162 if (U_FAILURE(fDeferredStatus)) { 1163 status = fDeferredStatus; 1164 } else if (fMatch == FALSE) { 1165 status = U_REGEX_INVALID_STATE; 1166 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1167 status = U_INDEX_OUTOFBOUNDS_ERROR; 1168 } 1169 1170 if (U_FAILURE(status)) { 1171 return dest; 1172 } 1173 1174 int64_t s, e; 1175 if (groupNum == 0) { 1176 s = fMatchStart; 1177 e = fMatchEnd; 1178 } else { 1179 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1180 U_ASSERT(groupOffset < fPattern->fFrameSize); 1181 U_ASSERT(groupOffset >= 0); 1182 s = fFrame->fExtra[groupOffset]; 1183 e = fFrame->fExtra[groupOffset+1]; 1184 } 1185 1186 if (s < 0) { 1187 // A capture group wasn't part of the match 1188 return utext_clone(dest, fInputText, FALSE, TRUE, &status); 1189 } 1190 U_ASSERT(s <= e); 1191 group_len = e - s; 1192 1193 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); 1194 if (dest) 1195 UTEXT_SETNATIVEINDEX(dest, s); 1196 return dest; 1197 } 1198 1199 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { 1200 UnicodeString result; 1201 int64_t groupStart = start64(groupNum, status); 1202 int64_t groupEnd = end64(groupNum, status); 1203 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) { 1204 return result; 1205 } 1206 1207 // Get the group length using a utext_extract preflight. 1208 // UText is actually pretty efficient at this when underlying encoding is UTF-16. 1209 int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status); 1210 if (status != U_BUFFER_OVERFLOW_ERROR) { 1211 return result; 1212 } 1213 1214 status = U_ZERO_ERROR; 1215 UChar *buf = result.getBuffer(length); 1216 if (buf == NULL) { 1217 status = U_MEMORY_ALLOCATION_ERROR; 1218 } else { 1219 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status); 1220 result.releaseBuffer(extractLength); 1221 U_ASSERT(length == extractLength); 1222 } 1223 return result; 1224 } 1225 1226 1227 //-------------------------------------------------------------------------------- 1228 // 1229 // appendGroup() -- currently internal only, appends a group to a UText rather 1230 // than replacing its contents 1231 // 1232 //-------------------------------------------------------------------------------- 1233 1234 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const { 1235 if (U_FAILURE(status)) { 1236 return 0; 1237 } 1238 if (U_FAILURE(fDeferredStatus)) { 1239 status = fDeferredStatus; 1240 return 0; 1241 } 1242 int64_t destLen = utext_nativeLength(dest); 1243 1244 if (fMatch == FALSE) { 1245 status = U_REGEX_INVALID_STATE; 1246 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1247 } 1248 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1249 status = U_INDEX_OUTOFBOUNDS_ERROR; 1250 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1251 } 1252 1253 int64_t s, e; 1254 if (groupNum == 0) { 1255 s = fMatchStart; 1256 e = fMatchEnd; 1257 } else { 1258 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1259 U_ASSERT(groupOffset < fPattern->fFrameSize); 1260 U_ASSERT(groupOffset >= 0); 1261 s = fFrame->fExtra[groupOffset]; 1262 e = fFrame->fExtra[groupOffset+1]; 1263 } 1264 1265 if (s < 0) { 1266 // A capture group wasn't part of the match 1267 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1268 } 1269 U_ASSERT(s <= e); 1270 1271 int64_t deltaLen; 1272 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1273 U_ASSERT(e <= fInputLength); 1274 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status); 1275 } else { 1276 int32_t len16; 1277 if (UTEXT_USES_U16(fInputText)) { 1278 len16 = (int32_t)(e-s); 1279 } else { 1280 UErrorCode lengthStatus = U_ZERO_ERROR; 1281 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1282 } 1283 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1284 if (groupChars == NULL) { 1285 status = U_MEMORY_ALLOCATION_ERROR; 1286 return 0; 1287 } 1288 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1289 1290 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status); 1291 uprv_free(groupChars); 1292 } 1293 return deltaLen; 1294 } 1295 1296 1297 1298 //-------------------------------------------------------------------------------- 1299 // 1300 // groupCount() 1301 // 1302 //-------------------------------------------------------------------------------- 1303 int32_t RegexMatcher::groupCount() const { 1304 return fPattern->fGroupMap->size(); 1305 } 1306 1307 //-------------------------------------------------------------------------------- 1308 // 1309 // hasAnchoringBounds() 1310 // 1311 //-------------------------------------------------------------------------------- 1312 UBool RegexMatcher::hasAnchoringBounds() const { 1313 return fAnchoringBounds; 1314 } 1315 1316 1317 //-------------------------------------------------------------------------------- 1318 // 1319 // hasTransparentBounds() 1320 // 1321 //-------------------------------------------------------------------------------- 1322 UBool RegexMatcher::hasTransparentBounds() const { 1323 return fTransparentBounds; 1324 } 1325 1326 1327 1328 //-------------------------------------------------------------------------------- 1329 // 1330 // hitEnd() 1331 // 1332 //-------------------------------------------------------------------------------- 1333 UBool RegexMatcher::hitEnd() const { 1334 return fHitEnd; 1335 } 1336 1337 1338 //-------------------------------------------------------------------------------- 1339 // 1340 // input() 1341 // 1342 //-------------------------------------------------------------------------------- 1343 const UnicodeString &RegexMatcher::input() const { 1344 if (!fInput) { 1345 UErrorCode status = U_ZERO_ERROR; 1346 int32_t len16; 1347 if (UTEXT_USES_U16(fInputText)) { 1348 len16 = (int32_t)fInputLength; 1349 } else { 1350 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status); 1351 status = U_ZERO_ERROR; // overflow, length status 1352 } 1353 UnicodeString *result = new UnicodeString(len16, 0, 0); 1354 1355 UChar *inputChars = result->getBuffer(len16); 1356 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning 1357 result->releaseBuffer(len16); 1358 1359 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator= 1360 } 1361 1362 return *fInput; 1363 } 1364 1365 //-------------------------------------------------------------------------------- 1366 // 1367 // inputText() 1368 // 1369 //-------------------------------------------------------------------------------- 1370 UText *RegexMatcher::inputText() const { 1371 return fInputText; 1372 } 1373 1374 1375 //-------------------------------------------------------------------------------- 1376 // 1377 // getInput() -- like inputText(), but makes a clone or copies into another UText 1378 // 1379 //-------------------------------------------------------------------------------- 1380 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { 1381 if (U_FAILURE(status)) { 1382 return dest; 1383 } 1384 if (U_FAILURE(fDeferredStatus)) { 1385 status = fDeferredStatus; 1386 return dest; 1387 } 1388 1389 if (dest) { 1390 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1391 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status); 1392 } else { 1393 int32_t input16Len; 1394 if (UTEXT_USES_U16(fInputText)) { 1395 input16Len = (int32_t)fInputLength; 1396 } else { 1397 UErrorCode lengthStatus = U_ZERO_ERROR; 1398 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error 1399 } 1400 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len)); 1401 if (inputChars == NULL) { 1402 return dest; 1403 } 1404 1405 status = U_ZERO_ERROR; 1406 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning 1407 status = U_ZERO_ERROR; 1408 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status); 1409 1410 uprv_free(inputChars); 1411 } 1412 return dest; 1413 } else { 1414 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); 1415 } 1416 } 1417 1418 1419 static UBool compat_SyncMutableUTextContents(UText *ut); 1420 static UBool compat_SyncMutableUTextContents(UText *ut) { 1421 UBool retVal = FALSE; 1422 1423 // In the following test, we're really only interested in whether the UText should switch 1424 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents 1425 // will still point to the correct data. 1426 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { 1427 UnicodeString *us=(UnicodeString *)ut->context; 1428 1429 // Update to the latest length. 1430 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). 1431 int32_t newLength = us->length(); 1432 1433 // Update the chunk description. 1434 // The buffer may have switched between stack- and heap-based. 1435 ut->chunkContents = us->getBuffer(); 1436 ut->chunkLength = newLength; 1437 ut->chunkNativeLimit = newLength; 1438 ut->nativeIndexingLimit = newLength; 1439 retVal = TRUE; 1440 } 1441 1442 return retVal; 1443 } 1444 1445 //-------------------------------------------------------------------------------- 1446 // 1447 // lookingAt() 1448 // 1449 //-------------------------------------------------------------------------------- 1450 UBool RegexMatcher::lookingAt(UErrorCode &status) { 1451 if (U_FAILURE(status)) { 1452 return FALSE; 1453 } 1454 if (U_FAILURE(fDeferredStatus)) { 1455 status = fDeferredStatus; 1456 return FALSE; 1457 } 1458 1459 if (fInputUniStrMaybeMutable) { 1460 if (compat_SyncMutableUTextContents(fInputText)) { 1461 fInputLength = utext_nativeLength(fInputText); 1462 reset(); 1463 } 1464 } 1465 else { 1466 resetPreserveRegion(); 1467 } 1468 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1469 MatchChunkAt((int32_t)fActiveStart, FALSE, status); 1470 } else { 1471 MatchAt(fActiveStart, FALSE, status); 1472 } 1473 return fMatch; 1474 } 1475 1476 1477 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { 1478 if (U_FAILURE(status)) { 1479 return FALSE; 1480 } 1481 if (U_FAILURE(fDeferredStatus)) { 1482 status = fDeferredStatus; 1483 return FALSE; 1484 } 1485 reset(); 1486 1487 if (start < 0) { 1488 status = U_INDEX_OUTOFBOUNDS_ERROR; 1489 return FALSE; 1490 } 1491 1492 if (fInputUniStrMaybeMutable) { 1493 if (compat_SyncMutableUTextContents(fInputText)) { 1494 fInputLength = utext_nativeLength(fInputText); 1495 reset(); 1496 } 1497 } 1498 1499 int64_t nativeStart; 1500 nativeStart = start; 1501 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1502 status = U_INDEX_OUTOFBOUNDS_ERROR; 1503 return FALSE; 1504 } 1505 1506 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1507 MatchChunkAt((int32_t)nativeStart, FALSE, status); 1508 } else { 1509 MatchAt(nativeStart, FALSE, status); 1510 } 1511 return fMatch; 1512 } 1513 1514 1515 1516 //-------------------------------------------------------------------------------- 1517 // 1518 // matches() 1519 // 1520 //-------------------------------------------------------------------------------- 1521 UBool RegexMatcher::matches(UErrorCode &status) { 1522 if (U_FAILURE(status)) { 1523 return FALSE; 1524 } 1525 if (U_FAILURE(fDeferredStatus)) { 1526 status = fDeferredStatus; 1527 return FALSE; 1528 } 1529 1530 if (fInputUniStrMaybeMutable) { 1531 if (compat_SyncMutableUTextContents(fInputText)) { 1532 fInputLength = utext_nativeLength(fInputText); 1533 reset(); 1534 } 1535 } 1536 else { 1537 resetPreserveRegion(); 1538 } 1539 1540 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1541 MatchChunkAt((int32_t)fActiveStart, TRUE, status); 1542 } else { 1543 MatchAt(fActiveStart, TRUE, status); 1544 } 1545 return fMatch; 1546 } 1547 1548 1549 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { 1550 if (U_FAILURE(status)) { 1551 return FALSE; 1552 } 1553 if (U_FAILURE(fDeferredStatus)) { 1554 status = fDeferredStatus; 1555 return FALSE; 1556 } 1557 reset(); 1558 1559 if (start < 0) { 1560 status = U_INDEX_OUTOFBOUNDS_ERROR; 1561 return FALSE; 1562 } 1563 1564 if (fInputUniStrMaybeMutable) { 1565 if (compat_SyncMutableUTextContents(fInputText)) { 1566 fInputLength = utext_nativeLength(fInputText); 1567 reset(); 1568 } 1569 } 1570 1571 int64_t nativeStart; 1572 nativeStart = start; 1573 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1574 status = U_INDEX_OUTOFBOUNDS_ERROR; 1575 return FALSE; 1576 } 1577 1578 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1579 MatchChunkAt((int32_t)nativeStart, TRUE, status); 1580 } else { 1581 MatchAt(nativeStart, TRUE, status); 1582 } 1583 return fMatch; 1584 } 1585 1586 1587 1588 //-------------------------------------------------------------------------------- 1589 // 1590 // pattern 1591 // 1592 //-------------------------------------------------------------------------------- 1593 const RegexPattern &RegexMatcher::pattern() const { 1594 return *fPattern; 1595 } 1596 1597 1598 1599 //-------------------------------------------------------------------------------- 1600 // 1601 // region 1602 // 1603 //-------------------------------------------------------------------------------- 1604 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) { 1605 if (U_FAILURE(status)) { 1606 return *this; 1607 } 1608 1609 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { 1610 status = U_ILLEGAL_ARGUMENT_ERROR; 1611 } 1612 1613 int64_t nativeStart = regionStart; 1614 int64_t nativeLimit = regionLimit; 1615 if (nativeStart > fInputLength || nativeLimit > fInputLength) { 1616 status = U_ILLEGAL_ARGUMENT_ERROR; 1617 } 1618 1619 if (startIndex == -1) 1620 this->reset(); 1621 else 1622 resetPreserveRegion(); 1623 1624 fRegionStart = nativeStart; 1625 fRegionLimit = nativeLimit; 1626 fActiveStart = nativeStart; 1627 fActiveLimit = nativeLimit; 1628 1629 if (startIndex != -1) { 1630 if (startIndex < fActiveStart || startIndex > fActiveLimit) { 1631 status = U_INDEX_OUTOFBOUNDS_ERROR; 1632 } 1633 fMatchEnd = startIndex; 1634 } 1635 1636 if (!fTransparentBounds) { 1637 fLookStart = nativeStart; 1638 fLookLimit = nativeLimit; 1639 } 1640 if (fAnchoringBounds) { 1641 fAnchorStart = nativeStart; 1642 fAnchorLimit = nativeLimit; 1643 } 1644 return *this; 1645 } 1646 1647 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) { 1648 return region(start, limit, -1, status); 1649 } 1650 1651 //-------------------------------------------------------------------------------- 1652 // 1653 // regionEnd 1654 // 1655 //-------------------------------------------------------------------------------- 1656 int32_t RegexMatcher::regionEnd() const { 1657 return (int32_t)fRegionLimit; 1658 } 1659 1660 int64_t RegexMatcher::regionEnd64() const { 1661 return fRegionLimit; 1662 } 1663 1664 //-------------------------------------------------------------------------------- 1665 // 1666 // regionStart 1667 // 1668 //-------------------------------------------------------------------------------- 1669 int32_t RegexMatcher::regionStart() const { 1670 return (int32_t)fRegionStart; 1671 } 1672 1673 int64_t RegexMatcher::regionStart64() const { 1674 return fRegionStart; 1675 } 1676 1677 1678 //-------------------------------------------------------------------------------- 1679 // 1680 // replaceAll 1681 // 1682 //-------------------------------------------------------------------------------- 1683 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { 1684 UText replacementText = UTEXT_INITIALIZER; 1685 UText resultText = UTEXT_INITIALIZER; 1686 UnicodeString resultString; 1687 if (U_FAILURE(status)) { 1688 return resultString; 1689 } 1690 1691 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1692 utext_openUnicodeString(&resultText, &resultString, &status); 1693 1694 replaceAll(&replacementText, &resultText, status); 1695 1696 utext_close(&resultText); 1697 utext_close(&replacementText); 1698 1699 return resultString; 1700 } 1701 1702 1703 // 1704 // replaceAll, UText mode 1705 // 1706 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) { 1707 if (U_FAILURE(status)) { 1708 return dest; 1709 } 1710 if (U_FAILURE(fDeferredStatus)) { 1711 status = fDeferredStatus; 1712 return dest; 1713 } 1714 1715 if (dest == NULL) { 1716 UnicodeString emptyString; 1717 UText empty = UTEXT_INITIALIZER; 1718 1719 utext_openUnicodeString(&empty, &emptyString, &status); 1720 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1721 utext_close(&empty); 1722 } 1723 1724 if (U_SUCCESS(status)) { 1725 reset(); 1726 while (find()) { 1727 appendReplacement(dest, replacement, status); 1728 if (U_FAILURE(status)) { 1729 break; 1730 } 1731 } 1732 appendTail(dest, status); 1733 } 1734 1735 return dest; 1736 } 1737 1738 1739 //-------------------------------------------------------------------------------- 1740 // 1741 // replaceFirst 1742 // 1743 //-------------------------------------------------------------------------------- 1744 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { 1745 UText replacementText = UTEXT_INITIALIZER; 1746 UText resultText = UTEXT_INITIALIZER; 1747 UnicodeString resultString; 1748 1749 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1750 utext_openUnicodeString(&resultText, &resultString, &status); 1751 1752 replaceFirst(&replacementText, &resultText, status); 1753 1754 utext_close(&resultText); 1755 utext_close(&replacementText); 1756 1757 return resultString; 1758 } 1759 1760 // 1761 // replaceFirst, UText mode 1762 // 1763 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) { 1764 if (U_FAILURE(status)) { 1765 return dest; 1766 } 1767 if (U_FAILURE(fDeferredStatus)) { 1768 status = fDeferredStatus; 1769 return dest; 1770 } 1771 1772 reset(); 1773 if (!find()) { 1774 return getInput(dest, status); 1775 } 1776 1777 if (dest == NULL) { 1778 UnicodeString emptyString; 1779 UText empty = UTEXT_INITIALIZER; 1780 1781 utext_openUnicodeString(&empty, &emptyString, &status); 1782 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1783 utext_close(&empty); 1784 } 1785 1786 appendReplacement(dest, replacement, status); 1787 appendTail(dest, status); 1788 1789 return dest; 1790 } 1791 1792 1793 //-------------------------------------------------------------------------------- 1794 // 1795 // requireEnd 1796 // 1797 //-------------------------------------------------------------------------------- 1798 UBool RegexMatcher::requireEnd() const { 1799 return fRequireEnd; 1800 } 1801 1802 1803 //-------------------------------------------------------------------------------- 1804 // 1805 // reset 1806 // 1807 //-------------------------------------------------------------------------------- 1808 RegexMatcher &RegexMatcher::reset() { 1809 fRegionStart = 0; 1810 fRegionLimit = fInputLength; 1811 fActiveStart = 0; 1812 fActiveLimit = fInputLength; 1813 fAnchorStart = 0; 1814 fAnchorLimit = fInputLength; 1815 fLookStart = 0; 1816 fLookLimit = fInputLength; 1817 resetPreserveRegion(); 1818 return *this; 1819 } 1820 1821 1822 1823 void RegexMatcher::resetPreserveRegion() { 1824 fMatchStart = 0; 1825 fMatchEnd = 0; 1826 fLastMatchEnd = -1; 1827 fAppendPosition = 0; 1828 fMatch = FALSE; 1829 fHitEnd = FALSE; 1830 fRequireEnd = FALSE; 1831 fTime = 0; 1832 fTickCounter = TIMER_INITIAL_VALUE; 1833 //resetStack(); // more expensive than it looks... 1834 } 1835 1836 1837 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { 1838 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus); 1839 if (fPattern->fNeedsAltInput) { 1840 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1841 } 1842 if (U_FAILURE(fDeferredStatus)) { 1843 return *this; 1844 } 1845 fInputLength = utext_nativeLength(fInputText); 1846 1847 reset(); 1848 delete fInput; 1849 fInput = NULL; 1850 1851 // Do the following for any UnicodeString. 1852 // This is for compatibility for those clients who modify the input string "live" during regex operations. 1853 fInputUniStrMaybeMutable = TRUE; 1854 1855 if (fWordBreakItr != NULL) { 1856 #if UCONFIG_NO_BREAK_ITERATION==0 1857 UErrorCode status = U_ZERO_ERROR; 1858 fWordBreakItr->setText(fInputText, status); 1859 #endif 1860 } 1861 return *this; 1862 } 1863 1864 1865 RegexMatcher &RegexMatcher::reset(UText *input) { 1866 if (fInputText != input) { 1867 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus); 1868 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1869 if (U_FAILURE(fDeferredStatus)) { 1870 return *this; 1871 } 1872 fInputLength = utext_nativeLength(fInputText); 1873 1874 delete fInput; 1875 fInput = NULL; 1876 1877 if (fWordBreakItr != NULL) { 1878 #if UCONFIG_NO_BREAK_ITERATION==0 1879 UErrorCode status = U_ZERO_ERROR; 1880 fWordBreakItr->setText(input, status); 1881 #endif 1882 } 1883 } 1884 reset(); 1885 fInputUniStrMaybeMutable = FALSE; 1886 1887 return *this; 1888 } 1889 1890 /*RegexMatcher &RegexMatcher::reset(const UChar *) { 1891 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; 1892 return *this; 1893 }*/ 1894 1895 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { 1896 if (U_FAILURE(status)) { 1897 return *this; 1898 } 1899 reset(); // Reset also resets the region to be the entire string. 1900 1901 if (position < 0 || position > fActiveLimit) { 1902 status = U_INDEX_OUTOFBOUNDS_ERROR; 1903 return *this; 1904 } 1905 fMatchEnd = position; 1906 return *this; 1907 } 1908 1909 1910 //-------------------------------------------------------------------------------- 1911 // 1912 // refresh 1913 // 1914 //-------------------------------------------------------------------------------- 1915 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) { 1916 if (U_FAILURE(status)) { 1917 return *this; 1918 } 1919 if (input == NULL) { 1920 status = U_ILLEGAL_ARGUMENT_ERROR; 1921 return *this; 1922 } 1923 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) { 1924 status = U_ILLEGAL_ARGUMENT_ERROR; 1925 return *this; 1926 } 1927 int64_t pos = utext_getNativeIndex(fInputText); 1928 // Shallow read-only clone of the new UText into the existing input UText 1929 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status); 1930 if (U_FAILURE(status)) { 1931 return *this; 1932 } 1933 utext_setNativeIndex(fInputText, pos); 1934 1935 if (fAltInputText != NULL) { 1936 pos = utext_getNativeIndex(fAltInputText); 1937 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status); 1938 if (U_FAILURE(status)) { 1939 return *this; 1940 } 1941 utext_setNativeIndex(fAltInputText, pos); 1942 } 1943 return *this; 1944 } 1945 1946 1947 1948 //-------------------------------------------------------------------------------- 1949 // 1950 // setTrace 1951 // 1952 //-------------------------------------------------------------------------------- 1953 void RegexMatcher::setTrace(UBool state) { 1954 fTraceDebug = state; 1955 } 1956 1957 1958 1959 /** 1960 * UText, replace entire contents of the destination UText with a substring of the source UText. 1961 * 1962 * @param src The source UText 1963 * @param dest The destination UText. Must be writable. 1964 * May be NULL, in which case a new UText will be allocated. 1965 * @param start Start index of source substring. 1966 * @param limit Limit index of source substring. 1967 * @param status An error code. 1968 */ 1969 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) { 1970 if (U_FAILURE(*status)) { 1971 return dest; 1972 } 1973 if (start == limit) { 1974 if (dest) { 1975 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status); 1976 return dest; 1977 } else { 1978 return utext_openUChars(NULL, NULL, 0, status); 1979 } 1980 } 1981 int32_t length = utext_extract(src, start, limit, NULL, 0, status); 1982 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { 1983 return dest; 1984 } 1985 *status = U_ZERO_ERROR; 1986 MaybeStackArray<UChar, 40> buffer; 1987 if (length >= buffer.getCapacity()) { 1988 UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul. 1989 if (newBuf == NULL) { 1990 *status = U_MEMORY_ALLOCATION_ERROR; 1991 } 1992 } 1993 utext_extract(src, start, limit, buffer.getAlias(), length+1, status); 1994 if (dest) { 1995 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status); 1996 return dest; 1997 } 1998 1999 // Caller did not provide a prexisting UText. 2000 // Open a new one, and have it adopt the text buffer storage. 2001 if (U_FAILURE(*status)) { 2002 return NULL; 2003 } 2004 int32_t ownedLength = 0; 2005 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength); 2006 if (ownedBuf == NULL) { 2007 *status = U_MEMORY_ALLOCATION_ERROR; 2008 return NULL; 2009 } 2010 UText *result = utext_openUChars(NULL, ownedBuf, length, status); 2011 if (U_FAILURE(*status)) { 2012 uprv_free(ownedBuf); 2013 return NULL; 2014 } 2015 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT); 2016 return result; 2017 } 2018 2019 2020 //--------------------------------------------------------------------- 2021 // 2022 // split 2023 // 2024 //--------------------------------------------------------------------- 2025 int32_t RegexMatcher::split(const UnicodeString &input, 2026 UnicodeString dest[], 2027 int32_t destCapacity, 2028 UErrorCode &status) 2029 { 2030 UText inputText = UTEXT_INITIALIZER; 2031 utext_openConstUnicodeString(&inputText, &input, &status); 2032 if (U_FAILURE(status)) { 2033 return 0; 2034 } 2035 2036 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); 2037 if (destText == NULL) { 2038 status = U_MEMORY_ALLOCATION_ERROR; 2039 return 0; 2040 } 2041 int32_t i; 2042 for (i = 0; i < destCapacity; i++) { 2043 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); 2044 } 2045 2046 int32_t fieldCount = split(&inputText, destText, destCapacity, status); 2047 2048 for (i = 0; i < destCapacity; i++) { 2049 utext_close(destText[i]); 2050 } 2051 2052 uprv_free(destText); 2053 utext_close(&inputText); 2054 return fieldCount; 2055 } 2056 2057 // 2058 // split, UText mode 2059 // 2060 int32_t RegexMatcher::split(UText *input, 2061 UText *dest[], 2062 int32_t destCapacity, 2063 UErrorCode &status) 2064 { 2065 // 2066 // Check arguements for validity 2067 // 2068 if (U_FAILURE(status)) { 2069 return 0; 2070 }; 2071 2072 if (destCapacity < 1) { 2073 status = U_ILLEGAL_ARGUMENT_ERROR; 2074 return 0; 2075 } 2076 2077 // 2078 // Reset for the input text 2079 // 2080 reset(input); 2081 int64_t nextOutputStringStart = 0; 2082 if (fActiveLimit == 0) { 2083 return 0; 2084 } 2085 2086 // 2087 // Loop through the input text, searching for the delimiter pattern 2088 // 2089 int32_t i; 2090 int32_t numCaptureGroups = fPattern->fGroupMap->size(); 2091 for (i=0; ; i++) { 2092 if (i>=destCapacity-1) { 2093 // There is one or zero output string left. 2094 // Fill the last output string with whatever is left from the input, then exit the loop. 2095 // ( i will be == destCapacity if we filled the output array while processing 2096 // capture groups of the delimiter expression, in which case we will discard the 2097 // last capture group saved in favor of the unprocessed remainder of the 2098 // input string.) 2099 i = destCapacity-1; 2100 if (fActiveLimit > nextOutputStringStart) { 2101 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2102 if (dest[i]) { 2103 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2104 input->chunkContents+nextOutputStringStart, 2105 (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2106 } else { 2107 UText remainingText = UTEXT_INITIALIZER; 2108 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2109 fActiveLimit-nextOutputStringStart, &status); 2110 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2111 utext_close(&remainingText); 2112 } 2113 } else { 2114 UErrorCode lengthStatus = U_ZERO_ERROR; 2115 int32_t remaining16Length = 2116 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 2117 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2118 if (remainingChars == NULL) { 2119 status = U_MEMORY_ALLOCATION_ERROR; 2120 break; 2121 } 2122 2123 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2124 if (dest[i]) { 2125 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2126 } else { 2127 UText remainingText = UTEXT_INITIALIZER; 2128 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2129 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2130 utext_close(&remainingText); 2131 } 2132 2133 uprv_free(remainingChars); 2134 } 2135 } 2136 break; 2137 } 2138 if (find()) { 2139 // We found another delimiter. Move everything from where we started looking 2140 // up until the start of the delimiter into the next output string. 2141 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2142 if (dest[i]) { 2143 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2144 input->chunkContents+nextOutputStringStart, 2145 (int32_t)(fMatchStart-nextOutputStringStart), &status); 2146 } else { 2147 UText remainingText = UTEXT_INITIALIZER; 2148 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2149 fMatchStart-nextOutputStringStart, &status); 2150 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2151 utext_close(&remainingText); 2152 } 2153 } else { 2154 UErrorCode lengthStatus = U_ZERO_ERROR; 2155 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus); 2156 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2157 if (remainingChars == NULL) { 2158 status = U_MEMORY_ALLOCATION_ERROR; 2159 break; 2160 } 2161 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status); 2162 if (dest[i]) { 2163 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2164 } else { 2165 UText remainingText = UTEXT_INITIALIZER; 2166 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2167 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2168 utext_close(&remainingText); 2169 } 2170 2171 uprv_free(remainingChars); 2172 } 2173 nextOutputStringStart = fMatchEnd; 2174 2175 // If the delimiter pattern has capturing parentheses, the captured 2176 // text goes out into the next n destination strings. 2177 int32_t groupNum; 2178 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 2179 if (i >= destCapacity-2) { 2180 // Never fill the last available output string with capture group text. 2181 // It will filled with the last field, the remainder of the 2182 // unsplit input text. 2183 break; 2184 } 2185 i++; 2186 dest[i] = utext_extract_replace(fInputText, dest[i], 2187 start64(groupNum, status), end64(groupNum, status), &status); 2188 } 2189 2190 if (nextOutputStringStart == fActiveLimit) { 2191 // The delimiter was at the end of the string. We're done, but first 2192 // we output one last empty string, for the empty field following 2193 // the delimiter at the end of input. 2194 if (i+1 < destCapacity) { 2195 ++i; 2196 if (dest[i] == NULL) { 2197 dest[i] = utext_openUChars(NULL, NULL, 0, &status); 2198 } else { 2199 static UChar emptyString[] = {(UChar)0}; 2200 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status); 2201 } 2202 } 2203 break; 2204 2205 } 2206 } 2207 else 2208 { 2209 // We ran off the end of the input while looking for the next delimiter. 2210 // All the remaining text goes into the current output string. 2211 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2212 if (dest[i]) { 2213 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2214 input->chunkContents+nextOutputStringStart, 2215 (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2216 } else { 2217 UText remainingText = UTEXT_INITIALIZER; 2218 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2219 fActiveLimit-nextOutputStringStart, &status); 2220 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2221 utext_close(&remainingText); 2222 } 2223 } else { 2224 UErrorCode lengthStatus = U_ZERO_ERROR; 2225 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 2226 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2227 if (remainingChars == NULL) { 2228 status = U_MEMORY_ALLOCATION_ERROR; 2229 break; 2230 } 2231 2232 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2233 if (dest[i]) { 2234 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2235 } else { 2236 UText remainingText = UTEXT_INITIALIZER; 2237 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2238 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2239 utext_close(&remainingText); 2240 } 2241 2242 uprv_free(remainingChars); 2243 } 2244 break; 2245 } 2246 if (U_FAILURE(status)) { 2247 break; 2248 } 2249 } // end of for loop 2250 return i+1; 2251 } 2252 2253 2254 //-------------------------------------------------------------------------------- 2255 // 2256 // start 2257 // 2258 //-------------------------------------------------------------------------------- 2259 int32_t RegexMatcher::start(UErrorCode &status) const { 2260 return start(0, status); 2261 } 2262 2263 int64_t RegexMatcher::start64(UErrorCode &status) const { 2264 return start64(0, status); 2265 } 2266 2267 //-------------------------------------------------------------------------------- 2268 // 2269 // start(int32_t group, UErrorCode &status) 2270 // 2271 //-------------------------------------------------------------------------------- 2272 2273 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const { 2274 if (U_FAILURE(status)) { 2275 return -1; 2276 } 2277 if (U_FAILURE(fDeferredStatus)) { 2278 status = fDeferredStatus; 2279 return -1; 2280 } 2281 if (fMatch == FALSE) { 2282 status = U_REGEX_INVALID_STATE; 2283 return -1; 2284 } 2285 if (group < 0 || group > fPattern->fGroupMap->size()) { 2286 status = U_INDEX_OUTOFBOUNDS_ERROR; 2287 return -1; 2288 } 2289 int64_t s; 2290 if (group == 0) { 2291 s = fMatchStart; 2292 } else { 2293 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 2294 U_ASSERT(groupOffset < fPattern->fFrameSize); 2295 U_ASSERT(groupOffset >= 0); 2296 s = fFrame->fExtra[groupOffset]; 2297 } 2298 2299 return s; 2300 } 2301 2302 2303 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { 2304 return (int32_t)start64(group, status); 2305 } 2306 2307 //-------------------------------------------------------------------------------- 2308 // 2309 // useAnchoringBounds 2310 // 2311 //-------------------------------------------------------------------------------- 2312 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { 2313 fAnchoringBounds = b; 2314 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); 2315 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); 2316 return *this; 2317 } 2318 2319 2320 //-------------------------------------------------------------------------------- 2321 // 2322 // useTransparentBounds 2323 // 2324 //-------------------------------------------------------------------------------- 2325 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { 2326 fTransparentBounds = b; 2327 fLookStart = (fTransparentBounds ? 0 : fRegionStart); 2328 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); 2329 return *this; 2330 } 2331 2332 //-------------------------------------------------------------------------------- 2333 // 2334 // setTimeLimit 2335 // 2336 //-------------------------------------------------------------------------------- 2337 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { 2338 if (U_FAILURE(status)) { 2339 return; 2340 } 2341 if (U_FAILURE(fDeferredStatus)) { 2342 status = fDeferredStatus; 2343 return; 2344 } 2345 if (limit < 0) { 2346 status = U_ILLEGAL_ARGUMENT_ERROR; 2347 return; 2348 } 2349 fTimeLimit = limit; 2350 } 2351 2352 2353 //-------------------------------------------------------------------------------- 2354 // 2355 // getTimeLimit 2356 // 2357 //-------------------------------------------------------------------------------- 2358 int32_t RegexMatcher::getTimeLimit() const { 2359 return fTimeLimit; 2360 } 2361 2362 2363 //-------------------------------------------------------------------------------- 2364 // 2365 // setStackLimit 2366 // 2367 //-------------------------------------------------------------------------------- 2368 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { 2369 if (U_FAILURE(status)) { 2370 return; 2371 } 2372 if (U_FAILURE(fDeferredStatus)) { 2373 status = fDeferredStatus; 2374 return; 2375 } 2376 if (limit < 0) { 2377 status = U_ILLEGAL_ARGUMENT_ERROR; 2378 return; 2379 } 2380 2381 // Reset the matcher. This is needed here in case there is a current match 2382 // whose final stack frame (containing the match results, pointed to by fFrame) 2383 // would be lost by resizing to a smaller stack size. 2384 reset(); 2385 2386 if (limit == 0) { 2387 // Unlimited stack expansion 2388 fStack->setMaxCapacity(0); 2389 } else { 2390 // Change the units of the limit from bytes to ints, and bump the size up 2391 // to be big enough to hold at least one stack frame for the pattern, 2392 // if it isn't there already. 2393 int32_t adjustedLimit = limit / sizeof(int32_t); 2394 if (adjustedLimit < fPattern->fFrameSize) { 2395 adjustedLimit = fPattern->fFrameSize; 2396 } 2397 fStack->setMaxCapacity(adjustedLimit); 2398 } 2399 fStackLimit = limit; 2400 } 2401 2402 2403 //-------------------------------------------------------------------------------- 2404 // 2405 // getStackLimit 2406 // 2407 //-------------------------------------------------------------------------------- 2408 int32_t RegexMatcher::getStackLimit() const { 2409 return fStackLimit; 2410 } 2411 2412 2413 //-------------------------------------------------------------------------------- 2414 // 2415 // setMatchCallback 2416 // 2417 //-------------------------------------------------------------------------------- 2418 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, 2419 const void *context, 2420 UErrorCode &status) { 2421 if (U_FAILURE(status)) { 2422 return; 2423 } 2424 fCallbackFn = callback; 2425 fCallbackContext = context; 2426 } 2427 2428 2429 //-------------------------------------------------------------------------------- 2430 // 2431 // getMatchCallback 2432 // 2433 //-------------------------------------------------------------------------------- 2434 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, 2435 const void *&context, 2436 UErrorCode &status) { 2437 if (U_FAILURE(status)) { 2438 return; 2439 } 2440 callback = fCallbackFn; 2441 context = fCallbackContext; 2442 } 2443 2444 2445 //-------------------------------------------------------------------------------- 2446 // 2447 // setMatchCallback 2448 // 2449 //-------------------------------------------------------------------------------- 2450 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback, 2451 const void *context, 2452 UErrorCode &status) { 2453 if (U_FAILURE(status)) { 2454 return; 2455 } 2456 fFindProgressCallbackFn = callback; 2457 fFindProgressCallbackContext = context; 2458 } 2459 2460 2461 //-------------------------------------------------------------------------------- 2462 // 2463 // getMatchCallback 2464 // 2465 //-------------------------------------------------------------------------------- 2466 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback, 2467 const void *&context, 2468 UErrorCode &status) { 2469 if (U_FAILURE(status)) { 2470 return; 2471 } 2472 callback = fFindProgressCallbackFn; 2473 context = fFindProgressCallbackContext; 2474 } 2475 2476 2477 //================================================================================ 2478 // 2479 // Code following this point in this file is the internal 2480 // Match Engine Implementation. 2481 // 2482 //================================================================================ 2483 2484 2485 //-------------------------------------------------------------------------------- 2486 // 2487 // resetStack 2488 // Discard any previous contents of the state save stack, and initialize a 2489 // new stack frame to all -1. The -1s are needed for capture group limits, 2490 // where they indicate that a group has not yet matched anything. 2491 //-------------------------------------------------------------------------------- 2492 REStackFrame *RegexMatcher::resetStack() { 2493 // Discard any previous contents of the state save stack, and initialize a 2494 // new stack frame with all -1 data. The -1s are needed for capture group limits, 2495 // where they indicate that a group has not yet matched anything. 2496 fStack->removeAllElements(); 2497 2498 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); 2499 if(U_FAILURE(fDeferredStatus)) { 2500 return NULL; 2501 } 2502 2503 int32_t i; 2504 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { 2505 iFrame->fExtra[i] = -1; 2506 } 2507 return iFrame; 2508 } 2509 2510 2511 2512 //-------------------------------------------------------------------------------- 2513 // 2514 // isWordBoundary 2515 // in perl, "xab..cd..", \b is true at positions 0,3,5,7 2516 // For us, 2517 // If the current char is a combining mark, 2518 // \b is FALSE. 2519 // Else Scan backwards to the first non-combining char. 2520 // We are at a boundary if the this char and the original chars are 2521 // opposite in membership in \w set 2522 // 2523 // parameters: pos - the current position in the input buffer 2524 // 2525 // TODO: double-check edge cases at region boundaries. 2526 // 2527 //-------------------------------------------------------------------------------- 2528 UBool RegexMatcher::isWordBoundary(int64_t pos) { 2529 UBool isBoundary = FALSE; 2530 UBool cIsWord = FALSE; 2531 2532 if (pos >= fLookLimit) { 2533 fHitEnd = TRUE; 2534 } else { 2535 // Determine whether char c at current position is a member of the word set of chars. 2536 // If we're off the end of the string, behave as though we're not at a word char. 2537 UTEXT_SETNATIVEINDEX(fInputText, pos); 2538 UChar32 c = UTEXT_CURRENT32(fInputText); 2539 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2540 // Current char is a combining one. Not a boundary. 2541 return FALSE; 2542 } 2543 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2544 } 2545 2546 // Back up until we come to a non-combining char, determine whether 2547 // that char is a word char. 2548 UBool prevCIsWord = FALSE; 2549 for (;;) { 2550 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { 2551 break; 2552 } 2553 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); 2554 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2555 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2556 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 2557 break; 2558 } 2559 } 2560 isBoundary = cIsWord ^ prevCIsWord; 2561 return isBoundary; 2562 } 2563 2564 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { 2565 UBool isBoundary = FALSE; 2566 UBool cIsWord = FALSE; 2567 2568 const UChar *inputBuf = fInputText->chunkContents; 2569 2570 if (pos >= fLookLimit) { 2571 fHitEnd = TRUE; 2572 } else { 2573 // Determine whether char c at current position is a member of the word set of chars. 2574 // If we're off the end of the string, behave as though we're not at a word char. 2575 UChar32 c; 2576 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); 2577 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2578 // Current char is a combining one. Not a boundary. 2579 return FALSE; 2580 } 2581 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2582 } 2583 2584 // Back up until we come to a non-combining char, determine whether 2585 // that char is a word char. 2586 UBool prevCIsWord = FALSE; 2587 for (;;) { 2588 if (pos <= fLookStart) { 2589 break; 2590 } 2591 UChar32 prevChar; 2592 U16_PREV(inputBuf, fLookStart, pos, prevChar); 2593 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2594 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2595 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 2596 break; 2597 } 2598 } 2599 isBoundary = cIsWord ^ prevCIsWord; 2600 return isBoundary; 2601 } 2602 2603 //-------------------------------------------------------------------------------- 2604 // 2605 // isUWordBoundary 2606 // 2607 // Test for a word boundary using RBBI word break. 2608 // 2609 // parameters: pos - the current position in the input buffer 2610 // 2611 //-------------------------------------------------------------------------------- 2612 UBool RegexMatcher::isUWordBoundary(int64_t pos) { 2613 UBool returnVal = FALSE; 2614 #if UCONFIG_NO_BREAK_ITERATION==0 2615 2616 // If we haven't yet created a break iterator for this matcher, do it now. 2617 if (fWordBreakItr == NULL) { 2618 fWordBreakItr = 2619 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus); 2620 if (U_FAILURE(fDeferredStatus)) { 2621 return FALSE; 2622 } 2623 fWordBreakItr->setText(fInputText, fDeferredStatus); 2624 } 2625 2626 if (pos >= fLookLimit) { 2627 fHitEnd = TRUE; 2628 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" 2629 // words are not boundaries. All non-word chars stand by themselves, 2630 // with word boundaries on both sides. 2631 } else { 2632 if (!UTEXT_USES_U16(fInputText)) { 2633 // !!!: Would like a better way to do this! 2634 UErrorCode status = U_ZERO_ERROR; 2635 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status); 2636 } 2637 returnVal = fWordBreakItr->isBoundary((int32_t)pos); 2638 } 2639 #endif 2640 return returnVal; 2641 } 2642 2643 //-------------------------------------------------------------------------------- 2644 // 2645 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state 2646 // saves. Increment the "time" counter, and call the 2647 // user callback function if there is one installed. 2648 // 2649 // If the match operation needs to be aborted, either for a time-out 2650 // or because the user callback asked for it, just set an error status. 2651 // The engine will pick that up and stop in its outer loop. 2652 // 2653 //-------------------------------------------------------------------------------- 2654 void RegexMatcher::IncrementTime(UErrorCode &status) { 2655 fTickCounter = TIMER_INITIAL_VALUE; 2656 fTime++; 2657 if (fCallbackFn != NULL) { 2658 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { 2659 status = U_REGEX_STOPPED_BY_CALLER; 2660 return; 2661 } 2662 } 2663 if (fTimeLimit > 0 && fTime >= fTimeLimit) { 2664 status = U_REGEX_TIME_OUT; 2665 } 2666 } 2667 2668 //-------------------------------------------------------------------------------- 2669 // 2670 // StateSave 2671 // Make a new stack frame, initialized as a copy of the current stack frame. 2672 // Set the pattern index in the original stack frame from the operand value 2673 // in the opcode. Execution of the engine continues with the state in 2674 // the newly created stack frame 2675 // 2676 // Note that reserveBlock() may grow the stack, resulting in the 2677 // whole thing being relocated in memory. 2678 // 2679 // Parameters: 2680 // fp The top frame pointer when called. At return, a new 2681 // fame will be present 2682 // savePatIdx An index into the compiled pattern. Goes into the original 2683 // (not new) frame. If execution ever back-tracks out of the 2684 // new frame, this will be where we continue from in the pattern. 2685 // Return 2686 // The new frame pointer. 2687 // 2688 //-------------------------------------------------------------------------------- 2689 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) { 2690 if (U_FAILURE(status)) { 2691 return fp; 2692 } 2693 // push storage for a new frame. 2694 int64_t *newFP = fStack->reserveBlock(fFrameSize, status); 2695 if (U_FAILURE(status)) { 2696 // Failure on attempted stack expansion. 2697 // Stack function set some other error code, change it to a more 2698 // specific one for regular expressions. 2699 status = U_REGEX_STACK_OVERFLOW; 2700 // We need to return a writable stack frame, so just return the 2701 // previous frame. The match operation will stop quickly 2702 // because of the error status, after which the frame will never 2703 // be looked at again. 2704 return fp; 2705 } 2706 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. 2707 2708 // New stack frame = copy of old top frame. 2709 int64_t *source = (int64_t *)fp; 2710 int64_t *dest = newFP; 2711 for (;;) { 2712 *dest++ = *source++; 2713 if (source == newFP) { 2714 break; 2715 } 2716 } 2717 2718 fTickCounter--; 2719 if (fTickCounter <= 0) { 2720 IncrementTime(status); // Re-initializes fTickCounter 2721 } 2722 fp->fPatIdx = savePatIdx; 2723 return (REStackFrame *)newFP; 2724 } 2725 2726 2727 //-------------------------------------------------------------------------------- 2728 // 2729 // MatchAt This is the actual matching engine. 2730 // 2731 // startIdx: begin matching a this index. 2732 // toEnd: if true, match must extend to end of the input region 2733 // 2734 //-------------------------------------------------------------------------------- 2735 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { 2736 UBool isMatch = FALSE; // True if the we have a match. 2737 2738 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards 2739 2740 int32_t op; // Operation from the compiled pattern, split into 2741 int32_t opType; // the opcode 2742 int32_t opValue; // and the operand value. 2743 2744 #ifdef REGEX_RUN_DEBUG 2745 if (fTraceDebug) 2746 { 2747 printf("MatchAt(startIdx=%ld)\n", startIdx); 2748 printf("Original Pattern: "); 2749 UChar32 c = utext_next32From(fPattern->fPattern, 0); 2750 while (c != U_SENTINEL) { 2751 if (c<32 || c>256) { 2752 c = '.'; 2753 } 2754 printf("%c", c); 2755 2756 c = UTEXT_NEXT32(fPattern->fPattern); 2757 } 2758 printf("\n"); 2759 printf("Input String: "); 2760 c = utext_next32From(fInputText, 0); 2761 while (c != U_SENTINEL) { 2762 if (c<32 || c>256) { 2763 c = '.'; 2764 } 2765 printf("%c", c); 2766 2767 c = UTEXT_NEXT32(fInputText); 2768 } 2769 printf("\n"); 2770 printf("\n"); 2771 } 2772 #endif 2773 2774 if (U_FAILURE(status)) { 2775 return; 2776 } 2777 2778 // Cache frequently referenced items from the compiled pattern 2779 // 2780 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 2781 2782 const UChar *litText = fPattern->fLiteralText.getBuffer(); 2783 UVector *sets = fPattern->fSets; 2784 2785 fFrameSize = fPattern->fFrameSize; 2786 REStackFrame *fp = resetStack(); 2787 if (U_FAILURE(fDeferredStatus)) { 2788 status = fDeferredStatus; 2789 return; 2790 } 2791 2792 fp->fPatIdx = 0; 2793 fp->fInputIdx = startIdx; 2794 2795 // Zero out the pattern's static data 2796 int32_t i; 2797 for (i = 0; i<fPattern->fDataSize; i++) { 2798 fData[i] = 0; 2799 } 2800 2801 // 2802 // Main loop for interpreting the compiled pattern. 2803 // One iteration of the loop per pattern operation performed. 2804 // 2805 for (;;) { 2806 op = (int32_t)pat[fp->fPatIdx]; 2807 opType = URX_TYPE(op); 2808 opValue = URX_VAL(op); 2809 #ifdef REGEX_RUN_DEBUG 2810 if (fTraceDebug) { 2811 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2812 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, 2813 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 2814 fPattern->dumpOp(fp->fPatIdx); 2815 } 2816 #endif 2817 fp->fPatIdx++; 2818 2819 switch (opType) { 2820 2821 2822 case URX_NOP: 2823 break; 2824 2825 2826 case URX_BACKTRACK: 2827 // Force a backtrack. In some circumstances, the pattern compiler 2828 // will notice that the pattern can't possibly match anything, and will 2829 // emit one of these at that point. 2830 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2831 break; 2832 2833 2834 case URX_ONECHAR: 2835 if (fp->fInputIdx < fActiveLimit) { 2836 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2837 UChar32 c = UTEXT_NEXT32(fInputText); 2838 if (c == opValue) { 2839 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2840 break; 2841 } 2842 } else { 2843 fHitEnd = TRUE; 2844 } 2845 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2846 break; 2847 2848 2849 case URX_STRING: 2850 { 2851 // Test input against a literal string. 2852 // Strings require two slots in the compiled pattern, one for the 2853 // offset to the string text, and one for the length. 2854 2855 int32_t stringStartIdx = opValue; 2856 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 2857 fp->fPatIdx++; 2858 opType = URX_TYPE(op); 2859 int32_t stringLen = URX_VAL(op); 2860 U_ASSERT(opType == URX_STRING_LEN); 2861 U_ASSERT(stringLen >= 2); 2862 2863 const UChar *patternString = litText+stringStartIdx; 2864 int32_t patternStringIndex = 0; 2865 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2866 UChar32 inputChar; 2867 UChar32 patternChar; 2868 UBool success = TRUE; 2869 while (patternStringIndex < stringLen) { 2870 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { 2871 success = FALSE; 2872 fHitEnd = TRUE; 2873 break; 2874 } 2875 inputChar = UTEXT_NEXT32(fInputText); 2876 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar); 2877 if (patternChar != inputChar) { 2878 success = FALSE; 2879 break; 2880 } 2881 } 2882 2883 if (success) { 2884 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2885 } else { 2886 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2887 } 2888 } 2889 break; 2890 2891 2892 case URX_STATE_SAVE: 2893 fp = StateSave(fp, opValue, status); 2894 break; 2895 2896 2897 case URX_END: 2898 // The match loop will exit via this path on a successful match, 2899 // when we reach the end of the pattern. 2900 if (toEnd && fp->fInputIdx != fActiveLimit) { 2901 // The pattern matched, but not to the end of input. Try some more. 2902 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2903 break; 2904 } 2905 isMatch = TRUE; 2906 goto breakFromLoop; 2907 2908 // Start and End Capture stack frame variables are laid out out like this: 2909 // fp->fExtra[opValue] - The start of a completed capture group 2910 // opValue+1 - The end of a completed capture group 2911 // opValue+2 - the start of a capture group whose end 2912 // has not yet been reached (and might not ever be). 2913 case URX_START_CAPTURE: 2914 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2915 fp->fExtra[opValue+2] = fp->fInputIdx; 2916 break; 2917 2918 2919 case URX_END_CAPTURE: 2920 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2921 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 2922 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 2923 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 2924 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 2925 break; 2926 2927 2928 case URX_DOLLAR: // $, test for End of line 2929 // or for position before new line at end of input 2930 { 2931 if (fp->fInputIdx >= fAnchorLimit) { 2932 // We really are at the end of input. Success. 2933 fHitEnd = TRUE; 2934 fRequireEnd = TRUE; 2935 break; 2936 } 2937 2938 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2939 2940 // If we are positioned just before a new-line that is located at the 2941 // end of input, succeed. 2942 UChar32 c = UTEXT_NEXT32(fInputText); 2943 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 2944 if (isLineTerminator(c)) { 2945 // If not in the middle of a CR/LF sequence 2946 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { 2947 // At new-line at end of input. Success 2948 fHitEnd = TRUE; 2949 fRequireEnd = TRUE; 2950 2951 break; 2952 } 2953 } 2954 } else { 2955 UChar32 nextC = UTEXT_NEXT32(fInputText); 2956 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 2957 fHitEnd = TRUE; 2958 fRequireEnd = TRUE; 2959 break; // At CR/LF at end of input. Success 2960 } 2961 } 2962 2963 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2964 } 2965 break; 2966 2967 2968 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 2969 if (fp->fInputIdx >= fAnchorLimit) { 2970 // Off the end of input. Success. 2971 fHitEnd = TRUE; 2972 fRequireEnd = TRUE; 2973 break; 2974 } else { 2975 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2976 UChar32 c = UTEXT_NEXT32(fInputText); 2977 // Either at the last character of input, or off the end. 2978 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) { 2979 fHitEnd = TRUE; 2980 fRequireEnd = TRUE; 2981 break; 2982 } 2983 } 2984 2985 // Not at end of input. Back-track out. 2986 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2987 break; 2988 2989 2990 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 2991 { 2992 if (fp->fInputIdx >= fAnchorLimit) { 2993 // We really are at the end of input. Success. 2994 fHitEnd = TRUE; 2995 fRequireEnd = TRUE; 2996 break; 2997 } 2998 // If we are positioned just before a new-line, succeed. 2999 // It makes no difference where the new-line is within the input. 3000 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3001 UChar32 c = UTEXT_CURRENT32(fInputText); 3002 if (isLineTerminator(c)) { 3003 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 3004 // In multi-line mode, hitting a new-line just before the end of input does not 3005 // set the hitEnd or requireEnd flags 3006 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) { 3007 break; 3008 } 3009 } 3010 // not at a new line. Fail. 3011 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3012 } 3013 break; 3014 3015 3016 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 3017 { 3018 if (fp->fInputIdx >= fAnchorLimit) { 3019 // We really are at the end of input. Success. 3020 fHitEnd = TRUE; 3021 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 3022 break; // adding a new-line would not lose the match. 3023 } 3024 // If we are not positioned just before a new-line, the test fails; backtrack out. 3025 // It makes no difference where the new-line is within the input. 3026 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3027 if (UTEXT_CURRENT32(fInputText) != 0x0a) { 3028 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3029 } 3030 } 3031 break; 3032 3033 3034 case URX_CARET: // ^, test for start of line 3035 if (fp->fInputIdx != fAnchorStart) { 3036 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3037 } 3038 break; 3039 3040 3041 case URX_CARET_M: // ^, test for start of line in mulit-line mode 3042 { 3043 if (fp->fInputIdx == fAnchorStart) { 3044 // We are at the start input. Success. 3045 break; 3046 } 3047 // Check whether character just before the current pos is a new-line 3048 // unless we are at the end of input 3049 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3050 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3051 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) { 3052 // It's a new-line. ^ is true. Success. 3053 // TODO: what should be done with positions between a CR and LF? 3054 break; 3055 } 3056 // Not at the start of a line. Fail. 3057 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3058 } 3059 break; 3060 3061 3062 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 3063 { 3064 U_ASSERT(fp->fInputIdx >= fAnchorStart); 3065 if (fp->fInputIdx <= fAnchorStart) { 3066 // We are at the start input. Success. 3067 break; 3068 } 3069 // Check whether character just before the current pos is a new-line 3070 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 3071 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3072 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3073 if (c != 0x0a) { 3074 // Not at the start of a line. Back-track out. 3075 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3076 } 3077 } 3078 break; 3079 3080 case URX_BACKSLASH_B: // Test for word boundaries 3081 { 3082 UBool success = isWordBoundary(fp->fInputIdx); 3083 success ^= (UBool)(opValue != 0); // flip sense for \B 3084 if (!success) { 3085 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3086 } 3087 } 3088 break; 3089 3090 3091 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 3092 { 3093 UBool success = isUWordBoundary(fp->fInputIdx); 3094 success ^= (UBool)(opValue != 0); // flip sense for \B 3095 if (!success) { 3096 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3097 } 3098 } 3099 break; 3100 3101 3102 case URX_BACKSLASH_D: // Test for decimal digit 3103 { 3104 if (fp->fInputIdx >= fActiveLimit) { 3105 fHitEnd = TRUE; 3106 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3107 break; 3108 } 3109 3110 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3111 3112 UChar32 c = UTEXT_NEXT32(fInputText); 3113 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 3114 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 3115 success ^= (UBool)(opValue != 0); // flip sense for \D 3116 if (success) { 3117 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3118 } else { 3119 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3120 } 3121 } 3122 break; 3123 3124 3125 case URX_BACKSLASH_G: // Test for position at end of previous match 3126 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { 3127 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3128 } 3129 break; 3130 3131 3132 case URX_BACKSLASH_H: // Test for \h, horizontal white space. 3133 { 3134 if (fp->fInputIdx >= fActiveLimit) { 3135 fHitEnd = TRUE; 3136 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3137 break; 3138 } 3139 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3140 UChar32 c = UTEXT_NEXT32(fInputText); 3141 int8_t ctype = u_charType(c); 3142 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB 3143 success ^= (UBool)(opValue != 0); // flip sense for \H 3144 if (success) { 3145 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3146 } else { 3147 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3148 } 3149 } 3150 break; 3151 3152 3153 case URX_BACKSLASH_R: // Test for \R, any line break sequence. 3154 { 3155 if (fp->fInputIdx >= fActiveLimit) { 3156 fHitEnd = TRUE; 3157 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3158 break; 3159 } 3160 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3161 UChar32 c = UTEXT_NEXT32(fInputText); 3162 if (isLineTerminator(c)) { 3163 if (c == 0x0d && utext_current32(fInputText) == 0x0a) { 3164 utext_next32(fInputText); 3165 } 3166 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3167 } else { 3168 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3169 } 3170 } 3171 break; 3172 3173 3174 case URX_BACKSLASH_V: // \v, any single line ending character. 3175 { 3176 if (fp->fInputIdx >= fActiveLimit) { 3177 fHitEnd = TRUE; 3178 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3179 break; 3180 } 3181 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3182 UChar32 c = UTEXT_NEXT32(fInputText); 3183 UBool success = isLineTerminator(c); 3184 success ^= (UBool)(opValue != 0); // flip sense for \V 3185 if (success) { 3186 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3187 } else { 3188 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3189 } 3190 } 3191 break; 3192 3193 3194 case URX_BACKSLASH_X: 3195 // Match a Grapheme, as defined by Unicode TR 29. 3196 // Differs slightly from Perl, which consumes combining marks independently 3197 // of context. 3198 { 3199 3200 // Fail if at end of input 3201 if (fp->fInputIdx >= fActiveLimit) { 3202 fHitEnd = TRUE; 3203 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3204 break; 3205 } 3206 3207 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3208 3209 // Examine (and consume) the current char. 3210 // Dispatch into a little state machine, based on the char. 3211 UChar32 c; 3212 c = UTEXT_NEXT32(fInputText); 3213 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3214 UnicodeSet **sets = fPattern->fStaticSets; 3215 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 3216 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 3217 if (sets[URX_GC_L]->contains(c)) goto GC_L; 3218 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 3219 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 3220 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3221 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3222 goto GC_Extend; 3223 3224 3225 3226 GC_L: 3227 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3228 c = UTEXT_NEXT32(fInputText); 3229 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3230 if (sets[URX_GC_L]->contains(c)) goto GC_L; 3231 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 3232 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 3233 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3234 (void)UTEXT_PREVIOUS32(fInputText); 3235 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3236 goto GC_Extend; 3237 3238 GC_V: 3239 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3240 c = UTEXT_NEXT32(fInputText); 3241 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3242 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3243 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3244 (void)UTEXT_PREVIOUS32(fInputText); 3245 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3246 goto GC_Extend; 3247 3248 GC_T: 3249 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3250 c = UTEXT_NEXT32(fInputText); 3251 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3252 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3253 (void)UTEXT_PREVIOUS32(fInputText); 3254 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3255 goto GC_Extend; 3256 3257 GC_Extend: 3258 // Combining characters are consumed here 3259 for (;;) { 3260 if (fp->fInputIdx >= fActiveLimit) { 3261 break; 3262 } 3263 c = UTEXT_CURRENT32(fInputText); 3264 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 3265 break; 3266 } 3267 (void)UTEXT_NEXT32(fInputText); 3268 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3269 } 3270 goto GC_Done; 3271 3272 GC_Control: 3273 // Most control chars stand alone (don't combine with combining chars), 3274 // except for that CR/LF sequence is a single grapheme cluster. 3275 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 3276 c = UTEXT_NEXT32(fInputText); 3277 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3278 } 3279 3280 GC_Done: 3281 if (fp->fInputIdx >= fActiveLimit) { 3282 fHitEnd = TRUE; 3283 } 3284 break; 3285 } 3286 3287 3288 3289 3290 case URX_BACKSLASH_Z: // Test for end of Input 3291 if (fp->fInputIdx < fAnchorLimit) { 3292 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3293 } else { 3294 fHitEnd = TRUE; 3295 fRequireEnd = TRUE; 3296 } 3297 break; 3298 3299 3300 3301 case URX_STATIC_SETREF: 3302 { 3303 // Test input character against one of the predefined sets 3304 // (Word Characters, for example) 3305 // The high bit of the op value is a flag for the match polarity. 3306 // 0: success if input char is in set. 3307 // 1: success if input char is not in set. 3308 if (fp->fInputIdx >= fActiveLimit) { 3309 fHitEnd = TRUE; 3310 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3311 break; 3312 } 3313 3314 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 3315 opValue &= ~URX_NEG_SET; 3316 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3317 3318 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3319 UChar32 c = UTEXT_NEXT32(fInputText); 3320 if (c < 256) { 3321 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3322 if (s8->contains(c)) { 3323 success = !success; 3324 } 3325 } else { 3326 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3327 if (s->contains(c)) { 3328 success = !success; 3329 } 3330 } 3331 if (success) { 3332 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3333 } else { 3334 // the character wasn't in the set. 3335 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3336 } 3337 } 3338 break; 3339 3340 3341 case URX_STAT_SETREF_N: 3342 { 3343 // Test input character for NOT being a member of one of 3344 // the predefined sets (Word Characters, for example) 3345 if (fp->fInputIdx >= fActiveLimit) { 3346 fHitEnd = TRUE; 3347 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3348 break; 3349 } 3350 3351 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3352 3353 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3354 3355 UChar32 c = UTEXT_NEXT32(fInputText); 3356 if (c < 256) { 3357 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3358 if (s8->contains(c) == FALSE) { 3359 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3360 break; 3361 } 3362 } else { 3363 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3364 if (s->contains(c) == FALSE) { 3365 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3366 break; 3367 } 3368 } 3369 // the character wasn't in the set. 3370 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3371 } 3372 break; 3373 3374 3375 case URX_SETREF: 3376 if (fp->fInputIdx >= fActiveLimit) { 3377 fHitEnd = TRUE; 3378 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3379 break; 3380 } else { 3381 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3382 3383 // There is input left. Pick up one char and test it for set membership. 3384 UChar32 c = UTEXT_NEXT32(fInputText); 3385 U_ASSERT(opValue > 0 && opValue < sets->size()); 3386 if (c<256) { 3387 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 3388 if (s8->contains(c)) { 3389 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3390 break; 3391 } 3392 } else { 3393 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 3394 if (s->contains(c)) { 3395 // The character is in the set. A Match. 3396 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3397 break; 3398 } 3399 } 3400 3401 // the character wasn't in the set. 3402 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3403 } 3404 break; 3405 3406 3407 case URX_DOTANY: 3408 { 3409 // . matches anything, but stops at end-of-line. 3410 if (fp->fInputIdx >= fActiveLimit) { 3411 // At end of input. Match failed. Backtrack out. 3412 fHitEnd = TRUE; 3413 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3414 break; 3415 } 3416 3417 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3418 3419 // There is input left. Advance over one char, unless we've hit end-of-line 3420 UChar32 c = UTEXT_NEXT32(fInputText); 3421 if (isLineTerminator(c)) { 3422 // End of line in normal mode. . does not match. 3423 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3424 break; 3425 } 3426 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3427 } 3428 break; 3429 3430 3431 case URX_DOTANY_ALL: 3432 { 3433 // ., in dot-matches-all (including new lines) mode 3434 if (fp->fInputIdx >= fActiveLimit) { 3435 // At end of input. Match failed. Backtrack out. 3436 fHitEnd = TRUE; 3437 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3438 break; 3439 } 3440 3441 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3442 3443 // There is input left. Advance over one char, except if we are 3444 // at a cr/lf, advance over both of them. 3445 UChar32 c; 3446 c = UTEXT_NEXT32(fInputText); 3447 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3448 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 3449 // In the case of a CR/LF, we need to advance over both. 3450 UChar32 nextc = UTEXT_CURRENT32(fInputText); 3451 if (nextc == 0x0a) { 3452 (void)UTEXT_NEXT32(fInputText); 3453 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3454 } 3455 } 3456 } 3457 break; 3458 3459 3460 case URX_DOTANY_UNIX: 3461 { 3462 // '.' operator, matches all, but stops at end-of-line. 3463 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 3464 if (fp->fInputIdx >= fActiveLimit) { 3465 // At end of input. Match failed. Backtrack out. 3466 fHitEnd = TRUE; 3467 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3468 break; 3469 } 3470 3471 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3472 3473 // There is input left. Advance over one char, unless we've hit end-of-line 3474 UChar32 c = UTEXT_NEXT32(fInputText); 3475 if (c == 0x0a) { 3476 // End of line in normal mode. '.' does not match the \n 3477 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3478 } else { 3479 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3480 } 3481 } 3482 break; 3483 3484 3485 case URX_JMP: 3486 fp->fPatIdx = opValue; 3487 break; 3488 3489 case URX_FAIL: 3490 isMatch = FALSE; 3491 goto breakFromLoop; 3492 3493 case URX_JMP_SAV: 3494 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 3495 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3496 fp->fPatIdx = opValue; // Then JMP. 3497 break; 3498 3499 case URX_JMP_SAV_X: 3500 // This opcode is used with (x)+, when x can match a zero length string. 3501 // Same as JMP_SAV, except conditional on the match having made forward progress. 3502 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 3503 // data address of the input position at the start of the loop. 3504 { 3505 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 3506 int32_t stoOp = (int32_t)pat[opValue-1]; 3507 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 3508 int32_t frameLoc = URX_VAL(stoOp); 3509 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 3510 int64_t prevInputIdx = fp->fExtra[frameLoc]; 3511 U_ASSERT(prevInputIdx <= fp->fInputIdx); 3512 if (prevInputIdx < fp->fInputIdx) { 3513 // The match did make progress. Repeat the loop. 3514 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3515 fp->fPatIdx = opValue; 3516 fp->fExtra[frameLoc] = fp->fInputIdx; 3517 } 3518 // If the input position did not advance, we do nothing here, 3519 // execution will fall out of the loop. 3520 } 3521 break; 3522 3523 case URX_CTR_INIT: 3524 { 3525 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3526 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3527 3528 // Pick up the three extra operands that CTR_INIT has, and 3529 // skip the pattern location counter past 3530 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3531 fp->fPatIdx += 3; 3532 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3533 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3534 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3535 U_ASSERT(minCount>=0); 3536 U_ASSERT(maxCount>=minCount || maxCount==-1); 3537 U_ASSERT(loopLoc>=fp->fPatIdx); 3538 3539 if (minCount == 0) { 3540 fp = StateSave(fp, loopLoc+1, status); 3541 } 3542 if (maxCount == -1) { 3543 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. 3544 } else if (maxCount == 0) { 3545 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3546 } 3547 } 3548 break; 3549 3550 case URX_CTR_LOOP: 3551 { 3552 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3553 int32_t initOp = (int32_t)pat[opValue]; 3554 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 3555 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3556 int32_t minCount = (int32_t)pat[opValue+2]; 3557 int32_t maxCount = (int32_t)pat[opValue+3]; 3558 (*pCounter)++; 3559 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 3560 U_ASSERT(*pCounter == maxCount); 3561 break; 3562 } 3563 if (*pCounter >= minCount) { 3564 if (maxCount == -1) { 3565 // Loop has no hard upper bound. 3566 // Check that it is progressing through the input, break if it is not. 3567 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 3568 if (fp->fInputIdx == *pLastInputIdx) { 3569 break; 3570 } else { 3571 *pLastInputIdx = fp->fInputIdx; 3572 } 3573 } 3574 fp = StateSave(fp, fp->fPatIdx, status); 3575 } 3576 fp->fPatIdx = opValue + 4; // Loop back. 3577 } 3578 break; 3579 3580 case URX_CTR_INIT_NG: 3581 { 3582 // Initialize a non-greedy loop 3583 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3584 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3585 3586 // Pick up the three extra operands that CTR_INIT_NG has, and 3587 // skip the pattern location counter past 3588 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3589 fp->fPatIdx += 3; 3590 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3591 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3592 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3593 U_ASSERT(minCount>=0); 3594 U_ASSERT(maxCount>=minCount || maxCount==-1); 3595 U_ASSERT(loopLoc>fp->fPatIdx); 3596 if (maxCount == -1) { 3597 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. 3598 } 3599 3600 if (minCount == 0) { 3601 if (maxCount != 0) { 3602 fp = StateSave(fp, fp->fPatIdx, status); 3603 } 3604 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 3605 } 3606 } 3607 break; 3608 3609 case URX_CTR_LOOP_NG: 3610 { 3611 // Non-greedy {min, max} loops 3612 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3613 int32_t initOp = (int32_t)pat[opValue]; 3614 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 3615 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3616 int32_t minCount = (int32_t)pat[opValue+2]; 3617 int32_t maxCount = (int32_t)pat[opValue+3]; 3618 3619 (*pCounter)++; 3620 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 3621 // The loop has matched the maximum permitted number of times. 3622 // Break out of here with no action. Matching will 3623 // continue with the following pattern. 3624 U_ASSERT(*pCounter == maxCount); 3625 break; 3626 } 3627 3628 if (*pCounter < minCount) { 3629 // We haven't met the minimum number of matches yet. 3630 // Loop back for another one. 3631 fp->fPatIdx = opValue + 4; // Loop back. 3632 } else { 3633 // We do have the minimum number of matches. 3634 3635 // If there is no upper bound on the loop iterations, check that the input index 3636 // is progressing, and stop the loop if it is not. 3637 if (maxCount == -1) { 3638 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 3639 if (fp->fInputIdx == *pLastInputIdx) { 3640 break; 3641 } 3642 *pLastInputIdx = fp->fInputIdx; 3643 } 3644 3645 // Loop Continuation: we will fall into the pattern following the loop 3646 // (non-greedy, don't execute loop body first), but first do 3647 // a state save to the top of the loop, so that a match failure 3648 // in the following pattern will try another iteration of the loop. 3649 fp = StateSave(fp, opValue + 4, status); 3650 } 3651 } 3652 break; 3653 3654 case URX_STO_SP: 3655 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3656 fData[opValue] = fStack->size(); 3657 break; 3658 3659 case URX_LD_SP: 3660 { 3661 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3662 int32_t newStackSize = (int32_t)fData[opValue]; 3663 U_ASSERT(newStackSize <= fStack->size()); 3664 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3665 if (newFP == (int64_t *)fp) { 3666 break; 3667 } 3668 int32_t i; 3669 for (i=0; i<fFrameSize; i++) { 3670 newFP[i] = ((int64_t *)fp)[i]; 3671 } 3672 fp = (REStackFrame *)newFP; 3673 fStack->setSize(newStackSize); 3674 } 3675 break; 3676 3677 case URX_BACKREF: 3678 { 3679 U_ASSERT(opValue < fFrameSize); 3680 int64_t groupStartIdx = fp->fExtra[opValue]; 3681 int64_t groupEndIdx = fp->fExtra[opValue+1]; 3682 U_ASSERT(groupStartIdx <= groupEndIdx); 3683 if (groupStartIdx < 0) { 3684 // This capture group has not participated in the match thus far, 3685 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3686 break; 3687 } 3688 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); 3689 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3690 3691 // Note: if the capture group match was of an empty string the backref 3692 // match succeeds. Verified by testing: Perl matches succeed 3693 // in this case, so we do too. 3694 3695 UBool success = TRUE; 3696 for (;;) { 3697 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) { 3698 success = TRUE; 3699 break; 3700 } 3701 if (utext_getNativeIndex(fInputText) >= fActiveLimit) { 3702 success = FALSE; 3703 fHitEnd = TRUE; 3704 break; 3705 } 3706 UChar32 captureGroupChar = utext_next32(fAltInputText); 3707 UChar32 inputChar = utext_next32(fInputText); 3708 if (inputChar != captureGroupChar) { 3709 success = FALSE; 3710 break; 3711 } 3712 } 3713 3714 if (success) { 3715 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3716 } else { 3717 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3718 } 3719 } 3720 break; 3721 3722 3723 3724 case URX_BACKREF_I: 3725 { 3726 U_ASSERT(opValue < fFrameSize); 3727 int64_t groupStartIdx = fp->fExtra[opValue]; 3728 int64_t groupEndIdx = fp->fExtra[opValue+1]; 3729 U_ASSERT(groupStartIdx <= groupEndIdx); 3730 if (groupStartIdx < 0) { 3731 // This capture group has not participated in the match thus far, 3732 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3733 break; 3734 } 3735 utext_setNativeIndex(fAltInputText, groupStartIdx); 3736 utext_setNativeIndex(fInputText, fp->fInputIdx); 3737 CaseFoldingUTextIterator captureGroupItr(*fAltInputText); 3738 CaseFoldingUTextIterator inputItr(*fInputText); 3739 3740 // Note: if the capture group match was of an empty string the backref 3741 // match succeeds. Verified by testing: Perl matches succeed 3742 // in this case, so we do too. 3743 3744 UBool success = TRUE; 3745 for (;;) { 3746 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) { 3747 success = TRUE; 3748 break; 3749 } 3750 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) { 3751 success = FALSE; 3752 fHitEnd = TRUE; 3753 break; 3754 } 3755 UChar32 captureGroupChar = captureGroupItr.next(); 3756 UChar32 inputChar = inputItr.next(); 3757 if (inputChar != captureGroupChar) { 3758 success = FALSE; 3759 break; 3760 } 3761 } 3762 3763 if (success && inputItr.inExpansion()) { 3764 // We otained a match by consuming part of a string obtained from 3765 // case-folding a single code point of the input text. 3766 // This does not count as an overall match. 3767 success = FALSE; 3768 } 3769 3770 if (success) { 3771 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3772 } else { 3773 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3774 } 3775 3776 } 3777 break; 3778 3779 case URX_STO_INP_LOC: 3780 { 3781 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 3782 fp->fExtra[opValue] = fp->fInputIdx; 3783 } 3784 break; 3785 3786 case URX_JMPX: 3787 { 3788 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3789 fp->fPatIdx += 1; 3790 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 3791 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 3792 int64_t savedInputIdx = fp->fExtra[dataLoc]; 3793 U_ASSERT(savedInputIdx <= fp->fInputIdx); 3794 if (savedInputIdx < fp->fInputIdx) { 3795 fp->fPatIdx = opValue; // JMP 3796 } else { 3797 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 3798 } 3799 } 3800 break; 3801 3802 case URX_LA_START: 3803 { 3804 // Entering a lookahead block. 3805 // Save Stack Ptr, Input Pos. 3806 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3807 fData[opValue] = fStack->size(); 3808 fData[opValue+1] = fp->fInputIdx; 3809 fActiveStart = fLookStart; // Set the match region change for 3810 fActiveLimit = fLookLimit; // transparent bounds. 3811 } 3812 break; 3813 3814 case URX_LA_END: 3815 { 3816 // Leaving a look-ahead block. 3817 // restore Stack Ptr, Input Pos to positions they had on entry to block. 3818 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3819 int32_t stackSize = fStack->size(); 3820 int32_t newStackSize =(int32_t)fData[opValue]; 3821 U_ASSERT(stackSize >= newStackSize); 3822 if (stackSize > newStackSize) { 3823 // Copy the current top frame back to the new (cut back) top frame. 3824 // This makes the capture groups from within the look-ahead 3825 // expression available. 3826 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3827 int32_t i; 3828 for (i=0; i<fFrameSize; i++) { 3829 newFP[i] = ((int64_t *)fp)[i]; 3830 } 3831 fp = (REStackFrame *)newFP; 3832 fStack->setSize(newStackSize); 3833 } 3834 fp->fInputIdx = fData[opValue+1]; 3835 3836 // Restore the active region bounds in the input string; they may have 3837 // been changed because of transparent bounds on a Region. 3838 fActiveStart = fRegionStart; 3839 fActiveLimit = fRegionLimit; 3840 } 3841 break; 3842 3843 case URX_ONECHAR_I: 3844 // Case insensitive one char. The char from the pattern is already case folded. 3845 // Input text is not, but case folding the input can not reduce two or more code 3846 // points to one. 3847 if (fp->fInputIdx < fActiveLimit) { 3848 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3849 3850 UChar32 c = UTEXT_NEXT32(fInputText); 3851 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 3852 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3853 break; 3854 } 3855 } else { 3856 fHitEnd = TRUE; 3857 } 3858 3859 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3860 break; 3861 3862 case URX_STRING_I: 3863 { 3864 // Case-insensitive test input against a literal string. 3865 // Strings require two slots in the compiled pattern, one for the 3866 // offset to the string text, and one for the length. 3867 // The compiled string has already been case folded. 3868 { 3869 const UChar *patternString = litText + opValue; 3870 int32_t patternStringIdx = 0; 3871 3872 op = (int32_t)pat[fp->fPatIdx]; 3873 fp->fPatIdx++; 3874 opType = URX_TYPE(op); 3875 opValue = URX_VAL(op); 3876 U_ASSERT(opType == URX_STRING_LEN); 3877 int32_t patternStringLen = opValue; // Length of the string from the pattern. 3878 3879 3880 UChar32 cPattern; 3881 UChar32 cText; 3882 UBool success = TRUE; 3883 3884 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3885 CaseFoldingUTextIterator inputIterator(*fInputText); 3886 while (patternStringIdx < patternStringLen) { 3887 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { 3888 success = FALSE; 3889 fHitEnd = TRUE; 3890 break; 3891 } 3892 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); 3893 cText = inputIterator.next(); 3894 if (cText != cPattern) { 3895 success = FALSE; 3896 break; 3897 } 3898 } 3899 if (inputIterator.inExpansion()) { 3900 success = FALSE; 3901 } 3902 3903 if (success) { 3904 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3905 } else { 3906 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3907 } 3908 } 3909 } 3910 break; 3911 3912 case URX_LB_START: 3913 { 3914 // Entering a look-behind block. 3915 // Save Stack Ptr, Input Pos. 3916 // TODO: implement transparent bounds. Ticket #6067 3917 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3918 fData[opValue] = fStack->size(); 3919 fData[opValue+1] = fp->fInputIdx; 3920 // Init the variable containing the start index for attempted matches. 3921 fData[opValue+2] = -1; 3922 // Save input string length, then reset to pin any matches to end at 3923 // the current position. 3924 fData[opValue+3] = fActiveLimit; 3925 fActiveLimit = fp->fInputIdx; 3926 } 3927 break; 3928 3929 3930 case URX_LB_CONT: 3931 { 3932 // Positive Look-Behind, at top of loop checking for matches of LB expression 3933 // at all possible input starting positions. 3934 3935 // Fetch the min and max possible match lengths. They are the operands 3936 // of this op in the pattern. 3937 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 3938 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 3939 U_ASSERT(minML <= maxML); 3940 U_ASSERT(minML >= 0); 3941 3942 // Fetch (from data) the last input index where a match was attempted. 3943 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3944 int64_t *lbStartIdx = &fData[opValue+2]; 3945 if (*lbStartIdx < 0) { 3946 // First time through loop. 3947 *lbStartIdx = fp->fInputIdx - minML; 3948 } else { 3949 // 2nd through nth time through the loop. 3950 // Back up start position for match by one. 3951 if (*lbStartIdx == 0) { 3952 (*lbStartIdx)--; 3953 } else { 3954 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); 3955 (void)UTEXT_PREVIOUS32(fInputText); 3956 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 3957 } 3958 } 3959 3960 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 3961 // We have tried all potential match starting points without 3962 // getting a match. Backtrack out, and out of the 3963 // Look Behind altogether. 3964 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3965 int64_t restoreInputLen = fData[opValue+3]; 3966 U_ASSERT(restoreInputLen >= fActiveLimit); 3967 U_ASSERT(restoreInputLen <= fInputLength); 3968 fActiveLimit = restoreInputLen; 3969 break; 3970 } 3971 3972 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 3973 // (successful match will fall off the end of the loop.) 3974 fp = StateSave(fp, fp->fPatIdx-3, status); 3975 fp->fInputIdx = *lbStartIdx; 3976 } 3977 break; 3978 3979 case URX_LB_END: 3980 // End of a look-behind block, after a successful match. 3981 { 3982 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3983 if (fp->fInputIdx != fActiveLimit) { 3984 // The look-behind expression matched, but the match did not 3985 // extend all the way to the point that we are looking behind from. 3986 // FAIL out of here, which will take us back to the LB_CONT, which 3987 // will retry the match starting at another position or fail 3988 // the look-behind altogether, whichever is appropriate. 3989 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3990 break; 3991 } 3992 3993 // Look-behind match is good. Restore the orignal input string length, 3994 // which had been truncated to pin the end of the lookbehind match to the 3995 // position being looked-behind. 3996 int64_t originalInputLen = fData[opValue+3]; 3997 U_ASSERT(originalInputLen >= fActiveLimit); 3998 U_ASSERT(originalInputLen <= fInputLength); 3999 fActiveLimit = originalInputLen; 4000 } 4001 break; 4002 4003 4004 case URX_LBN_CONT: 4005 { 4006 // Negative Look-Behind, at top of loop checking for matches of LB expression 4007 // at all possible input starting positions. 4008 4009 // Fetch the extra parameters of this op. 4010 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 4011 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 4012 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 4013 continueLoc = URX_VAL(continueLoc); 4014 U_ASSERT(minML <= maxML); 4015 U_ASSERT(minML >= 0); 4016 U_ASSERT(continueLoc > fp->fPatIdx); 4017 4018 // Fetch (from data) the last input index where a match was attempted. 4019 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4020 int64_t *lbStartIdx = &fData[opValue+2]; 4021 if (*lbStartIdx < 0) { 4022 // First time through loop. 4023 *lbStartIdx = fp->fInputIdx - minML; 4024 } else { 4025 // 2nd through nth time through the loop. 4026 // Back up start position for match by one. 4027 if (*lbStartIdx == 0) { 4028 (*lbStartIdx)--; 4029 } else { 4030 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); 4031 (void)UTEXT_PREVIOUS32(fInputText); 4032 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 4033 } 4034 } 4035 4036 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 4037 // We have tried all potential match starting points without 4038 // getting a match, which means that the negative lookbehind as 4039 // a whole has succeeded. Jump forward to the continue location 4040 int64_t restoreInputLen = fData[opValue+3]; 4041 U_ASSERT(restoreInputLen >= fActiveLimit); 4042 U_ASSERT(restoreInputLen <= fInputLength); 4043 fActiveLimit = restoreInputLen; 4044 fp->fPatIdx = continueLoc; 4045 break; 4046 } 4047 4048 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 4049 // (successful match will cause a FAIL out of the loop altogether.) 4050 fp = StateSave(fp, fp->fPatIdx-4, status); 4051 fp->fInputIdx = *lbStartIdx; 4052 } 4053 break; 4054 4055 case URX_LBN_END: 4056 // End of a negative look-behind block, after a successful match. 4057 { 4058 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4059 if (fp->fInputIdx != fActiveLimit) { 4060 // The look-behind expression matched, but the match did not 4061 // extend all the way to the point that we are looking behind from. 4062 // FAIL out of here, which will take us back to the LB_CONT, which 4063 // will retry the match starting at another position or succeed 4064 // the look-behind altogether, whichever is appropriate. 4065 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4066 break; 4067 } 4068 4069 // Look-behind expression matched, which means look-behind test as 4070 // a whole Fails 4071 4072 // Restore the orignal input string length, which had been truncated 4073 // inorder to pin the end of the lookbehind match 4074 // to the position being looked-behind. 4075 int64_t originalInputLen = fData[opValue+3]; 4076 U_ASSERT(originalInputLen >= fActiveLimit); 4077 U_ASSERT(originalInputLen <= fInputLength); 4078 fActiveLimit = originalInputLen; 4079 4080 // Restore original stack position, discarding any state saved 4081 // by the successful pattern match. 4082 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4083 int32_t newStackSize = (int32_t)fData[opValue]; 4084 U_ASSERT(fStack->size() > newStackSize); 4085 fStack->setSize(newStackSize); 4086 4087 // FAIL, which will take control back to someplace 4088 // prior to entering the look-behind test. 4089 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4090 } 4091 break; 4092 4093 4094 case URX_LOOP_SR_I: 4095 // Loop Initialization for the optimized implementation of 4096 // [some character set]* 4097 // This op scans through all matching input. 4098 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4099 { 4100 U_ASSERT(opValue > 0 && opValue < sets->size()); 4101 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 4102 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 4103 4104 // Loop through input, until either the input is exhausted or 4105 // we reach a character that is not a member of the set. 4106 int64_t ix = fp->fInputIdx; 4107 UTEXT_SETNATIVEINDEX(fInputText, ix); 4108 for (;;) { 4109 if (ix >= fActiveLimit) { 4110 fHitEnd = TRUE; 4111 break; 4112 } 4113 UChar32 c = UTEXT_NEXT32(fInputText); 4114 if (c<256) { 4115 if (s8->contains(c) == FALSE) { 4116 break; 4117 } 4118 } else { 4119 if (s->contains(c) == FALSE) { 4120 break; 4121 } 4122 } 4123 ix = UTEXT_GETNATIVEINDEX(fInputText); 4124 } 4125 4126 // If there were no matching characters, skip over the loop altogether. 4127 // The loop doesn't run at all, a * op always succeeds. 4128 if (ix == fp->fInputIdx) { 4129 fp->fPatIdx++; // skip the URX_LOOP_C op. 4130 break; 4131 } 4132 4133 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4134 // must follow. It's operand is the stack location 4135 // that holds the starting input index for the match of this [set]* 4136 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 4137 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4138 int32_t stackLoc = URX_VAL(loopcOp); 4139 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4140 fp->fExtra[stackLoc] = fp->fInputIdx; 4141 fp->fInputIdx = ix; 4142 4143 // Save State to the URX_LOOP_C op that follows this one, 4144 // so that match failures in the following code will return to there. 4145 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4146 fp = StateSave(fp, fp->fPatIdx, status); 4147 fp->fPatIdx++; 4148 } 4149 break; 4150 4151 4152 case URX_LOOP_DOT_I: 4153 // Loop Initialization for the optimized implementation of .* 4154 // This op scans through all remaining input. 4155 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4156 { 4157 // Loop through input until the input is exhausted (we reach an end-of-line) 4158 // In DOTALL mode, we can just go straight to the end of the input. 4159 int64_t ix; 4160 if ((opValue & 1) == 1) { 4161 // Dot-matches-All mode. Jump straight to the end of the string. 4162 ix = fActiveLimit; 4163 fHitEnd = TRUE; 4164 } else { 4165 // NOT DOT ALL mode. Line endings do not match '.' 4166 // Scan forward until a line ending or end of input. 4167 ix = fp->fInputIdx; 4168 UTEXT_SETNATIVEINDEX(fInputText, ix); 4169 for (;;) { 4170 if (ix >= fActiveLimit) { 4171 fHitEnd = TRUE; 4172 break; 4173 } 4174 UChar32 c = UTEXT_NEXT32(fInputText); 4175 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 4176 if ((c == 0x0a) || // 0x0a is newline in both modes. 4177 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 4178 isLineTerminator(c))) { 4179 // char is a line ending. Exit the scanning loop. 4180 break; 4181 } 4182 } 4183 ix = UTEXT_GETNATIVEINDEX(fInputText); 4184 } 4185 } 4186 4187 // If there were no matching characters, skip over the loop altogether. 4188 // The loop doesn't run at all, a * op always succeeds. 4189 if (ix == fp->fInputIdx) { 4190 fp->fPatIdx++; // skip the URX_LOOP_C op. 4191 break; 4192 } 4193 4194 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4195 // must follow. It's operand is the stack location 4196 // that holds the starting input index for the match of this .* 4197 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 4198 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4199 int32_t stackLoc = URX_VAL(loopcOp); 4200 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4201 fp->fExtra[stackLoc] = fp->fInputIdx; 4202 fp->fInputIdx = ix; 4203 4204 // Save State to the URX_LOOP_C op that follows this one, 4205 // so that match failures in the following code will return to there. 4206 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4207 fp = StateSave(fp, fp->fPatIdx, status); 4208 fp->fPatIdx++; 4209 } 4210 break; 4211 4212 4213 case URX_LOOP_C: 4214 { 4215 U_ASSERT(opValue>=0 && opValue<fFrameSize); 4216 backSearchIndex = fp->fExtra[opValue]; 4217 U_ASSERT(backSearchIndex <= fp->fInputIdx); 4218 if (backSearchIndex == fp->fInputIdx) { 4219 // We've backed up the input idx to the point that the loop started. 4220 // The loop is done. Leave here without saving state. 4221 // Subsequent failures won't come back here. 4222 break; 4223 } 4224 // Set up for the next iteration of the loop, with input index 4225 // backed up by one from the last time through, 4226 // and a state save to this instruction in case the following code fails again. 4227 // (We're going backwards because this loop emulates stack unwinding, not 4228 // the initial scan forward.) 4229 U_ASSERT(fp->fInputIdx > 0); 4230 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4231 UChar32 prevC = UTEXT_PREVIOUS32(fInputText); 4232 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4233 4234 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); 4235 if (prevC == 0x0a && 4236 fp->fInputIdx > backSearchIndex && 4237 twoPrevC == 0x0d) { 4238 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 4239 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 4240 // .*, stepping back over CRLF pair. 4241 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4242 } 4243 } 4244 4245 4246 fp = StateSave(fp, fp->fPatIdx-1, status); 4247 } 4248 break; 4249 4250 4251 4252 default: 4253 // Trouble. The compiled pattern contains an entry with an 4254 // unrecognized type tag. 4255 U_ASSERT(FALSE); 4256 } 4257 4258 if (U_FAILURE(status)) { 4259 isMatch = FALSE; 4260 break; 4261 } 4262 } 4263 4264 breakFromLoop: 4265 fMatch = isMatch; 4266 if (isMatch) { 4267 fLastMatchEnd = fMatchEnd; 4268 fMatchStart = startIdx; 4269 fMatchEnd = fp->fInputIdx; 4270 } 4271 4272 #ifdef REGEX_RUN_DEBUG 4273 if (fTraceDebug) { 4274 if (isMatch) { 4275 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd); 4276 } else { 4277 printf("No match\n\n"); 4278 } 4279 } 4280 #endif 4281 4282 fFrame = fp; // The active stack frame when the engine stopped. 4283 // Contains the capture group results that we need to 4284 // access later. 4285 return; 4286 } 4287 4288 4289 //-------------------------------------------------------------------------------- 4290 // 4291 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the 4292 // assumption that the entire string is available in the UText's 4293 // chunk buffer. For now, that means we can use int32_t indexes, 4294 // except for anything that needs to be saved (like group starts 4295 // and ends). 4296 // 4297 // startIdx: begin matching a this index. 4298 // toEnd: if true, match must extend to end of the input region 4299 // 4300 //-------------------------------------------------------------------------------- 4301 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { 4302 UBool isMatch = FALSE; // True if the we have a match. 4303 4304 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards 4305 4306 int32_t op; // Operation from the compiled pattern, split into 4307 int32_t opType; // the opcode 4308 int32_t opValue; // and the operand value. 4309 4310 #ifdef REGEX_RUN_DEBUG 4311 if (fTraceDebug) { 4312 printf("MatchAt(startIdx=%d)\n", startIdx); 4313 printf("Original Pattern: "); 4314 UChar32 c = utext_next32From(fPattern->fPattern, 0); 4315 while (c != U_SENTINEL) { 4316 if (c<32 || c>256) { 4317 c = '.'; 4318 } 4319 printf("%c", c); 4320 4321 c = UTEXT_NEXT32(fPattern->fPattern); 4322 } 4323 printf("\n"); 4324 printf("Input String: "); 4325 c = utext_next32From(fInputText, 0); 4326 while (c != U_SENTINEL) { 4327 if (c<32 || c>256) { 4328 c = '.'; 4329 } 4330 printf("%c", c); 4331 4332 c = UTEXT_NEXT32(fInputText); 4333 } 4334 printf("\n"); 4335 printf("\n"); 4336 } 4337 #endif 4338 4339 if (U_FAILURE(status)) { 4340 return; 4341 } 4342 4343 // Cache frequently referenced items from the compiled pattern 4344 // 4345 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 4346 4347 const UChar *litText = fPattern->fLiteralText.getBuffer(); 4348 UVector *sets = fPattern->fSets; 4349 4350 const UChar *inputBuf = fInputText->chunkContents; 4351 4352 fFrameSize = fPattern->fFrameSize; 4353 REStackFrame *fp = resetStack(); 4354 if (U_FAILURE(fDeferredStatus)) { 4355 status = fDeferredStatus; 4356 return; 4357 } 4358 4359 fp->fPatIdx = 0; 4360 fp->fInputIdx = startIdx; 4361 4362 // Zero out the pattern's static data 4363 int32_t i; 4364 for (i = 0; i<fPattern->fDataSize; i++) { 4365 fData[i] = 0; 4366 } 4367 4368 // 4369 // Main loop for interpreting the compiled pattern. 4370 // One iteration of the loop per pattern operation performed. 4371 // 4372 for (;;) { 4373 op = (int32_t)pat[fp->fPatIdx]; 4374 opType = URX_TYPE(op); 4375 opValue = URX_VAL(op); 4376 #ifdef REGEX_RUN_DEBUG 4377 if (fTraceDebug) { 4378 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4379 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, 4380 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 4381 fPattern->dumpOp(fp->fPatIdx); 4382 } 4383 #endif 4384 fp->fPatIdx++; 4385 4386 switch (opType) { 4387 4388 4389 case URX_NOP: 4390 break; 4391 4392 4393 case URX_BACKTRACK: 4394 // Force a backtrack. In some circumstances, the pattern compiler 4395 // will notice that the pattern can't possibly match anything, and will 4396 // emit one of these at that point. 4397 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4398 break; 4399 4400 4401 case URX_ONECHAR: 4402 if (fp->fInputIdx < fActiveLimit) { 4403 UChar32 c; 4404 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4405 if (c == opValue) { 4406 break; 4407 } 4408 } else { 4409 fHitEnd = TRUE; 4410 } 4411 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4412 break; 4413 4414 4415 case URX_STRING: 4416 { 4417 // Test input against a literal string. 4418 // Strings require two slots in the compiled pattern, one for the 4419 // offset to the string text, and one for the length. 4420 int32_t stringStartIdx = opValue; 4421 int32_t stringLen; 4422 4423 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 4424 fp->fPatIdx++; 4425 opType = URX_TYPE(op); 4426 stringLen = URX_VAL(op); 4427 U_ASSERT(opType == URX_STRING_LEN); 4428 U_ASSERT(stringLen >= 2); 4429 4430 const UChar * pInp = inputBuf + fp->fInputIdx; 4431 const UChar * pInpLimit = inputBuf + fActiveLimit; 4432 const UChar * pPat = litText+stringStartIdx; 4433 const UChar * pEnd = pInp + stringLen; 4434 UBool success = TRUE; 4435 while (pInp < pEnd) { 4436 if (pInp >= pInpLimit) { 4437 fHitEnd = TRUE; 4438 success = FALSE; 4439 break; 4440 } 4441 if (*pInp++ != *pPat++) { 4442 success = FALSE; 4443 break; 4444 } 4445 } 4446 4447 if (success) { 4448 fp->fInputIdx += stringLen; 4449 } else { 4450 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4451 } 4452 } 4453 break; 4454 4455 4456 case URX_STATE_SAVE: 4457 fp = StateSave(fp, opValue, status); 4458 break; 4459 4460 4461 case URX_END: 4462 // The match loop will exit via this path on a successful match, 4463 // when we reach the end of the pattern. 4464 if (toEnd && fp->fInputIdx != fActiveLimit) { 4465 // The pattern matched, but not to the end of input. Try some more. 4466 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4467 break; 4468 } 4469 isMatch = TRUE; 4470 goto breakFromLoop; 4471 4472 // Start and End Capture stack frame variables are laid out out like this: 4473 // fp->fExtra[opValue] - The start of a completed capture group 4474 // opValue+1 - The end of a completed capture group 4475 // opValue+2 - the start of a capture group whose end 4476 // has not yet been reached (and might not ever be). 4477 case URX_START_CAPTURE: 4478 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4479 fp->fExtra[opValue+2] = fp->fInputIdx; 4480 break; 4481 4482 4483 case URX_END_CAPTURE: 4484 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4485 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 4486 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 4487 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 4488 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 4489 break; 4490 4491 4492 case URX_DOLLAR: // $, test for End of line 4493 // or for position before new line at end of input 4494 if (fp->fInputIdx < fAnchorLimit-2) { 4495 // We are no where near the end of input. Fail. 4496 // This is the common case. Keep it first. 4497 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4498 break; 4499 } 4500 if (fp->fInputIdx >= fAnchorLimit) { 4501 // We really are at the end of input. Success. 4502 fHitEnd = TRUE; 4503 fRequireEnd = TRUE; 4504 break; 4505 } 4506 4507 // If we are positioned just before a new-line that is located at the 4508 // end of input, succeed. 4509 if (fp->fInputIdx == fAnchorLimit-1) { 4510 UChar32 c; 4511 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); 4512 4513 if (isLineTerminator(c)) { 4514 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4515 // At new-line at end of input. Success 4516 fHitEnd = TRUE; 4517 fRequireEnd = TRUE; 4518 break; 4519 } 4520 } 4521 } else if (fp->fInputIdx == fAnchorLimit-2 && 4522 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) { 4523 fHitEnd = TRUE; 4524 fRequireEnd = TRUE; 4525 break; // At CR/LF at end of input. Success 4526 } 4527 4528 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4529 4530 break; 4531 4532 4533 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 4534 if (fp->fInputIdx >= fAnchorLimit-1) { 4535 // Either at the last character of input, or off the end. 4536 if (fp->fInputIdx == fAnchorLimit-1) { 4537 // At last char of input. Success if it's a new line. 4538 if (inputBuf[fp->fInputIdx] == 0x0a) { 4539 fHitEnd = TRUE; 4540 fRequireEnd = TRUE; 4541 break; 4542 } 4543 } else { 4544 // Off the end of input. Success. 4545 fHitEnd = TRUE; 4546 fRequireEnd = TRUE; 4547 break; 4548 } 4549 } 4550 4551 // Not at end of input. Back-track out. 4552 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4553 break; 4554 4555 4556 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 4557 { 4558 if (fp->fInputIdx >= fAnchorLimit) { 4559 // We really are at the end of input. Success. 4560 fHitEnd = TRUE; 4561 fRequireEnd = TRUE; 4562 break; 4563 } 4564 // If we are positioned just before a new-line, succeed. 4565 // It makes no difference where the new-line is within the input. 4566 UChar32 c = inputBuf[fp->fInputIdx]; 4567 if (isLineTerminator(c)) { 4568 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 4569 // In multi-line mode, hitting a new-line just before the end of input does not 4570 // set the hitEnd or requireEnd flags 4571 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4572 break; 4573 } 4574 } 4575 // not at a new line. Fail. 4576 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4577 } 4578 break; 4579 4580 4581 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 4582 { 4583 if (fp->fInputIdx >= fAnchorLimit) { 4584 // We really are at the end of input. Success. 4585 fHitEnd = TRUE; 4586 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 4587 break; // adding a new-line would not lose the match. 4588 } 4589 // If we are not positioned just before a new-line, the test fails; backtrack out. 4590 // It makes no difference where the new-line is within the input. 4591 if (inputBuf[fp->fInputIdx] != 0x0a) { 4592 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4593 } 4594 } 4595 break; 4596 4597 4598 case URX_CARET: // ^, test for start of line 4599 if (fp->fInputIdx != fAnchorStart) { 4600 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4601 } 4602 break; 4603 4604 4605 case URX_CARET_M: // ^, test for start of line in mulit-line mode 4606 { 4607 if (fp->fInputIdx == fAnchorStart) { 4608 // We are at the start input. Success. 4609 break; 4610 } 4611 // Check whether character just before the current pos is a new-line 4612 // unless we are at the end of input 4613 UChar c = inputBuf[fp->fInputIdx - 1]; 4614 if ((fp->fInputIdx < fAnchorLimit) && 4615 isLineTerminator(c)) { 4616 // It's a new-line. ^ is true. Success. 4617 // TODO: what should be done with positions between a CR and LF? 4618 break; 4619 } 4620 // Not at the start of a line. Fail. 4621 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4622 } 4623 break; 4624 4625 4626 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 4627 { 4628 U_ASSERT(fp->fInputIdx >= fAnchorStart); 4629 if (fp->fInputIdx <= fAnchorStart) { 4630 // We are at the start input. Success. 4631 break; 4632 } 4633 // Check whether character just before the current pos is a new-line 4634 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 4635 UChar c = inputBuf[fp->fInputIdx - 1]; 4636 if (c != 0x0a) { 4637 // Not at the start of a line. Back-track out. 4638 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4639 } 4640 } 4641 break; 4642 4643 case URX_BACKSLASH_B: // Test for word boundaries 4644 { 4645 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); 4646 success ^= (UBool)(opValue != 0); // flip sense for \B 4647 if (!success) { 4648 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4649 } 4650 } 4651 break; 4652 4653 4654 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 4655 { 4656 UBool success = isUWordBoundary(fp->fInputIdx); 4657 success ^= (UBool)(opValue != 0); // flip sense for \B 4658 if (!success) { 4659 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4660 } 4661 } 4662 break; 4663 4664 4665 case URX_BACKSLASH_D: // Test for decimal digit 4666 { 4667 if (fp->fInputIdx >= fActiveLimit) { 4668 fHitEnd = TRUE; 4669 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4670 break; 4671 } 4672 4673 UChar32 c; 4674 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4675 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 4676 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 4677 success ^= (UBool)(opValue != 0); // flip sense for \D 4678 if (!success) { 4679 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4680 } 4681 } 4682 break; 4683 4684 4685 case URX_BACKSLASH_G: // Test for position at end of previous match 4686 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { 4687 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4688 } 4689 break; 4690 4691 4692 case URX_BACKSLASH_H: // Test for \h, horizontal white space. 4693 { 4694 if (fp->fInputIdx >= fActiveLimit) { 4695 fHitEnd = TRUE; 4696 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4697 break; 4698 } 4699 UChar32 c; 4700 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4701 int8_t ctype = u_charType(c); 4702 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB 4703 success ^= (UBool)(opValue != 0); // flip sense for \H 4704 if (!success) { 4705 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4706 } 4707 } 4708 break; 4709 4710 4711 case URX_BACKSLASH_R: // Test for \R, any line break sequence. 4712 { 4713 if (fp->fInputIdx >= fActiveLimit) { 4714 fHitEnd = TRUE; 4715 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4716 break; 4717 } 4718 UChar32 c; 4719 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4720 if (isLineTerminator(c)) { 4721 if (c == 0x0d && fp->fInputIdx < fActiveLimit) { 4722 // Check for CR/LF sequence. Consume both together when found. 4723 UChar c2; 4724 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2); 4725 if (c2 != 0x0a) { 4726 U16_PREV(inputBuf, 0, fp->fInputIdx, c2); 4727 } 4728 } 4729 } else { 4730 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4731 } 4732 } 4733 break; 4734 4735 4736 case URX_BACKSLASH_V: // Any single code point line ending. 4737 { 4738 if (fp->fInputIdx >= fActiveLimit) { 4739 fHitEnd = TRUE; 4740 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4741 break; 4742 } 4743 UChar32 c; 4744 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4745 UBool success = isLineTerminator(c); 4746 success ^= (UBool)(opValue != 0); // flip sense for \V 4747 if (!success) { 4748 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4749 } 4750 } 4751 break; 4752 4753 4754 4755 case URX_BACKSLASH_X: 4756 // Match a Grapheme, as defined by Unicode TR 29. 4757 // Differs slightly from Perl, which consumes combining marks independently 4758 // of context. 4759 { 4760 4761 // Fail if at end of input 4762 if (fp->fInputIdx >= fActiveLimit) { 4763 fHitEnd = TRUE; 4764 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4765 break; 4766 } 4767 4768 // Examine (and consume) the current char. 4769 // Dispatch into a little state machine, based on the char. 4770 UChar32 c; 4771 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4772 UnicodeSet **sets = fPattern->fStaticSets; 4773 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 4774 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 4775 if (sets[URX_GC_L]->contains(c)) goto GC_L; 4776 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 4777 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 4778 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4779 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4780 goto GC_Extend; 4781 4782 4783 4784 GC_L: 4785 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4786 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4787 if (sets[URX_GC_L]->contains(c)) goto GC_L; 4788 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 4789 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 4790 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4791 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4792 goto GC_Extend; 4793 4794 GC_V: 4795 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4796 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4797 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4798 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4799 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4800 goto GC_Extend; 4801 4802 GC_T: 4803 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4804 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4805 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4806 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4807 goto GC_Extend; 4808 4809 GC_Extend: 4810 // Combining characters are consumed here 4811 for (;;) { 4812 if (fp->fInputIdx >= fActiveLimit) { 4813 break; 4814 } 4815 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4816 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 4817 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 4818 break; 4819 } 4820 } 4821 goto GC_Done; 4822 4823 GC_Control: 4824 // Most control chars stand alone (don't combine with combining chars), 4825 // except for that CR/LF sequence is a single grapheme cluster. 4826 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) { 4827 fp->fInputIdx++; 4828 } 4829 4830 GC_Done: 4831 if (fp->fInputIdx >= fActiveLimit) { 4832 fHitEnd = TRUE; 4833 } 4834 break; 4835 } 4836 4837 4838 4839 4840 case URX_BACKSLASH_Z: // Test for end of Input 4841 if (fp->fInputIdx < fAnchorLimit) { 4842 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4843 } else { 4844 fHitEnd = TRUE; 4845 fRequireEnd = TRUE; 4846 } 4847 break; 4848 4849 4850 4851 case URX_STATIC_SETREF: 4852 { 4853 // Test input character against one of the predefined sets 4854 // (Word Characters, for example) 4855 // The high bit of the op value is a flag for the match polarity. 4856 // 0: success if input char is in set. 4857 // 1: success if input char is not in set. 4858 if (fp->fInputIdx >= fActiveLimit) { 4859 fHitEnd = TRUE; 4860 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4861 break; 4862 } 4863 4864 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 4865 opValue &= ~URX_NEG_SET; 4866 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 4867 4868 UChar32 c; 4869 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4870 if (c < 256) { 4871 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 4872 if (s8->contains(c)) { 4873 success = !success; 4874 } 4875 } else { 4876 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 4877 if (s->contains(c)) { 4878 success = !success; 4879 } 4880 } 4881 if (!success) { 4882 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4883 } 4884 } 4885 break; 4886 4887 4888 case URX_STAT_SETREF_N: 4889 { 4890 // Test input character for NOT being a member of one of 4891 // the predefined sets (Word Characters, for example) 4892 if (fp->fInputIdx >= fActiveLimit) { 4893 fHitEnd = TRUE; 4894 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4895 break; 4896 } 4897 4898 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 4899 4900 UChar32 c; 4901 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4902 if (c < 256) { 4903 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 4904 if (s8->contains(c) == FALSE) { 4905 break; 4906 } 4907 } else { 4908 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 4909 if (s->contains(c) == FALSE) { 4910 break; 4911 } 4912 } 4913 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4914 } 4915 break; 4916 4917 4918 case URX_SETREF: 4919 { 4920 if (fp->fInputIdx >= fActiveLimit) { 4921 fHitEnd = TRUE; 4922 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4923 break; 4924 } 4925 4926 U_ASSERT(opValue > 0 && opValue < sets->size()); 4927 4928 // There is input left. Pick up one char and test it for set membership. 4929 UChar32 c; 4930 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4931 if (c<256) { 4932 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 4933 if (s8->contains(c)) { 4934 // The character is in the set. A Match. 4935 break; 4936 } 4937 } else { 4938 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 4939 if (s->contains(c)) { 4940 // The character is in the set. A Match. 4941 break; 4942 } 4943 } 4944 4945 // the character wasn't in the set. 4946 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4947 } 4948 break; 4949 4950 4951 case URX_DOTANY: 4952 { 4953 // . matches anything, but stops at end-of-line. 4954 if (fp->fInputIdx >= fActiveLimit) { 4955 // At end of input. Match failed. Backtrack out. 4956 fHitEnd = TRUE; 4957 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4958 break; 4959 } 4960 4961 // There is input left. Advance over one char, unless we've hit end-of-line 4962 UChar32 c; 4963 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4964 if (isLineTerminator(c)) { 4965 // End of line in normal mode. . does not match. 4966 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4967 break; 4968 } 4969 } 4970 break; 4971 4972 4973 case URX_DOTANY_ALL: 4974 { 4975 // . in dot-matches-all (including new lines) mode 4976 if (fp->fInputIdx >= fActiveLimit) { 4977 // At end of input. Match failed. Backtrack out. 4978 fHitEnd = TRUE; 4979 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4980 break; 4981 } 4982 4983 // There is input left. Advance over one char, except if we are 4984 // at a cr/lf, advance over both of them. 4985 UChar32 c; 4986 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4987 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 4988 // In the case of a CR/LF, we need to advance over both. 4989 if (inputBuf[fp->fInputIdx] == 0x0a) { 4990 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); 4991 } 4992 } 4993 } 4994 break; 4995 4996 4997 case URX_DOTANY_UNIX: 4998 { 4999 // '.' operator, matches all, but stops at end-of-line. 5000 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 5001 if (fp->fInputIdx >= fActiveLimit) { 5002 // At end of input. Match failed. Backtrack out. 5003 fHitEnd = TRUE; 5004 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5005 break; 5006 } 5007 5008 // There is input left. Advance over one char, unless we've hit end-of-line 5009 UChar32 c; 5010 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5011 if (c == 0x0a) { 5012 // End of line in normal mode. '.' does not match the \n 5013 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5014 } 5015 } 5016 break; 5017 5018 5019 case URX_JMP: 5020 fp->fPatIdx = opValue; 5021 break; 5022 5023 case URX_FAIL: 5024 isMatch = FALSE; 5025 goto breakFromLoop; 5026 5027 case URX_JMP_SAV: 5028 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 5029 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 5030 fp->fPatIdx = opValue; // Then JMP. 5031 break; 5032 5033 case URX_JMP_SAV_X: 5034 // This opcode is used with (x)+, when x can match a zero length string. 5035 // Same as JMP_SAV, except conditional on the match having made forward progress. 5036 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 5037 // data address of the input position at the start of the loop. 5038 { 5039 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 5040 int32_t stoOp = (int32_t)pat[opValue-1]; 5041 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 5042 int32_t frameLoc = URX_VAL(stoOp); 5043 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 5044 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; 5045 U_ASSERT(prevInputIdx <= fp->fInputIdx); 5046 if (prevInputIdx < fp->fInputIdx) { 5047 // The match did make progress. Repeat the loop. 5048 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 5049 fp->fPatIdx = opValue; 5050 fp->fExtra[frameLoc] = fp->fInputIdx; 5051 } 5052 // If the input position did not advance, we do nothing here, 5053 // execution will fall out of the loop. 5054 } 5055 break; 5056 5057 case URX_CTR_INIT: 5058 { 5059 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 5060 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 5061 5062 // Pick up the three extra operands that CTR_INIT has, and 5063 // skip the pattern location counter past 5064 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5065 fp->fPatIdx += 3; 5066 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 5067 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 5068 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 5069 U_ASSERT(minCount>=0); 5070 U_ASSERT(maxCount>=minCount || maxCount==-1); 5071 U_ASSERT(loopLoc>=fp->fPatIdx); 5072 5073 if (minCount == 0) { 5074 fp = StateSave(fp, loopLoc+1, status); 5075 } 5076 if (maxCount == -1) { 5077 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. 5078 } else if (maxCount == 0) { 5079 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5080 } 5081 } 5082 break; 5083 5084 case URX_CTR_LOOP: 5085 { 5086 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 5087 int32_t initOp = (int32_t)pat[opValue]; 5088 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 5089 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 5090 int32_t minCount = (int32_t)pat[opValue+2]; 5091 int32_t maxCount = (int32_t)pat[opValue+3]; 5092 (*pCounter)++; 5093 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 5094 U_ASSERT(*pCounter == maxCount); 5095 break; 5096 } 5097 if (*pCounter >= minCount) { 5098 if (maxCount == -1) { 5099 // Loop has no hard upper bound. 5100 // Check that it is progressing through the input, break if it is not. 5101 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 5102 if (fp->fInputIdx == *pLastInputIdx) { 5103 break; 5104 } else { 5105 *pLastInputIdx = fp->fInputIdx; 5106 } 5107 } 5108 fp = StateSave(fp, fp->fPatIdx, status); 5109 } 5110 fp->fPatIdx = opValue + 4; // Loop back. 5111 } 5112 break; 5113 5114 case URX_CTR_INIT_NG: 5115 { 5116 // Initialize a non-greedy loop 5117 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 5118 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 5119 5120 // Pick up the three extra operands that CTR_INIT_NG has, and 5121 // skip the pattern location counter past 5122 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5123 fp->fPatIdx += 3; 5124 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 5125 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 5126 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 5127 U_ASSERT(minCount>=0); 5128 U_ASSERT(maxCount>=minCount || maxCount==-1); 5129 U_ASSERT(loopLoc>fp->fPatIdx); 5130 if (maxCount == -1) { 5131 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. 5132 } 5133 5134 if (minCount == 0) { 5135 if (maxCount != 0) { 5136 fp = StateSave(fp, fp->fPatIdx, status); 5137 } 5138 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 5139 } 5140 } 5141 break; 5142 5143 case URX_CTR_LOOP_NG: 5144 { 5145 // Non-greedy {min, max} loops 5146 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 5147 int32_t initOp = (int32_t)pat[opValue]; 5148 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 5149 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 5150 int32_t minCount = (int32_t)pat[opValue+2]; 5151 int32_t maxCount = (int32_t)pat[opValue+3]; 5152 5153 (*pCounter)++; 5154 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 5155 // The loop has matched the maximum permitted number of times. 5156 // Break out of here with no action. Matching will 5157 // continue with the following pattern. 5158 U_ASSERT(*pCounter == maxCount); 5159 break; 5160 } 5161 5162 if (*pCounter < minCount) { 5163 // We haven't met the minimum number of matches yet. 5164 // Loop back for another one. 5165 fp->fPatIdx = opValue + 4; // Loop back. 5166 } else { 5167 // We do have the minimum number of matches. 5168 5169 // If there is no upper bound on the loop iterations, check that the input index 5170 // is progressing, and stop the loop if it is not. 5171 if (maxCount == -1) { 5172 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 5173 if (fp->fInputIdx == *pLastInputIdx) { 5174 break; 5175 } 5176 *pLastInputIdx = fp->fInputIdx; 5177 } 5178 5179 // Loop Continuation: we will fall into the pattern following the loop 5180 // (non-greedy, don't execute loop body first), but first do 5181 // a state save to the top of the loop, so that a match failure 5182 // in the following pattern will try another iteration of the loop. 5183 fp = StateSave(fp, opValue + 4, status); 5184 } 5185 } 5186 break; 5187 5188 case URX_STO_SP: 5189 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5190 fData[opValue] = fStack->size(); 5191 break; 5192 5193 case URX_LD_SP: 5194 { 5195 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5196 int32_t newStackSize = (int32_t)fData[opValue]; 5197 U_ASSERT(newStackSize <= fStack->size()); 5198 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5199 if (newFP == (int64_t *)fp) { 5200 break; 5201 } 5202 int32_t i; 5203 for (i=0; i<fFrameSize; i++) { 5204 newFP[i] = ((int64_t *)fp)[i]; 5205 } 5206 fp = (REStackFrame *)newFP; 5207 fStack->setSize(newStackSize); 5208 } 5209 break; 5210 5211 case URX_BACKREF: 5212 { 5213 U_ASSERT(opValue < fFrameSize); 5214 int64_t groupStartIdx = fp->fExtra[opValue]; 5215 int64_t groupEndIdx = fp->fExtra[opValue+1]; 5216 U_ASSERT(groupStartIdx <= groupEndIdx); 5217 int64_t inputIndex = fp->fInputIdx; 5218 if (groupStartIdx < 0) { 5219 // This capture group has not participated in the match thus far, 5220 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 5221 break; 5222 } 5223 UBool success = TRUE; 5224 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) { 5225 if (inputIndex >= fActiveLimit) { 5226 success = FALSE; 5227 fHitEnd = TRUE; 5228 break; 5229 } 5230 if (inputBuf[groupIndex] != inputBuf[inputIndex]) { 5231 success = FALSE; 5232 break; 5233 } 5234 } 5235 if (success) { 5236 fp->fInputIdx = inputIndex; 5237 } else { 5238 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5239 } 5240 } 5241 break; 5242 5243 case URX_BACKREF_I: 5244 { 5245 U_ASSERT(opValue < fFrameSize); 5246 int64_t groupStartIdx = fp->fExtra[opValue]; 5247 int64_t groupEndIdx = fp->fExtra[opValue+1]; 5248 U_ASSERT(groupStartIdx <= groupEndIdx); 5249 if (groupStartIdx < 0) { 5250 // This capture group has not participated in the match thus far, 5251 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 5252 break; 5253 } 5254 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx); 5255 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit); 5256 5257 // Note: if the capture group match was of an empty string the backref 5258 // match succeeds. Verified by testing: Perl matches succeed 5259 // in this case, so we do too. 5260 5261 UBool success = TRUE; 5262 for (;;) { 5263 UChar32 captureGroupChar = captureGroupItr.next(); 5264 if (captureGroupChar == U_SENTINEL) { 5265 success = TRUE; 5266 break; 5267 } 5268 UChar32 inputChar = inputItr.next(); 5269 if (inputChar == U_SENTINEL) { 5270 success = FALSE; 5271 fHitEnd = TRUE; 5272 break; 5273 } 5274 if (inputChar != captureGroupChar) { 5275 success = FALSE; 5276 break; 5277 } 5278 } 5279 5280 if (success && inputItr.inExpansion()) { 5281 // We otained a match by consuming part of a string obtained from 5282 // case-folding a single code point of the input text. 5283 // This does not count as an overall match. 5284 success = FALSE; 5285 } 5286 5287 if (success) { 5288 fp->fInputIdx = inputItr.getIndex(); 5289 } else { 5290 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5291 } 5292 } 5293 break; 5294 5295 case URX_STO_INP_LOC: 5296 { 5297 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 5298 fp->fExtra[opValue] = fp->fInputIdx; 5299 } 5300 break; 5301 5302 case URX_JMPX: 5303 { 5304 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5305 fp->fPatIdx += 1; 5306 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 5307 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 5308 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; 5309 U_ASSERT(savedInputIdx <= fp->fInputIdx); 5310 if (savedInputIdx < fp->fInputIdx) { 5311 fp->fPatIdx = opValue; // JMP 5312 } else { 5313 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 5314 } 5315 } 5316 break; 5317 5318 case URX_LA_START: 5319 { 5320 // Entering a lookahead block. 5321 // Save Stack Ptr, Input Pos. 5322 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5323 fData[opValue] = fStack->size(); 5324 fData[opValue+1] = fp->fInputIdx; 5325 fActiveStart = fLookStart; // Set the match region change for 5326 fActiveLimit = fLookLimit; // transparent bounds. 5327 } 5328 break; 5329 5330 case URX_LA_END: 5331 { 5332 // Leaving a look-ahead block. 5333 // restore Stack Ptr, Input Pos to positions they had on entry to block. 5334 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5335 int32_t stackSize = fStack->size(); 5336 int32_t newStackSize = (int32_t)fData[opValue]; 5337 U_ASSERT(stackSize >= newStackSize); 5338 if (stackSize > newStackSize) { 5339 // Copy the current top frame back to the new (cut back) top frame. 5340 // This makes the capture groups from within the look-ahead 5341 // expression available. 5342 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5343 int32_t i; 5344 for (i=0; i<fFrameSize; i++) { 5345 newFP[i] = ((int64_t *)fp)[i]; 5346 } 5347 fp = (REStackFrame *)newFP; 5348 fStack->setSize(newStackSize); 5349 } 5350 fp->fInputIdx = fData[opValue+1]; 5351 5352 // Restore the active region bounds in the input string; they may have 5353 // been changed because of transparent bounds on a Region. 5354 fActiveStart = fRegionStart; 5355 fActiveLimit = fRegionLimit; 5356 } 5357 break; 5358 5359 case URX_ONECHAR_I: 5360 if (fp->fInputIdx < fActiveLimit) { 5361 UChar32 c; 5362 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5363 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 5364 break; 5365 } 5366 } else { 5367 fHitEnd = TRUE; 5368 } 5369 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5370 break; 5371 5372 case URX_STRING_I: 5373 // Case-insensitive test input against a literal string. 5374 // Strings require two slots in the compiled pattern, one for the 5375 // offset to the string text, and one for the length. 5376 // The compiled string has already been case folded. 5377 { 5378 const UChar *patternString = litText + opValue; 5379 5380 op = (int32_t)pat[fp->fPatIdx]; 5381 fp->fPatIdx++; 5382 opType = URX_TYPE(op); 5383 opValue = URX_VAL(op); 5384 U_ASSERT(opType == URX_STRING_LEN); 5385 int32_t patternStringLen = opValue; // Length of the string from the pattern. 5386 5387 UChar32 cText; 5388 UChar32 cPattern; 5389 UBool success = TRUE; 5390 int32_t patternStringIdx = 0; 5391 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit); 5392 while (patternStringIdx < patternStringLen) { 5393 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); 5394 cText = inputIterator.next(); 5395 if (cText != cPattern) { 5396 success = FALSE; 5397 if (cText == U_SENTINEL) { 5398 fHitEnd = TRUE; 5399 } 5400 break; 5401 } 5402 } 5403 if (inputIterator.inExpansion()) { 5404 success = FALSE; 5405 } 5406 5407 if (success) { 5408 fp->fInputIdx = inputIterator.getIndex(); 5409 } else { 5410 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5411 } 5412 } 5413 break; 5414 5415 case URX_LB_START: 5416 { 5417 // Entering a look-behind block. 5418 // Save Stack Ptr, Input Pos. 5419 // TODO: implement transparent bounds. Ticket #6067 5420 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5421 fData[opValue] = fStack->size(); 5422 fData[opValue+1] = fp->fInputIdx; 5423 // Init the variable containing the start index for attempted matches. 5424 fData[opValue+2] = -1; 5425 // Save input string length, then reset to pin any matches to end at 5426 // the current position. 5427 fData[opValue+3] = fActiveLimit; 5428 fActiveLimit = fp->fInputIdx; 5429 } 5430 break; 5431 5432 5433 case URX_LB_CONT: 5434 { 5435 // Positive Look-Behind, at top of loop checking for matches of LB expression 5436 // at all possible input starting positions. 5437 5438 // Fetch the min and max possible match lengths. They are the operands 5439 // of this op in the pattern. 5440 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5441 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5442 U_ASSERT(minML <= maxML); 5443 U_ASSERT(minML >= 0); 5444 5445 // Fetch (from data) the last input index where a match was attempted. 5446 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5447 int64_t *lbStartIdx = &fData[opValue+2]; 5448 if (*lbStartIdx < 0) { 5449 // First time through loop. 5450 *lbStartIdx = fp->fInputIdx - minML; 5451 } else { 5452 // 2nd through nth time through the loop. 5453 // Back up start position for match by one. 5454 if (*lbStartIdx == 0) { 5455 (*lbStartIdx)--; 5456 } else { 5457 U16_BACK_1(inputBuf, 0, *lbStartIdx); 5458 } 5459 } 5460 5461 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 5462 // We have tried all potential match starting points without 5463 // getting a match. Backtrack out, and out of the 5464 // Look Behind altogether. 5465 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5466 int64_t restoreInputLen = fData[opValue+3]; 5467 U_ASSERT(restoreInputLen >= fActiveLimit); 5468 U_ASSERT(restoreInputLen <= fInputLength); 5469 fActiveLimit = restoreInputLen; 5470 break; 5471 } 5472 5473 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5474 // (successful match will fall off the end of the loop.) 5475 fp = StateSave(fp, fp->fPatIdx-3, status); 5476 fp->fInputIdx = *lbStartIdx; 5477 } 5478 break; 5479 5480 case URX_LB_END: 5481 // End of a look-behind block, after a successful match. 5482 { 5483 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5484 if (fp->fInputIdx != fActiveLimit) { 5485 // The look-behind expression matched, but the match did not 5486 // extend all the way to the point that we are looking behind from. 5487 // FAIL out of here, which will take us back to the LB_CONT, which 5488 // will retry the match starting at another position or fail 5489 // the look-behind altogether, whichever is appropriate. 5490 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5491 break; 5492 } 5493 5494 // Look-behind match is good. Restore the orignal input string length, 5495 // which had been truncated to pin the end of the lookbehind match to the 5496 // position being looked-behind. 5497 int64_t originalInputLen = fData[opValue+3]; 5498 U_ASSERT(originalInputLen >= fActiveLimit); 5499 U_ASSERT(originalInputLen <= fInputLength); 5500 fActiveLimit = originalInputLen; 5501 } 5502 break; 5503 5504 5505 case URX_LBN_CONT: 5506 { 5507 // Negative Look-Behind, at top of loop checking for matches of LB expression 5508 // at all possible input starting positions. 5509 5510 // Fetch the extra parameters of this op. 5511 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5512 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5513 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 5514 continueLoc = URX_VAL(continueLoc); 5515 U_ASSERT(minML <= maxML); 5516 U_ASSERT(minML >= 0); 5517 U_ASSERT(continueLoc > fp->fPatIdx); 5518 5519 // Fetch (from data) the last input index where a match was attempted. 5520 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5521 int64_t *lbStartIdx = &fData[opValue+2]; 5522 if (*lbStartIdx < 0) { 5523 // First time through loop. 5524 *lbStartIdx = fp->fInputIdx - minML; 5525 } else { 5526 // 2nd through nth time through the loop. 5527 // Back up start position for match by one. 5528 if (*lbStartIdx == 0) { 5529 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. 5530 } else { 5531 U16_BACK_1(inputBuf, 0, *lbStartIdx); 5532 } 5533 } 5534 5535 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 5536 // We have tried all potential match starting points without 5537 // getting a match, which means that the negative lookbehind as 5538 // a whole has succeeded. Jump forward to the continue location 5539 int64_t restoreInputLen = fData[opValue+3]; 5540 U_ASSERT(restoreInputLen >= fActiveLimit); 5541 U_ASSERT(restoreInputLen <= fInputLength); 5542 fActiveLimit = restoreInputLen; 5543 fp->fPatIdx = continueLoc; 5544 break; 5545 } 5546 5547 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5548 // (successful match will cause a FAIL out of the loop altogether.) 5549 fp = StateSave(fp, fp->fPatIdx-4, status); 5550 fp->fInputIdx = *lbStartIdx; 5551 } 5552 break; 5553 5554 case URX_LBN_END: 5555 // End of a negative look-behind block, after a successful match. 5556 { 5557 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5558 if (fp->fInputIdx != fActiveLimit) { 5559 // The look-behind expression matched, but the match did not 5560 // extend all the way to the point that we are looking behind from. 5561 // FAIL out of here, which will take us back to the LB_CONT, which 5562 // will retry the match starting at another position or succeed 5563 // the look-behind altogether, whichever is appropriate. 5564 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5565 break; 5566 } 5567 5568 // Look-behind expression matched, which means look-behind test as 5569 // a whole Fails 5570 5571 // Restore the orignal input string length, which had been truncated 5572 // inorder to pin the end of the lookbehind match 5573 // to the position being looked-behind. 5574 int64_t originalInputLen = fData[opValue+3]; 5575 U_ASSERT(originalInputLen >= fActiveLimit); 5576 U_ASSERT(originalInputLen <= fInputLength); 5577 fActiveLimit = originalInputLen; 5578 5579 // Restore original stack position, discarding any state saved 5580 // by the successful pattern match. 5581 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5582 int32_t newStackSize = (int32_t)fData[opValue]; 5583 U_ASSERT(fStack->size() > newStackSize); 5584 fStack->setSize(newStackSize); 5585 5586 // FAIL, which will take control back to someplace 5587 // prior to entering the look-behind test. 5588 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5589 } 5590 break; 5591 5592 5593 case URX_LOOP_SR_I: 5594 // Loop Initialization for the optimized implementation of 5595 // [some character set]* 5596 // This op scans through all matching input. 5597 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5598 { 5599 U_ASSERT(opValue > 0 && opValue < sets->size()); 5600 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5601 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 5602 5603 // Loop through input, until either the input is exhausted or 5604 // we reach a character that is not a member of the set. 5605 int32_t ix = (int32_t)fp->fInputIdx; 5606 for (;;) { 5607 if (ix >= fActiveLimit) { 5608 fHitEnd = TRUE; 5609 break; 5610 } 5611 UChar32 c; 5612 U16_NEXT(inputBuf, ix, fActiveLimit, c); 5613 if (c<256) { 5614 if (s8->contains(c) == FALSE) { 5615 U16_BACK_1(inputBuf, 0, ix); 5616 break; 5617 } 5618 } else { 5619 if (s->contains(c) == FALSE) { 5620 U16_BACK_1(inputBuf, 0, ix); 5621 break; 5622 } 5623 } 5624 } 5625 5626 // If there were no matching characters, skip over the loop altogether. 5627 // The loop doesn't run at all, a * op always succeeds. 5628 if (ix == fp->fInputIdx) { 5629 fp->fPatIdx++; // skip the URX_LOOP_C op. 5630 break; 5631 } 5632 5633 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5634 // must follow. It's operand is the stack location 5635 // that holds the starting input index for the match of this [set]* 5636 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 5637 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5638 int32_t stackLoc = URX_VAL(loopcOp); 5639 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5640 fp->fExtra[stackLoc] = fp->fInputIdx; 5641 fp->fInputIdx = ix; 5642 5643 // Save State to the URX_LOOP_C op that follows this one, 5644 // so that match failures in the following code will return to there. 5645 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 5646 fp = StateSave(fp, fp->fPatIdx, status); 5647 fp->fPatIdx++; 5648 } 5649 break; 5650 5651 5652 case URX_LOOP_DOT_I: 5653 // Loop Initialization for the optimized implementation of .* 5654 // This op scans through all remaining input. 5655 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5656 { 5657 // Loop through input until the input is exhausted (we reach an end-of-line) 5658 // In DOTALL mode, we can just go straight to the end of the input. 5659 int32_t ix; 5660 if ((opValue & 1) == 1) { 5661 // Dot-matches-All mode. Jump straight to the end of the string. 5662 ix = (int32_t)fActiveLimit; 5663 fHitEnd = TRUE; 5664 } else { 5665 // NOT DOT ALL mode. Line endings do not match '.' 5666 // Scan forward until a line ending or end of input. 5667 ix = (int32_t)fp->fInputIdx; 5668 for (;;) { 5669 if (ix >= fActiveLimit) { 5670 fHitEnd = TRUE; 5671 break; 5672 } 5673 UChar32 c; 5674 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] 5675 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 5676 if ((c == 0x0a) || // 0x0a is newline in both modes. 5677 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 5678 isLineTerminator(c))) { 5679 // char is a line ending. Put the input pos back to the 5680 // line ending char, and exit the scanning loop. 5681 U16_BACK_1(inputBuf, 0, ix); 5682 break; 5683 } 5684 } 5685 } 5686 } 5687 5688 // If there were no matching characters, skip over the loop altogether. 5689 // The loop doesn't run at all, a * op always succeeds. 5690 if (ix == fp->fInputIdx) { 5691 fp->fPatIdx++; // skip the URX_LOOP_C op. 5692 break; 5693 } 5694 5695 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5696 // must follow. It's operand is the stack location 5697 // that holds the starting input index for the match of this .* 5698 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 5699 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5700 int32_t stackLoc = URX_VAL(loopcOp); 5701 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5702 fp->fExtra[stackLoc] = fp->fInputIdx; 5703 fp->fInputIdx = ix; 5704 5705 // Save State to the URX_LOOP_C op that follows this one, 5706 // so that match failures in the following code will return to there. 5707 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 5708 fp = StateSave(fp, fp->fPatIdx, status); 5709 fp->fPatIdx++; 5710 } 5711 break; 5712 5713 5714 case URX_LOOP_C: 5715 { 5716 U_ASSERT(opValue>=0 && opValue<fFrameSize); 5717 backSearchIndex = (int32_t)fp->fExtra[opValue]; 5718 U_ASSERT(backSearchIndex <= fp->fInputIdx); 5719 if (backSearchIndex == fp->fInputIdx) { 5720 // We've backed up the input idx to the point that the loop started. 5721 // The loop is done. Leave here without saving state. 5722 // Subsequent failures won't come back here. 5723 break; 5724 } 5725 // Set up for the next iteration of the loop, with input index 5726 // backed up by one from the last time through, 5727 // and a state save to this instruction in case the following code fails again. 5728 // (We're going backwards because this loop emulates stack unwinding, not 5729 // the initial scan forward.) 5730 U_ASSERT(fp->fInputIdx > 0); 5731 UChar32 prevC; 5732 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit? 5733 5734 if (prevC == 0x0a && 5735 fp->fInputIdx > backSearchIndex && 5736 inputBuf[fp->fInputIdx-1] == 0x0d) { 5737 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 5738 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 5739 // .*, stepping back over CRLF pair. 5740 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 5741 } 5742 } 5743 5744 5745 fp = StateSave(fp, fp->fPatIdx-1, status); 5746 } 5747 break; 5748 5749 5750 5751 default: 5752 // Trouble. The compiled pattern contains an entry with an 5753 // unrecognized type tag. 5754 U_ASSERT(FALSE); 5755 } 5756 5757 if (U_FAILURE(status)) { 5758 isMatch = FALSE; 5759 break; 5760 } 5761 } 5762 5763 breakFromLoop: 5764 fMatch = isMatch; 5765 if (isMatch) { 5766 fLastMatchEnd = fMatchEnd; 5767 fMatchStart = startIdx; 5768 fMatchEnd = fp->fInputIdx; 5769 } 5770 5771 #ifdef REGEX_RUN_DEBUG 5772 if (fTraceDebug) { 5773 if (isMatch) { 5774 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd); 5775 } else { 5776 printf("No match\n\n"); 5777 } 5778 } 5779 #endif 5780 5781 fFrame = fp; // The active stack frame when the engine stopped. 5782 // Contains the capture group results that we need to 5783 // access later. 5784 5785 return; 5786 } 5787 5788 5789 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) 5790 5791 U_NAMESPACE_END 5792 5793 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 5794