Home | History | Annotate | Download | only in i18n
      1 /*
      2 **************************************************************************
      3 *   Copyright (C) 2002-2008 International Business Machines Corporation  *
      4 *   and others. All rights reserved.                                     *
      5 **************************************************************************
      6 */
      7 //
      8 //  file:  rematch.cpp
      9 //
     10 //         Contains the implementation of class RegexMatcher,
     11 //         which is one of the main API classes for the ICU regular expression package.
     12 //
     13 
     14 #include "unicode/utypes.h"
     15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     16 
     17 #include "unicode/regex.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/ustring.h"
     21 #include "unicode/rbbi.h"
     22 #include "uassert.h"
     23 #include "cmemory.h"
     24 #include "uvector.h"
     25 #include "uvectr32.h"
     26 #include "regeximp.h"
     27 #include "regexst.h"
     28 
     29 // #include <malloc.h>        // Needed for heapcheck testing
     30 
     31 U_NAMESPACE_BEGIN
     32 
     33 // Default limit for the size of the back track stack, to avoid system
     34 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
     35 // This value puts ICU's limits higher than most other regexp implementations,
     36 //    which use recursion rather than the heap, and take more storage per
     37 //    backtrack point.
     38 //
     39 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
     40 
     41 // Time limit counter constant.
     42 //   Time limits for expression evaluation are in terms of quanta of work by
     43 //   the engine, each of which is 10,000 state saves.
     44 //   This constant determines that state saves per tick number.
     45 static const int32_t TIMER_INITIAL_VALUE = 10000;
     46 
     47 //-----------------------------------------------------------------------------
     48 //
     49 //   Constructor and Destructor
     50 //
     51 //-----------------------------------------------------------------------------
     52 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
     53     fDeferredStatus = U_ZERO_ERROR;
     54     init(fDeferredStatus);
     55     if (U_FAILURE(fDeferredStatus)) {
     56         return;
     57     }
     58     if (pat==NULL) {
     59         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
     60         return;
     61     }
     62     fPattern = pat;
     63     init2(RegexStaticSets::gStaticSets->fEmptyString, fDeferredStatus);
     64 }
     65 
     66 
     67 
     68 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
     69                            uint32_t flags, UErrorCode &status) {
     70     init(status);
     71     if (U_FAILURE(status)) {
     72         return;
     73     }
     74     UParseError    pe;
     75     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
     76     fPattern           = fPatternOwned;
     77     init2(input, status);
     78 }
     79 
     80 
     81 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
     82                            uint32_t flags, UErrorCode &status) {
     83     init(status);
     84     if (U_FAILURE(status)) {
     85         return;
     86     }
     87     UParseError    pe;
     88     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
     89     fPattern           = fPatternOwned;
     90     init2(RegexStaticSets::gStaticSets->fEmptyString, status);
     91 }
     92 
     93 
     94 
     95 
     96 RegexMatcher::~RegexMatcher() {
     97     delete fStack;
     98     if (fData != fSmallData) {
     99         uprv_free(fData);
    100         fData = NULL;
    101     }
    102     if (fPatternOwned) {
    103         delete fPatternOwned;
    104         fPatternOwned = NULL;
    105         fPattern = NULL;
    106     }
    107     #if UCONFIG_NO_BREAK_ITERATION==0
    108     delete fWordBreakItr;
    109     #endif
    110 }
    111 
    112 //
    113 //   init()   common initialization for use by all constructors.
    114 //            Initialize all fields, get the object into a consistent state.
    115 //            This must be done even when the initial status shows an error,
    116 //            so that the object is initialized sufficiently well for the destructor
    117 //            to run safely.
    118 //
    119 void RegexMatcher::init(UErrorCode &status) {
    120     fPattern           = NULL;
    121     fPatternOwned      = NULL;
    122     fInput             = NULL;
    123     fFrameSize         = 0;
    124     fRegionStart       = 0;
    125     fRegionLimit       = 0;
    126     fAnchorStart       = 0;
    127     fAnchorLimit       = 0;
    128     fLookStart         = 0;
    129     fLookLimit         = 0;
    130     fActiveStart       = 0;
    131     fActiveLimit       = 0;
    132     fTransparentBounds = FALSE;
    133     fAnchoringBounds   = TRUE;
    134     fMatch             = FALSE;
    135     fMatchStart        = 0;
    136     fMatchEnd          = 0;
    137     fLastMatchEnd      = -1;
    138     fAppendPosition    = 0;
    139     fHitEnd            = FALSE;
    140     fRequireEnd        = FALSE;
    141     fStack             = NULL;
    142     fFrame             = NULL;
    143     fTimeLimit         = 0;
    144     fTime              = 0;
    145     fTickCounter       = 0;
    146     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
    147     fCallbackFn        = NULL;
    148     fCallbackContext   = NULL;
    149     fTraceDebug        = FALSE;
    150     fDeferredStatus    = status;
    151     fData              = fSmallData;
    152     fWordBreakItr      = NULL;
    153 
    154     fStack             = new UVector32(status);
    155     if (U_FAILURE(status)) {
    156         fDeferredStatus = status;
    157     }
    158 }
    159 
    160 //
    161 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
    162 //            This handles the common setup to be done after the Pattern is available.
    163 //
    164 void RegexMatcher::init2(const UnicodeString &input, UErrorCode &status) {
    165     if (U_FAILURE(status)) {
    166         fDeferredStatus = status;
    167         return;
    168     }
    169 
    170     if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) {
    171         fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t));
    172         if (fData == NULL) {
    173             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    174             return;
    175         }
    176     }
    177 
    178     reset(input);
    179     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
    180     if (U_FAILURE(status)) {
    181         fDeferredStatus = status;
    182         return;
    183     }
    184 }
    185 
    186 
    187 static const UChar BACKSLASH  = 0x5c;
    188 static const UChar DOLLARSIGN = 0x24;
    189 //--------------------------------------------------------------------------------
    190 //
    191 //    appendReplacement
    192 //
    193 //--------------------------------------------------------------------------------
    194 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
    195                                               const UnicodeString &replacement,
    196                                               UErrorCode &status) {
    197     if (U_FAILURE(status)) {
    198         return *this;
    199     }
    200     if (U_FAILURE(fDeferredStatus)) {
    201         status = fDeferredStatus;
    202         return *this;
    203     }
    204     if (fMatch == FALSE) {
    205         status = U_REGEX_INVALID_STATE;
    206         return *this;
    207     }
    208 
    209     // Copy input string from the end of previous match to start of current match
    210     int32_t  len = fMatchStart-fAppendPosition;
    211     if (len > 0) {
    212         dest.append(*fInput, fAppendPosition, len);
    213     }
    214     fAppendPosition = fMatchEnd;
    215 
    216 
    217     // scan the replacement text, looking for substitutions ($n) and \escapes.
    218     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
    219     //         move entire ranges not containing substitutions.
    220     int32_t  replLen = replacement.length();
    221     int32_t  replIdx = 0;
    222     while (replIdx<replLen) {
    223         UChar  c = replacement.charAt(replIdx);
    224         replIdx++;
    225         if (c == BACKSLASH) {
    226             // Backslash Escape.  Copy the following char out without further checks.
    227             //                    Note:  Surrogate pairs don't need any special handling
    228             //                           The second half wont be a '$' or a '\', and
    229             //                           will move to the dest normally on the next
    230             //                           loop iteration.
    231             if (replIdx >= replLen) {
    232                 break;
    233             }
    234             c = replacement.charAt(replIdx);
    235 
    236             if (c==0x55/*U*/ || c==0x75/*u*/) {
    237                 // We have a \udddd or \Udddddddd escape sequence.
    238                 UChar32 escapedChar = replacement.unescapeAt(replIdx);
    239                 if (escapedChar != (UChar32)0xFFFFFFFF) {
    240                     dest.append(escapedChar);
    241                     // TODO:  Report errors for mal-formed \u escapes?
    242                     //        As this is, the original sequence is output, which may be OK.
    243                     continue;
    244                 }
    245             }
    246 
    247             // Plain backslash escape.  Just put out the escaped character.
    248             dest.append(c);
    249             replIdx++;
    250             continue;
    251         }
    252 
    253         if (c != DOLLARSIGN) {
    254             // Normal char, not a $.  Copy it out without further checks.
    255             dest.append(c);
    256             continue;
    257         }
    258 
    259         // We've got a $.  Pick up a capture group number if one follows.
    260         // Consume at most the number of digits necessary for the largest capture
    261         // number that is valid for this pattern.
    262 
    263         int32_t numDigits = 0;
    264         int32_t groupNum  = 0;
    265         UChar32 digitC;
    266         for (;;) {
    267             if (replIdx >= replLen) {
    268                 break;
    269             }
    270             digitC = replacement.char32At(replIdx);
    271             if (u_isdigit(digitC) == FALSE) {
    272                 break;
    273             }
    274             replIdx = replacement.moveIndex32(replIdx, 1);
    275             groupNum=groupNum*10 + u_charDigitValue(digitC);
    276             numDigits++;
    277             if (numDigits >= fPattern->fMaxCaptureDigits) {
    278                 break;
    279             }
    280         }
    281 
    282 
    283         if (numDigits == 0) {
    284             // The $ didn't introduce a group number at all.
    285             // Treat it as just part of the substitution text.
    286             dest.append(DOLLARSIGN);
    287             continue;
    288         }
    289 
    290         // Finally, append the capture group data to the destination.
    291         dest.append(group(groupNum, status));
    292         if (U_FAILURE(status)) {
    293             // Can fail if group number is out of range.
    294             break;
    295         }
    296 
    297     }
    298 
    299     return *this;
    300 }
    301 
    302 
    303 
    304 //--------------------------------------------------------------------------------
    305 //
    306 //    appendTail     Intended to be used in conjunction with appendReplacement()
    307 //                   To the destination string, append everything following
    308 //                   the last match position from the input string.
    309 //
    310 //                   Note:  Match ranges do not affect appendTail or appendReplacement
    311 //
    312 //--------------------------------------------------------------------------------
    313 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
    314     int32_t  len = fInput->length() - fAppendPosition;
    315     if (len > 0) {
    316         dest.append(*fInput, fAppendPosition, len);
    317     }
    318     return dest;
    319 }
    320 
    321 
    322 
    323 //--------------------------------------------------------------------------------
    324 //
    325 //   end
    326 //
    327 //--------------------------------------------------------------------------------
    328 int32_t RegexMatcher::end(UErrorCode &err) const {
    329     return end(0, err);
    330 }
    331 
    332 
    333 
    334 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
    335     if (U_FAILURE(err)) {
    336         return -1;
    337     }
    338     if (fMatch == FALSE) {
    339         err = U_REGEX_INVALID_STATE;
    340         return -1;
    341     }
    342     if (group < 0 || group > fPattern->fGroupMap->size()) {
    343         err = U_INDEX_OUTOFBOUNDS_ERROR;
    344         return -1;
    345     }
    346     int32_t e = -1;
    347     if (group == 0) {
    348         e = fMatchEnd;
    349     } else {
    350         // Get the position within the stack frame of the variables for
    351         //    this capture group.
    352         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
    353         U_ASSERT(groupOffset < fPattern->fFrameSize);
    354         U_ASSERT(groupOffset >= 0);
    355         e = fFrame->fExtra[groupOffset + 1];
    356     }
    357     return e;
    358 }
    359 
    360 
    361 
    362 //--------------------------------------------------------------------------------
    363 //
    364 //   find()
    365 //
    366 //--------------------------------------------------------------------------------
    367 UBool RegexMatcher::find() {
    368     // Start at the position of the last match end.  (Will be zero if the
    369     //   matcher has been reset.
    370     //
    371     if (U_FAILURE(fDeferredStatus)) {
    372         return FALSE;
    373     }
    374 
    375     int32_t startPos = fMatchEnd;
    376     if (startPos==0) {
    377         startPos = fActiveStart;
    378     }
    379 
    380     if (fMatch) {
    381         // Save the position of any previous successful match.
    382         fLastMatchEnd = fMatchEnd;
    383 
    384         if (fMatchStart == fMatchEnd) {
    385             // Previous match had zero length.  Move start position up one position
    386             //  to avoid sending find() into a loop on zero-length matches.
    387             if (startPos >= fActiveLimit) {
    388                 fMatch = FALSE;
    389                 fHitEnd = TRUE;
    390                 return FALSE;
    391             }
    392             startPos = fInput->moveIndex32(startPos, 1);
    393         }
    394     } else {
    395         if (fLastMatchEnd >= 0) {
    396             // A previous find() failed to match.  Don't try again.
    397             //   (without this test, a pattern with a zero-length match
    398             //    could match again at the end of an input string.)
    399             fHitEnd = TRUE;
    400             return FALSE;
    401         }
    402     }
    403 
    404 
    405     // Compute the position in the input string beyond which a match can not begin, because
    406     //   the minimum length match would extend past the end of the input.
    407     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    408     //          Be aware of possible overflows if making changes here.
    409     int32_t testLen  = fActiveLimit - fPattern->fMinMatchLen;
    410     if (startPos > testLen) {
    411         fMatch = FALSE;
    412         fHitEnd = TRUE;
    413         return FALSE;
    414     }
    415 
    416     const UChar *inputBuf = fInput->getBuffer();
    417     UChar32  c;
    418     U_ASSERT(startPos >= 0);
    419 
    420     switch (fPattern->fStartType) {
    421     case START_NO_INFO:
    422         // No optimization was found.
    423         //  Try a match at each input position.
    424         for (;;) {
    425             MatchAt(startPos, FALSE, fDeferredStatus);
    426             if (U_FAILURE(fDeferredStatus)) {
    427                 return FALSE;
    428             }
    429             if (fMatch) {
    430                 return TRUE;
    431             }
    432             if (startPos >= testLen) {
    433                 fHitEnd = TRUE;
    434                 return FALSE;
    435             }
    436             U16_FWD_1(inputBuf, startPos, fActiveLimit);
    437             // Note that it's perfectly OK for a pattern to have a zero-length
    438             //   match at the end of a string, so we must make sure that the loop
    439             //   runs with startPos == testLen the last time through.
    440         }
    441         U_ASSERT(FALSE);
    442 
    443     case START_START:
    444         // Matches are only possible at the start of the input string
    445         //   (pattern begins with ^ or \A)
    446         if (startPos > fActiveStart) {
    447             fMatch = FALSE;
    448             return FALSE;
    449         }
    450         MatchAt(startPos, FALSE, fDeferredStatus);
    451         if (U_FAILURE(fDeferredStatus)) {
    452             return FALSE;
    453         }
    454         return fMatch;
    455 
    456 
    457     case START_SET:
    458         {
    459             // Match may start on any char from a pre-computed set.
    460             U_ASSERT(fPattern->fMinMatchLen > 0);
    461             for (;;) {
    462                 int32_t pos = startPos;
    463                 U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
    464                 if (c<256 && fPattern->fInitialChars8->contains(c) ||
    465                     c>=256 && fPattern->fInitialChars->contains(c)) {
    466                     MatchAt(pos, FALSE, fDeferredStatus);
    467                     if (U_FAILURE(fDeferredStatus)) {
    468                         return FALSE;
    469                     }
    470                     if (fMatch) {
    471                         return TRUE;
    472                     }
    473                 }
    474                 if (pos >= testLen) {
    475                     fMatch = FALSE;
    476                     fHitEnd = TRUE;
    477                     return FALSE;
    478                 }
    479             }
    480         }
    481         U_ASSERT(FALSE);
    482 
    483     case START_STRING:
    484     case START_CHAR:
    485         {
    486             // Match starts on exactly one char.
    487             U_ASSERT(fPattern->fMinMatchLen > 0);
    488             UChar32 theChar = fPattern->fInitialChar;
    489             for (;;) {
    490                 int32_t pos = startPos;
    491                 U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
    492                 if (c == theChar) {
    493                     MatchAt(pos, FALSE, fDeferredStatus);
    494                     if (U_FAILURE(fDeferredStatus)) {
    495                         return FALSE;
    496                     }
    497                     if (fMatch) {
    498                         return TRUE;
    499                     }
    500                 }
    501                 if (pos >= testLen) {
    502                     fMatch = FALSE;
    503                     fHitEnd = TRUE;
    504                     return FALSE;
    505                 }
    506             }
    507         }
    508         U_ASSERT(FALSE);
    509 
    510     case START_LINE:
    511         {
    512             UChar32  c;
    513             if (startPos == fAnchorStart) {
    514                 MatchAt(startPos, FALSE, fDeferredStatus);
    515                 if (U_FAILURE(fDeferredStatus)) {
    516                     return FALSE;
    517                 }
    518                 if (fMatch) {
    519                     return TRUE;
    520                 }
    521                 U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
    522             }
    523 
    524             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
    525                for (;;) {
    526                     c = inputBuf[startPos-1];
    527                     if (c == 0x0a) {
    528                             MatchAt(startPos, FALSE, fDeferredStatus);
    529                             if (U_FAILURE(fDeferredStatus)) {
    530                                 return FALSE;
    531                             }
    532                             if (fMatch) {
    533                                 return TRUE;
    534                             }
    535                     }
    536                     if (startPos >= testLen) {
    537                         fMatch = FALSE;
    538                         fHitEnd = TRUE;
    539                         return FALSE;
    540                     }
    541                     U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
    542                     // Note that it's perfectly OK for a pattern to have a zero-length
    543                     //   match at the end of a string, so we must make sure that the loop
    544                     //   runs with startPos == testLen the last time through.
    545                 }
    546             } else {
    547                 for (;;) {
    548                     c = inputBuf[startPos-1];
    549                     if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
    550                         ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
    551                             if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
    552                                 startPos++;
    553                             }
    554                             MatchAt(startPos, FALSE, fDeferredStatus);
    555                             if (U_FAILURE(fDeferredStatus)) {
    556                                 return FALSE;
    557                             }
    558                             if (fMatch) {
    559                                 return TRUE;
    560                             }
    561                     }
    562                     if (startPos >= testLen) {
    563                         fMatch = FALSE;
    564                         fHitEnd = TRUE;
    565                         return FALSE;
    566                     }
    567                     U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
    568                     // Note that it's perfectly OK for a pattern to have a zero-length
    569                     //   match at the end of a string, so we must make sure that the loop
    570                     //   runs with startPos == testLen the last time through.
    571                 }
    572             }
    573         }
    574 
    575     default:
    576         U_ASSERT(FALSE);
    577     }
    578 
    579     U_ASSERT(FALSE);
    580     return FALSE;
    581 }
    582 
    583 
    584 
    585 UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
    586     if (U_FAILURE(status)) {
    587         return FALSE;
    588     }
    589     if (U_FAILURE(fDeferredStatus)) {
    590         status = fDeferredStatus;
    591         return FALSE;
    592     }
    593     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
    594                                           //        This will reset the region to be the full input length.
    595     if (start < fActiveStart || start > fActiveLimit) {
    596         status = U_INDEX_OUTOFBOUNDS_ERROR;
    597         return FALSE;
    598     }
    599     fMatchEnd = start;
    600     return find();
    601 }
    602 
    603 
    604 
    605 //--------------------------------------------------------------------------------
    606 //
    607 //  group()
    608 //
    609 //--------------------------------------------------------------------------------
    610 UnicodeString RegexMatcher::group(UErrorCode &status) const {
    611     return group(0, status);
    612 }
    613 
    614 
    615 
    616 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
    617     int32_t  s = start(groupNum, status);
    618     int32_t  e = end(groupNum, status);
    619 
    620     // Note:  calling start() and end() above will do all necessary checking that
    621     //        the group number is OK and that a match exists.  status will be set.
    622     if (U_FAILURE(status)) {
    623         return UnicodeString();
    624     }
    625     if (U_FAILURE(fDeferredStatus)) {
    626         status = fDeferredStatus;
    627         return UnicodeString();
    628     }
    629 
    630     if (s < 0) {
    631         // A capture group wasn't part of the match
    632         return UnicodeString();
    633     }
    634     U_ASSERT(s <= e);
    635     return UnicodeString(*fInput, s, e-s);
    636 }
    637 
    638 
    639 
    640 
    641 int32_t RegexMatcher::groupCount() const {
    642     return fPattern->fGroupMap->size();
    643 }
    644 
    645 
    646 
    647 const UnicodeString &RegexMatcher::input() const {
    648     return *fInput;
    649 }
    650 
    651 
    652 //--------------------------------------------------------------------------------
    653 //
    654 //  hasAnchoringBounds()
    655 //
    656 //--------------------------------------------------------------------------------
    657 UBool RegexMatcher::hasAnchoringBounds() const {
    658     return fAnchoringBounds;
    659 }
    660 
    661 
    662 //--------------------------------------------------------------------------------
    663 //
    664 //  hasTransparentBounds()
    665 //
    666 //--------------------------------------------------------------------------------
    667 UBool RegexMatcher::hasTransparentBounds() const {
    668     return fTransparentBounds;
    669 }
    670 
    671 
    672 
    673 //--------------------------------------------------------------------------------
    674 //
    675 //  hitEnd()
    676 //
    677 //--------------------------------------------------------------------------------
    678 UBool RegexMatcher::hitEnd() const {
    679     return fHitEnd;
    680 }
    681 
    682 //--------------------------------------------------------------------------------
    683 //
    684 //  lookingAt()
    685 //
    686 //--------------------------------------------------------------------------------
    687 UBool RegexMatcher::lookingAt(UErrorCode &status) {
    688     if (U_FAILURE(status)) {
    689         return FALSE;
    690     }
    691     if (U_FAILURE(fDeferredStatus)) {
    692         status = fDeferredStatus;
    693         return FALSE;
    694     }
    695     resetPreserveRegion();
    696     MatchAt(fActiveStart, FALSE, status);
    697     return fMatch;
    698 }
    699 
    700 
    701 UBool RegexMatcher::lookingAt(int32_t start, UErrorCode &status) {
    702     if (U_FAILURE(status)) {
    703         return FALSE;
    704     }
    705     if (U_FAILURE(fDeferredStatus)) {
    706         status = fDeferredStatus;
    707         return FALSE;
    708     }
    709     reset();
    710     if (start < fActiveStart || start > fActiveLimit) {
    711         status = U_INDEX_OUTOFBOUNDS_ERROR;
    712         return FALSE;
    713     }
    714     MatchAt(start, FALSE, status);
    715     return fMatch;
    716 }
    717 
    718 
    719 
    720 //--------------------------------------------------------------------------------
    721 //
    722 //  matches()
    723 //
    724 //--------------------------------------------------------------------------------
    725 UBool RegexMatcher::matches(UErrorCode &status) {
    726     if (U_FAILURE(status)) {
    727         return FALSE;
    728     }
    729     if (U_FAILURE(fDeferredStatus)) {
    730         status = fDeferredStatus;
    731         return FALSE;
    732     }
    733     resetPreserveRegion();
    734     MatchAt(fActiveStart, TRUE, status);
    735     return fMatch;
    736 }
    737 
    738 
    739 UBool RegexMatcher::matches(int32_t start, UErrorCode &status) {
    740     if (U_FAILURE(status)) {
    741         return FALSE;
    742     }
    743     if (U_FAILURE(fDeferredStatus)) {
    744         status = fDeferredStatus;
    745         return FALSE;
    746     }
    747     reset();
    748     if (start < fActiveStart || start > fActiveLimit) {
    749         status = U_INDEX_OUTOFBOUNDS_ERROR;
    750         return FALSE;
    751     }
    752     MatchAt(start, TRUE, status);
    753     return fMatch;
    754 }
    755 
    756 
    757 
    758 //--------------------------------------------------------------------------------
    759 //
    760 //    pattern
    761 //
    762 //--------------------------------------------------------------------------------
    763 const RegexPattern &RegexMatcher::pattern() const {
    764     return *fPattern;
    765 }
    766 
    767 
    768 
    769 //--------------------------------------------------------------------------------
    770 //
    771 //    region
    772 //
    773 //--------------------------------------------------------------------------------
    774 RegexMatcher &RegexMatcher::region(int32_t start, int32_t limit, UErrorCode &status) {
    775     if (U_FAILURE(status)) {
    776         return *this;
    777     }
    778     if (start>limit || start<0 || limit<0 || limit>fInput->length()) {
    779         status = U_ILLEGAL_ARGUMENT_ERROR;
    780     }
    781     this->reset();
    782     fRegionStart = start;
    783     fRegionLimit = limit;
    784     fActiveStart = start;
    785     fActiveLimit = limit;
    786     if (!fTransparentBounds) {
    787         fLookStart = start;
    788         fLookLimit = limit;
    789     }
    790     if (fAnchoringBounds) {
    791         fAnchorStart = start;
    792         fAnchorLimit = limit;
    793     }
    794     return *this;
    795 }
    796 
    797 
    798 
    799 //--------------------------------------------------------------------------------
    800 //
    801 //    regionEnd
    802 //
    803 //--------------------------------------------------------------------------------
    804 int32_t RegexMatcher::regionEnd() const {
    805     return fRegionLimit;
    806 }
    807 
    808 
    809 //--------------------------------------------------------------------------------
    810 //
    811 //    regionStart
    812 //
    813 //--------------------------------------------------------------------------------
    814 int32_t RegexMatcher::regionStart() const {
    815     return fRegionStart;
    816 }
    817 
    818 
    819 //--------------------------------------------------------------------------------
    820 //
    821 //    replaceAll
    822 //
    823 //--------------------------------------------------------------------------------
    824 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
    825     if (U_FAILURE(status)) {
    826         return *fInput;
    827     }
    828     if (U_FAILURE(fDeferredStatus)) {
    829         status = fDeferredStatus;
    830         return *fInput;
    831     }
    832     UnicodeString destString;
    833     reset();
    834     while (find()) {
    835         appendReplacement(destString, replacement, status);
    836         if (U_FAILURE(status)) {
    837             break;
    838         }
    839     }
    840     appendTail(destString);
    841     return destString;
    842 }
    843 
    844 
    845 
    846 
    847 //--------------------------------------------------------------------------------
    848 //
    849 //    replaceFirst
    850 //
    851 //--------------------------------------------------------------------------------
    852 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
    853     if (U_FAILURE(status)) {
    854         return *fInput;
    855     }
    856     if (U_FAILURE(fDeferredStatus)) {
    857         status = fDeferredStatus;
    858         return *fInput;
    859     }
    860 
    861     reset();
    862     if (!find()) {
    863         return *fInput;
    864     }
    865 
    866     UnicodeString destString;
    867     appendReplacement(destString, replacement, status);
    868     appendTail(destString);
    869     return destString;
    870 }
    871 
    872 
    873 //--------------------------------------------------------------------------------
    874 //
    875 //     requireEnd
    876 //
    877 //--------------------------------------------------------------------------------
    878 UBool RegexMatcher::requireEnd() const {
    879     return fRequireEnd;
    880 }
    881 
    882 
    883 //--------------------------------------------------------------------------------
    884 //
    885 //     reset
    886 //
    887 //--------------------------------------------------------------------------------
    888 RegexMatcher &RegexMatcher::reset() {
    889     fRegionStart    = 0;
    890     fRegionLimit    = fInput->length();
    891     fActiveStart    = 0;
    892     fActiveLimit    = fRegionLimit;
    893     fAnchorStart    = 0;
    894     fAnchorLimit    = fRegionLimit;
    895     fLookStart      = 0;
    896     fLookLimit      = fRegionLimit;
    897     resetPreserveRegion();
    898     return *this;
    899 }
    900 
    901 
    902 
    903 void RegexMatcher::resetPreserveRegion() {
    904     fMatchStart     = 0;
    905     fMatchEnd       = 0;
    906     fLastMatchEnd   = -1;
    907     fAppendPosition = 0;
    908     fMatch          = FALSE;
    909     fHitEnd         = FALSE;
    910     fRequireEnd     = FALSE;
    911     fTime           = 0;
    912     fTickCounter    = TIMER_INITIAL_VALUE;
    913     resetStack();
    914 }
    915 
    916 
    917 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
    918     fInput          = &input;
    919     reset();
    920     if (fWordBreakItr != NULL) {
    921         #if UCONFIG_NO_BREAK_ITERATION==0
    922         fWordBreakItr->setText(input);
    923         #endif
    924     }
    925     return *this;
    926 }
    927 
    928 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
    929     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
    930     return *this;
    931 }*/
    932 
    933 
    934 RegexMatcher &RegexMatcher::reset(int32_t position, UErrorCode &status) {
    935     if (U_FAILURE(status)) {
    936         return *this;
    937     }
    938     reset();       // Reset also resets the region to be the entire string.
    939     if (position < 0 || position >= fActiveLimit) {
    940         status = U_INDEX_OUTOFBOUNDS_ERROR;
    941         return *this;
    942     }
    943     fMatchEnd = position;
    944     return *this;
    945 }
    946 
    947 
    948 
    949 
    950 
    951 //--------------------------------------------------------------------------------
    952 //
    953 //    setTrace
    954 //
    955 //--------------------------------------------------------------------------------
    956 void RegexMatcher::setTrace(UBool state) {
    957     fTraceDebug = state;
    958 }
    959 
    960 
    961 
    962 //---------------------------------------------------------------------
    963 //
    964 //   split
    965 //
    966 //---------------------------------------------------------------------
    967 int32_t  RegexMatcher::split(const UnicodeString &input,
    968         UnicodeString    dest[],
    969         int32_t          destCapacity,
    970         UErrorCode       &status)
    971 {
    972     //
    973     // Check arguements for validity
    974     //
    975     if (U_FAILURE(status)) {
    976         return 0;
    977     };
    978 
    979     if (destCapacity < 1) {
    980         status = U_ILLEGAL_ARGUMENT_ERROR;
    981         return 0;
    982     }
    983 
    984     //
    985     // Reset for the input text
    986     //
    987     reset(input);
    988     int32_t   nextOutputStringStart = 0;
    989     if (fActiveLimit == 0) {
    990         return 0;
    991     }
    992 
    993     //
    994     // Loop through the input text, searching for the delimiter pattern
    995     //
    996     int32_t i;
    997     int32_t numCaptureGroups = fPattern->fGroupMap->size();
    998     for (i=0; ; i++) {
    999         if (i>=destCapacity-1) {
   1000             // There is one or zero output string left.
   1001             // Fill the last output string with whatever is left from the input, then exit the loop.
   1002             //  ( i will be == destCapicity if we filled the output array while processing
   1003             //    capture groups of the delimiter expression, in which case we will discard the
   1004             //    last capture group saved in favor of the unprocessed remainder of the
   1005             //    input string.)
   1006             i = destCapacity-1;
   1007             int32_t remainingLength = fActiveLimit-nextOutputStringStart;
   1008             if (remainingLength > 0) {
   1009                 dest[i].setTo(input, nextOutputStringStart, remainingLength);
   1010             }
   1011             break;
   1012         }
   1013         if (find()) {
   1014             // We found another delimiter.  Move everything from where we started looking
   1015             //  up until the start of the delimiter into the next output string.
   1016             int32_t fieldLen = fMatchStart - nextOutputStringStart;
   1017             dest[i].setTo(input, nextOutputStringStart, fieldLen);
   1018             nextOutputStringStart = fMatchEnd;
   1019 
   1020             // If the delimiter pattern has capturing parentheses, the captured
   1021             //  text goes out into the next n destination strings.
   1022             int32_t groupNum;
   1023             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
   1024                 if (i==destCapacity-1) {
   1025                     break;
   1026                 }
   1027                 i++;
   1028                 dest[i] = group(groupNum, status);
   1029             }
   1030 
   1031             if (nextOutputStringStart == fActiveLimit) {
   1032                 // The delimiter was at the end of the string.  We're done.
   1033                 break;
   1034             }
   1035 
   1036         }
   1037         else
   1038         {
   1039             // We ran off the end of the input while looking for the next delimiter.
   1040             // All the remaining text goes into the current output string.
   1041             dest[i].setTo(input, nextOutputStringStart, fActiveLimit-nextOutputStringStart);
   1042             break;
   1043         }
   1044     }
   1045     return i+1;
   1046 }
   1047 
   1048 
   1049 
   1050 //--------------------------------------------------------------------------------
   1051 //
   1052 //     start
   1053 //
   1054 //--------------------------------------------------------------------------------
   1055 int32_t RegexMatcher::start(UErrorCode &status) const {
   1056     return start(0, status);
   1057 }
   1058 
   1059 
   1060 
   1061 
   1062 //--------------------------------------------------------------------------------
   1063 //
   1064 //     start(int32_t group, UErrorCode &status)
   1065 //
   1066 //--------------------------------------------------------------------------------
   1067 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
   1068     if (U_FAILURE(status)) {
   1069         return -1;
   1070     }
   1071     if (U_FAILURE(fDeferredStatus)) {
   1072         status = fDeferredStatus;
   1073         return -1;
   1074     }
   1075     if (fMatch == FALSE) {
   1076         status = U_REGEX_INVALID_STATE;
   1077         return -1;
   1078     }
   1079     if (group < 0 || group > fPattern->fGroupMap->size()) {
   1080         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1081         return -1;
   1082     }
   1083     int32_t s;
   1084     if (group == 0) {
   1085         s = fMatchStart;
   1086     } else {
   1087         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
   1088         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1089         U_ASSERT(groupOffset >= 0);
   1090         s = fFrame->fExtra[groupOffset];
   1091     }
   1092     return s;
   1093 }
   1094 
   1095 
   1096 
   1097 //--------------------------------------------------------------------------------
   1098 //
   1099 //     useAnchoringBounds
   1100 //
   1101 //--------------------------------------------------------------------------------
   1102 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
   1103     fAnchoringBounds = b;
   1104     UErrorCode status = U_ZERO_ERROR;
   1105     region(fRegionStart, fRegionLimit, status);
   1106     U_ASSERT(U_SUCCESS(status));
   1107     return *this;
   1108 }
   1109 
   1110 
   1111 //--------------------------------------------------------------------------------
   1112 //
   1113 //     useTransparentBounds
   1114 //
   1115 //--------------------------------------------------------------------------------
   1116 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
   1117     fTransparentBounds = b;
   1118     UErrorCode status = U_ZERO_ERROR;
   1119     region(fRegionStart, fRegionLimit, status);
   1120     U_ASSERT(U_SUCCESS(status));
   1121     return *this;
   1122 }
   1123 
   1124 //--------------------------------------------------------------------------------
   1125 //
   1126 //     setTimeLimit
   1127 //
   1128 //--------------------------------------------------------------------------------
   1129 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
   1130     if (U_FAILURE(status)) {
   1131         return;
   1132     }
   1133     if (U_FAILURE(fDeferredStatus)) {
   1134         status = fDeferredStatus;
   1135         return;
   1136     }
   1137     if (limit < 0) {
   1138         status = U_ILLEGAL_ARGUMENT_ERROR;
   1139         return;
   1140     }
   1141     fTimeLimit = limit;
   1142 }
   1143 
   1144 
   1145 //--------------------------------------------------------------------------------
   1146 //
   1147 //     getTimeLimit
   1148 //
   1149 //--------------------------------------------------------------------------------
   1150 int32_t RegexMatcher::getTimeLimit() const {
   1151     return fTimeLimit;
   1152 }
   1153 
   1154 
   1155 //--------------------------------------------------------------------------------
   1156 //
   1157 //     setStackLimit
   1158 //
   1159 //--------------------------------------------------------------------------------
   1160 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
   1161     if (U_FAILURE(status)) {
   1162         return;
   1163     }
   1164     if (U_FAILURE(fDeferredStatus)) {
   1165         status = fDeferredStatus;
   1166         return;
   1167     }
   1168     if (limit < 0) {
   1169         status = U_ILLEGAL_ARGUMENT_ERROR;
   1170         return;
   1171     }
   1172 
   1173     // Reset the matcher.  This is needed here in case there is a current match
   1174     //    whose final stack frame (containing the match results, pointed to by fFrame)
   1175     //    would be lost by resizing to a smaller stack size.
   1176     reset();
   1177 
   1178     if (limit == 0) {
   1179         // Unlimited stack expansion
   1180         fStack->setMaxCapacity(0);
   1181     } else {
   1182         // Change the units of the limit  from bytes to ints, and bump the size up
   1183         //   to be big enough to hold at least one stack frame for the pattern,
   1184         //   if it isn't there already.
   1185         int32_t adjustedLimit = limit / sizeof(int32_t);
   1186         if (adjustedLimit < fPattern->fFrameSize) {
   1187             adjustedLimit = fPattern->fFrameSize;
   1188         }
   1189         fStack->setMaxCapacity(adjustedLimit);
   1190     }
   1191     fStackLimit = limit;
   1192 }
   1193 
   1194 
   1195 //--------------------------------------------------------------------------------
   1196 //
   1197 //     getStackLimit
   1198 //
   1199 //--------------------------------------------------------------------------------
   1200 int32_t RegexMatcher::getStackLimit() const {
   1201     return fStackLimit;
   1202 }
   1203 
   1204 
   1205 //--------------------------------------------------------------------------------
   1206 //
   1207 //     setMatchCallback
   1208 //
   1209 //--------------------------------------------------------------------------------
   1210 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
   1211                                     const void              *context,
   1212                                     UErrorCode              &status) {
   1213      if (U_FAILURE(status)) {
   1214          return;
   1215      }
   1216      fCallbackFn = callback;
   1217      fCallbackContext = context;
   1218 }
   1219 
   1220 
   1221 //--------------------------------------------------------------------------------
   1222 //
   1223 //     getMatchCallback
   1224 //
   1225 //--------------------------------------------------------------------------------
   1226 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
   1227                                   const void              *&context,
   1228                                   UErrorCode              &status) {
   1229     if (U_FAILURE(status)) {
   1230        return;
   1231     }
   1232     callback = fCallbackFn;
   1233     context  = fCallbackContext;
   1234 }
   1235 
   1236 
   1237 //================================================================================
   1238 //
   1239 //    Code following this point in this file is the internal
   1240 //    Match Engine Implementation.
   1241 //
   1242 //================================================================================
   1243 
   1244 
   1245 //--------------------------------------------------------------------------------
   1246 //
   1247 //   resetStack
   1248 //           Discard any previous contents of the state save stack, and initialize a
   1249 //           new stack frame to all -1.  The -1s are needed for capture group limits,
   1250 //           where they indicate that a group has not yet matched anything.
   1251 //--------------------------------------------------------------------------------
   1252 REStackFrame *RegexMatcher::resetStack() {
   1253     // Discard any previous contents of the state save stack, and initialize a
   1254     //  new stack frame to all -1.  The -1s are needed for capture group limits, where
   1255     //  they indicate that a group has not yet matched anything.
   1256     fStack->removeAllElements();
   1257 
   1258     int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
   1259     int32_t i;
   1260     for (i=0; i<fPattern->fFrameSize; i++) {
   1261         iFrame[i] = -1;
   1262     }
   1263     return (REStackFrame *)iFrame;
   1264 }
   1265 
   1266 
   1267 
   1268 //--------------------------------------------------------------------------------
   1269 //
   1270 //   isWordBoundary
   1271 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
   1272 //                     For us,
   1273 //                       If the current char is a combining mark,
   1274 //                          \b is FALSE.
   1275 //                       Else Scan backwards to the first non-combining char.
   1276 //                            We are at a boundary if the this char and the original chars are
   1277 //                               opposite in membership in \w set
   1278 //
   1279 //          parameters:   pos   - the current position in the input buffer
   1280 //
   1281 //              TODO:  double-check edge cases at region boundaries.
   1282 //
   1283 //--------------------------------------------------------------------------------
   1284 UBool RegexMatcher::isWordBoundary(int32_t pos) {
   1285     UBool isBoundary = FALSE;
   1286     UBool cIsWord    = FALSE;
   1287 
   1288     if (pos >= fLookLimit) {
   1289         fHitEnd = TRUE;
   1290     } else {
   1291         // Determine whether char c at current position is a member of the word set of chars.
   1292         // If we're off the end of the string, behave as though we're not at a word char.
   1293         UChar32  c = fInput->char32At(pos);
   1294         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   1295             // Current char is a combining one.  Not a boundary.
   1296             return FALSE;
   1297         }
   1298         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   1299     }
   1300 
   1301     // Back up until we come to a non-combining char, determine whether
   1302     //  that char is a word char.
   1303     UBool prevCIsWord = FALSE;
   1304     int32_t prevPos = pos;
   1305     for (;;) {
   1306         if (prevPos <= fLookStart) {
   1307             break;
   1308         }
   1309         prevPos = fInput->moveIndex32(prevPos, -1);
   1310         UChar32 prevChar = fInput->char32At(prevPos);
   1311         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   1312               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   1313             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   1314             break;
   1315         }
   1316     }
   1317     isBoundary = cIsWord ^ prevCIsWord;
   1318     return isBoundary;
   1319 }
   1320 
   1321 //--------------------------------------------------------------------------------
   1322 //
   1323 //   isUWordBoundary
   1324 //
   1325 //         Test for a word boundary using RBBI word break.
   1326 //
   1327 //          parameters:   pos   - the current position in the input buffer
   1328 //
   1329 //--------------------------------------------------------------------------------
   1330 UBool RegexMatcher::isUWordBoundary(int32_t pos) {
   1331     UBool       returnVal = FALSE;
   1332 #if UCONFIG_NO_BREAK_ITERATION==0
   1333 
   1334     // If we haven't yet created a break iterator for this matcher, do it now.
   1335     if (fWordBreakItr == NULL) {
   1336         fWordBreakItr =
   1337             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
   1338         if (U_FAILURE(fDeferredStatus)) {
   1339             return FALSE;
   1340         }
   1341         fWordBreakItr->setText(*fInput);
   1342     }
   1343 
   1344     if (pos >= fLookLimit) {
   1345         fHitEnd = TRUE;
   1346         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
   1347                             //    words are not boundaries.  All non-word chars stand by themselves,
   1348                             //    with word boundaries on both sides.
   1349     } else {
   1350         returnVal = fWordBreakItr->isBoundary(pos);
   1351     }
   1352 #endif
   1353     return   returnVal;
   1354 }
   1355 
   1356 //--------------------------------------------------------------------------------
   1357 //
   1358 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
   1359 //                     saves. Increment the "time" counter, and call the
   1360 //                     user callback function if there is one installed.
   1361 //
   1362 //                     If the match operation needs to be aborted, either for a time-out
   1363 //                     or because the user callback asked for it, just set an error status.
   1364 //                     The engine will pick that up and stop in its outer loop.
   1365 //
   1366 //--------------------------------------------------------------------------------
   1367 void RegexMatcher::IncrementTime(UErrorCode &status) {
   1368     fTickCounter = TIMER_INITIAL_VALUE;
   1369     fTime++;
   1370     if (fCallbackFn != NULL) {
   1371         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
   1372             status = U_REGEX_STOPPED_BY_CALLER;
   1373             return;
   1374         }
   1375     }
   1376     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
   1377         status = U_REGEX_TIME_OUT;
   1378     }
   1379 }
   1380 
   1381 //--------------------------------------------------------------------------------
   1382 //
   1383 //   StateSave
   1384 //       Make a new stack frame, initialized as a copy of the current stack frame.
   1385 //       Set the pattern index in the original stack frame from the operand value
   1386 //       in the opcode.  Execution of the engine continues with the state in
   1387 //       the newly created stack frame
   1388 //
   1389 //       Note that reserveBlock() may grow the stack, resulting in the
   1390 //       whole thing being relocated in memory.
   1391 //
   1392 //    Parameters:
   1393 //       fp           The top frame pointer when called.  At return, a new
   1394 //                    fame will be present
   1395 //       savePatIdx   An index into the compiled pattern.  Goes into the original
   1396 //                    (not new) frame.  If execution ever back-tracks out of the
   1397 //                    new frame, this will be where we continue from in the pattern.
   1398 //    Return
   1399 //                    The new frame pointer.
   1400 //
   1401 //--------------------------------------------------------------------------------
   1402 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, UErrorCode &status) {
   1403     // push storage for a new frame.
   1404     int32_t *newFP = fStack->reserveBlock(fFrameSize, status);
   1405     if (newFP == NULL) {
   1406         // Failure on attempted stack expansion.
   1407         //   Stack function set some other error code, change it to a more
   1408         //   specific one for regular expressions.
   1409         status = U_REGEX_STACK_OVERFLOW;
   1410         // We need to return a writable stack frame, so just return the
   1411         //    previous frame.  The match operation will stop quickly
   1412         //    because of the error status, after which the frame will never
   1413         //    be looked at again.
   1414         return fp;
   1415     }
   1416     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
   1417 
   1418     // New stack frame = copy of old top frame.
   1419     int32_t *source = (int32_t *)fp;
   1420     int32_t *dest   = newFP;
   1421     for (;;) {
   1422         *dest++ = *source++;
   1423         if (source == newFP) {
   1424             break;
   1425         }
   1426     }
   1427 
   1428     fTickCounter--;
   1429     if (fTickCounter <= 0) {
   1430        IncrementTime(status);    // Re-initializes fTickCounter
   1431     }
   1432     fp->fPatIdx = savePatIdx;
   1433     return (REStackFrame *)newFP;
   1434 }
   1435 
   1436 
   1437 //--------------------------------------------------------------------------------
   1438 //
   1439 //   MatchAt      This is the actual matching engine.
   1440 //
   1441 //                  startIdx:    begin matching a this index.
   1442 //                  toEnd:       if true, match must extend to end of the input region
   1443 //
   1444 //--------------------------------------------------------------------------------
   1445 void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
   1446     UBool       isMatch  = FALSE;      // True if the we have a match.
   1447 
   1448     int32_t     op;                    // Operation from the compiled pattern, split into
   1449     int32_t     opType;                //    the opcode
   1450     int32_t     opValue;               //    and the operand value.
   1451 
   1452     #ifdef REGEX_RUN_DEBUG
   1453     if (fTraceDebug)
   1454     {
   1455         printf("MatchAt(startIdx=%d)\n", startIdx);
   1456         printf("Original Pattern: ");
   1457         int32_t i;
   1458         for (i=0; i<fPattern->fPattern.length(); i++) {
   1459             printf("%c", fPattern->fPattern.charAt(i));
   1460         }
   1461         printf("\n");
   1462         printf("Input String: ");
   1463         for (i=0; i<fInput->length(); i++) {
   1464             UChar c = fInput->charAt(i);
   1465             if (c<32 || c>256) {
   1466                 c = '.';
   1467             }
   1468             printf("%c", c);
   1469         }
   1470         printf("\n");
   1471         printf("\n");
   1472     }
   1473     #endif
   1474 
   1475     if (U_FAILURE(status)) {
   1476         return;
   1477     }
   1478 
   1479     //  Cache frequently referenced items from the compiled pattern
   1480     //
   1481     int32_t             *pat           = fPattern->fCompiledPat->getBuffer();
   1482 
   1483     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   1484     UVector             *sets          = fPattern->fSets;
   1485 
   1486     const UChar         *inputBuf      = fInput->getBuffer();
   1487 
   1488     fFrameSize = fPattern->fFrameSize;
   1489     REStackFrame        *fp            = resetStack();
   1490 
   1491     fp->fPatIdx   = 0;
   1492     fp->fInputIdx = startIdx;
   1493 
   1494     // Zero out the pattern's static data
   1495     int32_t i;
   1496     for (i = 0; i<fPattern->fDataSize; i++) {
   1497         fData[i] = 0;
   1498     }
   1499 
   1500     //
   1501     //  Main loop for interpreting the compiled pattern.
   1502     //  One iteration of the loop per pattern operation performed.
   1503     //
   1504     for (;;) {
   1505 #if 0
   1506         if (_heapchk() != _HEAPOK) {
   1507             fprintf(stderr, "Heap Trouble\n");
   1508         }
   1509 #endif
   1510         op      = pat[fp->fPatIdx];
   1511         opType  = URX_TYPE(op);
   1512         opValue = URX_VAL(op);
   1513         #ifdef REGEX_RUN_DEBUG
   1514         if (fTraceDebug) {
   1515             printf("inputIdx=%d   inputChar=%c   sp=%3d  ", fp->fInputIdx,
   1516                 fInput->char32At(fp->fInputIdx), (int32_t *)fp-fStack->getBuffer());
   1517             fPattern->dumpOp(fp->fPatIdx);
   1518         }
   1519         #endif
   1520         fp->fPatIdx++;
   1521 
   1522         switch (opType) {
   1523 
   1524 
   1525         case URX_NOP:
   1526             break;
   1527 
   1528 
   1529         case URX_BACKTRACK:
   1530             // Force a backtrack.  In some circumstances, the pattern compiler
   1531             //   will notice that the pattern can't possibly match anything, and will
   1532             //   emit one of these at that point.
   1533             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1534             break;
   1535 
   1536 
   1537         case URX_ONECHAR:
   1538             if (fp->fInputIdx < fActiveLimit) {
   1539                 UChar32   c;
   1540                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   1541                 if (c == opValue) {
   1542                     break;
   1543                 }
   1544             } else {
   1545                 fHitEnd = TRUE;
   1546             }
   1547             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1548             break;
   1549 
   1550 
   1551         case URX_STRING:
   1552             {
   1553                 // Test input against a literal string.
   1554                 // Strings require two slots in the compiled pattern, one for the
   1555                 //   offset to the string text, and one for the length.
   1556                 int32_t   stringStartIdx = opValue;
   1557                 int32_t   stringLen;
   1558 
   1559                 op      = pat[fp->fPatIdx];     // Fetch the second operand
   1560                 fp->fPatIdx++;
   1561                 opType    = URX_TYPE(op);
   1562                 stringLen = URX_VAL(op);
   1563                 U_ASSERT(opType == URX_STRING_LEN);
   1564                 U_ASSERT(stringLen >= 2);
   1565 
   1566                 if (fp->fInputIdx + stringLen > fActiveLimit) {
   1567                     // No match.  String is longer than the remaining input text.
   1568                     fHitEnd = TRUE;          //   TODO:  See ticket 6074
   1569                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1570                     break;
   1571                 }
   1572 
   1573                 const UChar * pInp = inputBuf + fp->fInputIdx;
   1574                 const UChar * pPat = litText+stringStartIdx;
   1575                 const UChar * pEnd = pInp + stringLen;
   1576                 for(;;) {
   1577                     if (*pInp == *pPat) {
   1578                         pInp++;
   1579                         pPat++;
   1580                         if (pInp == pEnd) {
   1581                             // Successful Match.
   1582                             fp->fInputIdx += stringLen;
   1583                             break;
   1584                         }
   1585                     } else {
   1586                         // Match failed.
   1587                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1588                         break;
   1589                     }
   1590                 }
   1591             }
   1592             break;
   1593 
   1594 
   1595 
   1596         case URX_STATE_SAVE:
   1597             fp = StateSave(fp, opValue, status);
   1598             break;
   1599 
   1600 
   1601         case URX_END:
   1602             // The match loop will exit via this path on a successful match,
   1603             //   when we reach the end of the pattern.
   1604             if (toEnd && fp->fInputIdx != fActiveLimit) {
   1605                 // The pattern matched, but not to the end of input.  Try some more.
   1606                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1607                 break;
   1608             }
   1609             isMatch = TRUE;
   1610             goto  breakFromLoop;
   1611 
   1612         // Start and End Capture stack frame variables are layout out like this:
   1613             //  fp->fExtra[opValue]  - The start of a completed capture group
   1614             //             opValue+1 - The end   of a completed capture group
   1615             //             opValue+2 - the start of a capture group whose end
   1616             //                          has not yet been reached (and might not ever be).
   1617         case URX_START_CAPTURE:
   1618             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   1619             fp->fExtra[opValue+2] = fp->fInputIdx;
   1620             break;
   1621 
   1622 
   1623         case URX_END_CAPTURE:
   1624             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   1625             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   1626             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   1627             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   1628             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   1629             break;
   1630 
   1631 
   1632         case URX_DOLLAR:                   //  $, test for End of line
   1633                                            //     or for position before new line at end of input
   1634             if (fp->fInputIdx < fAnchorLimit-2) {
   1635                 // We are no where near the end of input.  Fail.
   1636                 //   This is the common case.  Keep it first.
   1637                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1638                 break;
   1639             }
   1640             if (fp->fInputIdx >= fAnchorLimit) {
   1641                 // We really are at the end of input.  Success.
   1642                 fHitEnd = TRUE;
   1643                 fRequireEnd = TRUE;
   1644                 break;
   1645             }
   1646             // If we are positioned just before a new-line that is located at the
   1647             //   end of input, succeed.
   1648             if (fp->fInputIdx == fAnchorLimit-1) {
   1649                 UChar32 c = fInput->char32At(fp->fInputIdx);
   1650                 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
   1651                     // If not in the middle of a CR/LF sequence
   1652                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   1653                         // At new-line at end of input. Success
   1654                         fHitEnd = TRUE;
   1655                         fRequireEnd = TRUE;
   1656                         break;
   1657                     }
   1658                 }
   1659             }
   1660 
   1661             if (fp->fInputIdx == fAnchorLimit-2 &&
   1662                  fInput->char32At(fp->fInputIdx) == 0x0d && fInput->char32At(fp->fInputIdx+1) == 0x0a) {
   1663                     fHitEnd = TRUE;
   1664                     fRequireEnd = TRUE;
   1665                     break;                         // At CR/LF at end of input.  Success
   1666             }
   1667 
   1668             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1669 
   1670             break;
   1671 
   1672 
   1673          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   1674             if (fp->fInputIdx >= fAnchorLimit-1) {
   1675                 // Either at the last character of input, or off the end.
   1676                 if (fp->fInputIdx == fAnchorLimit-1) {
   1677                     // At last char of input.  Success if it's a new line.
   1678                     if (fInput->char32At(fp->fInputIdx) == 0x0a) {
   1679                         fHitEnd = TRUE;
   1680                         fRequireEnd = TRUE;
   1681                         break;
   1682                     }
   1683                 } else {
   1684                     // Off the end of input.  Success.
   1685                     fHitEnd = TRUE;
   1686                     fRequireEnd = TRUE;
   1687                     break;
   1688                 }
   1689             }
   1690 
   1691             // Not at end of input.  Back-track out.
   1692             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1693             break;
   1694 
   1695 
   1696          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   1697              {
   1698                  if (fp->fInputIdx >= fAnchorLimit) {
   1699                      // We really are at the end of input.  Success.
   1700                      fHitEnd = TRUE;
   1701                      fRequireEnd = TRUE;
   1702                      break;
   1703                  }
   1704                  // If we are positioned just before a new-line, succeed.
   1705                  // It makes no difference where the new-line is within the input.
   1706                  UChar32 c = inputBuf[fp->fInputIdx];
   1707                  if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
   1708                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   1709                      //  In multi-line mode, hitting a new-line just before the end of input does not
   1710                      //   set the hitEnd or requireEnd flags
   1711                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   1712                         break;
   1713                      }
   1714                  }
   1715                  // not at a new line.  Fail.
   1716                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1717              }
   1718              break;
   1719 
   1720 
   1721          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   1722              {
   1723                  if (fp->fInputIdx >= fAnchorLimit) {
   1724                      // We really are at the end of input.  Success.
   1725                      fHitEnd = TRUE;
   1726                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   1727                      break;               //   adding a new-line would not lose the match.
   1728                  }
   1729                  // If we are not positioned just before a new-line, the test fails; backtrack out.
   1730                  // It makes no difference where the new-line is within the input.
   1731                  if (inputBuf[fp->fInputIdx] != 0x0a) {
   1732                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1733                  }
   1734              }
   1735              break;
   1736 
   1737 
   1738        case URX_CARET:                    //  ^, test for start of line
   1739             if (fp->fInputIdx != fAnchorStart) {
   1740                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1741             }
   1742             break;
   1743 
   1744 
   1745        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   1746            {
   1747                if (fp->fInputIdx == fAnchorStart) {
   1748                    // We are at the start input.  Success.
   1749                    break;
   1750                }
   1751                // Check whether character just before the current pos is a new-line
   1752                //   unless we are at the end of input
   1753                UChar  c = inputBuf[fp->fInputIdx - 1];
   1754                if ((fp->fInputIdx < fAnchorLimit) &&
   1755                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   1756                    //  It's a new-line.  ^ is true.  Success.
   1757                    //  TODO:  what should be done with positions between a CR and LF?
   1758                    break;
   1759                }
   1760                // Not at the start of a line.  Fail.
   1761                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1762            }
   1763            break;
   1764 
   1765 
   1766        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   1767            {
   1768                U_ASSERT(fp->fInputIdx >= fAnchorStart);
   1769                if (fp->fInputIdx <= fAnchorStart) {
   1770                    // We are at the start input.  Success.
   1771                    break;
   1772                }
   1773                // Check whether character just before the current pos is a new-line
   1774                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   1775                UChar  c = inputBuf[fp->fInputIdx - 1];
   1776                if (c != 0x0a) {
   1777                    // Not at the start of a line.  Back-track out.
   1778                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1779                }
   1780            }
   1781            break;
   1782 
   1783         case URX_BACKSLASH_B:          // Test for word boundaries
   1784             {
   1785                 UBool success = isWordBoundary(fp->fInputIdx);
   1786                 success ^= (opValue != 0);     // flip sense for \B
   1787                 if (!success) {
   1788                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1789                 }
   1790             }
   1791             break;
   1792 
   1793 
   1794         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   1795             {
   1796                 UBool success = isUWordBoundary(fp->fInputIdx);
   1797                 success ^= (opValue != 0);     // flip sense for \B
   1798                 if (!success) {
   1799                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1800                 }
   1801             }
   1802             break;
   1803 
   1804 
   1805         case URX_BACKSLASH_D:            // Test for decimal digit
   1806             {
   1807                 if (fp->fInputIdx >= fActiveLimit) {
   1808                     fHitEnd = TRUE;
   1809                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1810                     break;
   1811                 }
   1812 
   1813                 UChar32 c = fInput->char32At(fp->fInputIdx);
   1814                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   1815                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   1816                 success ^= (opValue != 0);        // flip sense for \D
   1817                 if (success) {
   1818                     fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
   1819                 } else {
   1820                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1821                 }
   1822             }
   1823             break;
   1824 
   1825 
   1826         case URX_BACKSLASH_G:          // Test for position at end of previous match
   1827             if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==fActiveStart)) {
   1828                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1829             }
   1830             break;
   1831 
   1832 
   1833         case URX_BACKSLASH_X:
   1834             //  Match a Grapheme, as defined by Unicode TR 29.
   1835             //  Differs slightly from Perl, which consumes combining marks independently
   1836             //    of context.
   1837             {
   1838 
   1839                 // Fail if at end of input
   1840                 if (fp->fInputIdx >= fActiveLimit) {
   1841                     fHitEnd = TRUE;
   1842                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1843                     break;
   1844                 }
   1845 
   1846                 // Examine (and consume) the current char.
   1847                 //   Dispatch into a little state machine, based on the char.
   1848                 UChar32  c;
   1849                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   1850                 UnicodeSet **sets = fPattern->fStaticSets;
   1851                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   1852                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   1853                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   1854                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   1855                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   1856                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   1857                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   1858                 goto GC_Extend;
   1859 
   1860 
   1861 
   1862 GC_L:
   1863                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   1864                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   1865                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   1866                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   1867                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   1868                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   1869                 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   1870                 goto GC_Extend;
   1871 
   1872 GC_V:
   1873                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   1874                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   1875                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   1876                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   1877                 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   1878                 goto GC_Extend;
   1879 
   1880 GC_T:
   1881                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   1882                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   1883                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   1884                 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   1885                 goto GC_Extend;
   1886 
   1887 GC_Extend:
   1888                 // Combining characters are consumed here
   1889                 for (;;) {
   1890                     if (fp->fInputIdx >= fActiveLimit) {
   1891                         break;
   1892                     }
   1893                     U16_GET(inputBuf, 0, fp->fInputIdx, fActiveLimit, c);
   1894                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   1895                         break;
   1896                     }
   1897                     U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
   1898                 }
   1899                 goto GC_Done;
   1900 
   1901 GC_Control:
   1902                 // Most control chars stand alone (don't combine with combining chars),
   1903                 //   except for that CR/LF sequence is a single grapheme cluster.
   1904                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
   1905                     fp->fInputIdx++;
   1906                 }
   1907 
   1908 GC_Done:
   1909                 if (fp->fInputIdx >= fActiveLimit) {
   1910                     fHitEnd = TRUE;
   1911                 }
   1912                 break;
   1913             }
   1914 
   1915 
   1916 
   1917 
   1918         case URX_BACKSLASH_Z:          // Test for end of Input
   1919             if (fp->fInputIdx < fAnchorLimit) {
   1920                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1921             } else {
   1922                 fHitEnd = TRUE;
   1923                 fRequireEnd = TRUE;
   1924             }
   1925             break;
   1926 
   1927 
   1928 
   1929         case URX_STATIC_SETREF:
   1930             {
   1931                 // Test input character against one of the predefined sets
   1932                 //    (Word Characters, for example)
   1933                 // The high bit of the op value is a flag for the match polarity.
   1934                 //    0:   success if input char is in set.
   1935                 //    1:   success if input char is not in set.
   1936                 if (fp->fInputIdx >= fActiveLimit) {
   1937                     fHitEnd = TRUE;
   1938                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1939                     break;
   1940                 }
   1941 
   1942                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   1943                 opValue &= ~URX_NEG_SET;
   1944                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   1945                 UChar32  c;
   1946                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   1947                 if (c < 256) {
   1948                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   1949                     if (s8->contains(c)) {
   1950                         success = !success;
   1951                     }
   1952                 } else {
   1953                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   1954                     if (s->contains(c)) {
   1955                         success = !success;
   1956                     }
   1957                 }
   1958                 if (!success) {
   1959                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1960                 }
   1961             }
   1962             break;
   1963 
   1964 
   1965         case URX_STAT_SETREF_N:
   1966             {
   1967                 // Test input character for NOT being a member of  one of
   1968                 //    the predefined sets (Word Characters, for example)
   1969                 if (fp->fInputIdx >= fActiveLimit) {
   1970                     fHitEnd = TRUE;
   1971                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1972                     break;
   1973                 }
   1974 
   1975                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   1976                 UChar32  c;
   1977                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   1978                 if (c < 256) {
   1979                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   1980                     if (s8->contains(c) == FALSE) {
   1981                         break;
   1982                     }
   1983                 } else {
   1984                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   1985                     if (s->contains(c) == FALSE) {
   1986                         break;
   1987                     }
   1988                 }
   1989 
   1990                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1991             }
   1992             break;
   1993 
   1994 
   1995         case URX_SETREF:
   1996             if (fp->fInputIdx >= fActiveLimit) {
   1997                 fHitEnd = TRUE;
   1998                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   1999                 break;
   2000             }
   2001             // There is input left.  Pick up one char and test it for set membership.
   2002             UChar32   c;
   2003             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   2004             U_ASSERT(opValue > 0 && opValue < sets->size());
   2005             if (c<256) {
   2006                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   2007                 if (s8->contains(c)) {
   2008                     break;
   2009                 }
   2010             } else {
   2011                 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   2012                 if (s->contains(c)) {
   2013                     // The character is in the set.  A Match.
   2014                     break;
   2015                 }
   2016             }
   2017             // the character wasn't in the set. Back track out.
   2018             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2019             break;
   2020 
   2021 
   2022         case URX_DOTANY:
   2023             {
   2024                 // . matches anything, but stops at end-of-line.
   2025                 if (fp->fInputIdx >= fActiveLimit) {
   2026                     // At end of input.  Match failed.  Backtrack out.
   2027                     fHitEnd = TRUE;
   2028                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2029                     break;
   2030                 }
   2031                 // There is input left.  Advance over one char, unless we've hit end-of-line
   2032                 UChar32 c;
   2033                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   2034                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   2035                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   2036                     // End of line in normal mode.   . does not match.
   2037                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2038                     break;
   2039                 }
   2040             }
   2041             break;
   2042 
   2043 
   2044         case URX_DOTANY_ALL:
   2045             {
   2046                 // ., in dot-matches-all (including new lines) mode
   2047                 if (fp->fInputIdx >= fActiveLimit) {
   2048                     // At end of input.  Match failed.  Backtrack out.
   2049                     fHitEnd = TRUE;
   2050                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2051                     break;
   2052                 }
   2053                 // There is input left.  Advance over one char, except if we are
   2054                 //   at a cr/lf, advance over both of them.
   2055                 UChar32 c;
   2056                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   2057                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   2058                     // In the case of a CR/LF, we need to advance over both.
   2059                     UChar nextc = inputBuf[fp->fInputIdx];
   2060                     if (nextc == 0x0a) {
   2061                         fp->fInputIdx++;
   2062                     }
   2063                 }
   2064             }
   2065             break;
   2066 
   2067 
   2068         case URX_DOTANY_UNIX:
   2069             {
   2070                 // '.' operator, matches all, but stops at end-of-line.
   2071                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   2072                 if (fp->fInputIdx >= fActiveLimit) {
   2073                     // At end of input.  Match failed.  Backtrack out.
   2074                     fHitEnd = TRUE;
   2075                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2076                     break;
   2077                 }
   2078                 // There is input left.  Advance over one char, unless we've hit end-of-line
   2079                 UChar32 c;
   2080                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   2081                 if (c == 0x0a) {
   2082                     // End of line in normal mode.   '.' does not match the \n
   2083                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2084                 }
   2085             }
   2086             break;
   2087 
   2088 
   2089         case URX_JMP:
   2090             fp->fPatIdx = opValue;
   2091             break;
   2092 
   2093         case URX_FAIL:
   2094             isMatch = FALSE;
   2095             goto breakFromLoop;
   2096 
   2097         case URX_JMP_SAV:
   2098             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   2099             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   2100             fp->fPatIdx = opValue;                         // Then JMP.
   2101             break;
   2102 
   2103         case URX_JMP_SAV_X:
   2104             // This opcode is used with (x)+, when x can match a zero length string.
   2105             // Same as JMP_SAV, except conditional on the match having made forward progress.
   2106             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   2107             //   data address of the input position at the start of the loop.
   2108             {
   2109                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   2110                 int32_t  stoOp = pat[opValue-1];
   2111                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   2112                 int32_t  frameLoc = URX_VAL(stoOp);
   2113                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   2114                 int32_t prevInputIdx = fp->fExtra[frameLoc];
   2115                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   2116                 if (prevInputIdx < fp->fInputIdx) {
   2117                     // The match did make progress.  Repeat the loop.
   2118                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   2119                     fp->fPatIdx = opValue;
   2120                     fp->fExtra[frameLoc] = fp->fInputIdx;
   2121                 }
   2122                 // If the input position did not advance, we do nothing here,
   2123                 //   execution will fall out of the loop.
   2124             }
   2125             break;
   2126 
   2127         case URX_CTR_INIT:
   2128             {
   2129                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   2130                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   2131 
   2132                 // Pick up the three extra operands that CTR_INIT has, and
   2133                 //    skip the pattern location counter past
   2134                 int32_t instrOperandLoc = fp->fPatIdx;
   2135                 fp->fPatIdx += 3;
   2136                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   2137                 int32_t minCount = pat[instrOperandLoc+1];
   2138                 int32_t maxCount = pat[instrOperandLoc+2];
   2139                 U_ASSERT(minCount>=0);
   2140                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   2141                 U_ASSERT(loopLoc>fp->fPatIdx);
   2142 
   2143                 if (minCount == 0) {
   2144                     fp = StateSave(fp, loopLoc+1, status);
   2145                 }
   2146                 if (maxCount == 0) {
   2147                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2148                 }
   2149             }
   2150             break;
   2151 
   2152         case URX_CTR_LOOP:
   2153             {
   2154                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   2155                 int32_t initOp = pat[opValue];
   2156                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   2157                 int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   2158                 int32_t minCount  = pat[opValue+2];
   2159                 int32_t maxCount = pat[opValue+3];
   2160                 // Increment the counter.  Note: we're not worrying about counter
   2161                 //   overflow, since the data comes from UnicodeStrings, which
   2162                 //   stores its length in an int32_t.
   2163                 (*pCounter)++;
   2164                 U_ASSERT(*pCounter > 0);
   2165                 if ((uint32_t)*pCounter >= (uint32_t)maxCount) {
   2166                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   2167                     break;
   2168                 }
   2169                 if (*pCounter >= minCount) {
   2170                     fp = StateSave(fp, fp->fPatIdx, status);
   2171                 }
   2172                 fp->fPatIdx = opValue + 4;    // Loop back.
   2173             }
   2174             break;
   2175 
   2176         case URX_CTR_INIT_NG:
   2177             {
   2178                 // Initialize a non-greedy loop
   2179                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   2180                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   2181 
   2182                 // Pick up the three extra operands that CTR_INIT has, and
   2183                 //    skip the pattern location counter past
   2184                 int32_t instrOperandLoc = fp->fPatIdx;
   2185                 fp->fPatIdx += 3;
   2186                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   2187                 int32_t minCount = pat[instrOperandLoc+1];
   2188                 int32_t maxCount = pat[instrOperandLoc+2];
   2189                 U_ASSERT(minCount>=0);
   2190                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   2191                 U_ASSERT(loopLoc>fp->fPatIdx);
   2192 
   2193                 if (minCount == 0) {
   2194                     if (maxCount != 0) {
   2195                         fp = StateSave(fp, fp->fPatIdx, status);
   2196                     }
   2197                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   2198                 }
   2199             }
   2200             break;
   2201 
   2202         case URX_CTR_LOOP_NG:
   2203             {
   2204                 // Non-greedy {min, max} loops
   2205                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   2206                 int32_t initOp = pat[opValue];
   2207                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   2208                 int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   2209                 int32_t minCount  = pat[opValue+2];
   2210                 int32_t maxCount = pat[opValue+3];
   2211                 // Increment the counter.  Note: we're not worrying about counter
   2212                 //   overflow, since the data comes from UnicodeStrings, which
   2213                 //   stores its length in an int32_t.
   2214                 (*pCounter)++;
   2215                 U_ASSERT(*pCounter > 0);
   2216 
   2217                 if ((uint32_t)*pCounter >= (uint32_t)maxCount) {
   2218                     // The loop has matched the maximum permitted number of times.
   2219                     //   Break out of here with no action.  Matching will
   2220                     //   continue with the following pattern.
   2221                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   2222                     break;
   2223                 }
   2224 
   2225                 if (*pCounter < minCount) {
   2226                     // We haven't met the minimum number of matches yet.
   2227                     //   Loop back for another one.
   2228                     fp->fPatIdx = opValue + 4;    // Loop back.
   2229                 } else {
   2230                     // We do have the minimum number of matches.
   2231                     //   Fall into the following pattern, but first do
   2232                     //   a state save to the top of the loop, so that a failure
   2233                     //   in the following pattern will try another iteration of the loop.
   2234                     fp = StateSave(fp, opValue + 4, status);
   2235                 }
   2236             }
   2237             break;
   2238 
   2239         case URX_STO_SP:
   2240             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   2241             fData[opValue] = fStack->size();
   2242             break;
   2243 
   2244         case URX_LD_SP:
   2245             {
   2246                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   2247                 int32_t newStackSize = fData[opValue];
   2248                 U_ASSERT(newStackSize <= fStack->size());
   2249                 int32_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   2250                 if (newFP == (int32_t *)fp) {
   2251                     break;
   2252                 }
   2253                 int32_t i;
   2254                 for (i=0; i<fFrameSize; i++) {
   2255                     newFP[i] = ((int32_t *)fp)[i];
   2256                 }
   2257                 fp = (REStackFrame *)newFP;
   2258                 fStack->setSize(newStackSize);
   2259             }
   2260             break;
   2261 
   2262         case URX_BACKREF:
   2263         case URX_BACKREF_I:
   2264             {
   2265                 U_ASSERT(opValue < fFrameSize);
   2266                 int32_t groupStartIdx = fp->fExtra[opValue];
   2267                 int32_t groupEndIdx   = fp->fExtra[opValue+1];
   2268                 U_ASSERT(groupStartIdx <= groupEndIdx);
   2269                 int32_t len = groupEndIdx-groupStartIdx;
   2270                 if (groupStartIdx < 0) {
   2271                     // This capture group has not participated in the match thus far,
   2272                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   2273                 }
   2274 
   2275                 if (len == 0) {
   2276                         //   The capture group match was of an empty string.
   2277                         //   Verified by testing:  Perl matches succeed in this case, so
   2278                         //   we do too.
   2279                         break;
   2280                     }
   2281 
   2282                 UBool  haveMatch = FALSE;
   2283                 if (fp->fInputIdx + len <= fActiveLimit) {
   2284                     if (opType == URX_BACKREF) {
   2285                         if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) == 0) {
   2286                             haveMatch = TRUE;
   2287                         }
   2288                     } else {
   2289                         if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
   2290                                   len, U_FOLD_CASE_DEFAULT) == 0) {
   2291                             haveMatch = TRUE;
   2292                         }
   2293                     }
   2294                 } else {
   2295                     // TODO: probably need to do a partial string comparison, and only
   2296                     //       set HitEnd if the available input matched.  Ticket #6074
   2297                     fHitEnd = TRUE;
   2298                 }
   2299                 if (haveMatch) {
   2300                     fp->fInputIdx += len;     // Match.  Advance current input position.
   2301                 } else {
   2302                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   2303                 }
   2304             }
   2305             break;
   2306 
   2307         case URX_STO_INP_LOC:
   2308             {
   2309                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   2310                 fp->fExtra[opValue] = fp->fInputIdx;
   2311             }
   2312             break;
   2313 
   2314         case URX_JMPX:
   2315             {
   2316                 int32_t instrOperandLoc = fp->fPatIdx;
   2317                 fp->fPatIdx += 1;
   2318                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   2319                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   2320                 int32_t savedInputIdx = fp->fExtra[dataLoc];
   2321                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   2322                 if (savedInputIdx < fp->fInputIdx) {
   2323                     fp->fPatIdx = opValue;                               // JMP
   2324                 } else {
   2325                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   2326                 }
   2327             }
   2328             break;
   2329 
   2330         case URX_LA_START:
   2331             {
   2332                 // Entering a lookahead block.
   2333                 // Save Stack Ptr, Input Pos.
   2334                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   2335                 fData[opValue]   = fStack->size();
   2336                 fData[opValue+1] = fp->fInputIdx;
   2337                 fActiveStart     = fLookStart;          // Set the match region change for
   2338                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   2339             }
   2340             break;
   2341 
   2342         case URX_LA_END:
   2343             {
   2344                 // Leaving a look-ahead block.
   2345                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   2346                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   2347                 int32_t stackSize = fStack->size();
   2348                 int32_t newStackSize = fData[opValue];
   2349                 U_ASSERT(stackSize >= newStackSize);
   2350                 if (stackSize > newStackSize) {
   2351                     // Copy the current top frame back to the new (cut back) top frame.
   2352                     //   This makes the capture groups from within the look-ahead
   2353                     //   expression available.
   2354                     int32_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   2355                     int32_t i;
   2356                     for (i=0; i<fFrameSize; i++) {
   2357                         newFP[i] = ((int32_t *)fp)[i];
   2358                     }
   2359                     fp = (REStackFrame *)newFP;
   2360                     fStack->setSize(newStackSize);
   2361                 }
   2362                 fp->fInputIdx = fData[opValue+1];
   2363 
   2364                 // Restore the active region bounds in the input string; they may have
   2365                 //    been changed because of transparent bounds on a Region.
   2366                 fActiveStart = fRegionStart;
   2367                 fActiveLimit = fRegionLimit;
   2368             }
   2369             break;
   2370 
   2371         case URX_ONECHAR_I:
   2372             if (fp->fInputIdx < fActiveLimit) {
   2373                 UChar32   c;
   2374                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   2375                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   2376                     break;
   2377                 }
   2378             } else {
   2379                 fHitEnd = TRUE;
   2380             }
   2381             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2382             break;
   2383 
   2384         case URX_STRING_I:
   2385             {
   2386                 // Test input against a literal string.
   2387                 // Strings require two slots in the compiled pattern, one for the
   2388                 //   offset to the string text, and one for the length.
   2389                 int32_t stringStartIdx, stringLen;
   2390                 stringStartIdx = opValue;
   2391 
   2392                 op      = pat[fp->fPatIdx];
   2393                 fp->fPatIdx++;
   2394                 opType  = URX_TYPE(op);
   2395                 opValue = URX_VAL(op);
   2396                 U_ASSERT(opType == URX_STRING_LEN);
   2397                 stringLen = opValue;
   2398 
   2399                 int32_t stringEndIndex = fp->fInputIdx + stringLen;
   2400                 if (stringEndIndex <= fActiveLimit) {
   2401                     if (u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx,
   2402                         stringLen, U_FOLD_CASE_DEFAULT) == 0) {
   2403                         // Success.  Advance the current input position.
   2404                         fp->fInputIdx = stringEndIndex;
   2405                         break;
   2406                     }
   2407                 } else {
   2408                     // Insufficent input left for a match.
   2409                     fHitEnd = TRUE;    // See ticket 6074
   2410                 }
   2411                 // No match.  Back up matching to a saved state
   2412                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2413             }
   2414             break;
   2415 
   2416         case URX_LB_START:
   2417             {
   2418                 // Entering a look-behind block.
   2419                 // Save Stack Ptr, Input Pos.
   2420                 //   TODO:  implement transparent bounds.  Ticket #6067
   2421                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   2422                 fData[opValue]   = fStack->size();
   2423                 fData[opValue+1] = fp->fInputIdx;
   2424                 // Init the variable containing the start index for attempted matches.
   2425                 fData[opValue+2] = -1;
   2426                 // Save input string length, then reset to pin any matches to end at
   2427                 //   the current position.
   2428                 fData[opValue+3] = fActiveLimit;
   2429                 fActiveLimit     = fp->fInputIdx;
   2430             }
   2431             break;
   2432 
   2433 
   2434         case URX_LB_CONT:
   2435             {
   2436                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   2437                 //    at all possible input starting positions.
   2438 
   2439                 // Fetch the min and max possible match lengths.  They are the operands
   2440                 //   of this op in the pattern.
   2441                 int32_t minML = pat[fp->fPatIdx++];
   2442                 int32_t maxML = pat[fp->fPatIdx++];
   2443                 U_ASSERT(minML <= maxML);
   2444                 U_ASSERT(minML >= 0);
   2445 
   2446                 // Fetch (from data) the last input index where a match was attempted.
   2447                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   2448                 int32_t  *lbStartIdx = &fData[opValue+2];
   2449                 if (*lbStartIdx < 0) {
   2450                     // First time through loop.
   2451                     *lbStartIdx = fp->fInputIdx - minML;
   2452                 } else {
   2453                     // 2nd through nth time through the loop.
   2454                     // Back up start position for match by one.
   2455                     if (*lbStartIdx == 0) {
   2456                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
   2457                     } else {
   2458                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
   2459                     }
   2460                 }
   2461 
   2462                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   2463                     // We have tried all potential match starting points without
   2464                     //  getting a match.  Backtrack out, and out of the
   2465                     //   Look Behind altogether.
   2466                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2467                     int32_t restoreInputLen = fData[opValue+3];
   2468                     U_ASSERT(restoreInputLen >= fActiveLimit);
   2469                     U_ASSERT(restoreInputLen <= fInput->length());
   2470                     fActiveLimit = restoreInputLen;
   2471                     break;
   2472                 }
   2473 
   2474                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   2475                 //      (successful match will fall off the end of the loop.)
   2476                 fp = StateSave(fp, fp->fPatIdx-3, status);
   2477                 fp->fInputIdx =  *lbStartIdx;
   2478             }
   2479             break;
   2480 
   2481         case URX_LB_END:
   2482             // End of a look-behind block, after a successful match.
   2483             {
   2484                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   2485                 if (fp->fInputIdx != fActiveLimit) {
   2486                     //  The look-behind expression matched, but the match did not
   2487                     //    extend all the way to the point that we are looking behind from.
   2488                     //  FAIL out of here, which will take us back to the LB_CONT, which
   2489                     //     will retry the match starting at another position or fail
   2490                     //     the look-behind altogether, whichever is appropriate.
   2491                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2492                     break;
   2493                 }
   2494 
   2495                 // Look-behind match is good.  Restore the orignal input string length,
   2496                 //   which had been truncated to pin the end of the lookbehind match to the
   2497                 //   position being looked-behind.
   2498                 int32_t originalInputLen = fData[opValue+3];
   2499                 U_ASSERT(originalInputLen >= fActiveLimit);
   2500                 U_ASSERT(originalInputLen <= fInput->length());
   2501                 fActiveLimit = originalInputLen;
   2502             }
   2503             break;
   2504 
   2505 
   2506         case URX_LBN_CONT:
   2507             {
   2508                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   2509                 //    at all possible input starting positions.
   2510 
   2511                 // Fetch the extra parameters of this op.
   2512                 int32_t minML       = pat[fp->fPatIdx++];
   2513                 int32_t maxML       = pat[fp->fPatIdx++];
   2514                 int32_t continueLoc = pat[fp->fPatIdx++];
   2515                         continueLoc = URX_VAL(continueLoc);
   2516                 U_ASSERT(minML <= maxML);
   2517                 U_ASSERT(minML >= 0);
   2518                 U_ASSERT(continueLoc > fp->fPatIdx);
   2519 
   2520                 // Fetch (from data) the last input index where a match was attempted.
   2521                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   2522                 int32_t  *lbStartIdx = &fData[opValue+2];
   2523                 if (*lbStartIdx < 0) {
   2524                     // First time through loop.
   2525                     *lbStartIdx = fp->fInputIdx - minML;
   2526                 } else {
   2527                     // 2nd through nth time through the loop.
   2528                     // Back up start position for match by one.
   2529                     if (*lbStartIdx == 0) {
   2530                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
   2531                     } else {
   2532                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
   2533                     }
   2534                 }
   2535 
   2536                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   2537                     // We have tried all potential match starting points without
   2538                     //  getting a match, which means that the negative lookbehind as
   2539                     //  a whole has succeeded.  Jump forward to the continue location
   2540                     int32_t restoreInputLen = fData[opValue+3];
   2541                     U_ASSERT(restoreInputLen >= fActiveLimit);
   2542                     U_ASSERT(restoreInputLen <= fInput->length());
   2543                     fActiveLimit = restoreInputLen;
   2544                     fp->fPatIdx = continueLoc;
   2545                     break;
   2546                 }
   2547 
   2548                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   2549                 //      (successful match will cause a FAIL out of the loop altogether.)
   2550                 fp = StateSave(fp, fp->fPatIdx-4, status);
   2551                 fp->fInputIdx =  *lbStartIdx;
   2552             }
   2553             break;
   2554 
   2555         case URX_LBN_END:
   2556             // End of a negative look-behind block, after a successful match.
   2557             {
   2558                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   2559                 if (fp->fInputIdx != fActiveLimit) {
   2560                     //  The look-behind expression matched, but the match did not
   2561                     //    extend all the way to the point that we are looking behind from.
   2562                     //  FAIL out of here, which will take us back to the LB_CONT, which
   2563                     //     will retry the match starting at another position or succeed
   2564                     //     the look-behind altogether, whichever is appropriate.
   2565                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2566                     break;
   2567                 }
   2568 
   2569                 // Look-behind expression matched, which means look-behind test as
   2570                 //   a whole Fails
   2571 
   2572                 //   Restore the orignal input string length, which had been truncated
   2573                 //   inorder to pin the end of the lookbehind match
   2574                 //   to the position being looked-behind.
   2575                 int32_t originalInputLen = fData[opValue+3];
   2576                 U_ASSERT(originalInputLen >= fActiveLimit);
   2577                 U_ASSERT(originalInputLen <= fInput->length());
   2578                 fActiveLimit = originalInputLen;
   2579 
   2580                 // Restore original stack position, discarding any state saved
   2581                 //   by the successful pattern match.
   2582                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   2583                 int32_t newStackSize = fData[opValue];
   2584                 U_ASSERT(fStack->size() > newStackSize);
   2585                 fStack->setSize(newStackSize);
   2586 
   2587                 //  FAIL, which will take control back to someplace
   2588                 //  prior to entering the look-behind test.
   2589                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2590             }
   2591             break;
   2592 
   2593 
   2594         case URX_LOOP_SR_I:
   2595             // Loop Initialization for the optimized implementation of
   2596             //     [some character set]*
   2597             //   This op scans through all matching input.
   2598             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   2599             {
   2600                 U_ASSERT(opValue > 0 && opValue < sets->size());
   2601                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   2602                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   2603 
   2604                 // Loop through input, until either the input is exhausted or
   2605                 //   we reach a character that is not a member of the set.
   2606                 int32_t ix = fp->fInputIdx;
   2607                 for (;;) {
   2608                     if (ix >= fActiveLimit) {
   2609                         fHitEnd = TRUE;
   2610                         break;
   2611                     }
   2612                     UChar32   c;
   2613                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
   2614                     if (c<256) {
   2615                         if (s8->contains(c) == FALSE) {
   2616                             U16_BACK_1(inputBuf, 0, ix);
   2617                             break;
   2618                         }
   2619                     } else {
   2620                         if (s->contains(c) == FALSE) {
   2621                             U16_BACK_1(inputBuf, 0, ix);
   2622                             break;
   2623                         }
   2624                     }
   2625                 }
   2626 
   2627                 // If there were no matching characters, skip over the loop altogether.
   2628                 //   The loop doesn't run at all, a * op always succeeds.
   2629                 if (ix == fp->fInputIdx) {
   2630                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   2631                     break;
   2632                 }
   2633 
   2634                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   2635                 //   must follow.  It's operand is the stack location
   2636                 //   that holds the starting input index for the match of this [set]*
   2637                 int32_t loopcOp = pat[fp->fPatIdx];
   2638                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   2639                 int32_t stackLoc = URX_VAL(loopcOp);
   2640                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   2641                 fp->fExtra[stackLoc] = fp->fInputIdx;
   2642                 fp->fInputIdx = ix;
   2643 
   2644                 // Save State to the URX_LOOP_C op that follows this one,
   2645                 //   so that match failures in the following code will return to there.
   2646                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   2647                 fp = StateSave(fp, fp->fPatIdx, status);
   2648                 fp->fPatIdx++;
   2649             }
   2650             break;
   2651 
   2652 
   2653         case URX_LOOP_DOT_I:
   2654             // Loop Initialization for the optimized implementation of .*
   2655             //   This op scans through all remaining input.
   2656             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   2657             {
   2658                 // Loop through input until the input is exhausted (we reach an end-of-line)
   2659                 // In DOTALL mode, we can just go straight to the end of the input.
   2660                 int32_t ix;
   2661                 if ((opValue & 1) == 1) {
   2662                     // Dot-matches-All mode.  Jump straight to the end of the string.
   2663                     ix = fActiveLimit;
   2664                     fHitEnd = TRUE;
   2665                 } else {
   2666                     // NOT DOT ALL mode.  Line endings do not match '.'
   2667                     // Scan forward until a line ending or end of input.
   2668                     ix = fp->fInputIdx;
   2669                     for (;;) {
   2670                         if (ix >= fActiveLimit) {
   2671                             fHitEnd = TRUE;
   2672                             ix = fActiveLimit;
   2673                             break;
   2674                         }
   2675                         UChar32   c;
   2676                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
   2677                         if ((c & 0x7f) <= 0x29) {        // Fast filter of non-new-line-s
   2678                             if ((c == 0x0a) ||            //  0x0a is newline in both modes.
   2679                                ((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   2680                                     (c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029) {
   2681                                 //  char is a line ending.  Put the input pos back to the
   2682                                 //    line ending char, and exit the scanning loop.
   2683                                 U16_BACK_1(inputBuf, 0, ix);
   2684                                 break;
   2685                             }
   2686                         }
   2687                     }
   2688                 }
   2689 
   2690                 // If there were no matching characters, skip over the loop altogether.
   2691                 //   The loop doesn't run at all, a * op always succeeds.
   2692                 if (ix == fp->fInputIdx) {
   2693                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   2694                     break;
   2695                 }
   2696 
   2697                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   2698                 //   must follow.  It's operand is the stack location
   2699                 //   that holds the starting input index for the match of this .*
   2700                 int32_t loopcOp = pat[fp->fPatIdx];
   2701                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   2702                 int32_t stackLoc = URX_VAL(loopcOp);
   2703                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   2704                 fp->fExtra[stackLoc] = fp->fInputIdx;
   2705                 fp->fInputIdx = ix;
   2706 
   2707                 // Save State to the URX_LOOP_C op that follows this one,
   2708                 //   so that match failures in the following code will return to there.
   2709                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   2710                 fp = StateSave(fp, fp->fPatIdx, status);
   2711                 fp->fPatIdx++;
   2712             }
   2713             break;
   2714 
   2715 
   2716         case URX_LOOP_C:
   2717             {
   2718                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   2719                 int32_t   terminalIdx =  fp->fExtra[opValue];
   2720                 U_ASSERT(terminalIdx <= fp->fInputIdx);
   2721                 if (terminalIdx == fp->fInputIdx) {
   2722                     // We've backed up the input idx to the point that the loop started.
   2723                     // The loop is done.  Leave here without saving state.
   2724                     //  Subsequent failures won't come back here.
   2725                     break;
   2726                 }
   2727                 // Set up for the next iteration of the loop, with input index
   2728                 //   backed up by one from the last time through,
   2729                 //   and a state save to this instruction in case the following code fails again.
   2730                 //   (We're going backwards because this loop emulates stack unwinding, not
   2731                 //    the initial scan forward.)
   2732                 U_ASSERT(fp->fInputIdx > 0);
   2733                 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   2734                 if (inputBuf[fp->fInputIdx] == 0x0a &&
   2735                     fp->fInputIdx > terminalIdx &&
   2736                     inputBuf[fp->fInputIdx-1] == 0x0d) {
   2737                     int32_t prevOp = pat[fp->fPatIdx-2];
   2738                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   2739                         // .*, stepping back over CRLF pair.
   2740                         fp->fInputIdx--;
   2741                     }
   2742                 }
   2743 
   2744 
   2745                 fp = StateSave(fp, fp->fPatIdx-1, status);
   2746             }
   2747             break;
   2748 
   2749 
   2750 
   2751         default:
   2752             // Trouble.  The compiled pattern contains an entry with an
   2753             //           unrecognized type tag.
   2754             U_ASSERT(FALSE);
   2755         }
   2756 
   2757         if (U_FAILURE(status)) {
   2758             isMatch = FALSE;
   2759             break;
   2760         }
   2761     }
   2762 
   2763 breakFromLoop:
   2764     fMatch = isMatch;
   2765     if (isMatch) {
   2766         fLastMatchEnd = fMatchEnd;
   2767         fMatchStart   = startIdx;
   2768         fMatchEnd     = fp->fInputIdx;
   2769         if (fTraceDebug) {
   2770             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
   2771         }
   2772     }
   2773     else
   2774     {
   2775         if (fTraceDebug) {
   2776             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
   2777         }
   2778     }
   2779 
   2780     fFrame = fp;                // The active stack frame when the engine stopped.
   2781                                 //   Contains the capture group results that we need to
   2782                                 //    access later.
   2783 
   2784     return;
   2785 }
   2786 
   2787 
   2788 
   2789 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
   2790 
   2791 U_NAMESPACE_END
   2792 
   2793 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
   2794 
   2795