Home | History | Annotate | Download | only in i18n
      1 /*
      2 **************************************************************************
      3 *   Copyright (C) 2002-2010 International Business Machines Corporation  *
      4 *   and others. All rights reserved.                                     *
      5 **************************************************************************
      6 */
      7 //
      8 //  file:  rematch.cpp
      9 //
     10 //         Contains the implementation of class RegexMatcher,
     11 //         which is one of the main API classes for the ICU regular expression package.
     12 //
     13 
     14 #include "unicode/utypes.h"
     15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     16 
     17 #include "unicode/regex.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/ustring.h"
     21 #include "unicode/rbbi.h"
     22 #include "uassert.h"
     23 #include "cmemory.h"
     24 #include "uvector.h"
     25 #include "uvectr32.h"
     26 #include "uvectr64.h"
     27 #include "regeximp.h"
     28 #include "regexst.h"
     29 #include "regextxt.h"
     30 #include "ucase.h"
     31 
     32 // #include <malloc.h>        // Needed for heapcheck testing
     33 
     34 
     35 // Find progress callback
     36 // ----------------------
     37 // Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
     38 //
     39 #define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
     40     (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
     41 
     42 
     43 // Smart Backtracking
     44 // ------------------
     45 // When a failure would go back to a LOOP_C instruction,
     46 // strings, characters, and setrefs scan backwards for a valid start
     47 // character themselves, pop the stack, and save state, emulating the
     48 // LOOP_C's effect but assured that the next character of input is a
     49 // possible matching character.
     50 //
     51 // Good idea in theory; unfortunately it only helps out a few specific
     52 // cases and slows the engine down a little in the rest.
     53 
     54 //#define REGEX_SMART_BACKTRACKING 1
     55 
     56 U_NAMESPACE_BEGIN
     57 
     58 // Default limit for the size of the back track stack, to avoid system
     59 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
     60 // This value puts ICU's limits higher than most other regexp implementations,
     61 //    which use recursion rather than the heap, and take more storage per
     62 //    backtrack point.
     63 //
     64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
     65 
     66 // Time limit counter constant.
     67 //   Time limits for expression evaluation are in terms of quanta of work by
     68 //   the engine, each of which is 10,000 state saves.
     69 //   This constant determines that state saves per tick number.
     70 static const int32_t TIMER_INITIAL_VALUE = 10000;
     71 
     72 //-----------------------------------------------------------------------------
     73 //
     74 //   Constructor and Destructor
     75 //
     76 //-----------------------------------------------------------------------------
     77 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
     78     fDeferredStatus = U_ZERO_ERROR;
     79     init(fDeferredStatus);
     80     if (U_FAILURE(fDeferredStatus)) {
     81         return;
     82     }
     83     if (pat==NULL) {
     84         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
     85         return;
     86     }
     87     fPattern = pat;
     88     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
     89 }
     90 
     91 
     92 
     93 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
     94                            uint32_t flags, UErrorCode &status) {
     95     init(status);
     96     if (U_FAILURE(status)) {
     97         return;
     98     }
     99     UParseError    pe;
    100     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    101     fPattern           = fPatternOwned;
    102 
    103     UText inputText = UTEXT_INITIALIZER;
    104     utext_openConstUnicodeString(&inputText, &input, &status);
    105     init2(&inputText, status);
    106     utext_close(&inputText);
    107 
    108     fInputUniStrMaybeMutable = TRUE;
    109 }
    110 
    111 
    112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
    113                            uint32_t flags, UErrorCode &status) {
    114     init(status);
    115     if (U_FAILURE(status)) {
    116         return;
    117     }
    118     UParseError    pe;
    119     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    120     if (U_FAILURE(status)) {
    121         return;
    122     }
    123 
    124     fPattern           = fPatternOwned;
    125     init2(input, status);
    126 }
    127 
    128 
    129 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
    130                            uint32_t flags, UErrorCode &status) {
    131     init(status);
    132     if (U_FAILURE(status)) {
    133         return;
    134     }
    135     UParseError    pe;
    136     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    137     if (U_FAILURE(status)) {
    138         return;
    139     }
    140     fPattern           = fPatternOwned;
    141     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    142 }
    143 
    144 RegexMatcher::RegexMatcher(UText *regexp,
    145                            uint32_t flags, UErrorCode &status) {
    146     init(status);
    147     if (U_FAILURE(status)) {
    148         return;
    149     }
    150     UParseError    pe;
    151     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    152         if (U_FAILURE(status)) {
    153         return;
    154     }
    155 
    156     fPattern           = fPatternOwned;
    157     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    158 }
    159 
    160 
    161 
    162 
    163 RegexMatcher::~RegexMatcher() {
    164     delete fStack;
    165     if (fData != fSmallData) {
    166         uprv_free(fData);
    167         fData = NULL;
    168     }
    169     if (fPatternOwned) {
    170         delete fPatternOwned;
    171         fPatternOwned = NULL;
    172         fPattern = NULL;
    173     }
    174 
    175     if (fInput) {
    176         delete fInput;
    177     }
    178     if (fInputText) {
    179         utext_close(fInputText);
    180     }
    181     if (fAltInputText) {
    182         utext_close(fAltInputText);
    183     }
    184 
    185     #if UCONFIG_NO_BREAK_ITERATION==0
    186     delete fWordBreakItr;
    187     #endif
    188 }
    189 
    190 //
    191 //   init()   common initialization for use by all constructors.
    192 //            Initialize all fields, get the object into a consistent state.
    193 //            This must be done even when the initial status shows an error,
    194 //            so that the object is initialized sufficiently well for the destructor
    195 //            to run safely.
    196 //
    197 void RegexMatcher::init(UErrorCode &status) {
    198     fPattern           = NULL;
    199     fPatternOwned      = NULL;
    200     fFrameSize         = 0;
    201     fRegionStart       = 0;
    202     fRegionLimit       = 0;
    203     fAnchorStart       = 0;
    204     fAnchorLimit       = 0;
    205     fLookStart         = 0;
    206     fLookLimit         = 0;
    207     fActiveStart       = 0;
    208     fActiveLimit       = 0;
    209     fTransparentBounds = FALSE;
    210     fAnchoringBounds   = TRUE;
    211     fMatch             = FALSE;
    212     fMatchStart        = 0;
    213     fMatchEnd          = 0;
    214     fLastMatchEnd      = -1;
    215     fAppendPosition    = 0;
    216     fHitEnd            = FALSE;
    217     fRequireEnd        = FALSE;
    218     fStack             = NULL;
    219     fFrame             = NULL;
    220     fTimeLimit         = 0;
    221     fTime              = 0;
    222     fTickCounter       = 0;
    223     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
    224     fCallbackFn        = NULL;
    225     fCallbackContext   = NULL;
    226     fFindProgressCallbackFn      = NULL;
    227     fFindProgressCallbackContext = NULL;
    228     fTraceDebug        = FALSE;
    229     fDeferredStatus    = status;
    230     fData              = fSmallData;
    231     fWordBreakItr      = NULL;
    232 
    233     fStack             = new UVector64(status);
    234     fInputText         = NULL;
    235     fAltInputText      = NULL;
    236     fInput             = NULL;
    237     fInputLength       = 0;
    238     fInputUniStrMaybeMutable = FALSE;
    239 
    240     if (U_FAILURE(status)) {
    241         fDeferredStatus = status;
    242     }
    243 }
    244 
    245 //
    246 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
    247 //            This handles the common setup to be done after the Pattern is available.
    248 //
    249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
    250     if (U_FAILURE(status)) {
    251         fDeferredStatus = status;
    252         return;
    253     }
    254 
    255     if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
    256         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
    257         if (fData == NULL) {
    258             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    259             return;
    260         }
    261     }
    262 
    263     reset(input);
    264     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
    265     if (U_FAILURE(status)) {
    266         fDeferredStatus = status;
    267         return;
    268     }
    269 }
    270 
    271 
    272 static const UChar BACKSLASH  = 0x5c;
    273 static const UChar DOLLARSIGN = 0x24;
    274 //--------------------------------------------------------------------------------
    275 //
    276 //    appendReplacement
    277 //
    278 //--------------------------------------------------------------------------------
    279 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
    280                                               const UnicodeString &replacement,
    281                                               UErrorCode &status) {
    282     UText replacementText = UTEXT_INITIALIZER;
    283 
    284     utext_openConstUnicodeString(&replacementText, &replacement, &status);
    285     if (U_SUCCESS(status)) {
    286         UText resultText = UTEXT_INITIALIZER;
    287         utext_openUnicodeString(&resultText, &dest, &status);
    288 
    289         if (U_SUCCESS(status)) {
    290             appendReplacement(&resultText, &replacementText, status);
    291             utext_close(&resultText);
    292         }
    293         utext_close(&replacementText);
    294     }
    295 
    296     return *this;
    297 }
    298 
    299 //
    300 //    appendReplacement, UText mode
    301 //
    302 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
    303                                               UText *replacement,
    304                                               UErrorCode &status) {
    305     if (U_FAILURE(status)) {
    306         return *this;
    307     }
    308     if (U_FAILURE(fDeferredStatus)) {
    309         status = fDeferredStatus;
    310         return *this;
    311     }
    312     if (fMatch == FALSE) {
    313         status = U_REGEX_INVALID_STATE;
    314         return *this;
    315     }
    316 
    317     // Copy input string from the end of previous match to start of current match
    318     int64_t  destLen = utext_nativeLength(dest);
    319     if (fMatchStart > fAppendPosition) {
    320         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    321             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
    322                                      (int32_t)(fMatchStart-fAppendPosition), &status);
    323         } else {
    324             int32_t len16;
    325             if (UTEXT_USES_U16(fInputText)) {
    326                 len16 = (int32_t)(fMatchStart-fAppendPosition);
    327             } else {
    328                 UErrorCode lengthStatus = U_ZERO_ERROR;
    329                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
    330             }
    331             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
    332             if (inputChars == NULL) {
    333                 status = U_MEMORY_ALLOCATION_ERROR;
    334                 return *this;
    335             }
    336             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
    337             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    338             uprv_free(inputChars);
    339         }
    340     }
    341     fAppendPosition = fMatchEnd;
    342 
    343 
    344     // scan the replacement text, looking for substitutions ($n) and \escapes.
    345     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
    346     //         move entire ranges not containing substitutions.
    347     UTEXT_SETNATIVEINDEX(replacement, 0);
    348     UChar32 c = UTEXT_NEXT32(replacement);
    349     while (c != U_SENTINEL) {
    350         if (c == BACKSLASH) {
    351             // Backslash Escape.  Copy the following char out without further checks.
    352             //                    Note:  Surrogate pairs don't need any special handling
    353             //                           The second half wont be a '$' or a '\', and
    354             //                           will move to the dest normally on the next
    355             //                           loop iteration.
    356             c = UTEXT_CURRENT32(replacement);
    357             if (c == U_SENTINEL) {
    358                 break;
    359             }
    360 
    361             if (c==0x55/*U*/ || c==0x75/*u*/) {
    362                 // We have a \udddd or \Udddddddd escape sequence.
    363                 int32_t offset = 0;
    364                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
    365                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
    366                 if (escapedChar != (UChar32)0xFFFFFFFF) {
    367                     if (U_IS_BMP(escapedChar)) {
    368                         UChar c16 = (UChar)escapedChar;
    369                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    370                     } else {
    371                         UChar surrogate[2];
    372                         surrogate[0] = U16_LEAD(escapedChar);
    373                         surrogate[1] = U16_TRAIL(escapedChar);
    374                         if (U_SUCCESS(status)) {
    375                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    376                         }
    377                     }
    378                     // TODO:  Report errors for mal-formed \u escapes?
    379                     //        As this is, the original sequence is output, which may be OK.
    380                     if (context.lastOffset == offset) {
    381                         UTEXT_PREVIOUS32(replacement);
    382                     } else if (context.lastOffset != offset-1) {
    383                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
    384                     }
    385                 }
    386             } else {
    387                 UTEXT_NEXT32(replacement);
    388                 // Plain backslash escape.  Just put out the escaped character.
    389                 if (U_IS_BMP(c)) {
    390                     UChar c16 = (UChar)c;
    391                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    392                 } else {
    393                     UChar surrogate[2];
    394                     surrogate[0] = U16_LEAD(c);
    395                     surrogate[1] = U16_TRAIL(c);
    396                     if (U_SUCCESS(status)) {
    397                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    398                     }
    399                 }
    400             }
    401         } else if (c != DOLLARSIGN) {
    402             // Normal char, not a $.  Copy it out without further checks.
    403             if (U_IS_BMP(c)) {
    404                 UChar c16 = (UChar)c;
    405                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    406             } else {
    407                 UChar surrogate[2];
    408                 surrogate[0] = U16_LEAD(c);
    409                 surrogate[1] = U16_TRAIL(c);
    410                 if (U_SUCCESS(status)) {
    411                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    412                 }
    413             }
    414         } else {
    415             // We've got a $.  Pick up a capture group number if one follows.
    416             // Consume at most the number of digits necessary for the largest capture
    417             // number that is valid for this pattern.
    418 
    419             int32_t numDigits = 0;
    420             int32_t groupNum  = 0;
    421             UChar32 digitC;
    422             for (;;) {
    423                 digitC = UTEXT_CURRENT32(replacement);
    424                 if (digitC == U_SENTINEL) {
    425                     break;
    426                 }
    427                 if (u_isdigit(digitC) == FALSE) {
    428                     break;
    429                 }
    430                 UTEXT_NEXT32(replacement);
    431                 groupNum=groupNum*10 + u_charDigitValue(digitC);
    432                 numDigits++;
    433                 if (numDigits >= fPattern->fMaxCaptureDigits) {
    434                     break;
    435                 }
    436             }
    437 
    438 
    439             if (numDigits == 0) {
    440                 // The $ didn't introduce a group number at all.
    441                 // Treat it as just part of the substitution text.
    442                 UChar c16 = DOLLARSIGN;
    443                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    444             } else {
    445                 // Finally, append the capture group data to the destination.
    446                 destLen += appendGroup(groupNum, dest, status);
    447                 if (U_FAILURE(status)) {
    448                     // Can fail if group number is out of range.
    449                     break;
    450                 }
    451             }
    452         }
    453 
    454         if (U_FAILURE(status)) {
    455             break;
    456         } else {
    457             c = UTEXT_NEXT32(replacement);
    458         }
    459     }
    460 
    461     return *this;
    462 }
    463 
    464 
    465 
    466 //--------------------------------------------------------------------------------
    467 //
    468 //    appendTail     Intended to be used in conjunction with appendReplacement()
    469 //                   To the destination string, append everything following
    470 //                   the last match position from the input string.
    471 //
    472 //                   Note:  Match ranges do not affect appendTail or appendReplacement
    473 //
    474 //--------------------------------------------------------------------------------
    475 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
    476     UErrorCode status = U_ZERO_ERROR;
    477     UText resultText = UTEXT_INITIALIZER;
    478     utext_openUnicodeString(&resultText, &dest, &status);
    479 
    480     if (U_SUCCESS(status)) {
    481         appendTail(&resultText, status);
    482         utext_close(&resultText);
    483     }
    484 
    485     return dest;
    486 }
    487 
    488 //
    489 //   appendTail, UText mode
    490 //
    491 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
    492     UBool bailOut = FALSE;
    493     if (U_FAILURE(status)) {
    494         bailOut = TRUE;
    495     }
    496     if (U_FAILURE(fDeferredStatus)) {
    497         status = fDeferredStatus;
    498         bailOut = TRUE;
    499     }
    500 
    501     if (bailOut) {
    502         //  dest must not be NULL
    503         if (dest) {
    504             utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
    505             return dest;
    506         }
    507     }
    508 
    509     if (fInputLength > fAppendPosition) {
    510         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    511             int64_t destLen = utext_nativeLength(dest);
    512             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
    513                           (int32_t)(fInputLength-fAppendPosition), &status);
    514         } else {
    515             int32_t len16;
    516             if (UTEXT_USES_U16(fInputText)) {
    517                 len16 = (int32_t)(fInputLength-fAppendPosition);
    518             } else {
    519                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
    520                 status = U_ZERO_ERROR; // buffer overflow
    521             }
    522 
    523             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
    524             if (inputChars == NULL) {
    525                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    526             } else {
    527                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
    528                 int64_t destLen = utext_nativeLength(dest);
    529                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    530                 uprv_free(inputChars);
    531             }
    532         }
    533     }
    534     return dest;
    535 }
    536 
    537 
    538 
    539 //--------------------------------------------------------------------------------
    540 //
    541 //   end
    542 //
    543 //--------------------------------------------------------------------------------
    544 int32_t RegexMatcher::end(UErrorCode &err) const {
    545     return end(0, err);
    546 }
    547 
    548 int64_t RegexMatcher::end64(UErrorCode &err) const {
    549     return end64(0, err);
    550 }
    551 
    552 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
    553     if (U_FAILURE(err)) {
    554         return -1;
    555     }
    556     if (fMatch == FALSE) {
    557         err = U_REGEX_INVALID_STATE;
    558         return -1;
    559     }
    560     if (group < 0 || group > fPattern->fGroupMap->size()) {
    561         err = U_INDEX_OUTOFBOUNDS_ERROR;
    562         return -1;
    563     }
    564     int64_t e = -1;
    565     if (group == 0) {
    566         e = fMatchEnd;
    567     } else {
    568         // Get the position within the stack frame of the variables for
    569         //    this capture group.
    570         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
    571         U_ASSERT(groupOffset < fPattern->fFrameSize);
    572         U_ASSERT(groupOffset >= 0);
    573         e = fFrame->fExtra[groupOffset + 1];
    574     }
    575 
    576         return e;
    577 }
    578 
    579 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
    580     return (int32_t)end64(group, err);
    581 }
    582 
    583 
    584 //--------------------------------------------------------------------------------
    585 //
    586 //   find()
    587 //
    588 //--------------------------------------------------------------------------------
    589 UBool RegexMatcher::find() {
    590     // Start at the position of the last match end.  (Will be zero if the
    591     //   matcher has been reset.)
    592     //
    593     if (U_FAILURE(fDeferredStatus)) {
    594         return FALSE;
    595     }
    596 
    597     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    598         return findUsingChunk();
    599     }
    600 
    601     int64_t startPos = fMatchEnd;
    602     if (startPos==0) {
    603         startPos = fActiveStart;
    604     }
    605 
    606     if (fMatch) {
    607         // Save the position of any previous successful match.
    608         fLastMatchEnd = fMatchEnd;
    609 
    610         if (fMatchStart == fMatchEnd) {
    611             // Previous match had zero length.  Move start position up one position
    612             //  to avoid sending find() into a loop on zero-length matches.
    613             if (startPos >= fActiveLimit) {
    614                 fMatch = FALSE;
    615                 fHitEnd = TRUE;
    616                 return FALSE;
    617             }
    618             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    619             UTEXT_NEXT32(fInputText);
    620             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    621         }
    622     } else {
    623         if (fLastMatchEnd >= 0) {
    624             // A previous find() failed to match.  Don't try again.
    625             //   (without this test, a pattern with a zero-length match
    626             //    could match again at the end of an input string.)
    627             fHitEnd = TRUE;
    628             return FALSE;
    629         }
    630     }
    631 
    632 
    633     // Compute the position in the input string beyond which a match can not begin, because
    634     //   the minimum length match would extend past the end of the input.
    635     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    636     //          Be aware of possible overflows if making changes here.
    637     int64_t testStartLimit;
    638     if (UTEXT_USES_U16(fInputText)) {
    639         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
    640         if (startPos > testStartLimit) {
    641             fMatch = FALSE;
    642             fHitEnd = TRUE;
    643             return FALSE;
    644         }
    645     } else {
    646         // For now, let the matcher discover that it can't match on its own
    647         // We don't know how long the match len is in native characters
    648         testStartLimit = fActiveLimit;
    649     }
    650 
    651     UChar32  c;
    652     U_ASSERT(startPos >= 0);
    653 
    654     switch (fPattern->fStartType) {
    655     case START_NO_INFO:
    656         // No optimization was found.
    657         //  Try a match at each input position.
    658         for (;;) {
    659             MatchAt(startPos, FALSE, fDeferredStatus);
    660             if (U_FAILURE(fDeferredStatus)) {
    661                 return FALSE;
    662             }
    663             if (fMatch) {
    664                 return TRUE;
    665             }
    666             if (startPos >= testStartLimit) {
    667                 fHitEnd = TRUE;
    668                 return FALSE;
    669             }
    670             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    671             UTEXT_NEXT32(fInputText);
    672             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    673             // Note that it's perfectly OK for a pattern to have a zero-length
    674             //   match at the end of a string, so we must make sure that the loop
    675             //   runs with startPos == testStartLimit the last time through.
    676             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    677                 return FALSE;
    678         }
    679         U_ASSERT(FALSE);
    680 
    681     case START_START:
    682         // Matches are only possible at the start of the input string
    683         //   (pattern begins with ^ or \A)
    684         if (startPos > fActiveStart) {
    685             fMatch = FALSE;
    686             return FALSE;
    687         }
    688         MatchAt(startPos, FALSE, fDeferredStatus);
    689         if (U_FAILURE(fDeferredStatus)) {
    690             return FALSE;
    691         }
    692         return fMatch;
    693 
    694 
    695     case START_SET:
    696         {
    697             // Match may start on any char from a pre-computed set.
    698             U_ASSERT(fPattern->fMinMatchLen > 0);
    699             int64_t pos;
    700             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    701             for (;;) {
    702                 c = UTEXT_NEXT32(fInputText);
    703                 pos = UTEXT_GETNATIVEINDEX(fInputText);
    704                 // c will be -1 (U_SENTINEL) at end of text, in which case we
    705                 // skip this next block (so we don't have a negative array index)
    706                 // and handle end of text in the following block.
    707                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
    708                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
    709                     MatchAt(startPos, FALSE, fDeferredStatus);
    710                     if (U_FAILURE(fDeferredStatus)) {
    711                         return FALSE;
    712                     }
    713                     if (fMatch) {
    714                         return TRUE;
    715                     }
    716                     UTEXT_SETNATIVEINDEX(fInputText, pos);
    717                 }
    718                 if (startPos >= testStartLimit) {
    719                     fMatch = FALSE;
    720                     fHitEnd = TRUE;
    721                     return FALSE;
    722                 }
    723                 startPos = pos;
    724 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    725                     return FALSE;
    726             }
    727         }
    728         U_ASSERT(FALSE);
    729 
    730     case START_STRING:
    731     case START_CHAR:
    732         {
    733             // Match starts on exactly one char.
    734             U_ASSERT(fPattern->fMinMatchLen > 0);
    735             UChar32 theChar = fPattern->fInitialChar;
    736             int64_t pos;
    737             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    738             for (;;) {
    739                 c = UTEXT_NEXT32(fInputText);
    740                 pos = UTEXT_GETNATIVEINDEX(fInputText);
    741                 if (c == theChar) {
    742                     MatchAt(startPos, FALSE, fDeferredStatus);
    743                     if (U_FAILURE(fDeferredStatus)) {
    744                         return FALSE;
    745                     }
    746                     if (fMatch) {
    747                         return TRUE;
    748                     }
    749                     UTEXT_SETNATIVEINDEX(fInputText, pos);
    750                 }
    751                 if (startPos >= testStartLimit) {
    752                     fMatch = FALSE;
    753                     fHitEnd = TRUE;
    754                     return FALSE;
    755                 }
    756                 startPos = pos;
    757 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    758                     return FALSE;
    759            }
    760         }
    761         U_ASSERT(FALSE);
    762 
    763     case START_LINE:
    764         {
    765             UChar32  c;
    766             if (startPos == fAnchorStart) {
    767                 MatchAt(startPos, FALSE, fDeferredStatus);
    768                 if (U_FAILURE(fDeferredStatus)) {
    769                     return FALSE;
    770                 }
    771                 if (fMatch) {
    772                     return TRUE;
    773                 }
    774                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    775                 c = UTEXT_NEXT32(fInputText);
    776                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    777             } else {
    778                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    779                 c = UTEXT_PREVIOUS32(fInputText);
    780                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    781             }
    782 
    783             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
    784                 for (;;) {
    785                     if (c == 0x0a) {
    786                             MatchAt(startPos, FALSE, fDeferredStatus);
    787                             if (U_FAILURE(fDeferredStatus)) {
    788                                 return FALSE;
    789                             }
    790                             if (fMatch) {
    791                                 return TRUE;
    792                             }
    793                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    794                     }
    795                     if (startPos >= testStartLimit) {
    796                         fMatch = FALSE;
    797                         fHitEnd = TRUE;
    798                         return FALSE;
    799                     }
    800                     c = UTEXT_NEXT32(fInputText);
    801                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
    802                     // Note that it's perfectly OK for a pattern to have a zero-length
    803                     //   match at the end of a string, so we must make sure that the loop
    804                     //   runs with startPos == testStartLimit the last time through.
    805 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    806                         return FALSE;
    807                 }
    808             } else {
    809                 for (;;) {
    810                     if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
    811                         ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
    812                             if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
    813                                 UTEXT_NEXT32(fInputText);
    814                                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    815                             }
    816                             MatchAt(startPos, FALSE, fDeferredStatus);
    817                             if (U_FAILURE(fDeferredStatus)) {
    818                                 return FALSE;
    819                             }
    820                             if (fMatch) {
    821                                 return TRUE;
    822                             }
    823                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    824                     }
    825                     if (startPos >= testStartLimit) {
    826                         fMatch = FALSE;
    827                         fHitEnd = TRUE;
    828                         return FALSE;
    829                     }
    830                     c = UTEXT_NEXT32(fInputText);
    831                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
    832                     // Note that it's perfectly OK for a pattern to have a zero-length
    833                     //   match at the end of a string, so we must make sure that the loop
    834                     //   runs with startPos == testStartLimit the last time through.
    835 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    836                         return FALSE;
    837                 }
    838             }
    839         }
    840 
    841     default:
    842         U_ASSERT(FALSE);
    843     }
    844 
    845     U_ASSERT(FALSE);
    846     return FALSE;
    847 }
    848 
    849 
    850 
    851 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
    852     if (U_FAILURE(status)) {
    853         return FALSE;
    854     }
    855     if (U_FAILURE(fDeferredStatus)) {
    856         status = fDeferredStatus;
    857         return FALSE;
    858     }
    859     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
    860                                           //        This will reset the region to be the full input length.
    861     if (start < 0) {
    862         status = U_INDEX_OUTOFBOUNDS_ERROR;
    863         return FALSE;
    864     }
    865 
    866     int64_t nativeStart = start;
    867     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
    868         status = U_INDEX_OUTOFBOUNDS_ERROR;
    869         return FALSE;
    870     }
    871     fMatchEnd = nativeStart;
    872     return find();
    873 }
    874 
    875 
    876 //--------------------------------------------------------------------------------
    877 //
    878 //   findUsingChunk() -- like find(), but with the advance knowledge that the
    879 //                       entire string is available in the UText's chunk buffer.
    880 //
    881 //--------------------------------------------------------------------------------
    882 UBool RegexMatcher::findUsingChunk() {
    883     // Start at the position of the last match end.  (Will be zero if the
    884     //   matcher has been reset.
    885     //
    886 
    887     int32_t startPos = (int32_t)fMatchEnd;
    888     if (startPos==0) {
    889         startPos = (int32_t)fActiveStart;
    890     }
    891 
    892     const UChar *inputBuf = fInputText->chunkContents;
    893 
    894     if (fMatch) {
    895         // Save the position of any previous successful match.
    896         fLastMatchEnd = fMatchEnd;
    897 
    898         if (fMatchStart == fMatchEnd) {
    899             // Previous match had zero length.  Move start position up one position
    900             //  to avoid sending find() into a loop on zero-length matches.
    901             if (startPos >= fActiveLimit) {
    902                 fMatch = FALSE;
    903                 fHitEnd = TRUE;
    904                 return FALSE;
    905             }
    906             U16_FWD_1(inputBuf, startPos, fInputLength);
    907         }
    908     } else {
    909         if (fLastMatchEnd >= 0) {
    910             // A previous find() failed to match.  Don't try again.
    911             //   (without this test, a pattern with a zero-length match
    912             //    could match again at the end of an input string.)
    913             fHitEnd = TRUE;
    914             return FALSE;
    915         }
    916     }
    917 
    918 
    919     // Compute the position in the input string beyond which a match can not begin, because
    920     //   the minimum length match would extend past the end of the input.
    921     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    922     //          Be aware of possible overflows if making changes here.
    923     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
    924     if (startPos > testLen) {
    925         fMatch = FALSE;
    926         fHitEnd = TRUE;
    927         return FALSE;
    928     }
    929 
    930     UChar32  c;
    931     U_ASSERT(startPos >= 0);
    932 
    933     switch (fPattern->fStartType) {
    934     case START_NO_INFO:
    935         // No optimization was found.
    936         //  Try a match at each input position.
    937         for (;;) {
    938             MatchChunkAt(startPos, FALSE, fDeferredStatus);
    939             if (U_FAILURE(fDeferredStatus)) {
    940                 return FALSE;
    941             }
    942             if (fMatch) {
    943                 return TRUE;
    944             }
    945             if (startPos >= testLen) {
    946                 fHitEnd = TRUE;
    947                 return FALSE;
    948             }
    949             U16_FWD_1(inputBuf, startPos, fActiveLimit);
    950             // Note that it's perfectly OK for a pattern to have a zero-length
    951             //   match at the end of a string, so we must make sure that the loop
    952             //   runs with startPos == testLen the last time through.
    953             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    954                 return FALSE;
    955         }
    956         U_ASSERT(FALSE);
    957 
    958     case START_START:
    959         // Matches are only possible at the start of the input string
    960         //   (pattern begins with ^ or \A)
    961         if (startPos > fActiveStart) {
    962             fMatch = FALSE;
    963             return FALSE;
    964         }
    965         MatchChunkAt(startPos, FALSE, fDeferredStatus);
    966         if (U_FAILURE(fDeferredStatus)) {
    967             return FALSE;
    968         }
    969         return fMatch;
    970 
    971 
    972     case START_SET:
    973     {
    974         // Match may start on any char from a pre-computed set.
    975         U_ASSERT(fPattern->fMinMatchLen > 0);
    976         for (;;) {
    977             int32_t pos = startPos;
    978             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
    979             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
    980                 (c>=256 && fPattern->fInitialChars->contains(c))) {
    981                 MatchChunkAt(pos, FALSE, fDeferredStatus);
    982                 if (U_FAILURE(fDeferredStatus)) {
    983                     return FALSE;
    984                 }
    985                 if (fMatch) {
    986                     return TRUE;
    987                 }
    988             }
    989             if (pos >= testLen) {
    990                 fMatch = FALSE;
    991                 fHitEnd = TRUE;
    992                 return FALSE;
    993             }
    994             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    995                 return FALSE;
    996         }
    997     }
    998         U_ASSERT(FALSE);
    999 
   1000     case START_STRING:
   1001     case START_CHAR:
   1002     {
   1003         // Match starts on exactly one char.
   1004         U_ASSERT(fPattern->fMinMatchLen > 0);
   1005         UChar32 theChar = fPattern->fInitialChar;
   1006         for (;;) {
   1007             int32_t pos = startPos;
   1008             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
   1009             if (c == theChar) {
   1010                 MatchChunkAt(pos, FALSE, fDeferredStatus);
   1011                 if (U_FAILURE(fDeferredStatus)) {
   1012                     return FALSE;
   1013                 }
   1014                 if (fMatch) {
   1015                     return TRUE;
   1016                 }
   1017             }
   1018             if (pos >= testLen) {
   1019                 fMatch = FALSE;
   1020                 fHitEnd = TRUE;
   1021                 return FALSE;
   1022             }
   1023             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1024                 return FALSE;
   1025         }
   1026     }
   1027         U_ASSERT(FALSE);
   1028 
   1029     case START_LINE:
   1030     {
   1031         UChar32  c;
   1032         if (startPos == fAnchorStart) {
   1033             MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1034             if (U_FAILURE(fDeferredStatus)) {
   1035                 return FALSE;
   1036             }
   1037             if (fMatch) {
   1038                 return TRUE;
   1039             }
   1040             U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1041         }
   1042 
   1043         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
   1044             for (;;) {
   1045                 c = inputBuf[startPos-1];
   1046                 if (c == 0x0a) {
   1047                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1048                     if (U_FAILURE(fDeferredStatus)) {
   1049                         return FALSE;
   1050                     }
   1051                     if (fMatch) {
   1052                         return TRUE;
   1053                     }
   1054                 }
   1055                 if (startPos >= testLen) {
   1056                     fMatch = FALSE;
   1057                     fHitEnd = TRUE;
   1058                     return FALSE;
   1059                 }
   1060                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1061                 // Note that it's perfectly OK for a pattern to have a zero-length
   1062                 //   match at the end of a string, so we must make sure that the loop
   1063                 //   runs with startPos == testLen the last time through.
   1064 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1065                     return FALSE;
   1066             }
   1067         } else {
   1068             for (;;) {
   1069                 c = inputBuf[startPos-1];
   1070                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   1071                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
   1072                     if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
   1073                         startPos++;
   1074                     }
   1075                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1076                     if (U_FAILURE(fDeferredStatus)) {
   1077                         return FALSE;
   1078                     }
   1079                     if (fMatch) {
   1080                         return TRUE;
   1081                     }
   1082                 }
   1083                 if (startPos >= testLen) {
   1084                     fMatch = FALSE;
   1085                     fHitEnd = TRUE;
   1086                     return FALSE;
   1087                 }
   1088                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1089                 // Note that it's perfectly OK for a pattern to have a zero-length
   1090                 //   match at the end of a string, so we must make sure that the loop
   1091                 //   runs with startPos == testLen the last time through.
   1092 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1093                     return FALSE;
   1094             }
   1095         }
   1096     }
   1097 
   1098     default:
   1099         U_ASSERT(FALSE);
   1100     }
   1101 
   1102     U_ASSERT(FALSE);
   1103     return FALSE;
   1104 }
   1105 
   1106 
   1107 
   1108 //--------------------------------------------------------------------------------
   1109 //
   1110 //  group()
   1111 //
   1112 //--------------------------------------------------------------------------------
   1113 UnicodeString RegexMatcher::group(UErrorCode &status) const {
   1114     return group(0, status);
   1115 }
   1116 
   1117 //  Return immutable shallow clone
   1118 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
   1119     return group(0, dest, group_len, status);
   1120 }
   1121 
   1122 //  Return immutable shallow clone
   1123 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
   1124     group_len = 0;
   1125     UBool bailOut = FALSE;
   1126     if (U_FAILURE(status)) {
   1127         return dest;
   1128     }
   1129     if (U_FAILURE(fDeferredStatus)) {
   1130         status = fDeferredStatus;
   1131         bailOut = TRUE;
   1132     }
   1133     if (fMatch == FALSE) {
   1134         status = U_REGEX_INVALID_STATE;
   1135         bailOut = TRUE;
   1136     }
   1137     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1138         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1139         bailOut = TRUE;
   1140     }
   1141 
   1142     if (bailOut) {
   1143         return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
   1144     }
   1145 
   1146     int64_t s, e;
   1147     if (groupNum == 0) {
   1148         s = fMatchStart;
   1149         e = fMatchEnd;
   1150     } else {
   1151         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1152         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1153         U_ASSERT(groupOffset >= 0);
   1154         s = fFrame->fExtra[groupOffset];
   1155         e = fFrame->fExtra[groupOffset+1];
   1156     }
   1157 
   1158     if (s < 0) {
   1159         // A capture group wasn't part of the match
   1160         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
   1161     }
   1162     U_ASSERT(s <= e);
   1163     group_len = e - s;
   1164 
   1165     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
   1166     if (dest)
   1167         UTEXT_SETNATIVEINDEX(dest, s);
   1168     return dest;
   1169 }
   1170 
   1171 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
   1172     UnicodeString result;
   1173     if (U_FAILURE(status)) {
   1174         return result;
   1175     }
   1176     UText resultText = UTEXT_INITIALIZER;
   1177     utext_openUnicodeString(&resultText, &result, &status);
   1178     group(groupNum, &resultText, status);
   1179     utext_close(&resultText);
   1180     return result;
   1181 }
   1182 
   1183 
   1184 //  Return deep (mutable) clone
   1185 //		Technology Preview (as an API), but note that the UnicodeString API is implemented
   1186 //		using this function.
   1187 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
   1188     UBool bailOut = FALSE;
   1189     if (U_FAILURE(status)) {
   1190         return dest;
   1191     }
   1192     if (U_FAILURE(fDeferredStatus)) {
   1193         status = fDeferredStatus;
   1194         bailOut = TRUE;
   1195     }
   1196 
   1197     if (fMatch == FALSE) {
   1198         status = U_REGEX_INVALID_STATE;
   1199         bailOut = TRUE;
   1200     }
   1201     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1202         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1203         bailOut = TRUE;
   1204     }
   1205 
   1206     if (bailOut) {
   1207         if (dest) {
   1208             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1209             return dest;
   1210         } else {
   1211             return utext_openUChars(NULL, NULL, 0, &status);
   1212         }
   1213     }
   1214 
   1215     int64_t s, e;
   1216     if (groupNum == 0) {
   1217         s = fMatchStart;
   1218         e = fMatchEnd;
   1219     } else {
   1220         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1221         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1222         U_ASSERT(groupOffset >= 0);
   1223         s = fFrame->fExtra[groupOffset];
   1224         e = fFrame->fExtra[groupOffset+1];
   1225     }
   1226 
   1227     if (s < 0) {
   1228         // A capture group wasn't part of the match
   1229         if (dest) {
   1230             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1231             return dest;
   1232         } else {
   1233             return utext_openUChars(NULL, NULL, 0, &status);
   1234         }
   1235     }
   1236     U_ASSERT(s <= e);
   1237 
   1238     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1239         U_ASSERT(e <= fInputLength);
   1240         if (dest) {
   1241             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
   1242         } else {
   1243             UText groupText = UTEXT_INITIALIZER;
   1244             utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
   1245             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
   1246             utext_close(&groupText);
   1247         }
   1248     } else {
   1249         int32_t len16;
   1250         if (UTEXT_USES_U16(fInputText)) {
   1251             len16 = (int32_t)(e-s);
   1252         } else {
   1253             UErrorCode lengthStatus = U_ZERO_ERROR;
   1254             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
   1255         }
   1256         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   1257         if (groupChars == NULL) {
   1258             status = U_MEMORY_ALLOCATION_ERROR;
   1259             return dest;
   1260         }
   1261         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
   1262 
   1263         if (dest) {
   1264             utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
   1265         } else {
   1266             UText groupText = UTEXT_INITIALIZER;
   1267             utext_openUChars(&groupText, groupChars, len16, &status);
   1268             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
   1269             utext_close(&groupText);
   1270         }
   1271 
   1272         uprv_free(groupChars);
   1273     }
   1274     return dest;
   1275 }
   1276 
   1277 //--------------------------------------------------------------------------------
   1278 //
   1279 //  appendGroup() -- currently internal only, appends a group to a UText rather
   1280 //                   than replacing its contents
   1281 //
   1282 //--------------------------------------------------------------------------------
   1283 
   1284 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
   1285     if (U_FAILURE(status)) {
   1286         return 0;
   1287     }
   1288     if (U_FAILURE(fDeferredStatus)) {
   1289         status = fDeferredStatus;
   1290         return 0;
   1291     }
   1292     int64_t destLen = utext_nativeLength(dest);
   1293 
   1294     if (fMatch == FALSE) {
   1295         status = U_REGEX_INVALID_STATE;
   1296         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1297     }
   1298     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1299         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1300         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1301     }
   1302 
   1303     int64_t s, e;
   1304     if (groupNum == 0) {
   1305         s = fMatchStart;
   1306         e = fMatchEnd;
   1307     } else {
   1308         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1309         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1310         U_ASSERT(groupOffset >= 0);
   1311         s = fFrame->fExtra[groupOffset];
   1312         e = fFrame->fExtra[groupOffset+1];
   1313     }
   1314 
   1315     if (s < 0) {
   1316         // A capture group wasn't part of the match
   1317         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1318     }
   1319     U_ASSERT(s <= e);
   1320 
   1321     int64_t deltaLen;
   1322     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1323         U_ASSERT(e <= fInputLength);
   1324         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
   1325     } else {
   1326         int32_t len16;
   1327         if (UTEXT_USES_U16(fInputText)) {
   1328             len16 = (int32_t)(e-s);
   1329         } else {
   1330             UErrorCode lengthStatus = U_ZERO_ERROR;
   1331             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
   1332         }
   1333         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   1334         if (groupChars == NULL) {
   1335             status = U_MEMORY_ALLOCATION_ERROR;
   1336             return 0;
   1337         }
   1338         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
   1339 
   1340         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
   1341         uprv_free(groupChars);
   1342     }
   1343     return deltaLen;
   1344 }
   1345 
   1346 
   1347 
   1348 //--------------------------------------------------------------------------------
   1349 //
   1350 //  groupCount()
   1351 //
   1352 //--------------------------------------------------------------------------------
   1353 int32_t RegexMatcher::groupCount() const {
   1354     return fPattern->fGroupMap->size();
   1355 }
   1356 
   1357 
   1358 
   1359 //--------------------------------------------------------------------------------
   1360 //
   1361 //  hasAnchoringBounds()
   1362 //
   1363 //--------------------------------------------------------------------------------
   1364 UBool RegexMatcher::hasAnchoringBounds() const {
   1365     return fAnchoringBounds;
   1366 }
   1367 
   1368 
   1369 //--------------------------------------------------------------------------------
   1370 //
   1371 //  hasTransparentBounds()
   1372 //
   1373 //--------------------------------------------------------------------------------
   1374 UBool RegexMatcher::hasTransparentBounds() const {
   1375     return fTransparentBounds;
   1376 }
   1377 
   1378 
   1379 
   1380 //--------------------------------------------------------------------------------
   1381 //
   1382 //  hitEnd()
   1383 //
   1384 //--------------------------------------------------------------------------------
   1385 UBool RegexMatcher::hitEnd() const {
   1386     return fHitEnd;
   1387 }
   1388 
   1389 
   1390 //--------------------------------------------------------------------------------
   1391 //
   1392 //  input()
   1393 //
   1394 //--------------------------------------------------------------------------------
   1395 const UnicodeString &RegexMatcher::input() const {
   1396     if (!fInput) {
   1397         UErrorCode status = U_ZERO_ERROR;
   1398         int32_t len16;
   1399         if (UTEXT_USES_U16(fInputText)) {
   1400             len16 = (int32_t)fInputLength;
   1401         } else {
   1402             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
   1403             status = U_ZERO_ERROR; // overflow, length status
   1404         }
   1405         UnicodeString *result = new UnicodeString(len16, 0, 0);
   1406 
   1407         UChar *inputChars = result->getBuffer(len16);
   1408         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
   1409         result->releaseBuffer(len16);
   1410 
   1411         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
   1412     }
   1413 
   1414     return *fInput;
   1415 }
   1416 
   1417 //--------------------------------------------------------------------------------
   1418 //
   1419 //  inputText()
   1420 //
   1421 //--------------------------------------------------------------------------------
   1422 UText *RegexMatcher::inputText() const {
   1423     return fInputText;
   1424 }
   1425 
   1426 
   1427 //--------------------------------------------------------------------------------
   1428 //
   1429 //  getInput() -- like inputText(), but makes a clone or copies into another UText
   1430 //
   1431 //--------------------------------------------------------------------------------
   1432 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
   1433     UBool bailOut = FALSE;
   1434     if (U_FAILURE(status)) {
   1435         return dest;
   1436     }
   1437     if (U_FAILURE(fDeferredStatus)) {
   1438         status = fDeferredStatus;
   1439         bailOut = TRUE;
   1440     }
   1441 
   1442     if (bailOut) {
   1443         if (dest) {
   1444             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1445             return dest;
   1446         } else {
   1447             return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
   1448         }
   1449     }
   1450 
   1451     if (dest) {
   1452         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1453             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
   1454         } else {
   1455             int32_t input16Len;
   1456             if (UTEXT_USES_U16(fInputText)) {
   1457                 input16Len = (int32_t)fInputLength;
   1458             } else {
   1459                 UErrorCode lengthStatus = U_ZERO_ERROR;
   1460                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
   1461             }
   1462             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
   1463             if (inputChars == NULL) {
   1464                 return dest;
   1465             }
   1466 
   1467             status = U_ZERO_ERROR;
   1468             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
   1469             status = U_ZERO_ERROR;
   1470             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
   1471 
   1472             uprv_free(inputChars);
   1473         }
   1474         return dest;
   1475     } else {
   1476         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
   1477     }
   1478 }
   1479 
   1480 
   1481 static UBool compat_SyncMutableUTextContents(UText *ut);
   1482 static UBool compat_SyncMutableUTextContents(UText *ut) {
   1483     UBool retVal = FALSE;
   1484 
   1485     //  In the following test, we're really only interested in whether the UText should switch
   1486     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
   1487     //  will still point to the correct data.
   1488     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
   1489         UnicodeString *us=(UnicodeString *)ut->context;
   1490 
   1491         // Update to the latest length.
   1492         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
   1493         int32_t newLength = us->length();
   1494 
   1495         // Update the chunk description.
   1496         // The buffer may have switched between stack- and heap-based.
   1497         ut->chunkContents    = us->getBuffer();
   1498         ut->chunkLength      = newLength;
   1499         ut->chunkNativeLimit = newLength;
   1500         ut->nativeIndexingLimit = newLength;
   1501         retVal = TRUE;
   1502     }
   1503 
   1504     return retVal;
   1505 }
   1506 
   1507 //--------------------------------------------------------------------------------
   1508 //
   1509 //  lookingAt()
   1510 //
   1511 //--------------------------------------------------------------------------------
   1512 UBool RegexMatcher::lookingAt(UErrorCode &status) {
   1513     if (U_FAILURE(status)) {
   1514         return FALSE;
   1515     }
   1516     if (U_FAILURE(fDeferredStatus)) {
   1517         status = fDeferredStatus;
   1518         return FALSE;
   1519     }
   1520 
   1521     if (fInputUniStrMaybeMutable) {
   1522         if (compat_SyncMutableUTextContents(fInputText)) {
   1523         fInputLength = utext_nativeLength(fInputText);
   1524         reset();
   1525         }
   1526     }
   1527     else {
   1528         resetPreserveRegion();
   1529     }
   1530     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1531         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
   1532     } else {
   1533         MatchAt(fActiveStart, FALSE, status);
   1534     }
   1535     return fMatch;
   1536 }
   1537 
   1538 
   1539 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
   1540     if (U_FAILURE(status)) {
   1541         return FALSE;
   1542     }
   1543     if (U_FAILURE(fDeferredStatus)) {
   1544         status = fDeferredStatus;
   1545         return FALSE;
   1546     }
   1547     reset();
   1548 
   1549     if (start < 0) {
   1550         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1551         return FALSE;
   1552     }
   1553 
   1554     if (fInputUniStrMaybeMutable) {
   1555         if (compat_SyncMutableUTextContents(fInputText)) {
   1556         fInputLength = utext_nativeLength(fInputText);
   1557         reset();
   1558         }
   1559     }
   1560 
   1561     int64_t nativeStart;
   1562     nativeStart = start;
   1563     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1564         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1565         return FALSE;
   1566     }
   1567 
   1568     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1569         MatchChunkAt((int32_t)nativeStart, FALSE, status);
   1570     } else {
   1571         MatchAt(nativeStart, FALSE, status);
   1572     }
   1573     return fMatch;
   1574 }
   1575 
   1576 
   1577 
   1578 //--------------------------------------------------------------------------------
   1579 //
   1580 //  matches()
   1581 //
   1582 //--------------------------------------------------------------------------------
   1583 UBool RegexMatcher::matches(UErrorCode &status) {
   1584     if (U_FAILURE(status)) {
   1585         return FALSE;
   1586     }
   1587     if (U_FAILURE(fDeferredStatus)) {
   1588         status = fDeferredStatus;
   1589         return FALSE;
   1590     }
   1591 
   1592     if (fInputUniStrMaybeMutable) {
   1593         if (compat_SyncMutableUTextContents(fInputText)) {
   1594         fInputLength = utext_nativeLength(fInputText);
   1595         reset();
   1596         }
   1597     }
   1598     else {
   1599         resetPreserveRegion();
   1600     }
   1601 
   1602     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1603         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
   1604     } else {
   1605         MatchAt(fActiveStart, TRUE, status);
   1606     }
   1607     return fMatch;
   1608 }
   1609 
   1610 
   1611 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
   1612     if (U_FAILURE(status)) {
   1613         return FALSE;
   1614     }
   1615     if (U_FAILURE(fDeferredStatus)) {
   1616         status = fDeferredStatus;
   1617         return FALSE;
   1618     }
   1619     reset();
   1620 
   1621     if (start < 0) {
   1622         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1623         return FALSE;
   1624     }
   1625 
   1626     if (fInputUniStrMaybeMutable) {
   1627         if (compat_SyncMutableUTextContents(fInputText)) {
   1628         fInputLength = utext_nativeLength(fInputText);
   1629         reset();
   1630         }
   1631     }
   1632 
   1633     int64_t nativeStart;
   1634     nativeStart = start;
   1635     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1636         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1637         return FALSE;
   1638     }
   1639 
   1640     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1641         MatchChunkAt((int32_t)nativeStart, TRUE, status);
   1642     } else {
   1643         MatchAt(nativeStart, TRUE, status);
   1644     }
   1645     return fMatch;
   1646 }
   1647 
   1648 
   1649 
   1650 //--------------------------------------------------------------------------------
   1651 //
   1652 //    pattern
   1653 //
   1654 //--------------------------------------------------------------------------------
   1655 const RegexPattern &RegexMatcher::pattern() const {
   1656     return *fPattern;
   1657 }
   1658 
   1659 
   1660 
   1661 //--------------------------------------------------------------------------------
   1662 //
   1663 //    region
   1664 //
   1665 //--------------------------------------------------------------------------------
   1666 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
   1667     if (U_FAILURE(status)) {
   1668         return *this;
   1669     }
   1670 
   1671     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
   1672         status = U_ILLEGAL_ARGUMENT_ERROR;
   1673     }
   1674 
   1675     int64_t nativeStart = regionStart;
   1676     int64_t nativeLimit = regionLimit;
   1677     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
   1678       status = U_ILLEGAL_ARGUMENT_ERROR;
   1679     }
   1680 
   1681     if (startIndex == -1)
   1682       this->reset();
   1683     else
   1684       resetPreserveRegion();
   1685 
   1686     fRegionStart = nativeStart;
   1687     fRegionLimit = nativeLimit;
   1688     fActiveStart = nativeStart;
   1689     fActiveLimit = nativeLimit;
   1690 
   1691     if (startIndex != -1) {
   1692       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
   1693           status = U_INDEX_OUTOFBOUNDS_ERROR;
   1694       }
   1695       fMatchEnd = startIndex;
   1696     }
   1697 
   1698     if (!fTransparentBounds) {
   1699         fLookStart = nativeStart;
   1700         fLookLimit = nativeLimit;
   1701     }
   1702     if (fAnchoringBounds) {
   1703         fAnchorStart = nativeStart;
   1704         fAnchorLimit = nativeLimit;
   1705     }
   1706     return *this;
   1707 }
   1708 
   1709 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
   1710   return region(start, limit, -1, status);
   1711 }
   1712 
   1713 //--------------------------------------------------------------------------------
   1714 //
   1715 //    regionEnd
   1716 //
   1717 //--------------------------------------------------------------------------------
   1718 int32_t RegexMatcher::regionEnd() const {
   1719     return (int32_t)fRegionLimit;
   1720 }
   1721 
   1722 int64_t RegexMatcher::regionEnd64() const {
   1723     return fRegionLimit;
   1724 }
   1725 
   1726 //--------------------------------------------------------------------------------
   1727 //
   1728 //    regionStart
   1729 //
   1730 //--------------------------------------------------------------------------------
   1731 int32_t RegexMatcher::regionStart() const {
   1732     return (int32_t)fRegionStart;
   1733 }
   1734 
   1735 int64_t RegexMatcher::regionStart64() const {
   1736     return fRegionStart;
   1737 }
   1738 
   1739 
   1740 //--------------------------------------------------------------------------------
   1741 //
   1742 //    replaceAll
   1743 //
   1744 //--------------------------------------------------------------------------------
   1745 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
   1746     UText replacementText = UTEXT_INITIALIZER;
   1747     UText resultText = UTEXT_INITIALIZER;
   1748     UnicodeString resultString;
   1749     if (U_FAILURE(status)) {
   1750         return resultString;
   1751     }
   1752 
   1753     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1754     utext_openUnicodeString(&resultText, &resultString, &status);
   1755 
   1756     replaceAll(&replacementText, &resultText, status);
   1757 
   1758     utext_close(&resultText);
   1759     utext_close(&replacementText);
   1760 
   1761     return resultString;
   1762 }
   1763 
   1764 
   1765 //
   1766 //    replaceAll, UText mode
   1767 //
   1768 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
   1769     if (U_FAILURE(status)) {
   1770         return dest;
   1771     }
   1772     if (U_FAILURE(fDeferredStatus)) {
   1773         status = fDeferredStatus;
   1774         return dest;
   1775     }
   1776 
   1777     if (dest == NULL) {
   1778         UnicodeString emptyString;
   1779         UText empty = UTEXT_INITIALIZER;
   1780 
   1781         utext_openUnicodeString(&empty, &emptyString, &status);
   1782         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
   1783         utext_close(&empty);
   1784     }
   1785 
   1786     if (U_SUCCESS(status)) {
   1787         reset();
   1788         while (find()) {
   1789             appendReplacement(dest, replacement, status);
   1790             if (U_FAILURE(status)) {
   1791                 break;
   1792             }
   1793         }
   1794         appendTail(dest, status);
   1795     }
   1796 
   1797     return dest;
   1798 }
   1799 
   1800 
   1801 //--------------------------------------------------------------------------------
   1802 //
   1803 //    replaceFirst
   1804 //
   1805 //--------------------------------------------------------------------------------
   1806 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
   1807     UText replacementText = UTEXT_INITIALIZER;
   1808     UText resultText = UTEXT_INITIALIZER;
   1809     UnicodeString resultString;
   1810 
   1811     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1812     utext_openUnicodeString(&resultText, &resultString, &status);
   1813 
   1814     replaceFirst(&replacementText, &resultText, status);
   1815 
   1816     utext_close(&resultText);
   1817     utext_close(&replacementText);
   1818 
   1819     return resultString;
   1820 }
   1821 
   1822 //
   1823 //    replaceFirst, UText mode
   1824 //
   1825 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
   1826     if (U_FAILURE(status)) {
   1827         return dest;
   1828     }
   1829     if (U_FAILURE(fDeferredStatus)) {
   1830         status = fDeferredStatus;
   1831         return dest;
   1832     }
   1833 
   1834     reset();
   1835     if (!find()) {
   1836         return getInput(dest, status);
   1837     }
   1838 
   1839     if (dest == NULL) {
   1840         UnicodeString emptyString;
   1841         UText empty = UTEXT_INITIALIZER;
   1842 
   1843         utext_openUnicodeString(&empty, &emptyString, &status);
   1844         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
   1845         utext_close(&empty);
   1846     }
   1847 
   1848     appendReplacement(dest, replacement, status);
   1849     appendTail(dest, status);
   1850 
   1851     return dest;
   1852 }
   1853 
   1854 
   1855 //--------------------------------------------------------------------------------
   1856 //
   1857 //     requireEnd
   1858 //
   1859 //--------------------------------------------------------------------------------
   1860 UBool RegexMatcher::requireEnd() const {
   1861     return fRequireEnd;
   1862 }
   1863 
   1864 
   1865 //--------------------------------------------------------------------------------
   1866 //
   1867 //     reset
   1868 //
   1869 //--------------------------------------------------------------------------------
   1870 RegexMatcher &RegexMatcher::reset() {
   1871     fRegionStart    = 0;
   1872     fRegionLimit    = fInputLength;
   1873     fActiveStart    = 0;
   1874     fActiveLimit    = fInputLength;
   1875     fAnchorStart    = 0;
   1876     fAnchorLimit    = fInputLength;
   1877     fLookStart      = 0;
   1878     fLookLimit      = fInputLength;
   1879     resetPreserveRegion();
   1880     return *this;
   1881 }
   1882 
   1883 
   1884 
   1885 void RegexMatcher::resetPreserveRegion() {
   1886     fMatchStart     = 0;
   1887     fMatchEnd       = 0;
   1888     fLastMatchEnd   = -1;
   1889     fAppendPosition = 0;
   1890     fMatch          = FALSE;
   1891     fHitEnd         = FALSE;
   1892     fRequireEnd     = FALSE;
   1893     fTime           = 0;
   1894     fTickCounter    = TIMER_INITIAL_VALUE;
   1895     //resetStack(); // more expensive than it looks...
   1896 }
   1897 
   1898 
   1899 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
   1900     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
   1901     if (fPattern->fNeedsAltInput) {
   1902         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
   1903     }
   1904     fInputLength = utext_nativeLength(fInputText);
   1905 
   1906     reset();
   1907     delete fInput;
   1908     fInput = NULL;
   1909 
   1910     //  Do the following for any UnicodeString.
   1911     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
   1912     fInputUniStrMaybeMutable = TRUE;
   1913 
   1914     if (fWordBreakItr != NULL) {
   1915 #if UCONFIG_NO_BREAK_ITERATION==0
   1916         UErrorCode status = U_ZERO_ERROR;
   1917         fWordBreakItr->setText(fInputText, status);
   1918 #endif
   1919     }
   1920     return *this;
   1921 }
   1922 
   1923 
   1924 RegexMatcher &RegexMatcher::reset(UText *input) {
   1925     if (fInputText != input) {
   1926         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
   1927         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
   1928         fInputLength = utext_nativeLength(fInputText);
   1929 
   1930         delete fInput;
   1931         fInput = NULL;
   1932 
   1933         if (fWordBreakItr != NULL) {
   1934 #if UCONFIG_NO_BREAK_ITERATION==0
   1935             UErrorCode status = U_ZERO_ERROR;
   1936             fWordBreakItr->setText(input, status);
   1937 #endif
   1938         }
   1939     }
   1940     reset();
   1941     fInputUniStrMaybeMutable = FALSE;
   1942 
   1943     return *this;
   1944 }
   1945 
   1946 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
   1947     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
   1948     return *this;
   1949 }*/
   1950 
   1951 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
   1952     if (U_FAILURE(status)) {
   1953         return *this;
   1954     }
   1955     reset();       // Reset also resets the region to be the entire string.
   1956 
   1957     if (position < 0 || position > fActiveLimit) {
   1958         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1959         return *this;
   1960     }
   1961     fMatchEnd = position;
   1962     return *this;
   1963 }
   1964 
   1965 
   1966 
   1967 
   1968 
   1969 //--------------------------------------------------------------------------------
   1970 //
   1971 //    setTrace
   1972 //
   1973 //--------------------------------------------------------------------------------
   1974 void RegexMatcher::setTrace(UBool state) {
   1975     fTraceDebug = state;
   1976 }
   1977 
   1978 
   1979 
   1980 //---------------------------------------------------------------------
   1981 //
   1982 //   split
   1983 //
   1984 //---------------------------------------------------------------------
   1985 int32_t  RegexMatcher::split(const UnicodeString &input,
   1986         UnicodeString    dest[],
   1987         int32_t          destCapacity,
   1988         UErrorCode      &status)
   1989 {
   1990     UText inputText = UTEXT_INITIALIZER;
   1991     utext_openConstUnicodeString(&inputText, &input, &status);
   1992     if (U_FAILURE(status)) {
   1993         return 0;
   1994     }
   1995 
   1996     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
   1997     if (destText == NULL) {
   1998         status = U_MEMORY_ALLOCATION_ERROR;
   1999         return 0;
   2000     }
   2001     int32_t i;
   2002     for (i = 0; i < destCapacity; i++) {
   2003         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
   2004     }
   2005 
   2006     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
   2007 
   2008     for (i = 0; i < destCapacity; i++) {
   2009         utext_close(destText[i]);
   2010     }
   2011 
   2012     uprv_free(destText);
   2013     utext_close(&inputText);
   2014     return fieldCount;
   2015 }
   2016 
   2017 //
   2018 //   split, UText mode
   2019 //
   2020 int32_t  RegexMatcher::split(UText *input,
   2021         UText           *dest[],
   2022         int32_t          destCapacity,
   2023         UErrorCode      &status)
   2024 {
   2025     //
   2026     // Check arguements for validity
   2027     //
   2028     if (U_FAILURE(status)) {
   2029         return 0;
   2030     };
   2031 
   2032     if (destCapacity < 1) {
   2033         status = U_ILLEGAL_ARGUMENT_ERROR;
   2034         return 0;
   2035     }
   2036 
   2037     //
   2038     // Reset for the input text
   2039     //
   2040     reset(input);
   2041     int64_t   nextOutputStringStart = 0;
   2042     if (fActiveLimit == 0) {
   2043         return 0;
   2044     }
   2045 
   2046     //
   2047     // Loop through the input text, searching for the delimiter pattern
   2048     //
   2049     int32_t i;
   2050     int32_t numCaptureGroups = fPattern->fGroupMap->size();
   2051     for (i=0; ; i++) {
   2052         if (i>=destCapacity-1) {
   2053             // There is one or zero output string left.
   2054             // Fill the last output string with whatever is left from the input, then exit the loop.
   2055             //  ( i will be == destCapacity if we filled the output array while processing
   2056             //    capture groups of the delimiter expression, in which case we will discard the
   2057             //    last capture group saved in favor of the unprocessed remainder of the
   2058             //    input string.)
   2059             i = destCapacity-1;
   2060             if (fActiveLimit > nextOutputStringStart) {
   2061                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2062                     if (dest[i]) {
   2063                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2064                                       input->chunkContents+nextOutputStringStart,
   2065                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
   2066                     } else {
   2067                         UText remainingText = UTEXT_INITIALIZER;
   2068                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2069                                          fActiveLimit-nextOutputStringStart, &status);
   2070                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2071                         utext_close(&remainingText);
   2072                     }
   2073                 } else {
   2074                     UErrorCode lengthStatus = U_ZERO_ERROR;
   2075                     int32_t remaining16Length =
   2076                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
   2077                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2078                     if (remainingChars == NULL) {
   2079                         status = U_MEMORY_ALLOCATION_ERROR;
   2080                         break;
   2081                     }
   2082 
   2083                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2084                     if (dest[i]) {
   2085                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2086                     } else {
   2087                         UText remainingText = UTEXT_INITIALIZER;
   2088                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2089                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2090                         utext_close(&remainingText);
   2091                     }
   2092 
   2093                     uprv_free(remainingChars);
   2094                 }
   2095             }
   2096             break;
   2097         }
   2098         if (find()) {
   2099             // We found another delimiter.  Move everything from where we started looking
   2100             //  up until the start of the delimiter into the next output string.
   2101             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2102                 if (dest[i]) {
   2103                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2104                                   input->chunkContents+nextOutputStringStart,
   2105                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
   2106                 } else {
   2107                     UText remainingText = UTEXT_INITIALIZER;
   2108                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2109                                       fMatchStart-nextOutputStringStart, &status);
   2110                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2111                     utext_close(&remainingText);
   2112                 }
   2113             } else {
   2114                 UErrorCode lengthStatus = U_ZERO_ERROR;
   2115                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
   2116                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2117                 if (remainingChars == NULL) {
   2118                     status = U_MEMORY_ALLOCATION_ERROR;
   2119                     break;
   2120                 }
   2121                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
   2122                 if (dest[i]) {
   2123                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2124                 } else {
   2125                     UText remainingText = UTEXT_INITIALIZER;
   2126                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2127                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2128                     utext_close(&remainingText);
   2129                 }
   2130 
   2131                 uprv_free(remainingChars);
   2132             }
   2133             nextOutputStringStart = fMatchEnd;
   2134 
   2135             // If the delimiter pattern has capturing parentheses, the captured
   2136             //  text goes out into the next n destination strings.
   2137             int32_t groupNum;
   2138             UBool lastGroupWasNullUText = FALSE;
   2139             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
   2140                 if (i==destCapacity-1) {
   2141                     break;
   2142                 }
   2143                 i++;
   2144                 lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE);
   2145                 dest[i] = group(groupNum, dest[i], status);
   2146             }
   2147 
   2148             if (nextOutputStringStart == fActiveLimit) {
   2149                 // The delimiter was at the end of the string.  We're done.
   2150                 break;
   2151             } else if (i == destCapacity-1) {
   2152                 // We're out of capture groups, and the rest of the string is more important
   2153                 if (lastGroupWasNullUText) {
   2154                     utext_close(dest[i]);
   2155                     dest[i] = NULL;
   2156                 }
   2157             }
   2158 
   2159         }
   2160         else
   2161         {
   2162             // We ran off the end of the input while looking for the next delimiter.
   2163             // All the remaining text goes into the current output string.
   2164             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2165                 if (dest[i]) {
   2166                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2167                                   input->chunkContents+nextOutputStringStart,
   2168                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
   2169                 } else {
   2170                     UText remainingText = UTEXT_INITIALIZER;
   2171                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2172                                      fActiveLimit-nextOutputStringStart, &status);
   2173                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2174                     utext_close(&remainingText);
   2175                 }
   2176             } else {
   2177                 UErrorCode lengthStatus = U_ZERO_ERROR;
   2178                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
   2179                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2180                 if (remainingChars == NULL) {
   2181                     status = U_MEMORY_ALLOCATION_ERROR;
   2182                     break;
   2183                 }
   2184 
   2185                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2186                 if (dest[i]) {
   2187                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2188                 } else {
   2189                     UText remainingText = UTEXT_INITIALIZER;
   2190                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2191                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2192                     utext_close(&remainingText);
   2193                 }
   2194 
   2195                 uprv_free(remainingChars);
   2196             }
   2197             break;
   2198         }
   2199         if (U_FAILURE(status)) {
   2200             break;
   2201         }
   2202     }   // end of for loop
   2203     return i+1;
   2204 }
   2205 
   2206 
   2207 //--------------------------------------------------------------------------------
   2208 //
   2209 //     start
   2210 //
   2211 //--------------------------------------------------------------------------------
   2212 int32_t RegexMatcher::start(UErrorCode &status) const {
   2213     return start(0, status);
   2214 }
   2215 
   2216 int64_t RegexMatcher::start64(UErrorCode &status) const {
   2217     return start64(0, status);
   2218 }
   2219 
   2220 //--------------------------------------------------------------------------------
   2221 //
   2222 //     start(int32_t group, UErrorCode &status)
   2223 //
   2224 //--------------------------------------------------------------------------------
   2225 
   2226 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
   2227     if (U_FAILURE(status)) {
   2228         return -1;
   2229     }
   2230     if (U_FAILURE(fDeferredStatus)) {
   2231         status = fDeferredStatus;
   2232         return -1;
   2233     }
   2234     if (fMatch == FALSE) {
   2235         status = U_REGEX_INVALID_STATE;
   2236         return -1;
   2237     }
   2238     if (group < 0 || group > fPattern->fGroupMap->size()) {
   2239         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2240         return -1;
   2241     }
   2242     int64_t s;
   2243     if (group == 0) {
   2244         s = fMatchStart;
   2245     } else {
   2246         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
   2247         U_ASSERT(groupOffset < fPattern->fFrameSize);
   2248         U_ASSERT(groupOffset >= 0);
   2249         s = fFrame->fExtra[groupOffset];
   2250     }
   2251 
   2252     return s;
   2253 }
   2254 
   2255 
   2256 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
   2257     return (int32_t)start64(group, status);
   2258 }
   2259 
   2260 //--------------------------------------------------------------------------------
   2261 //
   2262 //     useAnchoringBounds
   2263 //
   2264 //--------------------------------------------------------------------------------
   2265 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
   2266     fAnchoringBounds = b;
   2267     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
   2268     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
   2269     return *this;
   2270 }
   2271 
   2272 
   2273 //--------------------------------------------------------------------------------
   2274 //
   2275 //     useTransparentBounds
   2276 //
   2277 //--------------------------------------------------------------------------------
   2278 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
   2279     fTransparentBounds = b;
   2280     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
   2281     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
   2282     return *this;
   2283 }
   2284 
   2285 //--------------------------------------------------------------------------------
   2286 //
   2287 //     setTimeLimit
   2288 //
   2289 //--------------------------------------------------------------------------------
   2290 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
   2291     if (U_FAILURE(status)) {
   2292         return;
   2293     }
   2294     if (U_FAILURE(fDeferredStatus)) {
   2295         status = fDeferredStatus;
   2296         return;
   2297     }
   2298     if (limit < 0) {
   2299         status = U_ILLEGAL_ARGUMENT_ERROR;
   2300         return;
   2301     }
   2302     fTimeLimit = limit;
   2303 }
   2304 
   2305 
   2306 //--------------------------------------------------------------------------------
   2307 //
   2308 //     getTimeLimit
   2309 //
   2310 //--------------------------------------------------------------------------------
   2311 int32_t RegexMatcher::getTimeLimit() const {
   2312     return fTimeLimit;
   2313 }
   2314 
   2315 
   2316 //--------------------------------------------------------------------------------
   2317 //
   2318 //     setStackLimit
   2319 //
   2320 //--------------------------------------------------------------------------------
   2321 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
   2322     if (U_FAILURE(status)) {
   2323         return;
   2324     }
   2325     if (U_FAILURE(fDeferredStatus)) {
   2326         status = fDeferredStatus;
   2327         return;
   2328     }
   2329     if (limit < 0) {
   2330         status = U_ILLEGAL_ARGUMENT_ERROR;
   2331         return;
   2332     }
   2333 
   2334     // Reset the matcher.  This is needed here in case there is a current match
   2335     //    whose final stack frame (containing the match results, pointed to by fFrame)
   2336     //    would be lost by resizing to a smaller stack size.
   2337     reset();
   2338 
   2339     if (limit == 0) {
   2340         // Unlimited stack expansion
   2341         fStack->setMaxCapacity(0);
   2342     } else {
   2343         // Change the units of the limit  from bytes to ints, and bump the size up
   2344         //   to be big enough to hold at least one stack frame for the pattern,
   2345         //   if it isn't there already.
   2346         int32_t adjustedLimit = limit / sizeof(int32_t);
   2347         if (adjustedLimit < fPattern->fFrameSize) {
   2348             adjustedLimit = fPattern->fFrameSize;
   2349         }
   2350         fStack->setMaxCapacity(adjustedLimit);
   2351     }
   2352     fStackLimit = limit;
   2353 }
   2354 
   2355 
   2356 //--------------------------------------------------------------------------------
   2357 //
   2358 //     getStackLimit
   2359 //
   2360 //--------------------------------------------------------------------------------
   2361 int32_t RegexMatcher::getStackLimit() const {
   2362     return fStackLimit;
   2363 }
   2364 
   2365 
   2366 //--------------------------------------------------------------------------------
   2367 //
   2368 //     setMatchCallback
   2369 //
   2370 //--------------------------------------------------------------------------------
   2371 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
   2372                                     const void              *context,
   2373                                     UErrorCode              &status) {
   2374     if (U_FAILURE(status)) {
   2375         return;
   2376     }
   2377     fCallbackFn = callback;
   2378     fCallbackContext = context;
   2379 }
   2380 
   2381 
   2382 //--------------------------------------------------------------------------------
   2383 //
   2384 //     getMatchCallback
   2385 //
   2386 //--------------------------------------------------------------------------------
   2387 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
   2388                                   const void              *&context,
   2389                                   UErrorCode              &status) {
   2390     if (U_FAILURE(status)) {
   2391        return;
   2392     }
   2393     callback = fCallbackFn;
   2394     context  = fCallbackContext;
   2395 }
   2396 
   2397 
   2398 //--------------------------------------------------------------------------------
   2399 //
   2400 //     setMatchCallback
   2401 //
   2402 //--------------------------------------------------------------------------------
   2403 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
   2404                                                 const void                      *context,
   2405                                                 UErrorCode                      &status) {
   2406     if (U_FAILURE(status)) {
   2407         return;
   2408     }
   2409     fFindProgressCallbackFn = callback;
   2410     fFindProgressCallbackContext = context;
   2411 }
   2412 
   2413 
   2414 //--------------------------------------------------------------------------------
   2415 //
   2416 //     getMatchCallback
   2417 //
   2418 //--------------------------------------------------------------------------------
   2419 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
   2420                                                 const void                    *&context,
   2421                                                 UErrorCode                    &status) {
   2422     if (U_FAILURE(status)) {
   2423        return;
   2424     }
   2425     callback = fFindProgressCallbackFn;
   2426     context  = fFindProgressCallbackContext;
   2427 }
   2428 
   2429 
   2430 //================================================================================
   2431 //
   2432 //    Code following this point in this file is the internal
   2433 //    Match Engine Implementation.
   2434 //
   2435 //================================================================================
   2436 
   2437 
   2438 //--------------------------------------------------------------------------------
   2439 //
   2440 //   resetStack
   2441 //           Discard any previous contents of the state save stack, and initialize a
   2442 //           new stack frame to all -1.  The -1s are needed for capture group limits,
   2443 //           where they indicate that a group has not yet matched anything.
   2444 //--------------------------------------------------------------------------------
   2445 REStackFrame *RegexMatcher::resetStack() {
   2446     // Discard any previous contents of the state save stack, and initialize a
   2447     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
   2448     //  where they indicate that a group has not yet matched anything.
   2449     fStack->removeAllElements();
   2450 
   2451     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
   2452     int32_t i;
   2453     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
   2454         iFrame->fExtra[i] = -1;
   2455     }
   2456     return iFrame;
   2457 }
   2458 
   2459 
   2460 
   2461 //--------------------------------------------------------------------------------
   2462 //
   2463 //   isWordBoundary
   2464 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
   2465 //                     For us,
   2466 //                       If the current char is a combining mark,
   2467 //                          \b is FALSE.
   2468 //                       Else Scan backwards to the first non-combining char.
   2469 //                            We are at a boundary if the this char and the original chars are
   2470 //                               opposite in membership in \w set
   2471 //
   2472 //          parameters:   pos   - the current position in the input buffer
   2473 //
   2474 //              TODO:  double-check edge cases at region boundaries.
   2475 //
   2476 //--------------------------------------------------------------------------------
   2477 UBool RegexMatcher::isWordBoundary(int64_t pos) {
   2478     UBool isBoundary = FALSE;
   2479     UBool cIsWord    = FALSE;
   2480 
   2481     if (pos >= fLookLimit) {
   2482         fHitEnd = TRUE;
   2483     } else {
   2484         // Determine whether char c at current position is a member of the word set of chars.
   2485         // If we're off the end of the string, behave as though we're not at a word char.
   2486         UTEXT_SETNATIVEINDEX(fInputText, pos);
   2487         UChar32  c = UTEXT_CURRENT32(fInputText);
   2488         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2489             // Current char is a combining one.  Not a boundary.
   2490             return FALSE;
   2491         }
   2492         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   2493     }
   2494 
   2495     // Back up until we come to a non-combining char, determine whether
   2496     //  that char is a word char.
   2497     UBool prevCIsWord = FALSE;
   2498     for (;;) {
   2499         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
   2500             break;
   2501         }
   2502         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
   2503         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2504               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2505             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   2506             break;
   2507         }
   2508     }
   2509     isBoundary = cIsWord ^ prevCIsWord;
   2510     return isBoundary;
   2511 }
   2512 
   2513 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
   2514     UBool isBoundary = FALSE;
   2515     UBool cIsWord    = FALSE;
   2516 
   2517     const UChar *inputBuf = fInputText->chunkContents;
   2518 
   2519     if (pos >= fLookLimit) {
   2520         fHitEnd = TRUE;
   2521     } else {
   2522         // Determine whether char c at current position is a member of the word set of chars.
   2523         // If we're off the end of the string, behave as though we're not at a word char.
   2524         UChar32 c;
   2525         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
   2526         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2527             // Current char is a combining one.  Not a boundary.
   2528             return FALSE;
   2529         }
   2530         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   2531     }
   2532 
   2533     // Back up until we come to a non-combining char, determine whether
   2534     //  that char is a word char.
   2535     UBool prevCIsWord = FALSE;
   2536     for (;;) {
   2537         if (pos <= fLookStart) {
   2538             break;
   2539         }
   2540         UChar32 prevChar;
   2541         U16_PREV(inputBuf, fLookStart, pos, prevChar);
   2542         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2543               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2544             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   2545             break;
   2546         }
   2547     }
   2548     isBoundary = cIsWord ^ prevCIsWord;
   2549     return isBoundary;
   2550 }
   2551 
   2552 //--------------------------------------------------------------------------------
   2553 //
   2554 //   isUWordBoundary
   2555 //
   2556 //         Test for a word boundary using RBBI word break.
   2557 //
   2558 //          parameters:   pos   - the current position in the input buffer
   2559 //
   2560 //--------------------------------------------------------------------------------
   2561 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
   2562     UBool       returnVal = FALSE;
   2563 #if UCONFIG_NO_BREAK_ITERATION==0
   2564 
   2565     // If we haven't yet created a break iterator for this matcher, do it now.
   2566     if (fWordBreakItr == NULL) {
   2567         fWordBreakItr =
   2568             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
   2569         if (U_FAILURE(fDeferredStatus)) {
   2570             return FALSE;
   2571         }
   2572         fWordBreakItr->setText(fInputText, fDeferredStatus);
   2573     }
   2574 
   2575     if (pos >= fLookLimit) {
   2576         fHitEnd = TRUE;
   2577         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
   2578                             //    words are not boundaries.  All non-word chars stand by themselves,
   2579                             //    with word boundaries on both sides.
   2580     } else {
   2581         if (!UTEXT_USES_U16(fInputText)) {
   2582             // !!!: Would like a better way to do this!
   2583             UErrorCode status = U_ZERO_ERROR;
   2584             pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
   2585         }
   2586         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
   2587     }
   2588 #endif
   2589     return   returnVal;
   2590 }
   2591 
   2592 //--------------------------------------------------------------------------------
   2593 //
   2594 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
   2595 //                     saves. Increment the "time" counter, and call the
   2596 //                     user callback function if there is one installed.
   2597 //
   2598 //                     If the match operation needs to be aborted, either for a time-out
   2599 //                     or because the user callback asked for it, just set an error status.
   2600 //                     The engine will pick that up and stop in its outer loop.
   2601 //
   2602 //--------------------------------------------------------------------------------
   2603 void RegexMatcher::IncrementTime(UErrorCode &status) {
   2604     fTickCounter = TIMER_INITIAL_VALUE;
   2605     fTime++;
   2606     if (fCallbackFn != NULL) {
   2607         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
   2608             status = U_REGEX_STOPPED_BY_CALLER;
   2609             return;
   2610         }
   2611     }
   2612     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
   2613         status = U_REGEX_TIME_OUT;
   2614     }
   2615 }
   2616 
   2617 //--------------------------------------------------------------------------------
   2618 //
   2619 //   ReportFindProgress     This function is called once for each advance in the target
   2620 //                          string from the find() function, and calls the user progress callback
   2621 //                          function if there is one installed.
   2622 //
   2623 //                          NOTE:
   2624 //
   2625 //                          If the match operation needs to be aborted because the user
   2626 //                          callback asked for it, just set an error status.
   2627 //                          The engine will pick that up and stop in its outer loop.
   2628 //
   2629 //--------------------------------------------------------------------------------
   2630 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
   2631     if (fFindProgressCallbackFn != NULL) {
   2632         if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
   2633             status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
   2634             return FALSE;
   2635         }
   2636     }
   2637     return TRUE;
   2638 }
   2639 
   2640 //--------------------------------------------------------------------------------
   2641 //
   2642 //   StateSave
   2643 //       Make a new stack frame, initialized as a copy of the current stack frame.
   2644 //       Set the pattern index in the original stack frame from the operand value
   2645 //       in the opcode.  Execution of the engine continues with the state in
   2646 //       the newly created stack frame
   2647 //
   2648 //       Note that reserveBlock() may grow the stack, resulting in the
   2649 //       whole thing being relocated in memory.
   2650 //
   2651 //    Parameters:
   2652 //       fp           The top frame pointer when called.  At return, a new
   2653 //                    fame will be present
   2654 //       savePatIdx   An index into the compiled pattern.  Goes into the original
   2655 //                    (not new) frame.  If execution ever back-tracks out of the
   2656 //                    new frame, this will be where we continue from in the pattern.
   2657 //    Return
   2658 //                    The new frame pointer.
   2659 //
   2660 //--------------------------------------------------------------------------------
   2661 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
   2662     // push storage for a new frame.
   2663     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
   2664     if (newFP == NULL) {
   2665         // Failure on attempted stack expansion.
   2666         //   Stack function set some other error code, change it to a more
   2667         //   specific one for regular expressions.
   2668         status = U_REGEX_STACK_OVERFLOW;
   2669         // We need to return a writable stack frame, so just return the
   2670         //    previous frame.  The match operation will stop quickly
   2671         //    because of the error status, after which the frame will never
   2672         //    be looked at again.
   2673         return fp;
   2674     }
   2675     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
   2676 
   2677     // New stack frame = copy of old top frame.
   2678     int64_t *source = (int64_t *)fp;
   2679     int64_t *dest   = newFP;
   2680     for (;;) {
   2681         *dest++ = *source++;
   2682         if (source == newFP) {
   2683             break;
   2684         }
   2685     }
   2686 
   2687     fTickCounter--;
   2688     if (fTickCounter <= 0) {
   2689        IncrementTime(status);    // Re-initializes fTickCounter
   2690     }
   2691     fp->fPatIdx = savePatIdx;
   2692     return (REStackFrame *)newFP;
   2693 }
   2694 
   2695 
   2696 //--------------------------------------------------------------------------------
   2697 //
   2698 //   MatchAt      This is the actual matching engine.
   2699 //
   2700 //                  startIdx:    begin matching a this index.
   2701 //                  toEnd:       if true, match must extend to end of the input region
   2702 //
   2703 //--------------------------------------------------------------------------------
   2704 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
   2705     UBool       isMatch  = FALSE;      // True if the we have a match.
   2706 
   2707     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
   2708 
   2709     int32_t     op;                    // Operation from the compiled pattern, split into
   2710     int32_t     opType;                //    the opcode
   2711     int32_t     opValue;               //    and the operand value.
   2712 
   2713     #ifdef REGEX_RUN_DEBUG
   2714     if (fTraceDebug)
   2715     {
   2716         printf("MatchAt(startIdx=%ld)\n", startIdx);
   2717         printf("Original Pattern: ");
   2718         UChar32 c = utext_next32From(fPattern->fPattern, 0);
   2719         while (c != U_SENTINEL) {
   2720             if (c<32 || c>256) {
   2721                 c = '.';
   2722             }
   2723             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
   2724 
   2725             c = UTEXT_NEXT32(fPattern->fPattern);
   2726         }
   2727         printf("\n");
   2728         printf("Input String: ");
   2729         c = utext_next32From(fInputText, 0);
   2730         while (c != U_SENTINEL) {
   2731             if (c<32 || c>256) {
   2732                 c = '.';
   2733             }
   2734             printf("%c", c);
   2735 
   2736             c = UTEXT_NEXT32(fInputText);
   2737         }
   2738         printf("\n");
   2739         printf("\n");
   2740     }
   2741     #endif
   2742 
   2743     if (U_FAILURE(status)) {
   2744         return;
   2745     }
   2746 
   2747     //  Cache frequently referenced items from the compiled pattern
   2748     //
   2749     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   2750 
   2751     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   2752     UVector             *sets          = fPattern->fSets;
   2753 
   2754     fFrameSize = fPattern->fFrameSize;
   2755     REStackFrame        *fp            = resetStack();
   2756 
   2757     fp->fPatIdx   = 0;
   2758     fp->fInputIdx = startIdx;
   2759 
   2760     // Zero out the pattern's static data
   2761     int32_t i;
   2762     for (i = 0; i<fPattern->fDataSize; i++) {
   2763         fData[i] = 0;
   2764     }
   2765 
   2766     //
   2767     //  Main loop for interpreting the compiled pattern.
   2768     //  One iteration of the loop per pattern operation performed.
   2769     //
   2770     for (;;) {
   2771 #if 0
   2772         if (_heapchk() != _HEAPOK) {
   2773             fprintf(stderr, "Heap Trouble\n");
   2774         }
   2775 #endif
   2776 
   2777         op      = (int32_t)pat[fp->fPatIdx];
   2778         opType  = URX_TYPE(op);
   2779         opValue = URX_VAL(op);
   2780         #ifdef REGEX_RUN_DEBUG
   2781         if (fTraceDebug) {
   2782             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2783             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
   2784                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   2785             fPattern->dumpOp(fp->fPatIdx);
   2786         }
   2787         #endif
   2788         fp->fPatIdx++;
   2789 
   2790         switch (opType) {
   2791 
   2792 
   2793         case URX_NOP:
   2794             break;
   2795 
   2796 
   2797         case URX_BACKTRACK:
   2798             // Force a backtrack.  In some circumstances, the pattern compiler
   2799             //   will notice that the pattern can't possibly match anything, and will
   2800             //   emit one of these at that point.
   2801             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2802             break;
   2803 
   2804 
   2805         case URX_ONECHAR:
   2806             if (fp->fInputIdx < fActiveLimit) {
   2807                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2808                 UChar32 c = UTEXT_NEXT32(fInputText);
   2809                 if (c == opValue) {
   2810                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2811                     break;
   2812                 }
   2813             } else {
   2814                 fHitEnd = TRUE;
   2815             }
   2816 
   2817             #ifdef REGEX_SMART_BACKTRACKING
   2818             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   2819                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   2820                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   2821                     UBool success = FALSE;
   2822                     UChar32 c = UTEXT_PREVIOUS32(fInputText);
   2823                     while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
   2824                         if (c == opValue) {
   2825                             success = TRUE;
   2826                             break;
   2827                         } else if (c == U_SENTINEL) {
   2828                             break;
   2829                         }
   2830                         c = UTEXT_PREVIOUS32(fInputText);
   2831                     }
   2832                     if (success) {
   2833                         fHitEnd = FALSE;
   2834                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2835                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2836                         if (fp->fInputIdx > backSearchIndex) {
   2837                             fp = StateSave(fp, fp->fPatIdx, status);
   2838                         }
   2839                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
   2840                         break;
   2841                     }
   2842                 }
   2843             }
   2844             #endif
   2845 
   2846             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2847             break;
   2848 
   2849 
   2850         case URX_STRING:
   2851             {
   2852                 // Test input against a literal string.
   2853                 // Strings require two slots in the compiled pattern, one for the
   2854                 //   offset to the string text, and one for the length.
   2855                 int32_t   stringStartIdx = opValue;
   2856                 int32_t   stringLen;
   2857 
   2858                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
   2859                 fp->fPatIdx++;
   2860                 opType    = URX_TYPE(op);
   2861                 stringLen = URX_VAL(op);
   2862                 U_ASSERT(opType == URX_STRING_LEN);
   2863                 U_ASSERT(stringLen >= 2);
   2864 
   2865                 const UChar *patternChars = litText+stringStartIdx;
   2866                 const UChar *patternEnd = patternChars+stringLen;
   2867 
   2868                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2869                 UChar32 c;
   2870                 UBool success = TRUE;
   2871 
   2872                 while (patternChars < patternEnd && success) {
   2873                     c = UTEXT_NEXT32(fInputText);
   2874 
   2875                     if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) {
   2876                         if (U_IS_BMP(c)) {
   2877                             success = (*patternChars == c);
   2878                             patternChars += 1;
   2879                         } else if (patternChars+1 < patternEnd) {
   2880                             success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
   2881                             patternChars += 2;
   2882                         }
   2883                     } else {
   2884                         success = FALSE;
   2885                         fHitEnd = TRUE;          //   TODO:  See ticket 6074
   2886                     }
   2887                 }
   2888 
   2889                 if (success) {
   2890                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2891                 } else {
   2892                     #ifdef REGEX_SMART_BACKTRACKING
   2893                     if (fp->fInputIdx > backSearchIndex && fStack->size()) {
   2894                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   2895                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   2896                             // Reset to last start point
   2897                             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2898                             patternChars = litText+stringStartIdx;
   2899 
   2900                             // Search backwards for a possible start
   2901                             do {
   2902                                 c = UTEXT_PREVIOUS32(fInputText);
   2903                                 if (c == U_SENTINEL) {
   2904                                     break;
   2905                                 } else if ((U_IS_BMP(c) && *patternChars == c) ||
   2906                                     (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
   2907                                     success = TRUE;
   2908                                     break;
   2909                                 }
   2910                             } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
   2911 
   2912                             // And try again
   2913                             if (success) {
   2914                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2915                                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2916                                 if (fp->fInputIdx > backSearchIndex) {
   2917                                     fp = StateSave(fp, fp->fPatIdx, status);
   2918                                 }
   2919                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
   2920                                 break;
   2921                             }
   2922                         }
   2923                     }
   2924                     #endif
   2925                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2926                 }
   2927             }
   2928             break;
   2929 
   2930 
   2931         case URX_STATE_SAVE:
   2932             fp = StateSave(fp, opValue, status);
   2933             break;
   2934 
   2935 
   2936         case URX_END:
   2937             // The match loop will exit via this path on a successful match,
   2938             //   when we reach the end of the pattern.
   2939             if (toEnd && fp->fInputIdx != fActiveLimit) {
   2940                 // The pattern matched, but not to the end of input.  Try some more.
   2941                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2942                 break;
   2943             }
   2944             isMatch = TRUE;
   2945             goto  breakFromLoop;
   2946 
   2947         // Start and End Capture stack frame variables are laid out out like this:
   2948             //  fp->fExtra[opValue]  - The start of a completed capture group
   2949             //             opValue+1 - The end   of a completed capture group
   2950             //             opValue+2 - the start of a capture group whose end
   2951             //                          has not yet been reached (and might not ever be).
   2952         case URX_START_CAPTURE:
   2953             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2954             fp->fExtra[opValue+2] = fp->fInputIdx;
   2955             break;
   2956 
   2957 
   2958         case URX_END_CAPTURE:
   2959             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2960             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   2961             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   2962             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   2963             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   2964             break;
   2965 
   2966 
   2967         case URX_DOLLAR:                   //  $, test for End of line
   2968                                            //     or for position before new line at end of input
   2969             {
   2970                 if (fp->fInputIdx >= fAnchorLimit) {
   2971                     // We really are at the end of input.  Success.
   2972                     fHitEnd = TRUE;
   2973                     fRequireEnd = TRUE;
   2974                     break;
   2975                 }
   2976 
   2977                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2978 
   2979                 // If we are positioned just before a new-line that is located at the
   2980                 //   end of input, succeed.
   2981                 UChar32 c = UTEXT_NEXT32(fInputText);
   2982                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   2983                     if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
   2984                         // If not in the middle of a CR/LF sequence
   2985                         if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
   2986                             // At new-line at end of input. Success
   2987                             fHitEnd = TRUE;
   2988                             fRequireEnd = TRUE;
   2989 
   2990                             break;
   2991                         }
   2992                     }
   2993                 } else {
   2994                     UChar32 nextC = UTEXT_NEXT32(fInputText);
   2995                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   2996                         fHitEnd = TRUE;
   2997                         fRequireEnd = TRUE;
   2998                         break;                         // At CR/LF at end of input.  Success
   2999                     }
   3000                 }
   3001 
   3002                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3003             }
   3004             break;
   3005 
   3006 
   3007          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   3008             if (fp->fInputIdx >= fAnchorLimit) {
   3009                 // Off the end of input.  Success.
   3010                 fHitEnd = TRUE;
   3011                 fRequireEnd = TRUE;
   3012                 break;
   3013             } else {
   3014                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3015                 UChar32 c = UTEXT_NEXT32(fInputText);
   3016                 // Either at the last character of input, or off the end.
   3017                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
   3018                     fHitEnd = TRUE;
   3019                     fRequireEnd = TRUE;
   3020                     break;
   3021                 }
   3022             }
   3023 
   3024             // Not at end of input.  Back-track out.
   3025             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3026             break;
   3027 
   3028 
   3029          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   3030              {
   3031                  if (fp->fInputIdx >= fAnchorLimit) {
   3032                      // We really are at the end of input.  Success.
   3033                      fHitEnd = TRUE;
   3034                      fRequireEnd = TRUE;
   3035                      break;
   3036                  }
   3037                  // If we are positioned just before a new-line, succeed.
   3038                  // It makes no difference where the new-line is within the input.
   3039                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3040                  UChar32 c = UTEXT_CURRENT32(fInputText);
   3041                  if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
   3042                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   3043                      //  In multi-line mode, hitting a new-line just before the end of input does not
   3044                      //   set the hitEnd or requireEnd flags
   3045                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
   3046                         break;
   3047                      }
   3048                  }
   3049                  // not at a new line.  Fail.
   3050                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3051              }
   3052              break;
   3053 
   3054 
   3055          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   3056              {
   3057                  if (fp->fInputIdx >= fAnchorLimit) {
   3058                      // We really are at the end of input.  Success.
   3059                      fHitEnd = TRUE;
   3060                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   3061                      break;               //   adding a new-line would not lose the match.
   3062                  }
   3063                  // If we are not positioned just before a new-line, the test fails; backtrack out.
   3064                  // It makes no difference where the new-line is within the input.
   3065                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3066                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
   3067                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3068                  }
   3069              }
   3070              break;
   3071 
   3072 
   3073        case URX_CARET:                    //  ^, test for start of line
   3074             if (fp->fInputIdx != fAnchorStart) {
   3075                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3076             }
   3077             break;
   3078 
   3079 
   3080        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   3081            {
   3082                if (fp->fInputIdx == fAnchorStart) {
   3083                    // We are at the start input.  Success.
   3084                    break;
   3085                }
   3086                // Check whether character just before the current pos is a new-line
   3087                //   unless we are at the end of input
   3088                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3089                UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3090                if ((fp->fInputIdx < fAnchorLimit) &&
   3091                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   3092                    //  It's a new-line.  ^ is true.  Success.
   3093                    //  TODO:  what should be done with positions between a CR and LF?
   3094                    break;
   3095                }
   3096                // Not at the start of a line.  Fail.
   3097                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3098            }
   3099            break;
   3100 
   3101 
   3102        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   3103            {
   3104                U_ASSERT(fp->fInputIdx >= fAnchorStart);
   3105                if (fp->fInputIdx <= fAnchorStart) {
   3106                    // We are at the start input.  Success.
   3107                    break;
   3108                }
   3109                // Check whether character just before the current pos is a new-line
   3110                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   3111                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3112                UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3113                if (c != 0x0a) {
   3114                    // Not at the start of a line.  Back-track out.
   3115                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3116                }
   3117            }
   3118            break;
   3119 
   3120         case URX_BACKSLASH_B:          // Test for word boundaries
   3121             {
   3122                 UBool success = isWordBoundary(fp->fInputIdx);
   3123                 success ^= (opValue != 0);     // flip sense for \B
   3124                 if (!success) {
   3125                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3126                 }
   3127             }
   3128             break;
   3129 
   3130 
   3131         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   3132             {
   3133                 UBool success = isUWordBoundary(fp->fInputIdx);
   3134                 success ^= (opValue != 0);     // flip sense for \B
   3135                 if (!success) {
   3136                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3137                 }
   3138             }
   3139             break;
   3140 
   3141 
   3142         case URX_BACKSLASH_D:            // Test for decimal digit
   3143             {
   3144                 if (fp->fInputIdx >= fActiveLimit) {
   3145                     fHitEnd = TRUE;
   3146                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3147                     break;
   3148                 }
   3149 
   3150                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3151 
   3152                 UChar32 c = UTEXT_NEXT32(fInputText);
   3153                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   3154                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   3155                 success ^= (opValue != 0);        // flip sense for \D
   3156                 if (success) {
   3157                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3158                 } else {
   3159                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3160                 }
   3161             }
   3162             break;
   3163 
   3164 
   3165         case URX_BACKSLASH_G:          // Test for position at end of previous match
   3166             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
   3167                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3168             }
   3169             break;
   3170 
   3171 
   3172         case URX_BACKSLASH_X:
   3173             //  Match a Grapheme, as defined by Unicode TR 29.
   3174             //  Differs slightly from Perl, which consumes combining marks independently
   3175             //    of context.
   3176             {
   3177 
   3178                 // Fail if at end of input
   3179                 if (fp->fInputIdx >= fActiveLimit) {
   3180                     fHitEnd = TRUE;
   3181                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3182                     break;
   3183                 }
   3184 
   3185                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3186 
   3187                 // Examine (and consume) the current char.
   3188                 //   Dispatch into a little state machine, based on the char.
   3189                 UChar32  c;
   3190                 c = UTEXT_NEXT32(fInputText);
   3191                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3192                 UnicodeSet **sets = fPattern->fStaticSets;
   3193                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   3194                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   3195                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   3196                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   3197                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   3198                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3199                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3200                 goto GC_Extend;
   3201 
   3202 
   3203 
   3204 GC_L:
   3205                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3206                 c = UTEXT_NEXT32(fInputText);
   3207                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3208                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   3209                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   3210                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   3211                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3212                 UTEXT_PREVIOUS32(fInputText);
   3213                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3214                 goto GC_Extend;
   3215 
   3216 GC_V:
   3217                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3218                 c = UTEXT_NEXT32(fInputText);
   3219                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3220                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3221                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3222                 UTEXT_PREVIOUS32(fInputText);
   3223                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3224                 goto GC_Extend;
   3225 
   3226 GC_T:
   3227                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3228                 c = UTEXT_NEXT32(fInputText);
   3229                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3230                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3231                 UTEXT_PREVIOUS32(fInputText);
   3232                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3233                 goto GC_Extend;
   3234 
   3235 GC_Extend:
   3236                 // Combining characters are consumed here
   3237                 for (;;) {
   3238                     if (fp->fInputIdx >= fActiveLimit) {
   3239                         break;
   3240                     }
   3241                     c = UTEXT_CURRENT32(fInputText);
   3242                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   3243                         break;
   3244                     }
   3245                     UTEXT_NEXT32(fInputText);
   3246                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3247                 }
   3248                 goto GC_Done;
   3249 
   3250 GC_Control:
   3251                 // Most control chars stand alone (don't combine with combining chars),
   3252                 //   except for that CR/LF sequence is a single grapheme cluster.
   3253                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
   3254                     c = UTEXT_NEXT32(fInputText);
   3255                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3256                 }
   3257 
   3258 GC_Done:
   3259                 if (fp->fInputIdx >= fActiveLimit) {
   3260                     fHitEnd = TRUE;
   3261                 }
   3262                 break;
   3263             }
   3264 
   3265 
   3266 
   3267 
   3268         case URX_BACKSLASH_Z:          // Test for end of Input
   3269             if (fp->fInputIdx < fAnchorLimit) {
   3270                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3271             } else {
   3272                 fHitEnd = TRUE;
   3273                 fRequireEnd = TRUE;
   3274             }
   3275             break;
   3276 
   3277 
   3278 
   3279         case URX_STATIC_SETREF:
   3280             {
   3281                 // Test input character against one of the predefined sets
   3282                 //    (Word Characters, for example)
   3283                 // The high bit of the op value is a flag for the match polarity.
   3284                 //    0:   success if input char is in set.
   3285                 //    1:   success if input char is not in set.
   3286                 if (fp->fInputIdx >= fActiveLimit) {
   3287                     fHitEnd = TRUE;
   3288                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3289                     break;
   3290                 }
   3291 
   3292                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   3293                 opValue &= ~URX_NEG_SET;
   3294                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3295 
   3296                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3297                 UChar32 c = UTEXT_NEXT32(fInputText);
   3298                 if (c < 256) {
   3299                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3300                     if (s8->contains(c)) {
   3301                         success = !success;
   3302                     }
   3303                 } else {
   3304                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3305                     if (s->contains(c)) {
   3306                         success = !success;
   3307                     }
   3308                 }
   3309                 if (success) {
   3310                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3311                 } else {
   3312                     // the character wasn't in the set.
   3313                     #ifdef REGEX_SMART_BACKTRACKING
   3314                     if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   3315                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3316                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3317                             // Try to find it, backwards
   3318                             UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
   3319                             success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
   3320                             do {
   3321                                 c = UTEXT_PREVIOUS32(fInputText);
   3322                                 if (c == U_SENTINEL) {
   3323                                     break;
   3324                                 } else if (c < 256) {
   3325                                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3326                                     if (s8->contains(c)) {
   3327                                         success = !success;
   3328                                     }
   3329                                 } else {
   3330                                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3331                                     if (s->contains(c)) {
   3332                                         success = !success;
   3333                                     }
   3334                                 }
   3335                             } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success);
   3336 
   3337                             if (success && c != U_SENTINEL) {
   3338                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3339                                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3340                                 if (fp->fInputIdx > backSearchIndex) {
   3341                                     fp = StateSave(fp, fp->fPatIdx, status);
   3342                                 }
   3343                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3344                                 break;
   3345                             }
   3346                         }
   3347                     }
   3348                     #endif
   3349                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3350                 }
   3351             }
   3352             break;
   3353 
   3354 
   3355         case URX_STAT_SETREF_N:
   3356             {
   3357                 // Test input character for NOT being a member of  one of
   3358                 //    the predefined sets (Word Characters, for example)
   3359                 if (fp->fInputIdx >= fActiveLimit) {
   3360                     fHitEnd = TRUE;
   3361                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3362                     break;
   3363                 }
   3364 
   3365                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3366 
   3367                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3368 
   3369                 UChar32 c = UTEXT_NEXT32(fInputText);
   3370                 if (c < 256) {
   3371                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3372                     if (s8->contains(c) == FALSE) {
   3373                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3374                         break;
   3375                     }
   3376                 } else {
   3377                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3378                     if (s->contains(c) == FALSE) {
   3379                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3380                         break;
   3381                     }
   3382                 }
   3383                 // the character wasn't in the set.
   3384                 #ifdef REGEX_SMART_BACKTRACKING
   3385                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   3386                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3387                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3388                         // Try to find it, backwards
   3389                         UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
   3390                         UBool success = FALSE;
   3391                         do {
   3392                             c = UTEXT_PREVIOUS32(fInputText);
   3393                             if (c == U_SENTINEL) {
   3394                                 break;
   3395                             } else if (c < 256) {
   3396                                 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3397                                 if (s8->contains(c) == FALSE) {
   3398                                     success = TRUE;
   3399                                     break;
   3400                                 }
   3401                             } else {
   3402                                 const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3403                                 if (s->contains(c) == FALSE) {
   3404                                     success = TRUE;
   3405                                     break;
   3406                                 }
   3407                             }
   3408                         } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
   3409 
   3410                         if (success) {
   3411                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3412                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3413                             if (fp->fInputIdx > backSearchIndex) {
   3414                                 fp = StateSave(fp, fp->fPatIdx, status);
   3415                             }
   3416                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3417                             break;
   3418                         }
   3419                     }
   3420                 }
   3421                 #endif
   3422                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3423             }
   3424             break;
   3425 
   3426 
   3427         case URX_SETREF:
   3428             if (fp->fInputIdx >= fActiveLimit) {
   3429                 fHitEnd = TRUE;
   3430                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3431                 break;
   3432             } else {
   3433                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3434 
   3435                 // There is input left.  Pick up one char and test it for set membership.
   3436                 UChar32 c = UTEXT_NEXT32(fInputText);
   3437                 U_ASSERT(opValue > 0 && opValue < sets->size());
   3438                 if (c<256) {
   3439                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   3440                     if (s8->contains(c)) {
   3441                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3442                         break;
   3443                     }
   3444                 } else {
   3445                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   3446                     if (s->contains(c)) {
   3447                         // The character is in the set.  A Match.
   3448                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3449                         break;
   3450                     }
   3451                 }
   3452 
   3453                 // the character wasn't in the set.
   3454                 #ifdef REGEX_SMART_BACKTRACKING
   3455                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   3456                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3457                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3458                         // Try to find it, backwards
   3459                         UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
   3460                         UBool success = FALSE;
   3461                         do {
   3462                             c = UTEXT_PREVIOUS32(fInputText);
   3463                             if (c == U_SENTINEL) {
   3464                                 break;
   3465                             } else if (c < 256) {
   3466                                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   3467                                 if (s8->contains(c)) {
   3468                                     success = TRUE;
   3469                                     break;
   3470                                 }
   3471                             } else {
   3472                                 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   3473                                 if (s->contains(c)) {
   3474                                     success = TRUE;
   3475                                     break;
   3476                                 }
   3477                             }
   3478                         } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
   3479 
   3480                         if (success) {
   3481                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3482                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3483                             if (fp->fInputIdx > backSearchIndex) {
   3484                                 fp = StateSave(fp, fp->fPatIdx, status);
   3485                             }
   3486                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3487                             break;
   3488                         }
   3489                     }
   3490                 }
   3491                 #endif
   3492                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3493             }
   3494             break;
   3495 
   3496 
   3497         case URX_DOTANY:
   3498             {
   3499                 // . matches anything, but stops at end-of-line.
   3500                 if (fp->fInputIdx >= fActiveLimit) {
   3501                     // At end of input.  Match failed.  Backtrack out.
   3502                     fHitEnd = TRUE;
   3503                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3504                     break;
   3505                 }
   3506 
   3507                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3508 
   3509                 // There is input left.  Advance over one char, unless we've hit end-of-line
   3510                 UChar32 c = UTEXT_NEXT32(fInputText);
   3511                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   3512                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   3513                     // End of line in normal mode.   . does not match.
   3514                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3515                     break;
   3516                 }
   3517                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3518             }
   3519             break;
   3520 
   3521 
   3522         case URX_DOTANY_ALL:
   3523             {
   3524                 // ., in dot-matches-all (including new lines) mode
   3525                 if (fp->fInputIdx >= fActiveLimit) {
   3526                     // At end of input.  Match failed.  Backtrack out.
   3527                     fHitEnd = TRUE;
   3528                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3529                     break;
   3530                 }
   3531 
   3532                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3533 
   3534                 // There is input left.  Advance over one char, except if we are
   3535                 //   at a cr/lf, advance over both of them.
   3536                 UChar32 c;
   3537                 c = UTEXT_NEXT32(fInputText);
   3538                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3539                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   3540                     // In the case of a CR/LF, we need to advance over both.
   3541                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
   3542                     if (nextc == 0x0a) {
   3543                         UTEXT_NEXT32(fInputText);
   3544                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3545                     }
   3546                 }
   3547             }
   3548             break;
   3549 
   3550 
   3551         case URX_DOTANY_UNIX:
   3552             {
   3553                 // '.' operator, matches all, but stops at end-of-line.
   3554                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   3555                 if (fp->fInputIdx >= fActiveLimit) {
   3556                     // At end of input.  Match failed.  Backtrack out.
   3557                     fHitEnd = TRUE;
   3558                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3559                     break;
   3560                 }
   3561 
   3562                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3563 
   3564                 // There is input left.  Advance over one char, unless we've hit end-of-line
   3565                 UChar32 c = UTEXT_NEXT32(fInputText);
   3566                 if (c == 0x0a) {
   3567                     // End of line in normal mode.   '.' does not match the \n
   3568                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3569                 } else {
   3570                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3571                 }
   3572             }
   3573             break;
   3574 
   3575 
   3576         case URX_JMP:
   3577             fp->fPatIdx = opValue;
   3578             break;
   3579 
   3580         case URX_FAIL:
   3581             isMatch = FALSE;
   3582             goto breakFromLoop;
   3583 
   3584         case URX_JMP_SAV:
   3585             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   3586             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   3587             fp->fPatIdx = opValue;                         // Then JMP.
   3588             break;
   3589 
   3590         case URX_JMP_SAV_X:
   3591             // This opcode is used with (x)+, when x can match a zero length string.
   3592             // Same as JMP_SAV, except conditional on the match having made forward progress.
   3593             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   3594             //   data address of the input position at the start of the loop.
   3595             {
   3596                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   3597                 int32_t  stoOp = (int32_t)pat[opValue-1];
   3598                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   3599                 int32_t  frameLoc = URX_VAL(stoOp);
   3600                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   3601                 int64_t prevInputIdx = fp->fExtra[frameLoc];
   3602                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   3603                 if (prevInputIdx < fp->fInputIdx) {
   3604                     // The match did make progress.  Repeat the loop.
   3605                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   3606                     fp->fPatIdx = opValue;
   3607                     fp->fExtra[frameLoc] = fp->fInputIdx;
   3608                 }
   3609                 // If the input position did not advance, we do nothing here,
   3610                 //   execution will fall out of the loop.
   3611             }
   3612             break;
   3613 
   3614         case URX_CTR_INIT:
   3615             {
   3616                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3617                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   3618 
   3619                 // Pick up the three extra operands that CTR_INIT has, and
   3620                 //    skip the pattern location counter past
   3621                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3622                 fp->fPatIdx += 3;
   3623                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3624                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   3625                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   3626                 U_ASSERT(minCount>=0);
   3627                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   3628                 U_ASSERT(loopLoc>fp->fPatIdx);
   3629 
   3630                 if (minCount == 0) {
   3631                     fp = StateSave(fp, loopLoc+1, status);
   3632                 }
   3633                 if (maxCount == 0) {
   3634                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3635                 }
   3636             }
   3637             break;
   3638 
   3639         case URX_CTR_LOOP:
   3640             {
   3641                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3642                 int32_t initOp = (int32_t)pat[opValue];
   3643                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   3644                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3645                 int32_t minCount  = (int32_t)pat[opValue+2];
   3646                 int32_t maxCount  = (int32_t)pat[opValue+3];
   3647                 // Increment the counter.  Note: we DIDN'T worry about counter
   3648                 //   overflow, since the data comes from UnicodeStrings, which
   3649                 //   stores its length in an int32_t. Do we have to think about
   3650                 //   this now that we're using UText? Probably not, since the length
   3651                 //   in UChar32s is still an int32_t.
   3652                 (*pCounter)++;
   3653                 U_ASSERT(*pCounter > 0);
   3654                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   3655                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   3656                     break;
   3657                 }
   3658                 if (*pCounter >= minCount) {
   3659                     fp = StateSave(fp, fp->fPatIdx, status);
   3660                 }
   3661                 fp->fPatIdx = opValue + 4;    // Loop back.
   3662             }
   3663             break;
   3664 
   3665         case URX_CTR_INIT_NG:
   3666             {
   3667                 // Initialize a non-greedy loop
   3668                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3669                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   3670 
   3671                 // Pick up the three extra operands that CTR_INIT has, and
   3672                 //    skip the pattern location counter past
   3673                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3674                 fp->fPatIdx += 3;
   3675                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3676                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   3677                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   3678                 U_ASSERT(minCount>=0);
   3679                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   3680                 U_ASSERT(loopLoc>fp->fPatIdx);
   3681 
   3682                 if (minCount == 0) {
   3683                     if (maxCount != 0) {
   3684                         fp = StateSave(fp, fp->fPatIdx, status);
   3685                     }
   3686                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   3687                 }
   3688             }
   3689             break;
   3690 
   3691         case URX_CTR_LOOP_NG:
   3692             {
   3693                 // Non-greedy {min, max} loops
   3694                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3695                 int32_t initOp = (int32_t)pat[opValue];
   3696                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   3697                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3698                 int32_t minCount  = (int32_t)pat[opValue+2];
   3699                 int32_t maxCount  = (int32_t)pat[opValue+3];
   3700                 // Increment the counter.  Note: we DIDN'T worry about counter
   3701                 //   overflow, since the data comes from UnicodeStrings, which
   3702                 //   stores its length in an int32_t. Do we have to think about
   3703                 //   this now that we're using UText? Probably not, since the length
   3704                 //   in UChar32s is still an int32_t.
   3705                 (*pCounter)++;
   3706                 U_ASSERT(*pCounter > 0);
   3707 
   3708                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   3709                     // The loop has matched the maximum permitted number of times.
   3710                     //   Break out of here with no action.  Matching will
   3711                     //   continue with the following pattern.
   3712                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   3713                     break;
   3714                 }
   3715 
   3716                 if (*pCounter < minCount) {
   3717                     // We haven't met the minimum number of matches yet.
   3718                     //   Loop back for another one.
   3719                     fp->fPatIdx = opValue + 4;    // Loop back.
   3720                 } else {
   3721                     // We do have the minimum number of matches.
   3722                     //   Fall into the following pattern, but first do
   3723                     //   a state save to the top of the loop, so that a failure
   3724                     //   in the following pattern will try another iteration of the loop.
   3725                     fp = StateSave(fp, opValue + 4, status);
   3726                 }
   3727             }
   3728             break;
   3729 
   3730         case URX_STO_SP:
   3731             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3732             fData[opValue] = fStack->size();
   3733             break;
   3734 
   3735         case URX_LD_SP:
   3736             {
   3737                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3738                 int32_t newStackSize = (int32_t)fData[opValue];
   3739                 U_ASSERT(newStackSize <= fStack->size());
   3740                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3741                 if (newFP == (int64_t *)fp) {
   3742                     break;
   3743                 }
   3744                 int32_t i;
   3745                 for (i=0; i<fFrameSize; i++) {
   3746                     newFP[i] = ((int64_t *)fp)[i];
   3747                 }
   3748                 fp = (REStackFrame *)newFP;
   3749                 fStack->setSize(newStackSize);
   3750             }
   3751             break;
   3752 
   3753         case URX_BACKREF:
   3754         case URX_BACKREF_I:
   3755             {
   3756                 U_ASSERT(opValue < fFrameSize);
   3757                 int64_t groupStartIdx = fp->fExtra[opValue];
   3758                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   3759                 U_ASSERT(groupStartIdx <= groupEndIdx);
   3760                 if (groupStartIdx < 0) {
   3761                     // This capture group has not participated in the match thus far,
   3762                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3763                 }
   3764 
   3765                 if (groupEndIdx == groupStartIdx) {
   3766                     //   The capture group match was of an empty string.
   3767                     //   Verified by testing:  Perl matches succeed in this case, so
   3768                     //   we do too.
   3769                     break;
   3770                 }
   3771 
   3772                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
   3773                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3774 
   3775                 UBool haveMatch = (opType == URX_BACKREF ?
   3776                     (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) :
   3777                     (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status)));
   3778                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3779 
   3780                 if (fp->fInputIdx > fActiveLimit) {
   3781                     fHitEnd = TRUE;
   3782                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3783                 } else if (!haveMatch) {
   3784                     if (fp->fInputIdx == fActiveLimit) {
   3785                         fHitEnd = TRUE;
   3786                     }
   3787                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3788                 }
   3789             }
   3790             break;
   3791 
   3792         case URX_STO_INP_LOC:
   3793             {
   3794                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   3795                 fp->fExtra[opValue] = fp->fInputIdx;
   3796             }
   3797             break;
   3798 
   3799         case URX_JMPX:
   3800             {
   3801                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3802                 fp->fPatIdx += 1;
   3803                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   3804                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   3805                 int64_t savedInputIdx = fp->fExtra[dataLoc];
   3806                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   3807                 if (savedInputIdx < fp->fInputIdx) {
   3808                     fp->fPatIdx = opValue;                               // JMP
   3809                 } else {
   3810                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   3811                 }
   3812             }
   3813             break;
   3814 
   3815         case URX_LA_START:
   3816             {
   3817                 // Entering a lookahead block.
   3818                 // Save Stack Ptr, Input Pos.
   3819                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3820                 fData[opValue]   = fStack->size();
   3821                 fData[opValue+1] = fp->fInputIdx;
   3822                 fActiveStart     = fLookStart;          // Set the match region change for
   3823                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   3824             }
   3825             break;
   3826 
   3827         case URX_LA_END:
   3828             {
   3829                 // Leaving a look-ahead block.
   3830                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   3831                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3832                 int32_t stackSize = fStack->size();
   3833                 int32_t newStackSize =(int32_t)fData[opValue];
   3834                 U_ASSERT(stackSize >= newStackSize);
   3835                 if (stackSize > newStackSize) {
   3836                     // Copy the current top frame back to the new (cut back) top frame.
   3837                     //   This makes the capture groups from within the look-ahead
   3838                     //   expression available.
   3839                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3840                     int32_t i;
   3841                     for (i=0; i<fFrameSize; i++) {
   3842                         newFP[i] = ((int64_t *)fp)[i];
   3843                     }
   3844                     fp = (REStackFrame *)newFP;
   3845                     fStack->setSize(newStackSize);
   3846                 }
   3847                 fp->fInputIdx = fData[opValue+1];
   3848 
   3849                 // Restore the active region bounds in the input string; they may have
   3850                 //    been changed because of transparent bounds on a Region.
   3851                 fActiveStart = fRegionStart;
   3852                 fActiveLimit = fRegionLimit;
   3853             }
   3854             break;
   3855 
   3856         case URX_ONECHAR_I:
   3857             if (fp->fInputIdx < fActiveLimit) {
   3858                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3859 
   3860                 UChar32 c = UTEXT_NEXT32(fInputText);
   3861                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   3862                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3863                     break;
   3864                 }
   3865             } else {
   3866                 fHitEnd = TRUE;
   3867             }
   3868 
   3869             #ifdef REGEX_SMART_BACKTRACKING
   3870             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   3871                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3872                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3873                     UBool success = FALSE;
   3874                     UChar32 c = UTEXT_PREVIOUS32(fInputText);
   3875                     while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
   3876                         if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   3877                             success = TRUE;
   3878                             break;
   3879                         } else if (c == U_SENTINEL) {
   3880                             break;
   3881                         }
   3882                         c = UTEXT_PREVIOUS32(fInputText);
   3883                     }
   3884                     if (success) {
   3885                         fHitEnd = FALSE;
   3886                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3887                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3888                         if (fp->fInputIdx > backSearchIndex) {
   3889                             fp = StateSave(fp, fp->fPatIdx, status);
   3890                         }
   3891                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3892                         break;
   3893                     }
   3894                 }
   3895             }
   3896             #endif
   3897 
   3898             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3899             break;
   3900 
   3901         case URX_STRING_I:
   3902             {
   3903                 // Test input against a literal string.
   3904                 // Strings require two slots in the compiled pattern, one for the
   3905                 //   offset to the string text, and one for the length.
   3906                 const UCaseProps *csp = ucase_getSingleton();
   3907                 {
   3908                     int32_t stringStartIdx, stringLen;
   3909                     stringStartIdx = opValue;
   3910 
   3911                     op      = (int32_t)pat[fp->fPatIdx];
   3912                     fp->fPatIdx++;
   3913                     opType  = URX_TYPE(op);
   3914                     opValue = URX_VAL(op);
   3915                     U_ASSERT(opType == URX_STRING_LEN);
   3916                     stringLen = opValue;
   3917 
   3918                     const UChar *patternChars = litText+stringStartIdx;
   3919                     const UChar *patternEnd = patternChars+stringLen;
   3920 
   3921                     const UChar *foldChars = NULL;
   3922                     int32_t foldOffset, foldLength;
   3923                     UChar32 c;
   3924 
   3925                     foldOffset = foldLength = 0;
   3926                     UBool success = TRUE;
   3927 
   3928                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3929                     while (patternChars < patternEnd && success) {
   3930                         if(foldOffset < foldLength) {
   3931                             U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   3932                         } else {
   3933                             c = UTEXT_NEXT32(fInputText);
   3934                             if (c != U_SENTINEL) {
   3935                                 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
   3936                                 if(foldLength >= 0) {
   3937                                     if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
   3938                                         foldOffset = 0;
   3939                                         U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   3940                                     } else {
   3941                                         c = foldLength;
   3942                                         foldLength = foldOffset; // to avoid reading chars from the folding buffer
   3943                                     }
   3944                                 }
   3945                             }
   3946 
   3947                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3948                         }
   3949 
   3950                         success = FALSE;
   3951                         if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) {
   3952                             if (U_IS_BMP(c)) {
   3953                                 success = (*patternChars == c);
   3954                                 patternChars += 1;
   3955                             } else if (patternChars+1 < patternEnd) {
   3956                                 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
   3957                                 patternChars += 2;
   3958                             }
   3959                         } else {
   3960                             fHitEnd = TRUE;          //   TODO:  See ticket 6074
   3961                         }
   3962                     }
   3963 
   3964                     if (!success) {
   3965                         #ifdef REGEX_SMART_BACKTRACKING
   3966                         if (fp->fInputIdx > backSearchIndex && fStack->size()) {
   3967                             REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3968                             if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3969                                 // Reset to last start point
   3970                                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3971                                 patternChars = litText+stringStartIdx;
   3972 
   3973                                 // Search backwards for a possible start
   3974                                 do {
   3975                                     c = UTEXT_PREVIOUS32(fInputText);
   3976                                     if (c == U_SENTINEL) {
   3977                                         break;
   3978                                     } else {
   3979                                         foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
   3980                                         if(foldLength >= 0) {
   3981                                             if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
   3982                                                 foldOffset = 0;
   3983                                                 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   3984                                             } else {
   3985                                                 c = foldLength;
   3986                                                 foldLength = foldOffset; // to avoid reading chars from the folding buffer
   3987                                             }
   3988                                         }
   3989 
   3990                                         if ((U_IS_BMP(c) && *patternChars == c) ||
   3991                                                (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
   3992                                             success = TRUE;
   3993                                             break;
   3994                                         }
   3995                                     }
   3996                                 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
   3997 
   3998                                 // And try again
   3999                                 if (success) {
   4000                                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4001                                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4002                                     if (fp->fInputIdx > backSearchIndex) {
   4003                                         fp = StateSave(fp, fp->fPatIdx, status);
   4004                                     }
   4005                                     fp->fPatIdx++; // Skip the LOOP_C, we just did that
   4006                                     break;
   4007                                 }
   4008                             }
   4009                         }
   4010                         #endif
   4011                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4012                     }
   4013                 }
   4014             }
   4015             break;
   4016 
   4017         case URX_LB_START:
   4018             {
   4019                 // Entering a look-behind block.
   4020                 // Save Stack Ptr, Input Pos.
   4021                 //   TODO:  implement transparent bounds.  Ticket #6067
   4022                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4023                 fData[opValue]   = fStack->size();
   4024                 fData[opValue+1] = fp->fInputIdx;
   4025                 // Init the variable containing the start index for attempted matches.
   4026                 fData[opValue+2] = -1;
   4027                 // Save input string length, then reset to pin any matches to end at
   4028                 //   the current position.
   4029                 fData[opValue+3] = fActiveLimit;
   4030                 fActiveLimit     = fp->fInputIdx;
   4031             }
   4032             break;
   4033 
   4034 
   4035         case URX_LB_CONT:
   4036             {
   4037                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   4038                 //    at all possible input starting positions.
   4039 
   4040                 // Fetch the min and max possible match lengths.  They are the operands
   4041                 //   of this op in the pattern.
   4042                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
   4043                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
   4044                 U_ASSERT(minML <= maxML);
   4045                 U_ASSERT(minML >= 0);
   4046 
   4047                 // Fetch (from data) the last input index where a match was attempted.
   4048                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4049                 int64_t  *lbStartIdx = &fData[opValue+2];
   4050                 if (*lbStartIdx < 0) {
   4051                     // First time through loop.
   4052                     *lbStartIdx = fp->fInputIdx - minML;
   4053                 } else {
   4054                     // 2nd through nth time through the loop.
   4055                     // Back up start position for match by one.
   4056                     if (*lbStartIdx == 0) {
   4057                         (*lbStartIdx)--;
   4058                     } else {
   4059                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
   4060                         UTEXT_PREVIOUS32(fInputText);
   4061                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4062                     }
   4063                 }
   4064 
   4065                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   4066                     // We have tried all potential match starting points without
   4067                     //  getting a match.  Backtrack out, and out of the
   4068                     //   Look Behind altogether.
   4069                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4070                     int64_t restoreInputLen = fData[opValue+3];
   4071                     U_ASSERT(restoreInputLen >= fActiveLimit);
   4072                     U_ASSERT(restoreInputLen <= fInputLength);
   4073                     fActiveLimit = restoreInputLen;
   4074                     break;
   4075                 }
   4076 
   4077                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   4078                 //      (successful match will fall off the end of the loop.)
   4079                 fp = StateSave(fp, fp->fPatIdx-3, status);
   4080                 fp->fInputIdx = *lbStartIdx;
   4081             }
   4082             break;
   4083 
   4084         case URX_LB_END:
   4085             // End of a look-behind block, after a successful match.
   4086             {
   4087                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4088                 if (fp->fInputIdx != fActiveLimit) {
   4089                     //  The look-behind expression matched, but the match did not
   4090                     //    extend all the way to the point that we are looking behind from.
   4091                     //  FAIL out of here, which will take us back to the LB_CONT, which
   4092                     //     will retry the match starting at another position or fail
   4093                     //     the look-behind altogether, whichever is appropriate.
   4094                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4095                     break;
   4096                 }
   4097 
   4098                 // Look-behind match is good.  Restore the orignal input string length,
   4099                 //   which had been truncated to pin the end of the lookbehind match to the
   4100                 //   position being looked-behind.
   4101                 int64_t originalInputLen = fData[opValue+3];
   4102                 U_ASSERT(originalInputLen >= fActiveLimit);
   4103                 U_ASSERT(originalInputLen <= fInputLength);
   4104                 fActiveLimit = originalInputLen;
   4105             }
   4106             break;
   4107 
   4108 
   4109         case URX_LBN_CONT:
   4110             {
   4111                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   4112                 //    at all possible input starting positions.
   4113 
   4114                 // Fetch the extra parameters of this op.
   4115                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
   4116                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
   4117                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
   4118                         continueLoc = URX_VAL(continueLoc);
   4119                 U_ASSERT(minML <= maxML);
   4120                 U_ASSERT(minML >= 0);
   4121                 U_ASSERT(continueLoc > fp->fPatIdx);
   4122 
   4123                 // Fetch (from data) the last input index where a match was attempted.
   4124                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4125                 int64_t  *lbStartIdx = &fData[opValue+2];
   4126                 if (*lbStartIdx < 0) {
   4127                     // First time through loop.
   4128                     *lbStartIdx = fp->fInputIdx - minML;
   4129                 } else {
   4130                     // 2nd through nth time through the loop.
   4131                     // Back up start position for match by one.
   4132                     if (*lbStartIdx == 0) {
   4133                         (*lbStartIdx)--;
   4134                     } else {
   4135                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
   4136                         UTEXT_PREVIOUS32(fInputText);
   4137                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4138                     }
   4139                 }
   4140 
   4141                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   4142                     // We have tried all potential match starting points without
   4143                     //  getting a match, which means that the negative lookbehind as
   4144                     //  a whole has succeeded.  Jump forward to the continue location
   4145                     int64_t restoreInputLen = fData[opValue+3];
   4146                     U_ASSERT(restoreInputLen >= fActiveLimit);
   4147                     U_ASSERT(restoreInputLen <= fInputLength);
   4148                     fActiveLimit = restoreInputLen;
   4149                     fp->fPatIdx = continueLoc;
   4150                     break;
   4151                 }
   4152 
   4153                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   4154                 //      (successful match will cause a FAIL out of the loop altogether.)
   4155                 fp = StateSave(fp, fp->fPatIdx-4, status);
   4156                 fp->fInputIdx = *lbStartIdx;
   4157             }
   4158             break;
   4159 
   4160         case URX_LBN_END:
   4161             // End of a negative look-behind block, after a successful match.
   4162             {
   4163                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4164                 if (fp->fInputIdx != fActiveLimit) {
   4165                     //  The look-behind expression matched, but the match did not
   4166                     //    extend all the way to the point that we are looking behind from.
   4167                     //  FAIL out of here, which will take us back to the LB_CONT, which
   4168                     //     will retry the match starting at another position or succeed
   4169                     //     the look-behind altogether, whichever is appropriate.
   4170                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4171                     break;
   4172                 }
   4173 
   4174                 // Look-behind expression matched, which means look-behind test as
   4175                 //   a whole Fails
   4176 
   4177                 //   Restore the orignal input string length, which had been truncated
   4178                 //   inorder to pin the end of the lookbehind match
   4179                 //   to the position being looked-behind.
   4180                 int64_t originalInputLen = fData[opValue+3];
   4181                 U_ASSERT(originalInputLen >= fActiveLimit);
   4182                 U_ASSERT(originalInputLen <= fInputLength);
   4183                 fActiveLimit = originalInputLen;
   4184 
   4185                 // Restore original stack position, discarding any state saved
   4186                 //   by the successful pattern match.
   4187                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4188                 int32_t newStackSize = (int32_t)fData[opValue];
   4189                 U_ASSERT(fStack->size() > newStackSize);
   4190                 fStack->setSize(newStackSize);
   4191 
   4192                 //  FAIL, which will take control back to someplace
   4193                 //  prior to entering the look-behind test.
   4194                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4195             }
   4196             break;
   4197 
   4198 
   4199         case URX_LOOP_SR_I:
   4200             // Loop Initialization for the optimized implementation of
   4201             //     [some character set]*
   4202             //   This op scans through all matching input.
   4203             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4204             {
   4205                 U_ASSERT(opValue > 0 && opValue < sets->size());
   4206                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   4207                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   4208 
   4209                 // Loop through input, until either the input is exhausted or
   4210                 //   we reach a character that is not a member of the set.
   4211                 int64_t ix = fp->fInputIdx;
   4212                 UTEXT_SETNATIVEINDEX(fInputText, ix);
   4213                 for (;;) {
   4214                     if (ix >= fActiveLimit) {
   4215                         fHitEnd = TRUE;
   4216                         break;
   4217                     }
   4218                     UChar32 c = UTEXT_NEXT32(fInputText);
   4219                     if (c<256) {
   4220                         if (s8->contains(c) == FALSE) {
   4221                             break;
   4222                         }
   4223                     } else {
   4224                         if (s->contains(c) == FALSE) {
   4225                             break;
   4226                         }
   4227                     }
   4228                     ix = UTEXT_GETNATIVEINDEX(fInputText);
   4229                 }
   4230 
   4231                 // If there were no matching characters, skip over the loop altogether.
   4232                 //   The loop doesn't run at all, a * op always succeeds.
   4233                 if (ix == fp->fInputIdx) {
   4234                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4235                     break;
   4236                 }
   4237 
   4238                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4239                 //   must follow.  It's operand is the stack location
   4240                 //   that holds the starting input index for the match of this [set]*
   4241                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   4242                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4243                 int32_t stackLoc = URX_VAL(loopcOp);
   4244                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4245                 fp->fExtra[stackLoc] = fp->fInputIdx;
   4246                 #ifdef REGEX_SMART_BACKTRACKING
   4247                 backSearchIndex = fp->fInputIdx;
   4248                 #endif
   4249                 fp->fInputIdx = ix;
   4250 
   4251                 // Save State to the URX_LOOP_C op that follows this one,
   4252                 //   so that match failures in the following code will return to there.
   4253                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4254                 fp = StateSave(fp, fp->fPatIdx, status);
   4255                 fp->fPatIdx++;
   4256             }
   4257             break;
   4258 
   4259 
   4260         case URX_LOOP_DOT_I:
   4261             // Loop Initialization for the optimized implementation of .*
   4262             //   This op scans through all remaining input.
   4263             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4264             {
   4265                 // Loop through input until the input is exhausted (we reach an end-of-line)
   4266                 // In DOTALL mode, we can just go straight to the end of the input.
   4267                 int64_t ix;
   4268                 if ((opValue & 1) == 1) {
   4269                     // Dot-matches-All mode.  Jump straight to the end of the string.
   4270                     ix = fActiveLimit;
   4271                     fHitEnd = TRUE;
   4272                 } else {
   4273                     // NOT DOT ALL mode.  Line endings do not match '.'
   4274                     // Scan forward until a line ending or end of input.
   4275                     ix = fp->fInputIdx;
   4276                     UTEXT_SETNATIVEINDEX(fInputText, ix);
   4277                     for (;;) {
   4278                         if (ix >= fActiveLimit) {
   4279                             fHitEnd = TRUE;
   4280                             break;
   4281                         }
   4282                         UChar32 c = UTEXT_NEXT32(fInputText);
   4283                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
   4284                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
   4285                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   4286                                     (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
   4287                                 //  char is a line ending.  Exit the scanning loop.
   4288                                 break;
   4289                             }
   4290                         }
   4291                         ix = UTEXT_GETNATIVEINDEX(fInputText);
   4292                     }
   4293                 }
   4294 
   4295                 // If there were no matching characters, skip over the loop altogether.
   4296                 //   The loop doesn't run at all, a * op always succeeds.
   4297                 if (ix == fp->fInputIdx) {
   4298                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4299                     break;
   4300                 }
   4301 
   4302                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4303                 //   must follow.  It's operand is the stack location
   4304                 //   that holds the starting input index for the match of this .*
   4305                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   4306                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4307                 int32_t stackLoc = URX_VAL(loopcOp);
   4308                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4309                 fp->fExtra[stackLoc] = fp->fInputIdx;
   4310                 #ifdef REGEX_SMART_BACKTRACKING
   4311                 backSearchIndex = fp->fInputIdx;
   4312                 #endif
   4313                 fp->fInputIdx = ix;
   4314 
   4315                 // Save State to the URX_LOOP_C op that follows this one,
   4316                 //   so that match failures in the following code will return to there.
   4317                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4318                 fp = StateSave(fp, fp->fPatIdx, status);
   4319                 fp->fPatIdx++;
   4320             }
   4321             break;
   4322 
   4323 
   4324         case URX_LOOP_C:
   4325             {
   4326                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   4327                 backSearchIndex = fp->fExtra[opValue];
   4328                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
   4329                 if (backSearchIndex == fp->fInputIdx) {
   4330                     // We've backed up the input idx to the point that the loop started.
   4331                     // The loop is done.  Leave here without saving state.
   4332                     //  Subsequent failures won't come back here.
   4333                     break;
   4334                 }
   4335                 // Set up for the next iteration of the loop, with input index
   4336                 //   backed up by one from the last time through,
   4337                 //   and a state save to this instruction in case the following code fails again.
   4338                 //   (We're going backwards because this loop emulates stack unwinding, not
   4339                 //    the initial scan forward.)
   4340                 U_ASSERT(fp->fInputIdx > 0);
   4341                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4342                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
   4343                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4344 
   4345                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
   4346                 if (prevC == 0x0a &&
   4347                     fp->fInputIdx > backSearchIndex &&
   4348                     twoPrevC == 0x0d) {
   4349                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
   4350                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   4351                         // .*, stepping back over CRLF pair.
   4352                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4353                     }
   4354                 }
   4355 
   4356 
   4357                 fp = StateSave(fp, fp->fPatIdx-1, status);
   4358             }
   4359             break;
   4360 
   4361 
   4362 
   4363         default:
   4364             // Trouble.  The compiled pattern contains an entry with an
   4365             //           unrecognized type tag.
   4366             U_ASSERT(FALSE);
   4367         }
   4368 
   4369         if (U_FAILURE(status)) {
   4370             isMatch = FALSE;
   4371             break;
   4372         }
   4373     }
   4374 
   4375 breakFromLoop:
   4376     fMatch = isMatch;
   4377     if (isMatch) {
   4378         fLastMatchEnd = fMatchEnd;
   4379         fMatchStart   = startIdx;
   4380         fMatchEnd     = fp->fInputIdx;
   4381         if (fTraceDebug) {
   4382             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
   4383         }
   4384     }
   4385     else
   4386     {
   4387         if (fTraceDebug) {
   4388             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
   4389         }
   4390     }
   4391 
   4392     fFrame = fp;                // The active stack frame when the engine stopped.
   4393                                 //   Contains the capture group results that we need to
   4394                                 //    access later.
   4395     return;
   4396 }
   4397 
   4398 
   4399 //--------------------------------------------------------------------------------
   4400 //
   4401 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
   4402 //                  assumption that the entire string is available in the UText's
   4403 //                  chunk buffer. For now, that means we can use int32_t indexes,
   4404 //                  except for anything that needs to be saved (like group starts
   4405 //                  and ends).
   4406 //
   4407 //                  startIdx:    begin matching a this index.
   4408 //                  toEnd:       if true, match must extend to end of the input region
   4409 //
   4410 //--------------------------------------------------------------------------------
   4411 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
   4412     UBool       isMatch  = FALSE;      // True if the we have a match.
   4413 
   4414     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
   4415 
   4416     int32_t     op;                    // Operation from the compiled pattern, split into
   4417     int32_t     opType;                //    the opcode
   4418     int32_t     opValue;               //    and the operand value.
   4419 
   4420 #ifdef REGEX_RUN_DEBUG
   4421     if (fTraceDebug)
   4422     {
   4423         printf("MatchAt(startIdx=%ld)\n", startIdx);
   4424         printf("Original Pattern: ");
   4425         UChar32 c = utext_next32From(fPattern->fPattern, 0);
   4426         while (c != U_SENTINEL) {
   4427             if (c<32 || c>256) {
   4428                 c = '.';
   4429             }
   4430             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
   4431 
   4432             c = UTEXT_NEXT32(fPattern->fPattern);
   4433         }
   4434         printf("\n");
   4435         printf("Input String: ");
   4436         c = utext_next32From(fInputText, 0);
   4437         while (c != U_SENTINEL) {
   4438             if (c<32 || c>256) {
   4439                 c = '.';
   4440             }
   4441             printf("%c", c);
   4442 
   4443             c = UTEXT_NEXT32(fInputText);
   4444         }
   4445         printf("\n");
   4446         printf("\n");
   4447     }
   4448 #endif
   4449 
   4450     if (U_FAILURE(status)) {
   4451         return;
   4452     }
   4453 
   4454     //  Cache frequently referenced items from the compiled pattern
   4455     //
   4456     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   4457 
   4458     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   4459     UVector             *sets          = fPattern->fSets;
   4460 
   4461     const UChar         *inputBuf      = fInputText->chunkContents;
   4462 
   4463     fFrameSize = fPattern->fFrameSize;
   4464     REStackFrame        *fp            = resetStack();
   4465 
   4466     fp->fPatIdx   = 0;
   4467     fp->fInputIdx = startIdx;
   4468 
   4469     // Zero out the pattern's static data
   4470     int32_t i;
   4471     for (i = 0; i<fPattern->fDataSize; i++) {
   4472         fData[i] = 0;
   4473     }
   4474 
   4475     //
   4476     //  Main loop for interpreting the compiled pattern.
   4477     //  One iteration of the loop per pattern operation performed.
   4478     //
   4479     for (;;) {
   4480 #if 0
   4481         if (_heapchk() != _HEAPOK) {
   4482             fprintf(stderr, "Heap Trouble\n");
   4483         }
   4484 #endif
   4485 
   4486         op      = (int32_t)pat[fp->fPatIdx];
   4487         opType  = URX_TYPE(op);
   4488         opValue = URX_VAL(op);
   4489 #ifdef REGEX_RUN_DEBUG
   4490         if (fTraceDebug) {
   4491             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4492             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
   4493                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   4494             fPattern->dumpOp(fp->fPatIdx);
   4495         }
   4496 #endif
   4497         fp->fPatIdx++;
   4498 
   4499         switch (opType) {
   4500 
   4501 
   4502         case URX_NOP:
   4503             break;
   4504 
   4505 
   4506         case URX_BACKTRACK:
   4507             // Force a backtrack.  In some circumstances, the pattern compiler
   4508             //   will notice that the pattern can't possibly match anything, and will
   4509             //   emit one of these at that point.
   4510             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4511             break;
   4512 
   4513 
   4514         case URX_ONECHAR:
   4515             if (fp->fInputIdx < fActiveLimit) {
   4516                 UChar32 c;
   4517                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4518                 if (c == opValue) {
   4519                     break;
   4520                 }
   4521             } else {
   4522                 fHitEnd = TRUE;
   4523             }
   4524 
   4525             #ifdef REGEX_SMART_BACKTRACKING
   4526             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   4527                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   4528                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   4529                     int64_t reverseIndex = fp->fInputIdx;
   4530                     UChar32 c;
   4531                     do {
   4532                         U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   4533                         if (c == opValue) {
   4534                             break;
   4535                         }
   4536                     } while (reverseIndex > backSearchIndex);
   4537                     if (c == opValue) {
   4538                         fHitEnd = FALSE;
   4539                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4540                         fp->fInputIdx = reverseIndex;
   4541                         if (fp->fInputIdx > backSearchIndex) {
   4542                             fp = StateSave(fp, fp->fPatIdx, status);
   4543                         }
   4544                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
   4545                         break;
   4546                     }
   4547                 }
   4548             }
   4549             #endif
   4550 
   4551             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4552             break;
   4553 
   4554 
   4555         case URX_STRING:
   4556             {
   4557                 // Test input against a literal string.
   4558                 // Strings require two slots in the compiled pattern, one for the
   4559                 //   offset to the string text, and one for the length.
   4560                 int32_t   stringStartIdx = opValue;
   4561                 int32_t   stringLen;
   4562 
   4563                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
   4564                 fp->fPatIdx++;
   4565                 opType    = URX_TYPE(op);
   4566                 stringLen = URX_VAL(op);
   4567                 U_ASSERT(opType == URX_STRING_LEN);
   4568                 U_ASSERT(stringLen >= 2);
   4569 
   4570                 if (fp->fInputIdx + stringLen > fActiveLimit) {
   4571                     // No match.  String is longer than the remaining input text.
   4572                     fHitEnd = TRUE;          //   TODO:  See ticket 6074
   4573                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4574                     break;
   4575                 }
   4576 
   4577                 const UChar * pInp = inputBuf + fp->fInputIdx;
   4578                 const UChar * pPat = litText+stringStartIdx;
   4579                 const UChar * pEnd = pInp + stringLen;
   4580                 UBool success = FALSE;
   4581                 for(;;) {
   4582                     if (*pInp == *pPat) {
   4583                         pInp++;
   4584                         pPat++;
   4585                         if (pInp == pEnd) {
   4586                             // Successful Match.
   4587                             success = TRUE;
   4588                             break;
   4589                         }
   4590                     } else {
   4591                         // Match failed.
   4592                         break;
   4593                     }
   4594                 }
   4595 
   4596                 if (success) {
   4597                     fp->fInputIdx += stringLen;
   4598                 } else {
   4599                     #ifdef REGEX_SMART_BACKTRACKING
   4600                     if (fp->fInputIdx > backSearchIndex && fStack->size()) {
   4601                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   4602                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   4603                             // Reset to last start point
   4604                             int64_t reverseIndex = fp->fInputIdx;
   4605                             UChar32 c;
   4606                             pPat = litText+stringStartIdx;
   4607 
   4608                             // Search backwards for a possible start
   4609                             do {
   4610                                 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   4611                                 if ((U_IS_BMP(c) && *pPat == c) ||
   4612                                     (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) {
   4613                                     success = TRUE;
   4614                                     break;
   4615                                 }
   4616                             } while (reverseIndex > backSearchIndex);
   4617 
   4618                             // And try again
   4619                             if (success) {
   4620                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4621                                 fp->fInputIdx = reverseIndex;
   4622                                 if (fp->fInputIdx > backSearchIndex) {
   4623                                     fp = StateSave(fp, fp->fPatIdx, status);
   4624                                 }
   4625                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
   4626                                 break;
   4627                             }
   4628                         }
   4629                     }
   4630                     #endif
   4631                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4632                 }
   4633             }
   4634             break;
   4635 
   4636 
   4637         case URX_STATE_SAVE:
   4638             fp = StateSave(fp, opValue, status);
   4639             break;
   4640 
   4641 
   4642         case URX_END:
   4643             // The match loop will exit via this path on a successful match,
   4644             //   when we reach the end of the pattern.
   4645             if (toEnd && fp->fInputIdx != fActiveLimit) {
   4646                 // The pattern matched, but not to the end of input.  Try some more.
   4647                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4648                 break;
   4649             }
   4650             isMatch = TRUE;
   4651             goto  breakFromLoop;
   4652 
   4653             // Start and End Capture stack frame variables are laid out out like this:
   4654             //  fp->fExtra[opValue]  - The start of a completed capture group
   4655             //             opValue+1 - The end   of a completed capture group
   4656             //             opValue+2 - the start of a capture group whose end
   4657             //                          has not yet been reached (and might not ever be).
   4658         case URX_START_CAPTURE:
   4659             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4660             fp->fExtra[opValue+2] = fp->fInputIdx;
   4661             break;
   4662 
   4663 
   4664         case URX_END_CAPTURE:
   4665             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4666             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   4667             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   4668             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   4669             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   4670             break;
   4671 
   4672 
   4673         case URX_DOLLAR:                   //  $, test for End of line
   4674             //     or for position before new line at end of input
   4675             if (fp->fInputIdx < fAnchorLimit-2) {
   4676                 // We are no where near the end of input.  Fail.
   4677                 //   This is the common case.  Keep it first.
   4678                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4679                 break;
   4680             }
   4681             if (fp->fInputIdx >= fAnchorLimit) {
   4682                 // We really are at the end of input.  Success.
   4683                 fHitEnd = TRUE;
   4684                 fRequireEnd = TRUE;
   4685                 break;
   4686             }
   4687 
   4688             // If we are positioned just before a new-line that is located at the
   4689             //   end of input, succeed.
   4690             if (fp->fInputIdx == fAnchorLimit-1) {
   4691                 UChar32 c;
   4692                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
   4693 
   4694                 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
   4695                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4696                         // At new-line at end of input. Success
   4697                         fHitEnd = TRUE;
   4698                         fRequireEnd = TRUE;
   4699                         break;
   4700                     }
   4701                 }
   4702             } else if (fp->fInputIdx == fAnchorLimit-2 &&
   4703                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
   4704                     fHitEnd = TRUE;
   4705                     fRequireEnd = TRUE;
   4706                     break;                         // At CR/LF at end of input.  Success
   4707             }
   4708 
   4709             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4710 
   4711             break;
   4712 
   4713 
   4714         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   4715             if (fp->fInputIdx >= fAnchorLimit-1) {
   4716                 // Either at the last character of input, or off the end.
   4717                 if (fp->fInputIdx == fAnchorLimit-1) {
   4718                     // At last char of input.  Success if it's a new line.
   4719                     if (inputBuf[fp->fInputIdx] == 0x0a) {
   4720                         fHitEnd = TRUE;
   4721                         fRequireEnd = TRUE;
   4722                         break;
   4723                     }
   4724                 } else {
   4725                     // Off the end of input.  Success.
   4726                     fHitEnd = TRUE;
   4727                     fRequireEnd = TRUE;
   4728                     break;
   4729                 }
   4730             }
   4731 
   4732             // Not at end of input.  Back-track out.
   4733             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4734             break;
   4735 
   4736 
   4737         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   4738             {
   4739                 if (fp->fInputIdx >= fAnchorLimit) {
   4740                     // We really are at the end of input.  Success.
   4741                     fHitEnd = TRUE;
   4742                     fRequireEnd = TRUE;
   4743                     break;
   4744                 }
   4745                 // If we are positioned just before a new-line, succeed.
   4746                 // It makes no difference where the new-line is within the input.
   4747                 UChar32 c = inputBuf[fp->fInputIdx];
   4748                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
   4749                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   4750                     //  In multi-line mode, hitting a new-line just before the end of input does not
   4751                     //   set the hitEnd or requireEnd flags
   4752                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4753                         break;
   4754                     }
   4755                 }
   4756                 // not at a new line.  Fail.
   4757                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4758             }
   4759             break;
   4760 
   4761 
   4762         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   4763             {
   4764                 if (fp->fInputIdx >= fAnchorLimit) {
   4765                     // We really are at the end of input.  Success.
   4766                     fHitEnd = TRUE;
   4767                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   4768                     break;               //   adding a new-line would not lose the match.
   4769                 }
   4770                 // If we are not positioned just before a new-line, the test fails; backtrack out.
   4771                 // It makes no difference where the new-line is within the input.
   4772                 if (inputBuf[fp->fInputIdx] != 0x0a) {
   4773                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4774                 }
   4775             }
   4776             break;
   4777 
   4778 
   4779         case URX_CARET:                    //  ^, test for start of line
   4780             if (fp->fInputIdx != fAnchorStart) {
   4781                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4782             }
   4783             break;
   4784 
   4785 
   4786         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   4787             {
   4788                 if (fp->fInputIdx == fAnchorStart) {
   4789                     // We are at the start input.  Success.
   4790                     break;
   4791                 }
   4792                 // Check whether character just before the current pos is a new-line
   4793                 //   unless we are at the end of input
   4794                 UChar  c = inputBuf[fp->fInputIdx - 1];
   4795                 if ((fp->fInputIdx < fAnchorLimit) &&
   4796                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   4797                     //  It's a new-line.  ^ is true.  Success.
   4798                     //  TODO:  what should be done with positions between a CR and LF?
   4799                     break;
   4800                 }
   4801                 // Not at the start of a line.  Fail.
   4802                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4803             }
   4804             break;
   4805 
   4806 
   4807         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   4808             {
   4809                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
   4810                 if (fp->fInputIdx <= fAnchorStart) {
   4811                     // We are at the start input.  Success.
   4812                     break;
   4813                 }
   4814                 // Check whether character just before the current pos is a new-line
   4815                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   4816                 UChar  c = inputBuf[fp->fInputIdx - 1];
   4817                 if (c != 0x0a) {
   4818                     // Not at the start of a line.  Back-track out.
   4819                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4820                 }
   4821             }
   4822             break;
   4823 
   4824         case URX_BACKSLASH_B:          // Test for word boundaries
   4825             {
   4826                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
   4827                 success ^= (opValue != 0);     // flip sense for \B
   4828                 if (!success) {
   4829                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4830                 }
   4831             }
   4832             break;
   4833 
   4834 
   4835         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   4836             {
   4837                 UBool success = isUWordBoundary(fp->fInputIdx);
   4838                 success ^= (opValue != 0);     // flip sense for \B
   4839                 if (!success) {
   4840                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4841                 }
   4842             }
   4843             break;
   4844 
   4845 
   4846         case URX_BACKSLASH_D:            // Test for decimal digit
   4847             {
   4848                 if (fp->fInputIdx >= fActiveLimit) {
   4849                     fHitEnd = TRUE;
   4850                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4851                     break;
   4852                 }
   4853 
   4854                 UChar32 c;
   4855                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4856                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   4857                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   4858                 success ^= (opValue != 0);        // flip sense for \D
   4859                 if (!success) {
   4860                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4861                 }
   4862             }
   4863             break;
   4864 
   4865 
   4866         case URX_BACKSLASH_G:          // Test for position at end of previous match
   4867             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
   4868                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4869             }
   4870             break;
   4871 
   4872 
   4873         case URX_BACKSLASH_X:
   4874         //  Match a Grapheme, as defined by Unicode TR 29.
   4875         //  Differs slightly from Perl, which consumes combining marks independently
   4876         //    of context.
   4877         {
   4878 
   4879             // Fail if at end of input
   4880             if (fp->fInputIdx >= fActiveLimit) {
   4881                 fHitEnd = TRUE;
   4882                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4883                 break;
   4884             }
   4885 
   4886             // Examine (and consume) the current char.
   4887             //   Dispatch into a little state machine, based on the char.
   4888             UChar32  c;
   4889             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4890             UnicodeSet **sets = fPattern->fStaticSets;
   4891             if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   4892             if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   4893             if (sets[URX_GC_L]->contains(c))       goto GC_L;
   4894             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   4895             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   4896             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4897             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4898             goto GC_Extend;
   4899 
   4900 
   4901 
   4902 GC_L:
   4903             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4904             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4905             if (sets[URX_GC_L]->contains(c))       goto GC_L;
   4906             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   4907             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   4908             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4909             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4910             goto GC_Extend;
   4911 
   4912 GC_V:
   4913             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4914             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4915             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4916             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4917             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4918             goto GC_Extend;
   4919 
   4920 GC_T:
   4921             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4922             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4923             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4924             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4925             goto GC_Extend;
   4926 
   4927 GC_Extend:
   4928             // Combining characters are consumed here
   4929             for (;;) {
   4930                 if (fp->fInputIdx >= fActiveLimit) {
   4931                     break;
   4932                 }
   4933                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4934                 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   4935                     U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   4936                     break;
   4937                 }
   4938             }
   4939             goto GC_Done;
   4940 
   4941 GC_Control:
   4942             // Most control chars stand alone (don't combine with combining chars),
   4943             //   except for that CR/LF sequence is a single grapheme cluster.
   4944             if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
   4945                 fp->fInputIdx++;
   4946             }
   4947 
   4948 GC_Done:
   4949             if (fp->fInputIdx >= fActiveLimit) {
   4950                 fHitEnd = TRUE;
   4951             }
   4952             break;
   4953         }
   4954 
   4955 
   4956 
   4957 
   4958         case URX_BACKSLASH_Z:          // Test for end of Input
   4959             if (fp->fInputIdx < fAnchorLimit) {
   4960                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4961             } else {
   4962                 fHitEnd = TRUE;
   4963                 fRequireEnd = TRUE;
   4964             }
   4965             break;
   4966 
   4967 
   4968 
   4969         case URX_STATIC_SETREF:
   4970             {
   4971                 // Test input character against one of the predefined sets
   4972                 //    (Word Characters, for example)
   4973                 // The high bit of the op value is a flag for the match polarity.
   4974                 //    0:   success if input char is in set.
   4975                 //    1:   success if input char is not in set.
   4976                 if (fp->fInputIdx >= fActiveLimit) {
   4977                     fHitEnd = TRUE;
   4978                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4979                     break;
   4980                 }
   4981 
   4982                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   4983                 opValue &= ~URX_NEG_SET;
   4984                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   4985 
   4986                 UChar32 c;
   4987                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4988                 if (c < 256) {
   4989                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   4990                     if (s8->contains(c)) {
   4991                         success = !success;
   4992                     }
   4993                 } else {
   4994                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   4995                     if (s->contains(c)) {
   4996                         success = !success;
   4997                     }
   4998                 }
   4999                 if (!success) {
   5000                     #ifdef REGEX_SMART_BACKTRACKING
   5001                     if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   5002                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5003                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5004                             // Try to find it, backwards
   5005                             int64_t reverseIndex = fp->fInputIdx;
   5006                             U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
   5007                             success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
   5008                             do {
   5009                                 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5010                                 if (c < 256) {
   5011                                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   5012                                     if (s8->contains(c)) {
   5013                                         success = !success;
   5014                                     }
   5015                                 } else {
   5016                                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   5017                                     if (s->contains(c)) {
   5018                                         success = !success;
   5019                                     }
   5020                                 }
   5021                             } while (reverseIndex > backSearchIndex && !success);
   5022 
   5023                             if (success) {
   5024                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5025                                 fp->fInputIdx = reverseIndex;
   5026                                 if (fp->fInputIdx > backSearchIndex) {
   5027                                     fp = StateSave(fp, fp->fPatIdx, status);
   5028                                 }
   5029                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5030                                 break;
   5031                             }
   5032                         }
   5033                     }
   5034                     #endif
   5035                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5036                 }
   5037             }
   5038             break;
   5039 
   5040 
   5041         case URX_STAT_SETREF_N:
   5042             {
   5043                 // Test input character for NOT being a member of  one of
   5044                 //    the predefined sets (Word Characters, for example)
   5045                 if (fp->fInputIdx >= fActiveLimit) {
   5046                     fHitEnd = TRUE;
   5047                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5048                     break;
   5049                 }
   5050 
   5051                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   5052 
   5053                 UChar32  c;
   5054                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5055                 if (c < 256) {
   5056                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   5057                     if (s8->contains(c) == FALSE) {
   5058                         break;
   5059                     }
   5060                 } else {
   5061                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   5062                     if (s->contains(c) == FALSE) {
   5063                         break;
   5064                     }
   5065                 }
   5066 
   5067                 #ifdef REGEX_SMART_BACKTRACKING
   5068                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   5069                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5070                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5071                         // Try to find it, backwards
   5072                         int64_t reverseIndex = fp->fInputIdx;
   5073                         U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
   5074                         UBool success = FALSE;
   5075                         do {
   5076                             U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5077                             if (c < 256) {
   5078                                 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   5079                                 if (s8->contains(c) == FALSE) {
   5080                                     success = TRUE;
   5081                                     break;
   5082                                 }
   5083                             } else {
   5084                                 const UnicodeSet *s = fPattern->fStaticSets[opValue];
   5085                                 if (s->contains(c) == FALSE) {
   5086                                     success = TRUE;
   5087                                     break;
   5088                                 }
   5089                             }
   5090                         } while (reverseIndex > backSearchIndex);
   5091 
   5092                         if (success) {
   5093                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5094                             fp->fInputIdx = reverseIndex;
   5095                             if (fp->fInputIdx > backSearchIndex) {
   5096                                 fp = StateSave(fp, fp->fPatIdx, status);
   5097                             }
   5098                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5099                             break;
   5100                         }
   5101                     }
   5102                 }
   5103                 #endif
   5104                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5105             }
   5106             break;
   5107 
   5108 
   5109         case URX_SETREF:
   5110             {
   5111                 if (fp->fInputIdx >= fActiveLimit) {
   5112                     fHitEnd = TRUE;
   5113                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5114                     break;
   5115                 }
   5116 
   5117                 U_ASSERT(opValue > 0 && opValue < sets->size());
   5118 
   5119                 // There is input left.  Pick up one char and test it for set membership.
   5120                 UChar32  c;
   5121                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5122                 if (c<256) {
   5123                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5124                     if (s8->contains(c)) {
   5125                         // The character is in the set.  A Match.
   5126                         break;
   5127                     }
   5128                 } else {
   5129                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   5130                     if (s->contains(c)) {
   5131                         // The character is in the set.  A Match.
   5132                         break;
   5133                     }
   5134                 }
   5135 
   5136                 // the character wasn't in the set.
   5137                 #ifdef REGEX_SMART_BACKTRACKING
   5138                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   5139                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5140                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5141                         // Try to find it, backwards
   5142                         int64_t reverseIndex = fp->fInputIdx;
   5143                         U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
   5144                         UBool success = FALSE;
   5145                         do {
   5146                             U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5147                             if (c < 256) {
   5148                                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5149                                 if (s8->contains(c)) {
   5150                                     success = TRUE;
   5151                                     break;
   5152                                 }
   5153                             } else {
   5154                                 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   5155                                 if (s->contains(c)) {
   5156                                     success = TRUE;
   5157                                     break;
   5158                                 }
   5159                             }
   5160                         } while (reverseIndex > backSearchIndex);
   5161 
   5162                         if (success) {
   5163                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5164                             fp->fInputIdx = reverseIndex;
   5165                             if (fp->fInputIdx > reverseIndex) {
   5166                                 fp = StateSave(fp, fp->fPatIdx, status);
   5167                             }
   5168                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5169                             break;
   5170                         }
   5171                     }
   5172                 }
   5173                 #endif
   5174                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5175             }
   5176             break;
   5177 
   5178 
   5179         case URX_DOTANY:
   5180             {
   5181                 // . matches anything, but stops at end-of-line.
   5182                 if (fp->fInputIdx >= fActiveLimit) {
   5183                     // At end of input.  Match failed.  Backtrack out.
   5184                     fHitEnd = TRUE;
   5185                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5186                     break;
   5187                 }
   5188 
   5189                 // There is input left.  Advance over one char, unless we've hit end-of-line
   5190                 UChar32  c;
   5191                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5192                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   5193                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   5194                     // End of line in normal mode.   . does not match.
   5195                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5196                     break;
   5197                 }
   5198             }
   5199             break;
   5200 
   5201 
   5202         case URX_DOTANY_ALL:
   5203             {
   5204                 // . in dot-matches-all (including new lines) mode
   5205                 if (fp->fInputIdx >= fActiveLimit) {
   5206                     // At end of input.  Match failed.  Backtrack out.
   5207                     fHitEnd = TRUE;
   5208                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5209                     break;
   5210                 }
   5211 
   5212                 // There is input left.  Advance over one char, except if we are
   5213                 //   at a cr/lf, advance over both of them.
   5214                 UChar32 c;
   5215                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5216                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   5217                     // In the case of a CR/LF, we need to advance over both.
   5218                     if (inputBuf[fp->fInputIdx] == 0x0a) {
   5219                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
   5220                     }
   5221                 }
   5222             }
   5223             break;
   5224 
   5225 
   5226         case URX_DOTANY_UNIX:
   5227             {
   5228                 // '.' operator, matches all, but stops at end-of-line.
   5229                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   5230                 if (fp->fInputIdx >= fActiveLimit) {
   5231                     // At end of input.  Match failed.  Backtrack out.
   5232                     fHitEnd = TRUE;
   5233                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5234                     break;
   5235                 }
   5236 
   5237                 // There is input left.  Advance over one char, unless we've hit end-of-line
   5238                 UChar32 c;
   5239                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5240                 if (c == 0x0a) {
   5241                     // End of line in normal mode.   '.' does not match the \n
   5242                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5243                 }
   5244             }
   5245             break;
   5246 
   5247 
   5248         case URX_JMP:
   5249             fp->fPatIdx = opValue;
   5250             break;
   5251 
   5252         case URX_FAIL:
   5253             isMatch = FALSE;
   5254             goto breakFromLoop;
   5255 
   5256         case URX_JMP_SAV:
   5257             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   5258             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   5259             fp->fPatIdx = opValue;                         // Then JMP.
   5260             break;
   5261 
   5262         case URX_JMP_SAV_X:
   5263             // This opcode is used with (x)+, when x can match a zero length string.
   5264             // Same as JMP_SAV, except conditional on the match having made forward progress.
   5265             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   5266             //   data address of the input position at the start of the loop.
   5267             {
   5268                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   5269                 int32_t  stoOp = (int32_t)pat[opValue-1];
   5270                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   5271                 int32_t  frameLoc = URX_VAL(stoOp);
   5272                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   5273                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
   5274                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   5275                 if (prevInputIdx < fp->fInputIdx) {
   5276                     // The match did make progress.  Repeat the loop.
   5277                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   5278                     fp->fPatIdx = opValue;
   5279                     fp->fExtra[frameLoc] = fp->fInputIdx;
   5280                 }
   5281                 // If the input position did not advance, we do nothing here,
   5282                 //   execution will fall out of the loop.
   5283             }
   5284             break;
   5285 
   5286         case URX_CTR_INIT:
   5287             {
   5288                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   5289                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   5290 
   5291                 // Pick up the three extra operands that CTR_INIT has, and
   5292                 //    skip the pattern location counter past
   5293                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5294                 fp->fPatIdx += 3;
   5295                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   5296                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   5297                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   5298                 U_ASSERT(minCount>=0);
   5299                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   5300                 U_ASSERT(loopLoc>fp->fPatIdx);
   5301 
   5302                 if (minCount == 0) {
   5303                     fp = StateSave(fp, loopLoc+1, status);
   5304                 }
   5305                 if (maxCount == 0) {
   5306                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5307                 }
   5308             }
   5309             break;
   5310 
   5311         case URX_CTR_LOOP:
   5312             {
   5313                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   5314                 int32_t initOp = (int32_t)pat[opValue];
   5315                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   5316                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   5317                 int32_t minCount  = (int32_t)pat[opValue+2];
   5318                 int32_t maxCount  = (int32_t)pat[opValue+3];
   5319                 // Increment the counter.  Note: we DIDN'T worry about counter
   5320                 //   overflow, since the data comes from UnicodeStrings, which
   5321                 //   stores its length in an int32_t. Do we have to think about
   5322                 //   this now that we're using UText? Probably not, since the length
   5323                 //   in UChar32s is still an int32_t.
   5324                 (*pCounter)++;
   5325                 U_ASSERT(*pCounter > 0);
   5326                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   5327                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   5328                     break;
   5329                 }
   5330                 if (*pCounter >= minCount) {
   5331                     fp = StateSave(fp, fp->fPatIdx, status);
   5332                 }
   5333                 fp->fPatIdx = opValue + 4;    // Loop back.
   5334             }
   5335             break;
   5336 
   5337         case URX_CTR_INIT_NG:
   5338             {
   5339                 // Initialize a non-greedy loop
   5340                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   5341                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   5342 
   5343                 // Pick up the three extra operands that CTR_INIT has, and
   5344                 //    skip the pattern location counter past
   5345                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5346                 fp->fPatIdx += 3;
   5347                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   5348                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   5349                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   5350                 U_ASSERT(minCount>=0);
   5351                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   5352                 U_ASSERT(loopLoc>fp->fPatIdx);
   5353 
   5354                 if (minCount == 0) {
   5355                     if (maxCount != 0) {
   5356                         fp = StateSave(fp, fp->fPatIdx, status);
   5357                     }
   5358                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   5359                 }
   5360             }
   5361             break;
   5362 
   5363         case URX_CTR_LOOP_NG:
   5364             {
   5365                 // Non-greedy {min, max} loops
   5366                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   5367                 int32_t initOp = (int32_t)pat[opValue];
   5368                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   5369                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   5370                 int32_t minCount  = (int32_t)pat[opValue+2];
   5371                 int32_t maxCount  = (int32_t)pat[opValue+3];
   5372                 // Increment the counter.  Note: we DIDN'T worry about counter
   5373                 //   overflow, since the data comes from UnicodeStrings, which
   5374                 //   stores its length in an int32_t. Do we have to think about
   5375                 //   this now that we're using UText? Probably not, since the length
   5376                 //   in UChar32s is still an int32_t.
   5377                 (*pCounter)++;
   5378                 U_ASSERT(*pCounter > 0);
   5379 
   5380                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   5381                     // The loop has matched the maximum permitted number of times.
   5382                     //   Break out of here with no action.  Matching will
   5383                     //   continue with the following pattern.
   5384                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   5385                     break;
   5386                 }
   5387 
   5388                 if (*pCounter < minCount) {
   5389                     // We haven't met the minimum number of matches yet.
   5390                     //   Loop back for another one.
   5391                     fp->fPatIdx = opValue + 4;    // Loop back.
   5392                 } else {
   5393                     // We do have the minimum number of matches.
   5394                     //   Fall into the following pattern, but first do
   5395                     //   a state save to the top of the loop, so that a failure
   5396                     //   in the following pattern will try another iteration of the loop.
   5397                     fp = StateSave(fp, opValue + 4, status);
   5398                 }
   5399             }
   5400             break;
   5401 
   5402         case URX_STO_SP:
   5403             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5404             fData[opValue] = fStack->size();
   5405             break;
   5406 
   5407         case URX_LD_SP:
   5408             {
   5409                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5410                 int32_t newStackSize = (int32_t)fData[opValue];
   5411                 U_ASSERT(newStackSize <= fStack->size());
   5412                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5413                 if (newFP == (int64_t *)fp) {
   5414                     break;
   5415                 }
   5416                 int32_t i;
   5417                 for (i=0; i<fFrameSize; i++) {
   5418                     newFP[i] = ((int64_t *)fp)[i];
   5419                 }
   5420                 fp = (REStackFrame *)newFP;
   5421                 fStack->setSize(newStackSize);
   5422             }
   5423             break;
   5424 
   5425         case URX_BACKREF:
   5426         case URX_BACKREF_I:
   5427             {
   5428                 U_ASSERT(opValue < fFrameSize);
   5429                 int64_t groupStartIdx = fp->fExtra[opValue];
   5430                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   5431                 U_ASSERT(groupStartIdx <= groupEndIdx);
   5432                 int64_t len = groupEndIdx-groupStartIdx;
   5433                 if (groupStartIdx < 0) {
   5434                     // This capture group has not participated in the match thus far,
   5435                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   5436                 }
   5437 
   5438                 if (len == 0) {
   5439                         //   The capture group match was of an empty string.
   5440                         //   Verified by testing:  Perl matches succeed in this case, so
   5441                         //   we do too.
   5442                         break;
   5443                     }
   5444 
   5445                 UBool  haveMatch = FALSE;
   5446                 if (fp->fInputIdx + len <= fActiveLimit) {
   5447                     if (opType == URX_BACKREF) {
   5448                         if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) {
   5449                             haveMatch = TRUE;
   5450                         }
   5451                     } else {
   5452                         if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
   5453                                   (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) {
   5454                             haveMatch = TRUE;
   5455                         }
   5456                     }
   5457                 } else {
   5458                     // TODO: probably need to do a partial string comparison, and only
   5459                     //       set HitEnd if the available input matched.  Ticket #6074
   5460                     fHitEnd = TRUE;
   5461                 }
   5462                 if (haveMatch) {
   5463                     fp->fInputIdx += len;     // Match.  Advance current input position.
   5464                 } else {
   5465                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   5466                 }
   5467             }
   5468             break;
   5469 
   5470         case URX_STO_INP_LOC:
   5471             {
   5472                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   5473                 fp->fExtra[opValue] = fp->fInputIdx;
   5474             }
   5475             break;
   5476 
   5477         case URX_JMPX:
   5478             {
   5479                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5480                 fp->fPatIdx += 1;
   5481                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   5482                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   5483                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
   5484                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   5485                 if (savedInputIdx < fp->fInputIdx) {
   5486                     fp->fPatIdx = opValue;                               // JMP
   5487                 } else {
   5488                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   5489                 }
   5490             }
   5491             break;
   5492 
   5493         case URX_LA_START:
   5494             {
   5495                 // Entering a lookahead block.
   5496                 // Save Stack Ptr, Input Pos.
   5497                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5498                 fData[opValue]   = fStack->size();
   5499                 fData[opValue+1] = fp->fInputIdx;
   5500                 fActiveStart     = fLookStart;          // Set the match region change for
   5501                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   5502             }
   5503             break;
   5504 
   5505         case URX_LA_END:
   5506             {
   5507                 // Leaving a look-ahead block.
   5508                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   5509                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5510                 int32_t stackSize = fStack->size();
   5511                 int32_t newStackSize = (int32_t)fData[opValue];
   5512                 U_ASSERT(stackSize >= newStackSize);
   5513                 if (stackSize > newStackSize) {
   5514                     // Copy the current top frame back to the new (cut back) top frame.
   5515                     //   This makes the capture groups from within the look-ahead
   5516                     //   expression available.
   5517                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5518                     int32_t i;
   5519                     for (i=0; i<fFrameSize; i++) {
   5520                         newFP[i] = ((int64_t *)fp)[i];
   5521                     }
   5522                     fp = (REStackFrame *)newFP;
   5523                     fStack->setSize(newStackSize);
   5524                 }
   5525                 fp->fInputIdx = fData[opValue+1];
   5526 
   5527                 // Restore the active region bounds in the input string; they may have
   5528                 //    been changed because of transparent bounds on a Region.
   5529                 fActiveStart = fRegionStart;
   5530                 fActiveLimit = fRegionLimit;
   5531             }
   5532             break;
   5533 
   5534         case URX_ONECHAR_I:
   5535             if (fp->fInputIdx < fActiveLimit) {
   5536                 UChar32 c;
   5537                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5538                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   5539                     break;
   5540                 }
   5541             } else {
   5542                 fHitEnd = TRUE;
   5543             }
   5544 
   5545             #ifdef REGEX_SMART_BACKTRACKING
   5546             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   5547                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5548                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5549                     UBool success = FALSE;
   5550                     int64_t reverseIndex = fp->fInputIdx;
   5551                     UChar32 c;
   5552                     while (reverseIndex > backSearchIndex) {
   5553                         U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5554                         if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   5555                             success = TRUE;
   5556                             break;
   5557                         } else if (c == U_SENTINEL) {
   5558                             break;
   5559                         }
   5560                     }
   5561                     if (success) {
   5562                         fHitEnd = FALSE;
   5563                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5564                         fp->fInputIdx = reverseIndex;
   5565                         if (fp->fInputIdx > backSearchIndex) {
   5566                             fp = StateSave(fp, fp->fPatIdx, status);
   5567                         }
   5568                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5569                         break;
   5570                     }
   5571                 }
   5572             }
   5573             #endif
   5574 
   5575             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5576             break;
   5577 
   5578         case URX_STRING_I:
   5579             {
   5580                 // Test input against a literal string.
   5581                 // Strings require two slots in the compiled pattern, one for the
   5582                 //   offset to the string text, and one for the length.
   5583                 const UCaseProps *csp = ucase_getSingleton();
   5584                 {
   5585                     int32_t stringStartIdx, stringLen;
   5586                     stringStartIdx = opValue;
   5587 
   5588                     op      = (int32_t)pat[fp->fPatIdx];
   5589                     fp->fPatIdx++;
   5590                     opType  = URX_TYPE(op);
   5591                     opValue = URX_VAL(op);
   5592                     U_ASSERT(opType == URX_STRING_LEN);
   5593                     stringLen = opValue;
   5594 
   5595                     const UChar *patternChars = litText+stringStartIdx;
   5596                     const UChar *patternEnd = patternChars+stringLen;
   5597 
   5598                     const UChar *foldChars = NULL;
   5599                     int32_t foldOffset, foldLength;
   5600                     UChar32 c;
   5601                     UBool c_is_valid = FALSE;
   5602 
   5603                     #ifdef REGEX_SMART_BACKTRACKING
   5604                     int32_t originalInputIdx = fp->fInputIdx;
   5605                     #endif
   5606                     UBool success = TRUE;
   5607 
   5608                     foldOffset = foldLength = 0;
   5609 
   5610                     while (patternChars < patternEnd && success) {
   5611                         if (fp->fInputIdx < fActiveLimit) {  // don't read past end of string
   5612                             if(foldOffset < foldLength) {
   5613                                 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   5614                                 c_is_valid = TRUE;
   5615                             } else {
   5616                                 // test pre-condition of U16_NEXT: i < length
   5617                                 U_ASSERT(fp->fInputIdx < fActiveLimit);
   5618                                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5619                                 c_is_valid = TRUE;
   5620                                 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
   5621                                 if(foldLength >= 0) {
   5622                                     if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
   5623                                         foldOffset = 0;
   5624                                         U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   5625                                     } else {
   5626                                         c = foldLength;
   5627                                         foldLength = foldOffset; // to avoid reading chars from the folding buffer
   5628                                     }
   5629                                 }
   5630                             }
   5631                         } else {
   5632                           c_is_valid = FALSE;
   5633                         }
   5634 
   5635                         if (fp->fInputIdx <= fActiveLimit && c_is_valid) {
   5636                             if (U_IS_BMP(c)) {
   5637                                 success = (*patternChars == c);
   5638                                 patternChars += 1;
   5639                             } else if (patternChars+1 < patternEnd) {
   5640                                 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
   5641                                 patternChars += 2;
   5642                             }
   5643                         } else {
   5644                             success = FALSE;
   5645                             fHitEnd = TRUE;          //   TODO:  See ticket 6074
   5646                         }
   5647                     }
   5648 
   5649                     if (!success) {
   5650                         #ifdef REGEX_SMART_BACKTRACKING
   5651                         if (fp->fInputIdx > backSearchIndex && fStack->size()) {
   5652                             REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5653                             if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5654                                 // Reset to last start point
   5655                                 int64_t reverseIndex = originalInputIdx;
   5656                                 patternChars = litText+stringStartIdx;
   5657 
   5658                                 // Search backwards for a possible start
   5659                                 do {
   5660                                     U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5661                                     foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
   5662                                     if(foldLength >= 0) {
   5663                                         if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
   5664                                             foldOffset = 0;
   5665                                             U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   5666                                         } else {
   5667                                             c = foldLength;
   5668                                             foldLength = foldOffset; // to avoid reading chars from the folding buffer
   5669                                         }
   5670                                     }
   5671 
   5672                                     if ((U_IS_BMP(c) && *patternChars == c) ||
   5673                                            (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
   5674                                         success = TRUE;
   5675                                         break;
   5676                                     }
   5677                                 } while (reverseIndex > backSearchIndex);
   5678 
   5679                                 // And try again
   5680                                 if (success) {
   5681                                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5682                                     fp->fInputIdx = reverseIndex;
   5683                                     if (fp->fInputIdx > backSearchIndex) {
   5684                                         fp = StateSave(fp, fp->fPatIdx, status);
   5685                                     }
   5686                                     fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5687                                     break;
   5688                                 }
   5689                             }
   5690                         }
   5691                         #endif
   5692                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5693                     }
   5694                 }
   5695             }
   5696             break;
   5697 
   5698         case URX_LB_START:
   5699             {
   5700                 // Entering a look-behind block.
   5701                 // Save Stack Ptr, Input Pos.
   5702                 //   TODO:  implement transparent bounds.  Ticket #6067
   5703                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5704                 fData[opValue]   = fStack->size();
   5705                 fData[opValue+1] = fp->fInputIdx;
   5706                 // Init the variable containing the start index for attempted matches.
   5707                 fData[opValue+2] = -1;
   5708                 // Save input string length, then reset to pin any matches to end at
   5709                 //   the current position.
   5710                 fData[opValue+3] = fActiveLimit;
   5711                 fActiveLimit     = fp->fInputIdx;
   5712             }
   5713             break;
   5714 
   5715 
   5716         case URX_LB_CONT:
   5717             {
   5718                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   5719                 //    at all possible input starting positions.
   5720 
   5721                 // Fetch the min and max possible match lengths.  They are the operands
   5722                 //   of this op in the pattern.
   5723                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
   5724                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
   5725                 U_ASSERT(minML <= maxML);
   5726                 U_ASSERT(minML >= 0);
   5727 
   5728                 // Fetch (from data) the last input index where a match was attempted.
   5729                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5730                 int64_t  *lbStartIdx = &fData[opValue+2];
   5731                 if (*lbStartIdx < 0) {
   5732                     // First time through loop.
   5733                     *lbStartIdx = fp->fInputIdx - minML;
   5734                 } else {
   5735                     // 2nd through nth time through the loop.
   5736                     // Back up start position for match by one.
   5737                     if (*lbStartIdx == 0) {
   5738                         (*lbStartIdx)--;
   5739                     } else {
   5740                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
   5741                     }
   5742                 }
   5743 
   5744                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   5745                     // We have tried all potential match starting points without
   5746                     //  getting a match.  Backtrack out, and out of the
   5747                     //   Look Behind altogether.
   5748                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5749                     int64_t restoreInputLen = fData[opValue+3];
   5750                     U_ASSERT(restoreInputLen >= fActiveLimit);
   5751                     U_ASSERT(restoreInputLen <= fInputLength);
   5752                     fActiveLimit = restoreInputLen;
   5753                     break;
   5754                 }
   5755 
   5756                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5757                 //      (successful match will fall off the end of the loop.)
   5758                 fp = StateSave(fp, fp->fPatIdx-3, status);
   5759                 fp->fInputIdx =  *lbStartIdx;
   5760             }
   5761             break;
   5762 
   5763         case URX_LB_END:
   5764             // End of a look-behind block, after a successful match.
   5765             {
   5766                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5767                 if (fp->fInputIdx != fActiveLimit) {
   5768                     //  The look-behind expression matched, but the match did not
   5769                     //    extend all the way to the point that we are looking behind from.
   5770                     //  FAIL out of here, which will take us back to the LB_CONT, which
   5771                     //     will retry the match starting at another position or fail
   5772                     //     the look-behind altogether, whichever is appropriate.
   5773                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5774                     break;
   5775                 }
   5776 
   5777                 // Look-behind match is good.  Restore the orignal input string length,
   5778                 //   which had been truncated to pin the end of the lookbehind match to the
   5779                 //   position being looked-behind.
   5780                 int64_t originalInputLen = fData[opValue+3];
   5781                 U_ASSERT(originalInputLen >= fActiveLimit);
   5782                 U_ASSERT(originalInputLen <= fInputLength);
   5783                 fActiveLimit = originalInputLen;
   5784             }
   5785             break;
   5786 
   5787 
   5788         case URX_LBN_CONT:
   5789             {
   5790                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   5791                 //    at all possible input starting positions.
   5792 
   5793                 // Fetch the extra parameters of this op.
   5794                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
   5795                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
   5796                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
   5797                 continueLoc = URX_VAL(continueLoc);
   5798                 U_ASSERT(minML <= maxML);
   5799                 U_ASSERT(minML >= 0);
   5800                 U_ASSERT(continueLoc > fp->fPatIdx);
   5801 
   5802                 // Fetch (from data) the last input index where a match was attempted.
   5803                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5804                 int64_t  *lbStartIdx = &fData[opValue+2];
   5805                 if (*lbStartIdx < 0) {
   5806                     // First time through loop.
   5807                     *lbStartIdx = fp->fInputIdx - minML;
   5808                 } else {
   5809                     // 2nd through nth time through the loop.
   5810                     // Back up start position for match by one.
   5811                     if (*lbStartIdx == 0) {
   5812                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
   5813                     } else {
   5814                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
   5815                     }
   5816                 }
   5817 
   5818                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   5819                     // We have tried all potential match starting points without
   5820                     //  getting a match, which means that the negative lookbehind as
   5821                     //  a whole has succeeded.  Jump forward to the continue location
   5822                     int64_t restoreInputLen = fData[opValue+3];
   5823                     U_ASSERT(restoreInputLen >= fActiveLimit);
   5824                     U_ASSERT(restoreInputLen <= fInputLength);
   5825                     fActiveLimit = restoreInputLen;
   5826                     fp->fPatIdx = continueLoc;
   5827                     break;
   5828                 }
   5829 
   5830                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5831                 //      (successful match will cause a FAIL out of the loop altogether.)
   5832                 fp = StateSave(fp, fp->fPatIdx-4, status);
   5833                 fp->fInputIdx =  *lbStartIdx;
   5834             }
   5835             break;
   5836 
   5837         case URX_LBN_END:
   5838             // End of a negative look-behind block, after a successful match.
   5839             {
   5840                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5841                 if (fp->fInputIdx != fActiveLimit) {
   5842                     //  The look-behind expression matched, but the match did not
   5843                     //    extend all the way to the point that we are looking behind from.
   5844                     //  FAIL out of here, which will take us back to the LB_CONT, which
   5845                     //     will retry the match starting at another position or succeed
   5846                     //     the look-behind altogether, whichever is appropriate.
   5847                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5848                     break;
   5849                 }
   5850 
   5851                 // Look-behind expression matched, which means look-behind test as
   5852                 //   a whole Fails
   5853 
   5854                 //   Restore the orignal input string length, which had been truncated
   5855                 //   inorder to pin the end of the lookbehind match
   5856                 //   to the position being looked-behind.
   5857                 int64_t originalInputLen = fData[opValue+3];
   5858                 U_ASSERT(originalInputLen >= fActiveLimit);
   5859                 U_ASSERT(originalInputLen <= fInputLength);
   5860                 fActiveLimit = originalInputLen;
   5861 
   5862                 // Restore original stack position, discarding any state saved
   5863                 //   by the successful pattern match.
   5864                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5865                 int32_t newStackSize = (int32_t)fData[opValue];
   5866                 U_ASSERT(fStack->size() > newStackSize);
   5867                 fStack->setSize(newStackSize);
   5868 
   5869                 //  FAIL, which will take control back to someplace
   5870                 //  prior to entering the look-behind test.
   5871                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5872             }
   5873             break;
   5874 
   5875 
   5876         case URX_LOOP_SR_I:
   5877             // Loop Initialization for the optimized implementation of
   5878             //     [some character set]*
   5879             //   This op scans through all matching input.
   5880             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5881             {
   5882                 U_ASSERT(opValue > 0 && opValue < sets->size());
   5883                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5884                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   5885 
   5886                 // Loop through input, until either the input is exhausted or
   5887                 //   we reach a character that is not a member of the set.
   5888                 int32_t ix = (int32_t)fp->fInputIdx;
   5889                 for (;;) {
   5890                     if (ix >= fActiveLimit) {
   5891                         fHitEnd = TRUE;
   5892                         break;
   5893                     }
   5894                     UChar32   c;
   5895                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
   5896                     if (c<256) {
   5897                         if (s8->contains(c) == FALSE) {
   5898                             U16_BACK_1(inputBuf, 0, ix);
   5899                             break;
   5900                         }
   5901                     } else {
   5902                         if (s->contains(c) == FALSE) {
   5903                             U16_BACK_1(inputBuf, 0, ix);
   5904                             break;
   5905                         }
   5906                     }
   5907                 }
   5908 
   5909                 // If there were no matching characters, skip over the loop altogether.
   5910                 //   The loop doesn't run at all, a * op always succeeds.
   5911                 if (ix == fp->fInputIdx) {
   5912                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   5913                     break;
   5914                 }
   5915 
   5916                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   5917                 //   must follow.  It's operand is the stack location
   5918                 //   that holds the starting input index for the match of this [set]*
   5919                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   5920                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   5921                 int32_t stackLoc = URX_VAL(loopcOp);
   5922                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   5923                 fp->fExtra[stackLoc] = fp->fInputIdx;
   5924                 #ifdef REGEX_SMART_BACKTRACKING
   5925                 backSearchIndex = fp->fInputIdx;
   5926                 #endif
   5927                 fp->fInputIdx = ix;
   5928 
   5929                 // Save State to the URX_LOOP_C op that follows this one,
   5930                 //   so that match failures in the following code will return to there.
   5931                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   5932                 fp = StateSave(fp, fp->fPatIdx, status);
   5933                 fp->fPatIdx++;
   5934             }
   5935             break;
   5936 
   5937 
   5938         case URX_LOOP_DOT_I:
   5939             // Loop Initialization for the optimized implementation of .*
   5940             //   This op scans through all remaining input.
   5941             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5942             {
   5943                 // Loop through input until the input is exhausted (we reach an end-of-line)
   5944                 // In DOTALL mode, we can just go straight to the end of the input.
   5945                 int32_t ix;
   5946                 if ((opValue & 1) == 1) {
   5947                     // Dot-matches-All mode.  Jump straight to the end of the string.
   5948                     ix = (int32_t)fActiveLimit;
   5949                     fHitEnd = TRUE;
   5950                 } else {
   5951                     // NOT DOT ALL mode.  Line endings do not match '.'
   5952                     // Scan forward until a line ending or end of input.
   5953                     ix = (int32_t)fp->fInputIdx;
   5954                     for (;;) {
   5955                         if (ix >= fActiveLimit) {
   5956                             fHitEnd = TRUE;
   5957                             break;
   5958                         }
   5959                         UChar32   c;
   5960                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
   5961                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
   5962                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
   5963                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   5964                                    ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
   5965                                 //  char is a line ending.  Put the input pos back to the
   5966                                 //    line ending char, and exit the scanning loop.
   5967                                 U16_BACK_1(inputBuf, 0, ix);
   5968                                 break;
   5969                             }
   5970                         }
   5971                     }
   5972                 }
   5973 
   5974                 // If there were no matching characters, skip over the loop altogether.
   5975                 //   The loop doesn't run at all, a * op always succeeds.
   5976                 if (ix == fp->fInputIdx) {
   5977                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   5978                     break;
   5979                 }
   5980 
   5981                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   5982                 //   must follow.  It's operand is the stack location
   5983                 //   that holds the starting input index for the match of this .*
   5984                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   5985                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   5986                 int32_t stackLoc = URX_VAL(loopcOp);
   5987                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   5988                 fp->fExtra[stackLoc] = fp->fInputIdx;
   5989                 #ifdef REGEX_SMART_BACKTRACKING
   5990                 backSearchIndex = fp->fInputIdx;
   5991                 #endif
   5992                 fp->fInputIdx = ix;
   5993 
   5994                 // Save State to the URX_LOOP_C op that follows this one,
   5995                 //   so that match failures in the following code will return to there.
   5996                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   5997                 fp = StateSave(fp, fp->fPatIdx, status);
   5998                 fp->fPatIdx++;
   5999             }
   6000             break;
   6001 
   6002 
   6003         case URX_LOOP_C:
   6004             {
   6005                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   6006                 backSearchIndex = (int32_t)fp->fExtra[opValue];
   6007                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
   6008                 if (backSearchIndex == fp->fInputIdx) {
   6009                     // We've backed up the input idx to the point that the loop started.
   6010                     // The loop is done.  Leave here without saving state.
   6011                     //  Subsequent failures won't come back here.
   6012                     break;
   6013                 }
   6014                 // Set up for the next iteration of the loop, with input index
   6015                 //   backed up by one from the last time through,
   6016                 //   and a state save to this instruction in case the following code fails again.
   6017                 //   (We're going backwards because this loop emulates stack unwinding, not
   6018                 //    the initial scan forward.)
   6019                 U_ASSERT(fp->fInputIdx > 0);
   6020                 UChar32 prevC;
   6021                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
   6022 
   6023                 if (prevC == 0x0a &&
   6024                     fp->fInputIdx > backSearchIndex &&
   6025                     inputBuf[fp->fInputIdx-1] == 0x0d) {
   6026                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
   6027                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   6028                         // .*, stepping back over CRLF pair.
   6029                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   6030                     }
   6031                 }
   6032 
   6033 
   6034                 fp = StateSave(fp, fp->fPatIdx-1, status);
   6035             }
   6036             break;
   6037 
   6038 
   6039 
   6040         default:
   6041             // Trouble.  The compiled pattern contains an entry with an
   6042             //           unrecognized type tag.
   6043             U_ASSERT(FALSE);
   6044         }
   6045 
   6046         if (U_FAILURE(status)) {
   6047             isMatch = FALSE;
   6048             break;
   6049         }
   6050     }
   6051 
   6052 breakFromLoop:
   6053     fMatch = isMatch;
   6054     if (isMatch) {
   6055         fLastMatchEnd = fMatchEnd;
   6056         fMatchStart   = startIdx;
   6057         fMatchEnd     = fp->fInputIdx;
   6058         if (fTraceDebug) {
   6059             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
   6060         }
   6061     }
   6062     else
   6063     {
   6064         if (fTraceDebug) {
   6065             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
   6066         }
   6067     }
   6068 
   6069     fFrame = fp;                // The active stack frame when the engine stopped.
   6070     //   Contains the capture group results that we need to
   6071     //    access later.
   6072 
   6073     return;
   6074 }
   6075 
   6076 
   6077 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
   6078 
   6079 U_NAMESPACE_END
   6080 
   6081 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
   6082