Home | History | Annotate | Download | only in i18n
      1 /*
      2 **************************************************************************
      3 *   Copyright (C) 2002-2013 International Business Machines Corporation  *
      4 *   and others. All rights reserved.                                     *
      5 **************************************************************************
      6 */
      7 //
      8 //  file:  rematch.cpp
      9 //
     10 //         Contains the implementation of class RegexMatcher,
     11 //         which is one of the main API classes for the ICU regular expression package.
     12 //
     13 
     14 #include "unicode/utypes.h"
     15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     16 
     17 #include "unicode/regex.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/ustring.h"
     21 #include "unicode/rbbi.h"
     22 #include "unicode/utf.h"
     23 #include "unicode/utf16.h"
     24 #include "uassert.h"
     25 #include "cmemory.h"
     26 #include "uvector.h"
     27 #include "uvectr32.h"
     28 #include "uvectr64.h"
     29 #include "regeximp.h"
     30 #include "regexst.h"
     31 #include "regextxt.h"
     32 #include "ucase.h"
     33 
     34 // #include <malloc.h>        // Needed for heapcheck testing
     35 
     36 
     37 // Find progress callback
     38 // ----------------------
     39 // Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
     40 //
     41 #define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
     42     (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
     43 
     44 
     45 // Smart Backtracking
     46 // ------------------
     47 // When a failure would go back to a LOOP_C instruction,
     48 // strings, characters, and setrefs scan backwards for a valid start
     49 // character themselves, pop the stack, and save state, emulating the
     50 // LOOP_C's effect but assured that the next character of input is a
     51 // possible matching character.
     52 //
     53 // Good idea in theory; unfortunately it only helps out a few specific
     54 // cases and slows the engine down a little in the rest.
     55 
     56 U_NAMESPACE_BEGIN
     57 
     58 // Default limit for the size of the back track stack, to avoid system
     59 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
     60 // This value puts ICU's limits higher than most other regexp implementations,
     61 //    which use recursion rather than the heap, and take more storage per
     62 //    backtrack point.
     63 //
     64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
     65 
     66 // Time limit counter constant.
     67 //   Time limits for expression evaluation are in terms of quanta of work by
     68 //   the engine, each of which is 10,000 state saves.
     69 //   This constant determines that state saves per tick number.
     70 static const int32_t TIMER_INITIAL_VALUE = 10000;
     71 
     72 //-----------------------------------------------------------------------------
     73 //
     74 //   Constructor and Destructor
     75 //
     76 //-----------------------------------------------------------------------------
     77 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
     78     fDeferredStatus = U_ZERO_ERROR;
     79     init(fDeferredStatus);
     80     if (U_FAILURE(fDeferredStatus)) {
     81         return;
     82     }
     83     if (pat==NULL) {
     84         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
     85         return;
     86     }
     87     fPattern = pat;
     88     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
     89 }
     90 
     91 
     92 
     93 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
     94                            uint32_t flags, UErrorCode &status) {
     95     init(status);
     96     if (U_FAILURE(status)) {
     97         return;
     98     }
     99     UParseError    pe;
    100     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    101     fPattern           = fPatternOwned;
    102 
    103     UText inputText = UTEXT_INITIALIZER;
    104     utext_openConstUnicodeString(&inputText, &input, &status);
    105     init2(&inputText, status);
    106     utext_close(&inputText);
    107 
    108     fInputUniStrMaybeMutable = TRUE;
    109 }
    110 
    111 
    112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
    113                            uint32_t flags, UErrorCode &status) {
    114     init(status);
    115     if (U_FAILURE(status)) {
    116         return;
    117     }
    118     UParseError    pe;
    119     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    120     if (U_FAILURE(status)) {
    121         return;
    122     }
    123 
    124     fPattern           = fPatternOwned;
    125     init2(input, status);
    126 }
    127 
    128 
    129 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
    130                            uint32_t flags, UErrorCode &status) {
    131     init(status);
    132     if (U_FAILURE(status)) {
    133         return;
    134     }
    135     UParseError    pe;
    136     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    137     if (U_FAILURE(status)) {
    138         return;
    139     }
    140     fPattern           = fPatternOwned;
    141     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    142 }
    143 
    144 RegexMatcher::RegexMatcher(UText *regexp,
    145                            uint32_t flags, UErrorCode &status) {
    146     init(status);
    147     if (U_FAILURE(status)) {
    148         return;
    149     }
    150     UParseError    pe;
    151     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    152         if (U_FAILURE(status)) {
    153         return;
    154     }
    155 
    156     fPattern           = fPatternOwned;
    157     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    158 }
    159 
    160 
    161 
    162 
    163 RegexMatcher::~RegexMatcher() {
    164     delete fStack;
    165     if (fData != fSmallData) {
    166         uprv_free(fData);
    167         fData = NULL;
    168     }
    169     if (fPatternOwned) {
    170         delete fPatternOwned;
    171         fPatternOwned = NULL;
    172         fPattern = NULL;
    173     }
    174 
    175     if (fInput) {
    176         delete fInput;
    177     }
    178     if (fInputText) {
    179         utext_close(fInputText);
    180     }
    181     if (fAltInputText) {
    182         utext_close(fAltInputText);
    183     }
    184 
    185     #if UCONFIG_NO_BREAK_ITERATION==0
    186     delete fWordBreakItr;
    187     #endif
    188 }
    189 
    190 //
    191 //   init()   common initialization for use by all constructors.
    192 //            Initialize all fields, get the object into a consistent state.
    193 //            This must be done even when the initial status shows an error,
    194 //            so that the object is initialized sufficiently well for the destructor
    195 //            to run safely.
    196 //
    197 void RegexMatcher::init(UErrorCode &status) {
    198     fPattern           = NULL;
    199     fPatternOwned      = NULL;
    200     fFrameSize         = 0;
    201     fRegionStart       = 0;
    202     fRegionLimit       = 0;
    203     fAnchorStart       = 0;
    204     fAnchorLimit       = 0;
    205     fLookStart         = 0;
    206     fLookLimit         = 0;
    207     fActiveStart       = 0;
    208     fActiveLimit       = 0;
    209     fTransparentBounds = FALSE;
    210     fAnchoringBounds   = TRUE;
    211     fMatch             = FALSE;
    212     fMatchStart        = 0;
    213     fMatchEnd          = 0;
    214     fLastMatchEnd      = -1;
    215     fAppendPosition    = 0;
    216     fHitEnd            = FALSE;
    217     fRequireEnd        = FALSE;
    218     fStack             = NULL;
    219     fFrame             = NULL;
    220     fTimeLimit         = 0;
    221     fTime              = 0;
    222     fTickCounter       = 0;
    223     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
    224     fCallbackFn        = NULL;
    225     fCallbackContext   = NULL;
    226     fFindProgressCallbackFn      = NULL;
    227     fFindProgressCallbackContext = NULL;
    228     fTraceDebug        = FALSE;
    229     fDeferredStatus    = status;
    230     fData              = fSmallData;
    231     fWordBreakItr      = NULL;
    232 
    233     fStack             = NULL;
    234     fInputText         = NULL;
    235     fAltInputText      = NULL;
    236     fInput             = NULL;
    237     fInputLength       = 0;
    238     fInputUniStrMaybeMutable = FALSE;
    239 
    240     if (U_FAILURE(status)) {
    241         fDeferredStatus = status;
    242     }
    243 }
    244 
    245 //
    246 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
    247 //            This handles the common setup to be done after the Pattern is available.
    248 //
    249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
    250     if (U_FAILURE(status)) {
    251         fDeferredStatus = status;
    252         return;
    253     }
    254 
    255     if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
    256         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
    257         if (fData == NULL) {
    258             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    259             return;
    260         }
    261     }
    262 
    263     fStack = new UVector64(status);
    264     if (fStack == NULL) {
    265         status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    266         return;
    267     }
    268 
    269     reset(input);
    270     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
    271     if (U_FAILURE(status)) {
    272         fDeferredStatus = status;
    273         return;
    274     }
    275 }
    276 
    277 
    278 static const UChar BACKSLASH  = 0x5c;
    279 static const UChar DOLLARSIGN = 0x24;
    280 //--------------------------------------------------------------------------------
    281 //
    282 //    appendReplacement
    283 //
    284 //--------------------------------------------------------------------------------
    285 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
    286                                               const UnicodeString &replacement,
    287                                               UErrorCode &status) {
    288     UText replacementText = UTEXT_INITIALIZER;
    289 
    290     utext_openConstUnicodeString(&replacementText, &replacement, &status);
    291     if (U_SUCCESS(status)) {
    292         UText resultText = UTEXT_INITIALIZER;
    293         utext_openUnicodeString(&resultText, &dest, &status);
    294 
    295         if (U_SUCCESS(status)) {
    296             appendReplacement(&resultText, &replacementText, status);
    297             utext_close(&resultText);
    298         }
    299         utext_close(&replacementText);
    300     }
    301 
    302     return *this;
    303 }
    304 
    305 //
    306 //    appendReplacement, UText mode
    307 //
    308 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
    309                                               UText *replacement,
    310                                               UErrorCode &status) {
    311     if (U_FAILURE(status)) {
    312         return *this;
    313     }
    314     if (U_FAILURE(fDeferredStatus)) {
    315         status = fDeferredStatus;
    316         return *this;
    317     }
    318     if (fMatch == FALSE) {
    319         status = U_REGEX_INVALID_STATE;
    320         return *this;
    321     }
    322 
    323     // Copy input string from the end of previous match to start of current match
    324     int64_t  destLen = utext_nativeLength(dest);
    325     if (fMatchStart > fAppendPosition) {
    326         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    327             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
    328                                      (int32_t)(fMatchStart-fAppendPosition), &status);
    329         } else {
    330             int32_t len16;
    331             if (UTEXT_USES_U16(fInputText)) {
    332                 len16 = (int32_t)(fMatchStart-fAppendPosition);
    333             } else {
    334                 UErrorCode lengthStatus = U_ZERO_ERROR;
    335                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
    336             }
    337             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
    338             if (inputChars == NULL) {
    339                 status = U_MEMORY_ALLOCATION_ERROR;
    340                 return *this;
    341             }
    342             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
    343             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    344             uprv_free(inputChars);
    345         }
    346     }
    347     fAppendPosition = fMatchEnd;
    348 
    349 
    350     // scan the replacement text, looking for substitutions ($n) and \escapes.
    351     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
    352     //         move entire ranges not containing substitutions.
    353     UTEXT_SETNATIVEINDEX(replacement, 0);
    354     UChar32 c = UTEXT_NEXT32(replacement);
    355     while (c != U_SENTINEL) {
    356         if (c == BACKSLASH) {
    357             // Backslash Escape.  Copy the following char out without further checks.
    358             //                    Note:  Surrogate pairs don't need any special handling
    359             //                           The second half wont be a '$' or a '\', and
    360             //                           will move to the dest normally on the next
    361             //                           loop iteration.
    362             c = UTEXT_CURRENT32(replacement);
    363             if (c == U_SENTINEL) {
    364                 break;
    365             }
    366 
    367             if (c==0x55/*U*/ || c==0x75/*u*/) {
    368                 // We have a \udddd or \Udddddddd escape sequence.
    369                 int32_t offset = 0;
    370                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
    371                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
    372                 if (escapedChar != (UChar32)0xFFFFFFFF) {
    373                     if (U_IS_BMP(escapedChar)) {
    374                         UChar c16 = (UChar)escapedChar;
    375                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    376                     } else {
    377                         UChar surrogate[2];
    378                         surrogate[0] = U16_LEAD(escapedChar);
    379                         surrogate[1] = U16_TRAIL(escapedChar);
    380                         if (U_SUCCESS(status)) {
    381                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    382                         }
    383                     }
    384                     // TODO:  Report errors for mal-formed \u escapes?
    385                     //        As this is, the original sequence is output, which may be OK.
    386                     if (context.lastOffset == offset) {
    387                         (void)UTEXT_PREVIOUS32(replacement);
    388                     } else if (context.lastOffset != offset-1) {
    389                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
    390                     }
    391                 }
    392             } else {
    393                 (void)UTEXT_NEXT32(replacement);
    394                 // Plain backslash escape.  Just put out the escaped character.
    395                 if (U_IS_BMP(c)) {
    396                     UChar c16 = (UChar)c;
    397                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    398                 } else {
    399                     UChar surrogate[2];
    400                     surrogate[0] = U16_LEAD(c);
    401                     surrogate[1] = U16_TRAIL(c);
    402                     if (U_SUCCESS(status)) {
    403                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    404                     }
    405                 }
    406             }
    407         } else if (c != DOLLARSIGN) {
    408             // Normal char, not a $.  Copy it out without further checks.
    409             if (U_IS_BMP(c)) {
    410                 UChar c16 = (UChar)c;
    411                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    412             } else {
    413                 UChar surrogate[2];
    414                 surrogate[0] = U16_LEAD(c);
    415                 surrogate[1] = U16_TRAIL(c);
    416                 if (U_SUCCESS(status)) {
    417                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    418                 }
    419             }
    420         } else {
    421             // We've got a $.  Pick up a capture group number if one follows.
    422             // Consume at most the number of digits necessary for the largest capture
    423             // number that is valid for this pattern.
    424 
    425             int32_t numDigits = 0;
    426             int32_t groupNum  = 0;
    427             UChar32 digitC;
    428             for (;;) {
    429                 digitC = UTEXT_CURRENT32(replacement);
    430                 if (digitC == U_SENTINEL) {
    431                     break;
    432                 }
    433                 if (u_isdigit(digitC) == FALSE) {
    434                     break;
    435                 }
    436                 (void)UTEXT_NEXT32(replacement);
    437                 groupNum=groupNum*10 + u_charDigitValue(digitC);
    438                 numDigits++;
    439                 if (numDigits >= fPattern->fMaxCaptureDigits) {
    440                     break;
    441                 }
    442             }
    443 
    444 
    445             if (numDigits == 0) {
    446                 // The $ didn't introduce a group number at all.
    447                 // Treat it as just part of the substitution text.
    448                 UChar c16 = DOLLARSIGN;
    449                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    450             } else {
    451                 // Finally, append the capture group data to the destination.
    452                 destLen += appendGroup(groupNum, dest, status);
    453                 if (U_FAILURE(status)) {
    454                     // Can fail if group number is out of range.
    455                     break;
    456                 }
    457             }
    458         }
    459 
    460         if (U_FAILURE(status)) {
    461             break;
    462         } else {
    463             c = UTEXT_NEXT32(replacement);
    464         }
    465     }
    466 
    467     return *this;
    468 }
    469 
    470 
    471 
    472 //--------------------------------------------------------------------------------
    473 //
    474 //    appendTail     Intended to be used in conjunction with appendReplacement()
    475 //                   To the destination string, append everything following
    476 //                   the last match position from the input string.
    477 //
    478 //                   Note:  Match ranges do not affect appendTail or appendReplacement
    479 //
    480 //--------------------------------------------------------------------------------
    481 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
    482     UErrorCode status = U_ZERO_ERROR;
    483     UText resultText = UTEXT_INITIALIZER;
    484     utext_openUnicodeString(&resultText, &dest, &status);
    485 
    486     if (U_SUCCESS(status)) {
    487         appendTail(&resultText, status);
    488         utext_close(&resultText);
    489     }
    490 
    491     return dest;
    492 }
    493 
    494 //
    495 //   appendTail, UText mode
    496 //
    497 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
    498     if (U_FAILURE(status)) {
    499         return dest;
    500     }
    501     if (U_FAILURE(fDeferredStatus)) {
    502         status = fDeferredStatus;
    503         return dest;
    504     }
    505 
    506     if (fInputLength > fAppendPosition) {
    507         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    508             int64_t destLen = utext_nativeLength(dest);
    509             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
    510                           (int32_t)(fInputLength-fAppendPosition), &status);
    511         } else {
    512             int32_t len16;
    513             if (UTEXT_USES_U16(fInputText)) {
    514                 len16 = (int32_t)(fInputLength-fAppendPosition);
    515             } else {
    516                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
    517                 status = U_ZERO_ERROR; // buffer overflow
    518             }
    519 
    520             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
    521             if (inputChars == NULL) {
    522                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    523             } else {
    524                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
    525                 int64_t destLen = utext_nativeLength(dest);
    526                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    527                 uprv_free(inputChars);
    528             }
    529         }
    530     }
    531     return dest;
    532 }
    533 
    534 
    535 
    536 //--------------------------------------------------------------------------------
    537 //
    538 //   end
    539 //
    540 //--------------------------------------------------------------------------------
    541 int32_t RegexMatcher::end(UErrorCode &err) const {
    542     return end(0, err);
    543 }
    544 
    545 int64_t RegexMatcher::end64(UErrorCode &err) const {
    546     return end64(0, err);
    547 }
    548 
    549 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
    550     if (U_FAILURE(err)) {
    551         return -1;
    552     }
    553     if (fMatch == FALSE) {
    554         err = U_REGEX_INVALID_STATE;
    555         return -1;
    556     }
    557     if (group < 0 || group > fPattern->fGroupMap->size()) {
    558         err = U_INDEX_OUTOFBOUNDS_ERROR;
    559         return -1;
    560     }
    561     int64_t e = -1;
    562     if (group == 0) {
    563         e = fMatchEnd;
    564     } else {
    565         // Get the position within the stack frame of the variables for
    566         //    this capture group.
    567         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
    568         U_ASSERT(groupOffset < fPattern->fFrameSize);
    569         U_ASSERT(groupOffset >= 0);
    570         e = fFrame->fExtra[groupOffset + 1];
    571     }
    572 
    573         return e;
    574 }
    575 
    576 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
    577     return (int32_t)end64(group, err);
    578 }
    579 
    580 
    581 //--------------------------------------------------------------------------------
    582 //
    583 //   find()
    584 //
    585 //--------------------------------------------------------------------------------
    586 UBool RegexMatcher::find() {
    587     // Start at the position of the last match end.  (Will be zero if the
    588     //   matcher has been reset.)
    589     //
    590     if (U_FAILURE(fDeferredStatus)) {
    591         return FALSE;
    592     }
    593 
    594     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    595         return findUsingChunk();
    596     }
    597 
    598     int64_t startPos = fMatchEnd;
    599     if (startPos==0) {
    600         startPos = fActiveStart;
    601     }
    602 
    603     if (fMatch) {
    604         // Save the position of any previous successful match.
    605         fLastMatchEnd = fMatchEnd;
    606 
    607         if (fMatchStart == fMatchEnd) {
    608             // Previous match had zero length.  Move start position up one position
    609             //  to avoid sending find() into a loop on zero-length matches.
    610             if (startPos >= fActiveLimit) {
    611                 fMatch = FALSE;
    612                 fHitEnd = TRUE;
    613                 return FALSE;
    614             }
    615             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    616             (void)UTEXT_NEXT32(fInputText);
    617             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    618         }
    619     } else {
    620         if (fLastMatchEnd >= 0) {
    621             // A previous find() failed to match.  Don't try again.
    622             //   (without this test, a pattern with a zero-length match
    623             //    could match again at the end of an input string.)
    624             fHitEnd = TRUE;
    625             return FALSE;
    626         }
    627     }
    628 
    629 
    630     // Compute the position in the input string beyond which a match can not begin, because
    631     //   the minimum length match would extend past the end of the input.
    632     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    633     //          Be aware of possible overflows if making changes here.
    634     int64_t testStartLimit;
    635     if (UTEXT_USES_U16(fInputText)) {
    636         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
    637         if (startPos > testStartLimit) {
    638             fMatch = FALSE;
    639             fHitEnd = TRUE;
    640             return FALSE;
    641         }
    642     } else {
    643         // For now, let the matcher discover that it can't match on its own
    644         // We don't know how long the match len is in native characters
    645         testStartLimit = fActiveLimit;
    646     }
    647 
    648     UChar32  c;
    649     U_ASSERT(startPos >= 0);
    650 
    651     switch (fPattern->fStartType) {
    652     case START_NO_INFO:
    653         // No optimization was found.
    654         //  Try a match at each input position.
    655         for (;;) {
    656             MatchAt(startPos, FALSE, fDeferredStatus);
    657             if (U_FAILURE(fDeferredStatus)) {
    658                 return FALSE;
    659             }
    660             if (fMatch) {
    661                 return TRUE;
    662             }
    663             if (startPos >= testStartLimit) {
    664                 fHitEnd = TRUE;
    665                 return FALSE;
    666             }
    667             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    668             (void)UTEXT_NEXT32(fInputText);
    669             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    670             // Note that it's perfectly OK for a pattern to have a zero-length
    671             //   match at the end of a string, so we must make sure that the loop
    672             //   runs with startPos == testStartLimit the last time through.
    673             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    674                 return FALSE;
    675         }
    676         U_ASSERT(FALSE);
    677 
    678     case START_START:
    679         // Matches are only possible at the start of the input string
    680         //   (pattern begins with ^ or \A)
    681         if (startPos > fActiveStart) {
    682             fMatch = FALSE;
    683             return FALSE;
    684         }
    685         MatchAt(startPos, FALSE, fDeferredStatus);
    686         if (U_FAILURE(fDeferredStatus)) {
    687             return FALSE;
    688         }
    689         return fMatch;
    690 
    691 
    692     case START_SET:
    693         {
    694             // Match may start on any char from a pre-computed set.
    695             U_ASSERT(fPattern->fMinMatchLen > 0);
    696             int64_t pos;
    697             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    698             for (;;) {
    699                 c = UTEXT_NEXT32(fInputText);
    700                 pos = UTEXT_GETNATIVEINDEX(fInputText);
    701                 // c will be -1 (U_SENTINEL) at end of text, in which case we
    702                 // skip this next block (so we don't have a negative array index)
    703                 // and handle end of text in the following block.
    704                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
    705                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
    706                     MatchAt(startPos, FALSE, fDeferredStatus);
    707                     if (U_FAILURE(fDeferredStatus)) {
    708                         return FALSE;
    709                     }
    710                     if (fMatch) {
    711                         return TRUE;
    712                     }
    713                     UTEXT_SETNATIVEINDEX(fInputText, pos);
    714                 }
    715                 if (startPos >= testStartLimit) {
    716                     fMatch = FALSE;
    717                     fHitEnd = TRUE;
    718                     return FALSE;
    719                 }
    720                 startPos = pos;
    721 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    722                     return FALSE;
    723             }
    724         }
    725         U_ASSERT(FALSE);
    726 
    727     case START_STRING:
    728     case START_CHAR:
    729         {
    730             // Match starts on exactly one char.
    731             U_ASSERT(fPattern->fMinMatchLen > 0);
    732             UChar32 theChar = fPattern->fInitialChar;
    733             int64_t pos;
    734             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    735             for (;;) {
    736                 c = UTEXT_NEXT32(fInputText);
    737                 pos = UTEXT_GETNATIVEINDEX(fInputText);
    738                 if (c == theChar) {
    739                     MatchAt(startPos, FALSE, fDeferredStatus);
    740                     if (U_FAILURE(fDeferredStatus)) {
    741                         return FALSE;
    742                     }
    743                     if (fMatch) {
    744                         return TRUE;
    745                     }
    746                     UTEXT_SETNATIVEINDEX(fInputText, pos);
    747                 }
    748                 if (startPos >= testStartLimit) {
    749                     fMatch = FALSE;
    750                     fHitEnd = TRUE;
    751                     return FALSE;
    752                 }
    753                 startPos = pos;
    754 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    755                     return FALSE;
    756            }
    757         }
    758         U_ASSERT(FALSE);
    759 
    760     case START_LINE:
    761         {
    762             UChar32  c;
    763             if (startPos == fAnchorStart) {
    764                 MatchAt(startPos, FALSE, fDeferredStatus);
    765                 if (U_FAILURE(fDeferredStatus)) {
    766                     return FALSE;
    767                 }
    768                 if (fMatch) {
    769                     return TRUE;
    770                 }
    771                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    772                 c = UTEXT_NEXT32(fInputText);
    773                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    774             } else {
    775                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    776                 c = UTEXT_PREVIOUS32(fInputText);
    777                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    778             }
    779 
    780             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
    781                 for (;;) {
    782                     if (c == 0x0a) {
    783                             MatchAt(startPos, FALSE, fDeferredStatus);
    784                             if (U_FAILURE(fDeferredStatus)) {
    785                                 return FALSE;
    786                             }
    787                             if (fMatch) {
    788                                 return TRUE;
    789                             }
    790                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    791                     }
    792                     if (startPos >= testStartLimit) {
    793                         fMatch = FALSE;
    794                         fHitEnd = TRUE;
    795                         return FALSE;
    796                     }
    797                     c = UTEXT_NEXT32(fInputText);
    798                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
    799                     // Note that it's perfectly OK for a pattern to have a zero-length
    800                     //   match at the end of a string, so we must make sure that the loop
    801                     //   runs with startPos == testStartLimit the last time through.
    802 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    803                         return FALSE;
    804                 }
    805             } else {
    806                 for (;;) {
    807                     if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
    808                         ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
    809                             if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
    810                                 (void)UTEXT_NEXT32(fInputText);
    811                                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    812                             }
    813                             MatchAt(startPos, FALSE, fDeferredStatus);
    814                             if (U_FAILURE(fDeferredStatus)) {
    815                                 return FALSE;
    816                             }
    817                             if (fMatch) {
    818                                 return TRUE;
    819                             }
    820                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    821                     }
    822                     if (startPos >= testStartLimit) {
    823                         fMatch = FALSE;
    824                         fHitEnd = TRUE;
    825                         return FALSE;
    826                     }
    827                     c = UTEXT_NEXT32(fInputText);
    828                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
    829                     // Note that it's perfectly OK for a pattern to have a zero-length
    830                     //   match at the end of a string, so we must make sure that the loop
    831                     //   runs with startPos == testStartLimit the last time through.
    832 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    833                         return FALSE;
    834                 }
    835             }
    836         }
    837 
    838     default:
    839         U_ASSERT(FALSE);
    840     }
    841 
    842     U_ASSERT(FALSE);
    843     return FALSE;
    844 }
    845 
    846 
    847 
    848 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
    849     if (U_FAILURE(status)) {
    850         return FALSE;
    851     }
    852     if (U_FAILURE(fDeferredStatus)) {
    853         status = fDeferredStatus;
    854         return FALSE;
    855     }
    856     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
    857                                           //        This will reset the region to be the full input length.
    858     if (start < 0) {
    859         status = U_INDEX_OUTOFBOUNDS_ERROR;
    860         return FALSE;
    861     }
    862 
    863     int64_t nativeStart = start;
    864     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
    865         status = U_INDEX_OUTOFBOUNDS_ERROR;
    866         return FALSE;
    867     }
    868     fMatchEnd = nativeStart;
    869     return find();
    870 }
    871 
    872 
    873 //--------------------------------------------------------------------------------
    874 //
    875 //   findUsingChunk() -- like find(), but with the advance knowledge that the
    876 //                       entire string is available in the UText's chunk buffer.
    877 //
    878 //--------------------------------------------------------------------------------
    879 UBool RegexMatcher::findUsingChunk() {
    880     // Start at the position of the last match end.  (Will be zero if the
    881     //   matcher has been reset.
    882     //
    883 
    884     int32_t startPos = (int32_t)fMatchEnd;
    885     if (startPos==0) {
    886         startPos = (int32_t)fActiveStart;
    887     }
    888 
    889     const UChar *inputBuf = fInputText->chunkContents;
    890 
    891     if (fMatch) {
    892         // Save the position of any previous successful match.
    893         fLastMatchEnd = fMatchEnd;
    894 
    895         if (fMatchStart == fMatchEnd) {
    896             // Previous match had zero length.  Move start position up one position
    897             //  to avoid sending find() into a loop on zero-length matches.
    898             if (startPos >= fActiveLimit) {
    899                 fMatch = FALSE;
    900                 fHitEnd = TRUE;
    901                 return FALSE;
    902             }
    903             U16_FWD_1(inputBuf, startPos, fInputLength);
    904         }
    905     } else {
    906         if (fLastMatchEnd >= 0) {
    907             // A previous find() failed to match.  Don't try again.
    908             //   (without this test, a pattern with a zero-length match
    909             //    could match again at the end of an input string.)
    910             fHitEnd = TRUE;
    911             return FALSE;
    912         }
    913     }
    914 
    915 
    916     // Compute the position in the input string beyond which a match can not begin, because
    917     //   the minimum length match would extend past the end of the input.
    918     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    919     //          Be aware of possible overflows if making changes here.
    920     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
    921     if (startPos > testLen) {
    922         fMatch = FALSE;
    923         fHitEnd = TRUE;
    924         return FALSE;
    925     }
    926 
    927     UChar32  c;
    928     U_ASSERT(startPos >= 0);
    929 
    930     switch (fPattern->fStartType) {
    931     case START_NO_INFO:
    932         // No optimization was found.
    933         //  Try a match at each input position.
    934         for (;;) {
    935             MatchChunkAt(startPos, FALSE, fDeferredStatus);
    936             if (U_FAILURE(fDeferredStatus)) {
    937                 return FALSE;
    938             }
    939             if (fMatch) {
    940                 return TRUE;
    941             }
    942             if (startPos >= testLen) {
    943                 fHitEnd = TRUE;
    944                 return FALSE;
    945             }
    946             U16_FWD_1(inputBuf, startPos, fActiveLimit);
    947             // Note that it's perfectly OK for a pattern to have a zero-length
    948             //   match at the end of a string, so we must make sure that the loop
    949             //   runs with startPos == testLen the last time through.
    950             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    951                 return FALSE;
    952         }
    953         U_ASSERT(FALSE);
    954 
    955     case START_START:
    956         // Matches are only possible at the start of the input string
    957         //   (pattern begins with ^ or \A)
    958         if (startPos > fActiveStart) {
    959             fMatch = FALSE;
    960             return FALSE;
    961         }
    962         MatchChunkAt(startPos, FALSE, fDeferredStatus);
    963         if (U_FAILURE(fDeferredStatus)) {
    964             return FALSE;
    965         }
    966         return fMatch;
    967 
    968 
    969     case START_SET:
    970     {
    971         // Match may start on any char from a pre-computed set.
    972         U_ASSERT(fPattern->fMinMatchLen > 0);
    973         for (;;) {
    974             int32_t pos = startPos;
    975             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
    976             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
    977                 (c>=256 && fPattern->fInitialChars->contains(c))) {
    978                 MatchChunkAt(pos, FALSE, fDeferredStatus);
    979                 if (U_FAILURE(fDeferredStatus)) {
    980                     return FALSE;
    981                 }
    982                 if (fMatch) {
    983                     return TRUE;
    984                 }
    985             }
    986             if (pos >= testLen) {
    987                 fMatch = FALSE;
    988                 fHitEnd = TRUE;
    989                 return FALSE;
    990             }
    991             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    992                 return FALSE;
    993         }
    994     }
    995         U_ASSERT(FALSE);
    996 
    997     case START_STRING:
    998     case START_CHAR:
    999     {
   1000         // Match starts on exactly one char.
   1001         U_ASSERT(fPattern->fMinMatchLen > 0);
   1002         UChar32 theChar = fPattern->fInitialChar;
   1003         for (;;) {
   1004             int32_t pos = startPos;
   1005             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
   1006             if (c == theChar) {
   1007                 MatchChunkAt(pos, FALSE, fDeferredStatus);
   1008                 if (U_FAILURE(fDeferredStatus)) {
   1009                     return FALSE;
   1010                 }
   1011                 if (fMatch) {
   1012                     return TRUE;
   1013                 }
   1014             }
   1015             if (pos >= testLen) {
   1016                 fMatch = FALSE;
   1017                 fHitEnd = TRUE;
   1018                 return FALSE;
   1019             }
   1020             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1021                 return FALSE;
   1022         }
   1023     }
   1024         U_ASSERT(FALSE);
   1025 
   1026     case START_LINE:
   1027     {
   1028         UChar32  c;
   1029         if (startPos == fAnchorStart) {
   1030             MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1031             if (U_FAILURE(fDeferredStatus)) {
   1032                 return FALSE;
   1033             }
   1034             if (fMatch) {
   1035                 return TRUE;
   1036             }
   1037             U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1038         }
   1039 
   1040         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
   1041             for (;;) {
   1042                 c = inputBuf[startPos-1];
   1043                 if (c == 0x0a) {
   1044                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1045                     if (U_FAILURE(fDeferredStatus)) {
   1046                         return FALSE;
   1047                     }
   1048                     if (fMatch) {
   1049                         return TRUE;
   1050                     }
   1051                 }
   1052                 if (startPos >= testLen) {
   1053                     fMatch = FALSE;
   1054                     fHitEnd = TRUE;
   1055                     return FALSE;
   1056                 }
   1057                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1058                 // Note that it's perfectly OK for a pattern to have a zero-length
   1059                 //   match at the end of a string, so we must make sure that the loop
   1060                 //   runs with startPos == testLen the last time through.
   1061 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1062                     return FALSE;
   1063             }
   1064         } else {
   1065             for (;;) {
   1066                 c = inputBuf[startPos-1];
   1067                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   1068                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
   1069                     if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
   1070                         startPos++;
   1071                     }
   1072                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1073                     if (U_FAILURE(fDeferredStatus)) {
   1074                         return FALSE;
   1075                     }
   1076                     if (fMatch) {
   1077                         return TRUE;
   1078                     }
   1079                 }
   1080                 if (startPos >= testLen) {
   1081                     fMatch = FALSE;
   1082                     fHitEnd = TRUE;
   1083                     return FALSE;
   1084                 }
   1085                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1086                 // Note that it's perfectly OK for a pattern to have a zero-length
   1087                 //   match at the end of a string, so we must make sure that the loop
   1088                 //   runs with startPos == testLen the last time through.
   1089 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1090                     return FALSE;
   1091             }
   1092         }
   1093     }
   1094 
   1095     default:
   1096         U_ASSERT(FALSE);
   1097     }
   1098 
   1099     U_ASSERT(FALSE);
   1100     return FALSE;
   1101 }
   1102 
   1103 
   1104 
   1105 //--------------------------------------------------------------------------------
   1106 //
   1107 //  group()
   1108 //
   1109 //--------------------------------------------------------------------------------
   1110 UnicodeString RegexMatcher::group(UErrorCode &status) const {
   1111     return group(0, status);
   1112 }
   1113 
   1114 //  Return immutable shallow clone
   1115 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
   1116     return group(0, dest, group_len, status);
   1117 }
   1118 
   1119 //  Return immutable shallow clone
   1120 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
   1121     group_len = 0;
   1122     if (U_FAILURE(status)) {
   1123         return dest;
   1124     }
   1125     if (U_FAILURE(fDeferredStatus)) {
   1126         status = fDeferredStatus;
   1127     } else if (fMatch == FALSE) {
   1128         status = U_REGEX_INVALID_STATE;
   1129     } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1130         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1131     }
   1132 
   1133     if (U_FAILURE(status)) {
   1134         return dest;
   1135     }
   1136 
   1137     int64_t s, e;
   1138     if (groupNum == 0) {
   1139         s = fMatchStart;
   1140         e = fMatchEnd;
   1141     } else {
   1142         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1143         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1144         U_ASSERT(groupOffset >= 0);
   1145         s = fFrame->fExtra[groupOffset];
   1146         e = fFrame->fExtra[groupOffset+1];
   1147     }
   1148 
   1149     if (s < 0) {
   1150         // A capture group wasn't part of the match
   1151         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
   1152     }
   1153     U_ASSERT(s <= e);
   1154     group_len = e - s;
   1155 
   1156     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
   1157     if (dest)
   1158         UTEXT_SETNATIVEINDEX(dest, s);
   1159     return dest;
   1160 }
   1161 
   1162 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
   1163     UnicodeString result;
   1164     if (U_FAILURE(status)) {
   1165         return result;
   1166     }
   1167     UText resultText = UTEXT_INITIALIZER;
   1168     utext_openUnicodeString(&resultText, &result, &status);
   1169     group(groupNum, &resultText, status);
   1170     utext_close(&resultText);
   1171     return result;
   1172 }
   1173 
   1174 
   1175 //  Return deep (mutable) clone
   1176 //		Technology Preview (as an API), but note that the UnicodeString API is implemented
   1177 //		using this function.
   1178 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
   1179     if (U_FAILURE(status)) {
   1180         return dest;
   1181     }
   1182 
   1183     if (U_FAILURE(fDeferredStatus)) {
   1184         status = fDeferredStatus;
   1185     } else if (fMatch == FALSE) {
   1186         status = U_REGEX_INVALID_STATE;
   1187     } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1188         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1189     }
   1190     if (U_FAILURE(status)) {
   1191         return dest;
   1192     }
   1193 
   1194     int64_t s, e;
   1195     if (groupNum == 0) {
   1196         s = fMatchStart;
   1197         e = fMatchEnd;
   1198     } else {
   1199         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1200         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1201         U_ASSERT(groupOffset >= 0);
   1202         s = fFrame->fExtra[groupOffset];
   1203         e = fFrame->fExtra[groupOffset+1];
   1204     }
   1205 
   1206     if (s < 0) {
   1207         // A capture group wasn't part of the match
   1208         if (dest) {
   1209             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1210             return dest;
   1211         } else {
   1212             return utext_openUChars(NULL, NULL, 0, &status);
   1213         }
   1214     }
   1215     U_ASSERT(s <= e);
   1216 
   1217     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1218         U_ASSERT(e <= fInputLength);
   1219         if (dest) {
   1220             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
   1221         } else {
   1222             UText groupText = UTEXT_INITIALIZER;
   1223             utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
   1224             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
   1225             utext_close(&groupText);
   1226         }
   1227     } else {
   1228         int32_t len16;
   1229         if (UTEXT_USES_U16(fInputText)) {
   1230             len16 = (int32_t)(e-s);
   1231         } else {
   1232             UErrorCode lengthStatus = U_ZERO_ERROR;
   1233             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
   1234         }
   1235         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   1236         if (groupChars == NULL) {
   1237             status = U_MEMORY_ALLOCATION_ERROR;
   1238             return dest;
   1239         }
   1240         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
   1241 
   1242         if (dest) {
   1243             utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
   1244         } else {
   1245             UText groupText = UTEXT_INITIALIZER;
   1246             utext_openUChars(&groupText, groupChars, len16, &status);
   1247             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
   1248             utext_close(&groupText);
   1249         }
   1250 
   1251         uprv_free(groupChars);
   1252     }
   1253     return dest;
   1254 }
   1255 
   1256 //--------------------------------------------------------------------------------
   1257 //
   1258 //  appendGroup() -- currently internal only, appends a group to a UText rather
   1259 //                   than replacing its contents
   1260 //
   1261 //--------------------------------------------------------------------------------
   1262 
   1263 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
   1264     if (U_FAILURE(status)) {
   1265         return 0;
   1266     }
   1267     if (U_FAILURE(fDeferredStatus)) {
   1268         status = fDeferredStatus;
   1269         return 0;
   1270     }
   1271     int64_t destLen = utext_nativeLength(dest);
   1272 
   1273     if (fMatch == FALSE) {
   1274         status = U_REGEX_INVALID_STATE;
   1275         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1276     }
   1277     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1278         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1279         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1280     }
   1281 
   1282     int64_t s, e;
   1283     if (groupNum == 0) {
   1284         s = fMatchStart;
   1285         e = fMatchEnd;
   1286     } else {
   1287         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1288         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1289         U_ASSERT(groupOffset >= 0);
   1290         s = fFrame->fExtra[groupOffset];
   1291         e = fFrame->fExtra[groupOffset+1];
   1292     }
   1293 
   1294     if (s < 0) {
   1295         // A capture group wasn't part of the match
   1296         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1297     }
   1298     U_ASSERT(s <= e);
   1299 
   1300     int64_t deltaLen;
   1301     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1302         U_ASSERT(e <= fInputLength);
   1303         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
   1304     } else {
   1305         int32_t len16;
   1306         if (UTEXT_USES_U16(fInputText)) {
   1307             len16 = (int32_t)(e-s);
   1308         } else {
   1309             UErrorCode lengthStatus = U_ZERO_ERROR;
   1310             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
   1311         }
   1312         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   1313         if (groupChars == NULL) {
   1314             status = U_MEMORY_ALLOCATION_ERROR;
   1315             return 0;
   1316         }
   1317         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
   1318 
   1319         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
   1320         uprv_free(groupChars);
   1321     }
   1322     return deltaLen;
   1323 }
   1324 
   1325 
   1326 
   1327 //--------------------------------------------------------------------------------
   1328 //
   1329 //  groupCount()
   1330 //
   1331 //--------------------------------------------------------------------------------
   1332 int32_t RegexMatcher::groupCount() const {
   1333     return fPattern->fGroupMap->size();
   1334 }
   1335 
   1336 
   1337 
   1338 //--------------------------------------------------------------------------------
   1339 //
   1340 //  hasAnchoringBounds()
   1341 //
   1342 //--------------------------------------------------------------------------------
   1343 UBool RegexMatcher::hasAnchoringBounds() const {
   1344     return fAnchoringBounds;
   1345 }
   1346 
   1347 
   1348 //--------------------------------------------------------------------------------
   1349 //
   1350 //  hasTransparentBounds()
   1351 //
   1352 //--------------------------------------------------------------------------------
   1353 UBool RegexMatcher::hasTransparentBounds() const {
   1354     return fTransparentBounds;
   1355 }
   1356 
   1357 
   1358 
   1359 //--------------------------------------------------------------------------------
   1360 //
   1361 //  hitEnd()
   1362 //
   1363 //--------------------------------------------------------------------------------
   1364 UBool RegexMatcher::hitEnd() const {
   1365     return fHitEnd;
   1366 }
   1367 
   1368 
   1369 //--------------------------------------------------------------------------------
   1370 //
   1371 //  input()
   1372 //
   1373 //--------------------------------------------------------------------------------
   1374 const UnicodeString &RegexMatcher::input() const {
   1375     if (!fInput) {
   1376         UErrorCode status = U_ZERO_ERROR;
   1377         int32_t len16;
   1378         if (UTEXT_USES_U16(fInputText)) {
   1379             len16 = (int32_t)fInputLength;
   1380         } else {
   1381             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
   1382             status = U_ZERO_ERROR; // overflow, length status
   1383         }
   1384         UnicodeString *result = new UnicodeString(len16, 0, 0);
   1385 
   1386         UChar *inputChars = result->getBuffer(len16);
   1387         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
   1388         result->releaseBuffer(len16);
   1389 
   1390         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
   1391     }
   1392 
   1393     return *fInput;
   1394 }
   1395 
   1396 //--------------------------------------------------------------------------------
   1397 //
   1398 //  inputText()
   1399 //
   1400 //--------------------------------------------------------------------------------
   1401 UText *RegexMatcher::inputText() const {
   1402     return fInputText;
   1403 }
   1404 
   1405 
   1406 //--------------------------------------------------------------------------------
   1407 //
   1408 //  getInput() -- like inputText(), but makes a clone or copies into another UText
   1409 //
   1410 //--------------------------------------------------------------------------------
   1411 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
   1412     if (U_FAILURE(status)) {
   1413         return dest;
   1414     }
   1415     if (U_FAILURE(fDeferredStatus)) {
   1416         status = fDeferredStatus;
   1417         return dest;
   1418     }
   1419 
   1420     if (dest) {
   1421         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1422             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
   1423         } else {
   1424             int32_t input16Len;
   1425             if (UTEXT_USES_U16(fInputText)) {
   1426                 input16Len = (int32_t)fInputLength;
   1427             } else {
   1428                 UErrorCode lengthStatus = U_ZERO_ERROR;
   1429                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
   1430             }
   1431             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
   1432             if (inputChars == NULL) {
   1433                 return dest;
   1434             }
   1435 
   1436             status = U_ZERO_ERROR;
   1437             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
   1438             status = U_ZERO_ERROR;
   1439             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
   1440 
   1441             uprv_free(inputChars);
   1442         }
   1443         return dest;
   1444     } else {
   1445         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
   1446     }
   1447 }
   1448 
   1449 
   1450 static UBool compat_SyncMutableUTextContents(UText *ut);
   1451 static UBool compat_SyncMutableUTextContents(UText *ut) {
   1452     UBool retVal = FALSE;
   1453 
   1454     //  In the following test, we're really only interested in whether the UText should switch
   1455     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
   1456     //  will still point to the correct data.
   1457     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
   1458         UnicodeString *us=(UnicodeString *)ut->context;
   1459 
   1460         // Update to the latest length.
   1461         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
   1462         int32_t newLength = us->length();
   1463 
   1464         // Update the chunk description.
   1465         // The buffer may have switched between stack- and heap-based.
   1466         ut->chunkContents    = us->getBuffer();
   1467         ut->chunkLength      = newLength;
   1468         ut->chunkNativeLimit = newLength;
   1469         ut->nativeIndexingLimit = newLength;
   1470         retVal = TRUE;
   1471     }
   1472 
   1473     return retVal;
   1474 }
   1475 
   1476 //--------------------------------------------------------------------------------
   1477 //
   1478 //  lookingAt()
   1479 //
   1480 //--------------------------------------------------------------------------------
   1481 UBool RegexMatcher::lookingAt(UErrorCode &status) {
   1482     if (U_FAILURE(status)) {
   1483         return FALSE;
   1484     }
   1485     if (U_FAILURE(fDeferredStatus)) {
   1486         status = fDeferredStatus;
   1487         return FALSE;
   1488     }
   1489 
   1490     if (fInputUniStrMaybeMutable) {
   1491         if (compat_SyncMutableUTextContents(fInputText)) {
   1492         fInputLength = utext_nativeLength(fInputText);
   1493         reset();
   1494         }
   1495     }
   1496     else {
   1497         resetPreserveRegion();
   1498     }
   1499     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1500         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
   1501     } else {
   1502         MatchAt(fActiveStart, FALSE, status);
   1503     }
   1504     return fMatch;
   1505 }
   1506 
   1507 
   1508 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
   1509     if (U_FAILURE(status)) {
   1510         return FALSE;
   1511     }
   1512     if (U_FAILURE(fDeferredStatus)) {
   1513         status = fDeferredStatus;
   1514         return FALSE;
   1515     }
   1516     reset();
   1517 
   1518     if (start < 0) {
   1519         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1520         return FALSE;
   1521     }
   1522 
   1523     if (fInputUniStrMaybeMutable) {
   1524         if (compat_SyncMutableUTextContents(fInputText)) {
   1525         fInputLength = utext_nativeLength(fInputText);
   1526         reset();
   1527         }
   1528     }
   1529 
   1530     int64_t nativeStart;
   1531     nativeStart = start;
   1532     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1533         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1534         return FALSE;
   1535     }
   1536 
   1537     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1538         MatchChunkAt((int32_t)nativeStart, FALSE, status);
   1539     } else {
   1540         MatchAt(nativeStart, FALSE, status);
   1541     }
   1542     return fMatch;
   1543 }
   1544 
   1545 
   1546 
   1547 //--------------------------------------------------------------------------------
   1548 //
   1549 //  matches()
   1550 //
   1551 //--------------------------------------------------------------------------------
   1552 UBool RegexMatcher::matches(UErrorCode &status) {
   1553     if (U_FAILURE(status)) {
   1554         return FALSE;
   1555     }
   1556     if (U_FAILURE(fDeferredStatus)) {
   1557         status = fDeferredStatus;
   1558         return FALSE;
   1559     }
   1560 
   1561     if (fInputUniStrMaybeMutable) {
   1562         if (compat_SyncMutableUTextContents(fInputText)) {
   1563         fInputLength = utext_nativeLength(fInputText);
   1564         reset();
   1565         }
   1566     }
   1567     else {
   1568         resetPreserveRegion();
   1569     }
   1570 
   1571     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1572         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
   1573     } else {
   1574         MatchAt(fActiveStart, TRUE, status);
   1575     }
   1576     return fMatch;
   1577 }
   1578 
   1579 
   1580 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
   1581     if (U_FAILURE(status)) {
   1582         return FALSE;
   1583     }
   1584     if (U_FAILURE(fDeferredStatus)) {
   1585         status = fDeferredStatus;
   1586         return FALSE;
   1587     }
   1588     reset();
   1589 
   1590     if (start < 0) {
   1591         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1592         return FALSE;
   1593     }
   1594 
   1595     if (fInputUniStrMaybeMutable) {
   1596         if (compat_SyncMutableUTextContents(fInputText)) {
   1597         fInputLength = utext_nativeLength(fInputText);
   1598         reset();
   1599         }
   1600     }
   1601 
   1602     int64_t nativeStart;
   1603     nativeStart = start;
   1604     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1605         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1606         return FALSE;
   1607     }
   1608 
   1609     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1610         MatchChunkAt((int32_t)nativeStart, TRUE, status);
   1611     } else {
   1612         MatchAt(nativeStart, TRUE, status);
   1613     }
   1614     return fMatch;
   1615 }
   1616 
   1617 
   1618 
   1619 //--------------------------------------------------------------------------------
   1620 //
   1621 //    pattern
   1622 //
   1623 //--------------------------------------------------------------------------------
   1624 const RegexPattern &RegexMatcher::pattern() const {
   1625     return *fPattern;
   1626 }
   1627 
   1628 
   1629 
   1630 //--------------------------------------------------------------------------------
   1631 //
   1632 //    region
   1633 //
   1634 //--------------------------------------------------------------------------------
   1635 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
   1636     if (U_FAILURE(status)) {
   1637         return *this;
   1638     }
   1639 
   1640     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
   1641         status = U_ILLEGAL_ARGUMENT_ERROR;
   1642     }
   1643 
   1644     int64_t nativeStart = regionStart;
   1645     int64_t nativeLimit = regionLimit;
   1646     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
   1647       status = U_ILLEGAL_ARGUMENT_ERROR;
   1648     }
   1649 
   1650     if (startIndex == -1)
   1651       this->reset();
   1652     else
   1653       resetPreserveRegion();
   1654 
   1655     fRegionStart = nativeStart;
   1656     fRegionLimit = nativeLimit;
   1657     fActiveStart = nativeStart;
   1658     fActiveLimit = nativeLimit;
   1659 
   1660     if (startIndex != -1) {
   1661       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
   1662           status = U_INDEX_OUTOFBOUNDS_ERROR;
   1663       }
   1664       fMatchEnd = startIndex;
   1665     }
   1666 
   1667     if (!fTransparentBounds) {
   1668         fLookStart = nativeStart;
   1669         fLookLimit = nativeLimit;
   1670     }
   1671     if (fAnchoringBounds) {
   1672         fAnchorStart = nativeStart;
   1673         fAnchorLimit = nativeLimit;
   1674     }
   1675     return *this;
   1676 }
   1677 
   1678 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
   1679   return region(start, limit, -1, status);
   1680 }
   1681 
   1682 //--------------------------------------------------------------------------------
   1683 //
   1684 //    regionEnd
   1685 //
   1686 //--------------------------------------------------------------------------------
   1687 int32_t RegexMatcher::regionEnd() const {
   1688     return (int32_t)fRegionLimit;
   1689 }
   1690 
   1691 int64_t RegexMatcher::regionEnd64() const {
   1692     return fRegionLimit;
   1693 }
   1694 
   1695 //--------------------------------------------------------------------------------
   1696 //
   1697 //    regionStart
   1698 //
   1699 //--------------------------------------------------------------------------------
   1700 int32_t RegexMatcher::regionStart() const {
   1701     return (int32_t)fRegionStart;
   1702 }
   1703 
   1704 int64_t RegexMatcher::regionStart64() const {
   1705     return fRegionStart;
   1706 }
   1707 
   1708 
   1709 //--------------------------------------------------------------------------------
   1710 //
   1711 //    replaceAll
   1712 //
   1713 //--------------------------------------------------------------------------------
   1714 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
   1715     UText replacementText = UTEXT_INITIALIZER;
   1716     UText resultText = UTEXT_INITIALIZER;
   1717     UnicodeString resultString;
   1718     if (U_FAILURE(status)) {
   1719         return resultString;
   1720     }
   1721 
   1722     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1723     utext_openUnicodeString(&resultText, &resultString, &status);
   1724 
   1725     replaceAll(&replacementText, &resultText, status);
   1726 
   1727     utext_close(&resultText);
   1728     utext_close(&replacementText);
   1729 
   1730     return resultString;
   1731 }
   1732 
   1733 
   1734 //
   1735 //    replaceAll, UText mode
   1736 //
   1737 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
   1738     if (U_FAILURE(status)) {
   1739         return dest;
   1740     }
   1741     if (U_FAILURE(fDeferredStatus)) {
   1742         status = fDeferredStatus;
   1743         return dest;
   1744     }
   1745 
   1746     if (dest == NULL) {
   1747         UnicodeString emptyString;
   1748         UText empty = UTEXT_INITIALIZER;
   1749 
   1750         utext_openUnicodeString(&empty, &emptyString, &status);
   1751         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
   1752         utext_close(&empty);
   1753     }
   1754 
   1755     if (U_SUCCESS(status)) {
   1756         reset();
   1757         while (find()) {
   1758             appendReplacement(dest, replacement, status);
   1759             if (U_FAILURE(status)) {
   1760                 break;
   1761             }
   1762         }
   1763         appendTail(dest, status);
   1764     }
   1765 
   1766     return dest;
   1767 }
   1768 
   1769 
   1770 //--------------------------------------------------------------------------------
   1771 //
   1772 //    replaceFirst
   1773 //
   1774 //--------------------------------------------------------------------------------
   1775 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
   1776     UText replacementText = UTEXT_INITIALIZER;
   1777     UText resultText = UTEXT_INITIALIZER;
   1778     UnicodeString resultString;
   1779 
   1780     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1781     utext_openUnicodeString(&resultText, &resultString, &status);
   1782 
   1783     replaceFirst(&replacementText, &resultText, status);
   1784 
   1785     utext_close(&resultText);
   1786     utext_close(&replacementText);
   1787 
   1788     return resultString;
   1789 }
   1790 
   1791 //
   1792 //    replaceFirst, UText mode
   1793 //
   1794 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
   1795     if (U_FAILURE(status)) {
   1796         return dest;
   1797     }
   1798     if (U_FAILURE(fDeferredStatus)) {
   1799         status = fDeferredStatus;
   1800         return dest;
   1801     }
   1802 
   1803     reset();
   1804     if (!find()) {
   1805         return getInput(dest, status);
   1806     }
   1807 
   1808     if (dest == NULL) {
   1809         UnicodeString emptyString;
   1810         UText empty = UTEXT_INITIALIZER;
   1811 
   1812         utext_openUnicodeString(&empty, &emptyString, &status);
   1813         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
   1814         utext_close(&empty);
   1815     }
   1816 
   1817     appendReplacement(dest, replacement, status);
   1818     appendTail(dest, status);
   1819 
   1820     return dest;
   1821 }
   1822 
   1823 
   1824 //--------------------------------------------------------------------------------
   1825 //
   1826 //     requireEnd
   1827 //
   1828 //--------------------------------------------------------------------------------
   1829 UBool RegexMatcher::requireEnd() const {
   1830     return fRequireEnd;
   1831 }
   1832 
   1833 
   1834 //--------------------------------------------------------------------------------
   1835 //
   1836 //     reset
   1837 //
   1838 //--------------------------------------------------------------------------------
   1839 RegexMatcher &RegexMatcher::reset() {
   1840     fRegionStart    = 0;
   1841     fRegionLimit    = fInputLength;
   1842     fActiveStart    = 0;
   1843     fActiveLimit    = fInputLength;
   1844     fAnchorStart    = 0;
   1845     fAnchorLimit    = fInputLength;
   1846     fLookStart      = 0;
   1847     fLookLimit      = fInputLength;
   1848     resetPreserveRegion();
   1849     return *this;
   1850 }
   1851 
   1852 
   1853 
   1854 void RegexMatcher::resetPreserveRegion() {
   1855     fMatchStart     = 0;
   1856     fMatchEnd       = 0;
   1857     fLastMatchEnd   = -1;
   1858     fAppendPosition = 0;
   1859     fMatch          = FALSE;
   1860     fHitEnd         = FALSE;
   1861     fRequireEnd     = FALSE;
   1862     fTime           = 0;
   1863     fTickCounter    = TIMER_INITIAL_VALUE;
   1864     //resetStack(); // more expensive than it looks...
   1865 }
   1866 
   1867 
   1868 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
   1869     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
   1870     if (fPattern->fNeedsAltInput) {
   1871         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
   1872     }
   1873     fInputLength = utext_nativeLength(fInputText);
   1874 
   1875     reset();
   1876     delete fInput;
   1877     fInput = NULL;
   1878 
   1879     //  Do the following for any UnicodeString.
   1880     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
   1881     fInputUniStrMaybeMutable = TRUE;
   1882 
   1883     if (fWordBreakItr != NULL) {
   1884 #if UCONFIG_NO_BREAK_ITERATION==0
   1885         UErrorCode status = U_ZERO_ERROR;
   1886         fWordBreakItr->setText(fInputText, status);
   1887 #endif
   1888     }
   1889     return *this;
   1890 }
   1891 
   1892 
   1893 RegexMatcher &RegexMatcher::reset(UText *input) {
   1894     if (fInputText != input) {
   1895         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
   1896         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
   1897         fInputLength = utext_nativeLength(fInputText);
   1898 
   1899         delete fInput;
   1900         fInput = NULL;
   1901 
   1902         if (fWordBreakItr != NULL) {
   1903 #if UCONFIG_NO_BREAK_ITERATION==0
   1904             UErrorCode status = U_ZERO_ERROR;
   1905             fWordBreakItr->setText(input, status);
   1906 #endif
   1907         }
   1908     }
   1909     reset();
   1910     fInputUniStrMaybeMutable = FALSE;
   1911 
   1912     return *this;
   1913 }
   1914 
   1915 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
   1916     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
   1917     return *this;
   1918 }*/
   1919 
   1920 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
   1921     if (U_FAILURE(status)) {
   1922         return *this;
   1923     }
   1924     reset();       // Reset also resets the region to be the entire string.
   1925 
   1926     if (position < 0 || position > fActiveLimit) {
   1927         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1928         return *this;
   1929     }
   1930     fMatchEnd = position;
   1931     return *this;
   1932 }
   1933 
   1934 
   1935 //--------------------------------------------------------------------------------
   1936 //
   1937 //    refresh
   1938 //
   1939 //--------------------------------------------------------------------------------
   1940 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
   1941     if (U_FAILURE(status)) {
   1942         return *this;
   1943     }
   1944     if (input == NULL) {
   1945         status = U_ILLEGAL_ARGUMENT_ERROR;
   1946         return *this;
   1947     }
   1948     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
   1949         status = U_ILLEGAL_ARGUMENT_ERROR;
   1950         return *this;
   1951     }
   1952     int64_t  pos = utext_getNativeIndex(fInputText);
   1953     //  Shallow read-only clone of the new UText into the existing input UText
   1954     fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
   1955     if (U_FAILURE(status)) {
   1956         return *this;
   1957     }
   1958     utext_setNativeIndex(fInputText, pos);
   1959 
   1960     if (fAltInputText != NULL) {
   1961         pos = utext_getNativeIndex(fAltInputText);
   1962         fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
   1963         if (U_FAILURE(status)) {
   1964             return *this;
   1965         }
   1966         utext_setNativeIndex(fAltInputText, pos);
   1967     }
   1968     return *this;
   1969 }
   1970 
   1971 
   1972 
   1973 //--------------------------------------------------------------------------------
   1974 //
   1975 //    setTrace
   1976 //
   1977 //--------------------------------------------------------------------------------
   1978 void RegexMatcher::setTrace(UBool state) {
   1979     fTraceDebug = state;
   1980 }
   1981 
   1982 
   1983 
   1984 //---------------------------------------------------------------------
   1985 //
   1986 //   split
   1987 //
   1988 //---------------------------------------------------------------------
   1989 int32_t  RegexMatcher::split(const UnicodeString &input,
   1990         UnicodeString    dest[],
   1991         int32_t          destCapacity,
   1992         UErrorCode      &status)
   1993 {
   1994     UText inputText = UTEXT_INITIALIZER;
   1995     utext_openConstUnicodeString(&inputText, &input, &status);
   1996     if (U_FAILURE(status)) {
   1997         return 0;
   1998     }
   1999 
   2000     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
   2001     if (destText == NULL) {
   2002         status = U_MEMORY_ALLOCATION_ERROR;
   2003         return 0;
   2004     }
   2005     int32_t i;
   2006     for (i = 0; i < destCapacity; i++) {
   2007         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
   2008     }
   2009 
   2010     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
   2011 
   2012     for (i = 0; i < destCapacity; i++) {
   2013         utext_close(destText[i]);
   2014     }
   2015 
   2016     uprv_free(destText);
   2017     utext_close(&inputText);
   2018     return fieldCount;
   2019 }
   2020 
   2021 //
   2022 //   split, UText mode
   2023 //
   2024 int32_t  RegexMatcher::split(UText *input,
   2025         UText           *dest[],
   2026         int32_t          destCapacity,
   2027         UErrorCode      &status)
   2028 {
   2029     //
   2030     // Check arguements for validity
   2031     //
   2032     if (U_FAILURE(status)) {
   2033         return 0;
   2034     };
   2035 
   2036     if (destCapacity < 1) {
   2037         status = U_ILLEGAL_ARGUMENT_ERROR;
   2038         return 0;
   2039     }
   2040 
   2041     //
   2042     // Reset for the input text
   2043     //
   2044     reset(input);
   2045     int64_t   nextOutputStringStart = 0;
   2046     if (fActiveLimit == 0) {
   2047         return 0;
   2048     }
   2049 
   2050     //
   2051     // Loop through the input text, searching for the delimiter pattern
   2052     //
   2053     int32_t i;
   2054     int32_t numCaptureGroups = fPattern->fGroupMap->size();
   2055     for (i=0; ; i++) {
   2056         if (i>=destCapacity-1) {
   2057             // There is one or zero output string left.
   2058             // Fill the last output string with whatever is left from the input, then exit the loop.
   2059             //  ( i will be == destCapacity if we filled the output array while processing
   2060             //    capture groups of the delimiter expression, in which case we will discard the
   2061             //    last capture group saved in favor of the unprocessed remainder of the
   2062             //    input string.)
   2063             i = destCapacity-1;
   2064             if (fActiveLimit > nextOutputStringStart) {
   2065                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2066                     if (dest[i]) {
   2067                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2068                                       input->chunkContents+nextOutputStringStart,
   2069                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
   2070                     } else {
   2071                         UText remainingText = UTEXT_INITIALIZER;
   2072                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2073                                          fActiveLimit-nextOutputStringStart, &status);
   2074                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2075                         utext_close(&remainingText);
   2076                     }
   2077                 } else {
   2078                     UErrorCode lengthStatus = U_ZERO_ERROR;
   2079                     int32_t remaining16Length =
   2080                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
   2081                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2082                     if (remainingChars == NULL) {
   2083                         status = U_MEMORY_ALLOCATION_ERROR;
   2084                         break;
   2085                     }
   2086 
   2087                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2088                     if (dest[i]) {
   2089                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2090                     } else {
   2091                         UText remainingText = UTEXT_INITIALIZER;
   2092                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2093                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2094                         utext_close(&remainingText);
   2095                     }
   2096 
   2097                     uprv_free(remainingChars);
   2098                 }
   2099             }
   2100             break;
   2101         }
   2102         if (find()) {
   2103             // We found another delimiter.  Move everything from where we started looking
   2104             //  up until the start of the delimiter into the next output string.
   2105             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2106                 if (dest[i]) {
   2107                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2108                                   input->chunkContents+nextOutputStringStart,
   2109                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
   2110                 } else {
   2111                     UText remainingText = UTEXT_INITIALIZER;
   2112                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2113                                       fMatchStart-nextOutputStringStart, &status);
   2114                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2115                     utext_close(&remainingText);
   2116                 }
   2117             } else {
   2118                 UErrorCode lengthStatus = U_ZERO_ERROR;
   2119                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
   2120                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2121                 if (remainingChars == NULL) {
   2122                     status = U_MEMORY_ALLOCATION_ERROR;
   2123                     break;
   2124                 }
   2125                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
   2126                 if (dest[i]) {
   2127                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2128                 } else {
   2129                     UText remainingText = UTEXT_INITIALIZER;
   2130                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2131                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2132                     utext_close(&remainingText);
   2133                 }
   2134 
   2135                 uprv_free(remainingChars);
   2136             }
   2137             nextOutputStringStart = fMatchEnd;
   2138 
   2139             // If the delimiter pattern has capturing parentheses, the captured
   2140             //  text goes out into the next n destination strings.
   2141             int32_t groupNum;
   2142             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
   2143                 if (i >= destCapacity-2) {
   2144                     // Never fill the last available output string with capture group text.
   2145                     // It will filled with the last field, the remainder of the
   2146                     //  unsplit input text.
   2147                     break;
   2148                 }
   2149                 i++;
   2150                 dest[i] = group(groupNum, dest[i], status);
   2151             }
   2152 
   2153             if (nextOutputStringStart == fActiveLimit) {
   2154                 // The delimiter was at the end of the string.  We're done, but first
   2155                 // we output one last empty string, for the empty field following
   2156                 //   the delimiter at the end of input.
   2157                 if (i+1 < destCapacity) {
   2158                     ++i;
   2159                     if (dest[i] == NULL) {
   2160                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
   2161                     } else {
   2162                         static UChar emptyString[] = {(UChar)0};
   2163                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
   2164                     }
   2165                 }
   2166                 break;
   2167 
   2168             }
   2169         }
   2170         else
   2171         {
   2172             // We ran off the end of the input while looking for the next delimiter.
   2173             // All the remaining text goes into the current output string.
   2174             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2175                 if (dest[i]) {
   2176                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2177                                   input->chunkContents+nextOutputStringStart,
   2178                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
   2179                 } else {
   2180                     UText remainingText = UTEXT_INITIALIZER;
   2181                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2182                                      fActiveLimit-nextOutputStringStart, &status);
   2183                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2184                     utext_close(&remainingText);
   2185                 }
   2186             } else {
   2187                 UErrorCode lengthStatus = U_ZERO_ERROR;
   2188                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
   2189                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2190                 if (remainingChars == NULL) {
   2191                     status = U_MEMORY_ALLOCATION_ERROR;
   2192                     break;
   2193                 }
   2194 
   2195                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2196                 if (dest[i]) {
   2197                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2198                 } else {
   2199                     UText remainingText = UTEXT_INITIALIZER;
   2200                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2201                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2202                     utext_close(&remainingText);
   2203                 }
   2204 
   2205                 uprv_free(remainingChars);
   2206             }
   2207             break;
   2208         }
   2209         if (U_FAILURE(status)) {
   2210             break;
   2211         }
   2212     }   // end of for loop
   2213     return i+1;
   2214 }
   2215 
   2216 
   2217 //--------------------------------------------------------------------------------
   2218 //
   2219 //     start
   2220 //
   2221 //--------------------------------------------------------------------------------
   2222 int32_t RegexMatcher::start(UErrorCode &status) const {
   2223     return start(0, status);
   2224 }
   2225 
   2226 int64_t RegexMatcher::start64(UErrorCode &status) const {
   2227     return start64(0, status);
   2228 }
   2229 
   2230 //--------------------------------------------------------------------------------
   2231 //
   2232 //     start(int32_t group, UErrorCode &status)
   2233 //
   2234 //--------------------------------------------------------------------------------
   2235 
   2236 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
   2237     if (U_FAILURE(status)) {
   2238         return -1;
   2239     }
   2240     if (U_FAILURE(fDeferredStatus)) {
   2241         status = fDeferredStatus;
   2242         return -1;
   2243     }
   2244     if (fMatch == FALSE) {
   2245         status = U_REGEX_INVALID_STATE;
   2246         return -1;
   2247     }
   2248     if (group < 0 || group > fPattern->fGroupMap->size()) {
   2249         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2250         return -1;
   2251     }
   2252     int64_t s;
   2253     if (group == 0) {
   2254         s = fMatchStart;
   2255     } else {
   2256         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
   2257         U_ASSERT(groupOffset < fPattern->fFrameSize);
   2258         U_ASSERT(groupOffset >= 0);
   2259         s = fFrame->fExtra[groupOffset];
   2260     }
   2261 
   2262     return s;
   2263 }
   2264 
   2265 
   2266 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
   2267     return (int32_t)start64(group, status);
   2268 }
   2269 
   2270 //--------------------------------------------------------------------------------
   2271 //
   2272 //     useAnchoringBounds
   2273 //
   2274 //--------------------------------------------------------------------------------
   2275 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
   2276     fAnchoringBounds = b;
   2277     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
   2278     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
   2279     return *this;
   2280 }
   2281 
   2282 
   2283 //--------------------------------------------------------------------------------
   2284 //
   2285 //     useTransparentBounds
   2286 //
   2287 //--------------------------------------------------------------------------------
   2288 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
   2289     fTransparentBounds = b;
   2290     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
   2291     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
   2292     return *this;
   2293 }
   2294 
   2295 //--------------------------------------------------------------------------------
   2296 //
   2297 //     setTimeLimit
   2298 //
   2299 //--------------------------------------------------------------------------------
   2300 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
   2301     if (U_FAILURE(status)) {
   2302         return;
   2303     }
   2304     if (U_FAILURE(fDeferredStatus)) {
   2305         status = fDeferredStatus;
   2306         return;
   2307     }
   2308     if (limit < 0) {
   2309         status = U_ILLEGAL_ARGUMENT_ERROR;
   2310         return;
   2311     }
   2312     fTimeLimit = limit;
   2313 }
   2314 
   2315 
   2316 //--------------------------------------------------------------------------------
   2317 //
   2318 //     getTimeLimit
   2319 //
   2320 //--------------------------------------------------------------------------------
   2321 int32_t RegexMatcher::getTimeLimit() const {
   2322     return fTimeLimit;
   2323 }
   2324 
   2325 
   2326 //--------------------------------------------------------------------------------
   2327 //
   2328 //     setStackLimit
   2329 //
   2330 //--------------------------------------------------------------------------------
   2331 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
   2332     if (U_FAILURE(status)) {
   2333         return;
   2334     }
   2335     if (U_FAILURE(fDeferredStatus)) {
   2336         status = fDeferredStatus;
   2337         return;
   2338     }
   2339     if (limit < 0) {
   2340         status = U_ILLEGAL_ARGUMENT_ERROR;
   2341         return;
   2342     }
   2343 
   2344     // Reset the matcher.  This is needed here in case there is a current match
   2345     //    whose final stack frame (containing the match results, pointed to by fFrame)
   2346     //    would be lost by resizing to a smaller stack size.
   2347     reset();
   2348 
   2349     if (limit == 0) {
   2350         // Unlimited stack expansion
   2351         fStack->setMaxCapacity(0);
   2352     } else {
   2353         // Change the units of the limit  from bytes to ints, and bump the size up
   2354         //   to be big enough to hold at least one stack frame for the pattern,
   2355         //   if it isn't there already.
   2356         int32_t adjustedLimit = limit / sizeof(int32_t);
   2357         if (adjustedLimit < fPattern->fFrameSize) {
   2358             adjustedLimit = fPattern->fFrameSize;
   2359         }
   2360         fStack->setMaxCapacity(adjustedLimit);
   2361     }
   2362     fStackLimit = limit;
   2363 }
   2364 
   2365 
   2366 //--------------------------------------------------------------------------------
   2367 //
   2368 //     getStackLimit
   2369 //
   2370 //--------------------------------------------------------------------------------
   2371 int32_t RegexMatcher::getStackLimit() const {
   2372     return fStackLimit;
   2373 }
   2374 
   2375 
   2376 //--------------------------------------------------------------------------------
   2377 //
   2378 //     setMatchCallback
   2379 //
   2380 //--------------------------------------------------------------------------------
   2381 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
   2382                                     const void              *context,
   2383                                     UErrorCode              &status) {
   2384     if (U_FAILURE(status)) {
   2385         return;
   2386     }
   2387     fCallbackFn = callback;
   2388     fCallbackContext = context;
   2389 }
   2390 
   2391 
   2392 //--------------------------------------------------------------------------------
   2393 //
   2394 //     getMatchCallback
   2395 //
   2396 //--------------------------------------------------------------------------------
   2397 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
   2398                                   const void              *&context,
   2399                                   UErrorCode              &status) {
   2400     if (U_FAILURE(status)) {
   2401        return;
   2402     }
   2403     callback = fCallbackFn;
   2404     context  = fCallbackContext;
   2405 }
   2406 
   2407 
   2408 //--------------------------------------------------------------------------------
   2409 //
   2410 //     setMatchCallback
   2411 //
   2412 //--------------------------------------------------------------------------------
   2413 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
   2414                                                 const void                      *context,
   2415                                                 UErrorCode                      &status) {
   2416     if (U_FAILURE(status)) {
   2417         return;
   2418     }
   2419     fFindProgressCallbackFn = callback;
   2420     fFindProgressCallbackContext = context;
   2421 }
   2422 
   2423 
   2424 //--------------------------------------------------------------------------------
   2425 //
   2426 //     getMatchCallback
   2427 //
   2428 //--------------------------------------------------------------------------------
   2429 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
   2430                                                 const void                    *&context,
   2431                                                 UErrorCode                    &status) {
   2432     if (U_FAILURE(status)) {
   2433        return;
   2434     }
   2435     callback = fFindProgressCallbackFn;
   2436     context  = fFindProgressCallbackContext;
   2437 }
   2438 
   2439 
   2440 //================================================================================
   2441 //
   2442 //    Code following this point in this file is the internal
   2443 //    Match Engine Implementation.
   2444 //
   2445 //================================================================================
   2446 
   2447 
   2448 //--------------------------------------------------------------------------------
   2449 //
   2450 //   resetStack
   2451 //           Discard any previous contents of the state save stack, and initialize a
   2452 //           new stack frame to all -1.  The -1s are needed for capture group limits,
   2453 //           where they indicate that a group has not yet matched anything.
   2454 //--------------------------------------------------------------------------------
   2455 REStackFrame *RegexMatcher::resetStack() {
   2456     // Discard any previous contents of the state save stack, and initialize a
   2457     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
   2458     //  where they indicate that a group has not yet matched anything.
   2459     fStack->removeAllElements();
   2460 
   2461     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
   2462     int32_t i;
   2463     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
   2464         iFrame->fExtra[i] = -1;
   2465     }
   2466     return iFrame;
   2467 }
   2468 
   2469 
   2470 
   2471 //--------------------------------------------------------------------------------
   2472 //
   2473 //   isWordBoundary
   2474 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
   2475 //                     For us,
   2476 //                       If the current char is a combining mark,
   2477 //                          \b is FALSE.
   2478 //                       Else Scan backwards to the first non-combining char.
   2479 //                            We are at a boundary if the this char and the original chars are
   2480 //                               opposite in membership in \w set
   2481 //
   2482 //          parameters:   pos   - the current position in the input buffer
   2483 //
   2484 //              TODO:  double-check edge cases at region boundaries.
   2485 //
   2486 //--------------------------------------------------------------------------------
   2487 UBool RegexMatcher::isWordBoundary(int64_t pos) {
   2488     UBool isBoundary = FALSE;
   2489     UBool cIsWord    = FALSE;
   2490 
   2491     if (pos >= fLookLimit) {
   2492         fHitEnd = TRUE;
   2493     } else {
   2494         // Determine whether char c at current position is a member of the word set of chars.
   2495         // If we're off the end of the string, behave as though we're not at a word char.
   2496         UTEXT_SETNATIVEINDEX(fInputText, pos);
   2497         UChar32  c = UTEXT_CURRENT32(fInputText);
   2498         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2499             // Current char is a combining one.  Not a boundary.
   2500             return FALSE;
   2501         }
   2502         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   2503     }
   2504 
   2505     // Back up until we come to a non-combining char, determine whether
   2506     //  that char is a word char.
   2507     UBool prevCIsWord = FALSE;
   2508     for (;;) {
   2509         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
   2510             break;
   2511         }
   2512         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
   2513         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2514               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2515             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   2516             break;
   2517         }
   2518     }
   2519     isBoundary = cIsWord ^ prevCIsWord;
   2520     return isBoundary;
   2521 }
   2522 
   2523 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
   2524     UBool isBoundary = FALSE;
   2525     UBool cIsWord    = FALSE;
   2526 
   2527     const UChar *inputBuf = fInputText->chunkContents;
   2528 
   2529     if (pos >= fLookLimit) {
   2530         fHitEnd = TRUE;
   2531     } else {
   2532         // Determine whether char c at current position is a member of the word set of chars.
   2533         // If we're off the end of the string, behave as though we're not at a word char.
   2534         UChar32 c;
   2535         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
   2536         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2537             // Current char is a combining one.  Not a boundary.
   2538             return FALSE;
   2539         }
   2540         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   2541     }
   2542 
   2543     // Back up until we come to a non-combining char, determine whether
   2544     //  that char is a word char.
   2545     UBool prevCIsWord = FALSE;
   2546     for (;;) {
   2547         if (pos <= fLookStart) {
   2548             break;
   2549         }
   2550         UChar32 prevChar;
   2551         U16_PREV(inputBuf, fLookStart, pos, prevChar);
   2552         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2553               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2554             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   2555             break;
   2556         }
   2557     }
   2558     isBoundary = cIsWord ^ prevCIsWord;
   2559     return isBoundary;
   2560 }
   2561 
   2562 //--------------------------------------------------------------------------------
   2563 //
   2564 //   isUWordBoundary
   2565 //
   2566 //         Test for a word boundary using RBBI word break.
   2567 //
   2568 //          parameters:   pos   - the current position in the input buffer
   2569 //
   2570 //--------------------------------------------------------------------------------
   2571 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
   2572     UBool       returnVal = FALSE;
   2573 #if UCONFIG_NO_BREAK_ITERATION==0
   2574 
   2575     // If we haven't yet created a break iterator for this matcher, do it now.
   2576     if (fWordBreakItr == NULL) {
   2577         fWordBreakItr =
   2578             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
   2579         if (U_FAILURE(fDeferredStatus)) {
   2580             return FALSE;
   2581         }
   2582         fWordBreakItr->setText(fInputText, fDeferredStatus);
   2583     }
   2584 
   2585     if (pos >= fLookLimit) {
   2586         fHitEnd = TRUE;
   2587         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
   2588                             //    words are not boundaries.  All non-word chars stand by themselves,
   2589                             //    with word boundaries on both sides.
   2590     } else {
   2591         if (!UTEXT_USES_U16(fInputText)) {
   2592             // !!!: Would like a better way to do this!
   2593             UErrorCode status = U_ZERO_ERROR;
   2594             pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
   2595         }
   2596         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
   2597     }
   2598 #endif
   2599     return   returnVal;
   2600 }
   2601 
   2602 //--------------------------------------------------------------------------------
   2603 //
   2604 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
   2605 //                     saves. Increment the "time" counter, and call the
   2606 //                     user callback function if there is one installed.
   2607 //
   2608 //                     If the match operation needs to be aborted, either for a time-out
   2609 //                     or because the user callback asked for it, just set an error status.
   2610 //                     The engine will pick that up and stop in its outer loop.
   2611 //
   2612 //--------------------------------------------------------------------------------
   2613 void RegexMatcher::IncrementTime(UErrorCode &status) {
   2614     fTickCounter = TIMER_INITIAL_VALUE;
   2615     fTime++;
   2616     if (fCallbackFn != NULL) {
   2617         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
   2618             status = U_REGEX_STOPPED_BY_CALLER;
   2619             return;
   2620         }
   2621     }
   2622     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
   2623         status = U_REGEX_TIME_OUT;
   2624     }
   2625 }
   2626 
   2627 //--------------------------------------------------------------------------------
   2628 //
   2629 //   ReportFindProgress     This function is called once for each advance in the target
   2630 //                          string from the find() function, and calls the user progress callback
   2631 //                          function if there is one installed.
   2632 //
   2633 //                          NOTE:
   2634 //
   2635 //                          If the match operation needs to be aborted because the user
   2636 //                          callback asked for it, just set an error status.
   2637 //                          The engine will pick that up and stop in its outer loop.
   2638 //
   2639 //--------------------------------------------------------------------------------
   2640 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
   2641     if (fFindProgressCallbackFn != NULL) {
   2642         if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
   2643             status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
   2644             return FALSE;
   2645         }
   2646     }
   2647     return TRUE;
   2648 }
   2649 
   2650 //--------------------------------------------------------------------------------
   2651 //
   2652 //   StateSave
   2653 //       Make a new stack frame, initialized as a copy of the current stack frame.
   2654 //       Set the pattern index in the original stack frame from the operand value
   2655 //       in the opcode.  Execution of the engine continues with the state in
   2656 //       the newly created stack frame
   2657 //
   2658 //       Note that reserveBlock() may grow the stack, resulting in the
   2659 //       whole thing being relocated in memory.
   2660 //
   2661 //    Parameters:
   2662 //       fp           The top frame pointer when called.  At return, a new
   2663 //                    fame will be present
   2664 //       savePatIdx   An index into the compiled pattern.  Goes into the original
   2665 //                    (not new) frame.  If execution ever back-tracks out of the
   2666 //                    new frame, this will be where we continue from in the pattern.
   2667 //    Return
   2668 //                    The new frame pointer.
   2669 //
   2670 //--------------------------------------------------------------------------------
   2671 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
   2672     // push storage for a new frame.
   2673     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
   2674     if (newFP == NULL) {
   2675         // Failure on attempted stack expansion.
   2676         //   Stack function set some other error code, change it to a more
   2677         //   specific one for regular expressions.
   2678         status = U_REGEX_STACK_OVERFLOW;
   2679         // We need to return a writable stack frame, so just return the
   2680         //    previous frame.  The match operation will stop quickly
   2681         //    because of the error status, after which the frame will never
   2682         //    be looked at again.
   2683         return fp;
   2684     }
   2685     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
   2686 
   2687     // New stack frame = copy of old top frame.
   2688     int64_t *source = (int64_t *)fp;
   2689     int64_t *dest   = newFP;
   2690     for (;;) {
   2691         *dest++ = *source++;
   2692         if (source == newFP) {
   2693             break;
   2694         }
   2695     }
   2696 
   2697     fTickCounter--;
   2698     if (fTickCounter <= 0) {
   2699        IncrementTime(status);    // Re-initializes fTickCounter
   2700     }
   2701     fp->fPatIdx = savePatIdx;
   2702     return (REStackFrame *)newFP;
   2703 }
   2704 
   2705 
   2706 //--------------------------------------------------------------------------------
   2707 //
   2708 //   MatchAt      This is the actual matching engine.
   2709 //
   2710 //                  startIdx:    begin matching a this index.
   2711 //                  toEnd:       if true, match must extend to end of the input region
   2712 //
   2713 //--------------------------------------------------------------------------------
   2714 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
   2715     UBool       isMatch  = FALSE;      // True if the we have a match.
   2716 
   2717     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
   2718 
   2719     int32_t     op;                    // Operation from the compiled pattern, split into
   2720     int32_t     opType;                //    the opcode
   2721     int32_t     opValue;               //    and the operand value.
   2722 
   2723 #ifdef REGEX_RUN_DEBUG
   2724     if (fTraceDebug)
   2725     {
   2726         printf("MatchAt(startIdx=%ld)\n", startIdx);
   2727         printf("Original Pattern: ");
   2728         UChar32 c = utext_next32From(fPattern->fPattern, 0);
   2729         while (c != U_SENTINEL) {
   2730             if (c<32 || c>256) {
   2731                 c = '.';
   2732             }
   2733             printf("%c", c);
   2734 
   2735             c = UTEXT_NEXT32(fPattern->fPattern);
   2736         }
   2737         printf("\n");
   2738         printf("Input String: ");
   2739         c = utext_next32From(fInputText, 0);
   2740         while (c != U_SENTINEL) {
   2741             if (c<32 || c>256) {
   2742                 c = '.';
   2743             }
   2744             printf("%c", c);
   2745 
   2746             c = UTEXT_NEXT32(fInputText);
   2747         }
   2748         printf("\n");
   2749         printf("\n");
   2750     }
   2751 #endif
   2752 
   2753     if (U_FAILURE(status)) {
   2754         return;
   2755     }
   2756 
   2757     //  Cache frequently referenced items from the compiled pattern
   2758     //
   2759     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   2760 
   2761     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   2762     UVector             *sets          = fPattern->fSets;
   2763 
   2764     fFrameSize = fPattern->fFrameSize;
   2765     REStackFrame        *fp            = resetStack();
   2766 
   2767     fp->fPatIdx   = 0;
   2768     fp->fInputIdx = startIdx;
   2769 
   2770     // Zero out the pattern's static data
   2771     int32_t i;
   2772     for (i = 0; i<fPattern->fDataSize; i++) {
   2773         fData[i] = 0;
   2774     }
   2775 
   2776     //
   2777     //  Main loop for interpreting the compiled pattern.
   2778     //  One iteration of the loop per pattern operation performed.
   2779     //
   2780     for (;;) {
   2781         op      = (int32_t)pat[fp->fPatIdx];
   2782         opType  = URX_TYPE(op);
   2783         opValue = URX_VAL(op);
   2784 #ifdef REGEX_RUN_DEBUG
   2785         if (fTraceDebug) {
   2786             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2787             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
   2788                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   2789             fPattern->dumpOp(fp->fPatIdx);
   2790         }
   2791 #endif
   2792         fp->fPatIdx++;
   2793 
   2794         switch (opType) {
   2795 
   2796 
   2797         case URX_NOP:
   2798             break;
   2799 
   2800 
   2801         case URX_BACKTRACK:
   2802             // Force a backtrack.  In some circumstances, the pattern compiler
   2803             //   will notice that the pattern can't possibly match anything, and will
   2804             //   emit one of these at that point.
   2805             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2806             break;
   2807 
   2808 
   2809         case URX_ONECHAR:
   2810             if (fp->fInputIdx < fActiveLimit) {
   2811                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2812                 UChar32 c = UTEXT_NEXT32(fInputText);
   2813                 if (c == opValue) {
   2814                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2815                     break;
   2816                 }
   2817             } else {
   2818                 fHitEnd = TRUE;
   2819             }
   2820             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2821             break;
   2822 
   2823 
   2824         case URX_STRING:
   2825             {
   2826                 // Test input against a literal string.
   2827                 // Strings require two slots in the compiled pattern, one for the
   2828                 //   offset to the string text, and one for the length.
   2829 
   2830                 int32_t   stringStartIdx = opValue;
   2831                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
   2832                 fp->fPatIdx++;
   2833                 opType    = URX_TYPE(op);
   2834                 int32_t stringLen = URX_VAL(op);
   2835                 U_ASSERT(opType == URX_STRING_LEN);
   2836                 U_ASSERT(stringLen >= 2);
   2837 
   2838                 const UChar *patternString = litText+stringStartIdx;
   2839                 int32_t patternStringIndex = 0;
   2840                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2841                 UChar32 inputChar;
   2842                 UChar32 patternChar;
   2843                 UBool success = TRUE;
   2844                 while (patternStringIndex < stringLen) {
   2845                     if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
   2846                         success = FALSE;
   2847                         fHitEnd = TRUE;
   2848                         break;
   2849                     }
   2850                     inputChar = UTEXT_NEXT32(fInputText);
   2851                     U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
   2852                     if (patternChar != inputChar) {
   2853                         success = FALSE;
   2854                         break;
   2855                     }
   2856                 }
   2857 
   2858                 if (success) {
   2859                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2860                 } else {
   2861                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2862                 }
   2863             }
   2864             break;
   2865 
   2866 
   2867         case URX_STATE_SAVE:
   2868             fp = StateSave(fp, opValue, status);
   2869             break;
   2870 
   2871 
   2872         case URX_END:
   2873             // The match loop will exit via this path on a successful match,
   2874             //   when we reach the end of the pattern.
   2875             if (toEnd && fp->fInputIdx != fActiveLimit) {
   2876                 // The pattern matched, but not to the end of input.  Try some more.
   2877                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2878                 break;
   2879             }
   2880             isMatch = TRUE;
   2881             goto  breakFromLoop;
   2882 
   2883         // Start and End Capture stack frame variables are laid out out like this:
   2884             //  fp->fExtra[opValue]  - The start of a completed capture group
   2885             //             opValue+1 - The end   of a completed capture group
   2886             //             opValue+2 - the start of a capture group whose end
   2887             //                          has not yet been reached (and might not ever be).
   2888         case URX_START_CAPTURE:
   2889             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2890             fp->fExtra[opValue+2] = fp->fInputIdx;
   2891             break;
   2892 
   2893 
   2894         case URX_END_CAPTURE:
   2895             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2896             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   2897             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   2898             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   2899             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   2900             break;
   2901 
   2902 
   2903         case URX_DOLLAR:                   //  $, test for End of line
   2904                                            //     or for position before new line at end of input
   2905             {
   2906                 if (fp->fInputIdx >= fAnchorLimit) {
   2907                     // We really are at the end of input.  Success.
   2908                     fHitEnd = TRUE;
   2909                     fRequireEnd = TRUE;
   2910                     break;
   2911                 }
   2912 
   2913                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2914 
   2915                 // If we are positioned just before a new-line that is located at the
   2916                 //   end of input, succeed.
   2917                 UChar32 c = UTEXT_NEXT32(fInputText);
   2918                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   2919                     if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
   2920                         // If not in the middle of a CR/LF sequence
   2921                       if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
   2922                             // At new-line at end of input. Success
   2923                             fHitEnd = TRUE;
   2924                             fRequireEnd = TRUE;
   2925 
   2926                             break;
   2927                         }
   2928                     }
   2929                 } else {
   2930                     UChar32 nextC = UTEXT_NEXT32(fInputText);
   2931                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   2932                         fHitEnd = TRUE;
   2933                         fRequireEnd = TRUE;
   2934                         break;                         // At CR/LF at end of input.  Success
   2935                     }
   2936                 }
   2937 
   2938                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2939             }
   2940             break;
   2941 
   2942 
   2943          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   2944             if (fp->fInputIdx >= fAnchorLimit) {
   2945                 // Off the end of input.  Success.
   2946                 fHitEnd = TRUE;
   2947                 fRequireEnd = TRUE;
   2948                 break;
   2949             } else {
   2950                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2951                 UChar32 c = UTEXT_NEXT32(fInputText);
   2952                 // Either at the last character of input, or off the end.
   2953                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
   2954                     fHitEnd = TRUE;
   2955                     fRequireEnd = TRUE;
   2956                     break;
   2957                 }
   2958             }
   2959 
   2960             // Not at end of input.  Back-track out.
   2961             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2962             break;
   2963 
   2964 
   2965          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   2966              {
   2967                  if (fp->fInputIdx >= fAnchorLimit) {
   2968                      // We really are at the end of input.  Success.
   2969                      fHitEnd = TRUE;
   2970                      fRequireEnd = TRUE;
   2971                      break;
   2972                  }
   2973                  // If we are positioned just before a new-line, succeed.
   2974                  // It makes no difference where the new-line is within the input.
   2975                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2976                  UChar32 c = UTEXT_CURRENT32(fInputText);
   2977                  if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
   2978                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   2979                      //  In multi-line mode, hitting a new-line just before the end of input does not
   2980                      //   set the hitEnd or requireEnd flags
   2981                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
   2982                         break;
   2983                      }
   2984                  }
   2985                  // not at a new line.  Fail.
   2986                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2987              }
   2988              break;
   2989 
   2990 
   2991          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   2992              {
   2993                  if (fp->fInputIdx >= fAnchorLimit) {
   2994                      // We really are at the end of input.  Success.
   2995                      fHitEnd = TRUE;
   2996                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   2997                      break;               //   adding a new-line would not lose the match.
   2998                  }
   2999                  // If we are not positioned just before a new-line, the test fails; backtrack out.
   3000                  // It makes no difference where the new-line is within the input.
   3001                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3002                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
   3003                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3004                  }
   3005              }
   3006              break;
   3007 
   3008 
   3009        case URX_CARET:                    //  ^, test for start of line
   3010             if (fp->fInputIdx != fAnchorStart) {
   3011                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3012             }
   3013             break;
   3014 
   3015 
   3016        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   3017            {
   3018                if (fp->fInputIdx == fAnchorStart) {
   3019                    // We are at the start input.  Success.
   3020                    break;
   3021                }
   3022                // Check whether character just before the current pos is a new-line
   3023                //   unless we are at the end of input
   3024                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3025                UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3026                if ((fp->fInputIdx < fAnchorLimit) &&
   3027                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   3028                    //  It's a new-line.  ^ is true.  Success.
   3029                    //  TODO:  what should be done with positions between a CR and LF?
   3030                    break;
   3031                }
   3032                // Not at the start of a line.  Fail.
   3033                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3034            }
   3035            break;
   3036 
   3037 
   3038        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   3039            {
   3040                U_ASSERT(fp->fInputIdx >= fAnchorStart);
   3041                if (fp->fInputIdx <= fAnchorStart) {
   3042                    // We are at the start input.  Success.
   3043                    break;
   3044                }
   3045                // Check whether character just before the current pos is a new-line
   3046                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   3047                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3048                UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3049                if (c != 0x0a) {
   3050                    // Not at the start of a line.  Back-track out.
   3051                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3052                }
   3053            }
   3054            break;
   3055 
   3056         case URX_BACKSLASH_B:          // Test for word boundaries
   3057             {
   3058                 UBool success = isWordBoundary(fp->fInputIdx);
   3059                 success ^= (UBool)(opValue != 0);     // flip sense for \B
   3060                 if (!success) {
   3061                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3062                 }
   3063             }
   3064             break;
   3065 
   3066 
   3067         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   3068             {
   3069                 UBool success = isUWordBoundary(fp->fInputIdx);
   3070                 success ^= (UBool)(opValue != 0);     // flip sense for \B
   3071                 if (!success) {
   3072                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3073                 }
   3074             }
   3075             break;
   3076 
   3077 
   3078         case URX_BACKSLASH_D:            // Test for decimal digit
   3079             {
   3080                 if (fp->fInputIdx >= fActiveLimit) {
   3081                     fHitEnd = TRUE;
   3082                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3083                     break;
   3084                 }
   3085 
   3086                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3087 
   3088                 UChar32 c = UTEXT_NEXT32(fInputText);
   3089                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   3090                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   3091                 success ^= (UBool)(opValue != 0);        // flip sense for \D
   3092                 if (success) {
   3093                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3094                 } else {
   3095                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3096                 }
   3097             }
   3098             break;
   3099 
   3100 
   3101         case URX_BACKSLASH_G:          // Test for position at end of previous match
   3102             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
   3103                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3104             }
   3105             break;
   3106 
   3107 
   3108         case URX_BACKSLASH_X:
   3109             //  Match a Grapheme, as defined by Unicode TR 29.
   3110             //  Differs slightly from Perl, which consumes combining marks independently
   3111             //    of context.
   3112             {
   3113 
   3114                 // Fail if at end of input
   3115                 if (fp->fInputIdx >= fActiveLimit) {
   3116                     fHitEnd = TRUE;
   3117                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3118                     break;
   3119                 }
   3120 
   3121                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3122 
   3123                 // Examine (and consume) the current char.
   3124                 //   Dispatch into a little state machine, based on the char.
   3125                 UChar32  c;
   3126                 c = UTEXT_NEXT32(fInputText);
   3127                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3128                 UnicodeSet **sets = fPattern->fStaticSets;
   3129                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   3130                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   3131                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   3132                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   3133                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   3134                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3135                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3136                 goto GC_Extend;
   3137 
   3138 
   3139 
   3140 GC_L:
   3141                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3142                 c = UTEXT_NEXT32(fInputText);
   3143                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3144                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   3145                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   3146                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   3147                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3148                 (void)UTEXT_PREVIOUS32(fInputText);
   3149                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3150                 goto GC_Extend;
   3151 
   3152 GC_V:
   3153                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3154                 c = UTEXT_NEXT32(fInputText);
   3155                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3156                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3157                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3158                 (void)UTEXT_PREVIOUS32(fInputText);
   3159                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3160                 goto GC_Extend;
   3161 
   3162 GC_T:
   3163                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3164                 c = UTEXT_NEXT32(fInputText);
   3165                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3166                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3167                 (void)UTEXT_PREVIOUS32(fInputText);
   3168                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3169                 goto GC_Extend;
   3170 
   3171 GC_Extend:
   3172                 // Combining characters are consumed here
   3173                 for (;;) {
   3174                     if (fp->fInputIdx >= fActiveLimit) {
   3175                         break;
   3176                     }
   3177                     c = UTEXT_CURRENT32(fInputText);
   3178                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   3179                         break;
   3180                     }
   3181                     (void)UTEXT_NEXT32(fInputText);
   3182                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3183                 }
   3184                 goto GC_Done;
   3185 
   3186 GC_Control:
   3187                 // Most control chars stand alone (don't combine with combining chars),
   3188                 //   except for that CR/LF sequence is a single grapheme cluster.
   3189                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
   3190                     c = UTEXT_NEXT32(fInputText);
   3191                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3192                 }
   3193 
   3194 GC_Done:
   3195                 if (fp->fInputIdx >= fActiveLimit) {
   3196                     fHitEnd = TRUE;
   3197                 }
   3198                 break;
   3199             }
   3200 
   3201 
   3202 
   3203 
   3204         case URX_BACKSLASH_Z:          // Test for end of Input
   3205             if (fp->fInputIdx < fAnchorLimit) {
   3206                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3207             } else {
   3208                 fHitEnd = TRUE;
   3209                 fRequireEnd = TRUE;
   3210             }
   3211             break;
   3212 
   3213 
   3214 
   3215         case URX_STATIC_SETREF:
   3216             {
   3217                 // Test input character against one of the predefined sets
   3218                 //    (Word Characters, for example)
   3219                 // The high bit of the op value is a flag for the match polarity.
   3220                 //    0:   success if input char is in set.
   3221                 //    1:   success if input char is not in set.
   3222                 if (fp->fInputIdx >= fActiveLimit) {
   3223                     fHitEnd = TRUE;
   3224                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3225                     break;
   3226                 }
   3227 
   3228                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   3229                 opValue &= ~URX_NEG_SET;
   3230                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3231 
   3232                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3233                 UChar32 c = UTEXT_NEXT32(fInputText);
   3234                 if (c < 256) {
   3235                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3236                     if (s8->contains(c)) {
   3237                         success = !success;
   3238                     }
   3239                 } else {
   3240                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3241                     if (s->contains(c)) {
   3242                         success = !success;
   3243                     }
   3244                 }
   3245                 if (success) {
   3246                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3247                 } else {
   3248                     // the character wasn't in the set.
   3249                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3250                 }
   3251             }
   3252             break;
   3253 
   3254 
   3255         case URX_STAT_SETREF_N:
   3256             {
   3257                 // Test input character for NOT being a member of  one of
   3258                 //    the predefined sets (Word Characters, for example)
   3259                 if (fp->fInputIdx >= fActiveLimit) {
   3260                     fHitEnd = TRUE;
   3261                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3262                     break;
   3263                 }
   3264 
   3265                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3266 
   3267                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3268 
   3269                 UChar32 c = UTEXT_NEXT32(fInputText);
   3270                 if (c < 256) {
   3271                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3272                     if (s8->contains(c) == FALSE) {
   3273                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3274                         break;
   3275                     }
   3276                 } else {
   3277                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3278                     if (s->contains(c) == FALSE) {
   3279                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3280                         break;
   3281                     }
   3282                 }
   3283                 // the character wasn't in the set.
   3284                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3285             }
   3286             break;
   3287 
   3288 
   3289         case URX_SETREF:
   3290             if (fp->fInputIdx >= fActiveLimit) {
   3291                 fHitEnd = TRUE;
   3292                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3293                 break;
   3294             } else {
   3295                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3296 
   3297                 // There is input left.  Pick up one char and test it for set membership.
   3298                 UChar32 c = UTEXT_NEXT32(fInputText);
   3299                 U_ASSERT(opValue > 0 && opValue < sets->size());
   3300                 if (c<256) {
   3301                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   3302                     if (s8->contains(c)) {
   3303                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3304                         break;
   3305                     }
   3306                 } else {
   3307                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   3308                     if (s->contains(c)) {
   3309                         // The character is in the set.  A Match.
   3310                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3311                         break;
   3312                     }
   3313                 }
   3314 
   3315                 // the character wasn't in the set.
   3316                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3317             }
   3318             break;
   3319 
   3320 
   3321         case URX_DOTANY:
   3322             {
   3323                 // . matches anything, but stops at end-of-line.
   3324                 if (fp->fInputIdx >= fActiveLimit) {
   3325                     // At end of input.  Match failed.  Backtrack out.
   3326                     fHitEnd = TRUE;
   3327                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3328                     break;
   3329                 }
   3330 
   3331                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3332 
   3333                 // There is input left.  Advance over one char, unless we've hit end-of-line
   3334                 UChar32 c = UTEXT_NEXT32(fInputText);
   3335                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   3336                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   3337                     // End of line in normal mode.   . does not match.
   3338                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3339                     break;
   3340                 }
   3341                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3342             }
   3343             break;
   3344 
   3345 
   3346         case URX_DOTANY_ALL:
   3347             {
   3348                 // ., in dot-matches-all (including new lines) mode
   3349                 if (fp->fInputIdx >= fActiveLimit) {
   3350                     // At end of input.  Match failed.  Backtrack out.
   3351                     fHitEnd = TRUE;
   3352                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3353                     break;
   3354                 }
   3355 
   3356                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3357 
   3358                 // There is input left.  Advance over one char, except if we are
   3359                 //   at a cr/lf, advance over both of them.
   3360                 UChar32 c;
   3361                 c = UTEXT_NEXT32(fInputText);
   3362                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3363                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   3364                     // In the case of a CR/LF, we need to advance over both.
   3365                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
   3366                     if (nextc == 0x0a) {
   3367                         (void)UTEXT_NEXT32(fInputText);
   3368                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3369                     }
   3370                 }
   3371             }
   3372             break;
   3373 
   3374 
   3375         case URX_DOTANY_UNIX:
   3376             {
   3377                 // '.' operator, matches all, but stops at end-of-line.
   3378                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   3379                 if (fp->fInputIdx >= fActiveLimit) {
   3380                     // At end of input.  Match failed.  Backtrack out.
   3381                     fHitEnd = TRUE;
   3382                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3383                     break;
   3384                 }
   3385 
   3386                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3387 
   3388                 // There is input left.  Advance over one char, unless we've hit end-of-line
   3389                 UChar32 c = UTEXT_NEXT32(fInputText);
   3390                 if (c == 0x0a) {
   3391                     // End of line in normal mode.   '.' does not match the \n
   3392                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3393                 } else {
   3394                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3395                 }
   3396             }
   3397             break;
   3398 
   3399 
   3400         case URX_JMP:
   3401             fp->fPatIdx = opValue;
   3402             break;
   3403 
   3404         case URX_FAIL:
   3405             isMatch = FALSE;
   3406             goto breakFromLoop;
   3407 
   3408         case URX_JMP_SAV:
   3409             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   3410             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   3411             fp->fPatIdx = opValue;                         // Then JMP.
   3412             break;
   3413 
   3414         case URX_JMP_SAV_X:
   3415             // This opcode is used with (x)+, when x can match a zero length string.
   3416             // Same as JMP_SAV, except conditional on the match having made forward progress.
   3417             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   3418             //   data address of the input position at the start of the loop.
   3419             {
   3420                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   3421                 int32_t  stoOp = (int32_t)pat[opValue-1];
   3422                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   3423                 int32_t  frameLoc = URX_VAL(stoOp);
   3424                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   3425                 int64_t prevInputIdx = fp->fExtra[frameLoc];
   3426                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   3427                 if (prevInputIdx < fp->fInputIdx) {
   3428                     // The match did make progress.  Repeat the loop.
   3429                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   3430                     fp->fPatIdx = opValue;
   3431                     fp->fExtra[frameLoc] = fp->fInputIdx;
   3432                 }
   3433                 // If the input position did not advance, we do nothing here,
   3434                 //   execution will fall out of the loop.
   3435             }
   3436             break;
   3437 
   3438         case URX_CTR_INIT:
   3439             {
   3440                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3441                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
   3442 
   3443                 // Pick up the three extra operands that CTR_INIT has, and
   3444                 //    skip the pattern location counter past
   3445                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3446                 fp->fPatIdx += 3;
   3447                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3448                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   3449                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   3450                 U_ASSERT(minCount>=0);
   3451                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   3452                 U_ASSERT(loopLoc>=fp->fPatIdx);
   3453 
   3454                 if (minCount == 0) {
   3455                     fp = StateSave(fp, loopLoc+1, status);
   3456                 }
   3457                 if (maxCount == -1) {
   3458                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
   3459                 } else if (maxCount == 0) {
   3460                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3461                 }
   3462             }
   3463             break;
   3464 
   3465         case URX_CTR_LOOP:
   3466             {
   3467                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3468                 int32_t initOp = (int32_t)pat[opValue];
   3469                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   3470                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3471                 int32_t minCount  = (int32_t)pat[opValue+2];
   3472                 int32_t maxCount  = (int32_t)pat[opValue+3];
   3473                 (*pCounter)++;
   3474                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
   3475                     U_ASSERT(*pCounter == maxCount);
   3476                     break;
   3477                 }
   3478                 if (*pCounter >= minCount) {
   3479                     if (maxCount == -1) {
   3480                         // Loop has no hard upper bound.
   3481                         // Check that it is progressing through the input, break if it is not.
   3482                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
   3483                         if (fp->fInputIdx == *pLastInputIdx) {
   3484                             break;
   3485                         } else {
   3486                             *pLastInputIdx = fp->fInputIdx;
   3487                         }
   3488                     }
   3489                     fp = StateSave(fp, fp->fPatIdx, status);
   3490                 }
   3491                 fp->fPatIdx = opValue + 4;    // Loop back.
   3492             }
   3493             break;
   3494 
   3495         case URX_CTR_INIT_NG:
   3496             {
   3497                 // Initialize a non-greedy loop
   3498                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3499                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
   3500 
   3501                 // Pick up the three extra operands that CTR_INIT_NG has, and
   3502                 //    skip the pattern location counter past
   3503                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3504                 fp->fPatIdx += 3;
   3505                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3506                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   3507                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   3508                 U_ASSERT(minCount>=0);
   3509                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   3510                 U_ASSERT(loopLoc>fp->fPatIdx);
   3511                 if (maxCount == -1) {
   3512                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
   3513                 }
   3514 
   3515                 if (minCount == 0) {
   3516                     if (maxCount != 0) {
   3517                         fp = StateSave(fp, fp->fPatIdx, status);
   3518                     }
   3519                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   3520                 }
   3521             }
   3522             break;
   3523 
   3524         case URX_CTR_LOOP_NG:
   3525             {
   3526                 // Non-greedy {min, max} loops
   3527                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3528                 int32_t initOp = (int32_t)pat[opValue];
   3529                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   3530                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3531                 int32_t minCount  = (int32_t)pat[opValue+2];
   3532                 int32_t maxCount  = (int32_t)pat[opValue+3];
   3533 
   3534                 (*pCounter)++;
   3535                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
   3536                     // The loop has matched the maximum permitted number of times.
   3537                     //   Break out of here with no action.  Matching will
   3538                     //   continue with the following pattern.
   3539                     U_ASSERT(*pCounter == maxCount);
   3540                     break;
   3541                 }
   3542 
   3543                 if (*pCounter < minCount) {
   3544                     // We haven't met the minimum number of matches yet.
   3545                     //   Loop back for another one.
   3546                     fp->fPatIdx = opValue + 4;    // Loop back.
   3547                 } else {
   3548                     // We do have the minimum number of matches.
   3549 
   3550                     // If there is no upper bound on the loop iterations, check that the input index
   3551                     // is progressing, and stop the loop if it is not.
   3552                     if (maxCount == -1) {
   3553                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
   3554                         if (fp->fInputIdx == *pLastInputIdx) {
   3555                             break;
   3556                         }
   3557                         *pLastInputIdx = fp->fInputIdx;
   3558                     }
   3559 
   3560                     // Loop Continuation: we will fall into the pattern following the loop
   3561                     //   (non-greedy, don't execute loop body first), but first do
   3562                     //   a state save to the top of the loop, so that a match failure
   3563                     //   in the following pattern will try another iteration of the loop.
   3564                     fp = StateSave(fp, opValue + 4, status);
   3565                 }
   3566             }
   3567             break;
   3568 
   3569         case URX_STO_SP:
   3570             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3571             fData[opValue] = fStack->size();
   3572             break;
   3573 
   3574         case URX_LD_SP:
   3575             {
   3576                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3577                 int32_t newStackSize = (int32_t)fData[opValue];
   3578                 U_ASSERT(newStackSize <= fStack->size());
   3579                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3580                 if (newFP == (int64_t *)fp) {
   3581                     break;
   3582                 }
   3583                 int32_t i;
   3584                 for (i=0; i<fFrameSize; i++) {
   3585                     newFP[i] = ((int64_t *)fp)[i];
   3586                 }
   3587                 fp = (REStackFrame *)newFP;
   3588                 fStack->setSize(newStackSize);
   3589             }
   3590             break;
   3591 
   3592         case URX_BACKREF:
   3593             {
   3594                 U_ASSERT(opValue < fFrameSize);
   3595                 int64_t groupStartIdx = fp->fExtra[opValue];
   3596                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   3597                 U_ASSERT(groupStartIdx <= groupEndIdx);
   3598                 if (groupStartIdx < 0) {
   3599                     // This capture group has not participated in the match thus far,
   3600                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3601                     break;
   3602                 }
   3603                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
   3604                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3605 
   3606                 //   Note: if the capture group match was of an empty string the backref
   3607                 //         match succeeds.  Verified by testing:  Perl matches succeed
   3608                 //         in this case, so we do too.
   3609 
   3610                 UBool success = TRUE;
   3611                 for (;;) {
   3612                     if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
   3613                         success = TRUE;
   3614                         break;
   3615                     }
   3616                     if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
   3617                         success = FALSE;
   3618                         fHitEnd = TRUE;
   3619                         break;
   3620                     }
   3621                     UChar32 captureGroupChar = utext_next32(fAltInputText);
   3622                     UChar32 inputChar = utext_next32(fInputText);
   3623                     if (inputChar != captureGroupChar) {
   3624                         success = FALSE;
   3625                         break;
   3626                     }
   3627                 }
   3628 
   3629                 if (success) {
   3630                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3631                 } else {
   3632                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3633                 }
   3634             }
   3635             break;
   3636 
   3637 
   3638 
   3639         case URX_BACKREF_I:
   3640             {
   3641                 U_ASSERT(opValue < fFrameSize);
   3642                 int64_t groupStartIdx = fp->fExtra[opValue];
   3643                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   3644                 U_ASSERT(groupStartIdx <= groupEndIdx);
   3645                 if (groupStartIdx < 0) {
   3646                     // This capture group has not participated in the match thus far,
   3647                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3648                     break;
   3649                 }
   3650                 utext_setNativeIndex(fAltInputText, groupStartIdx);
   3651                 utext_setNativeIndex(fInputText, fp->fInputIdx);
   3652                 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
   3653                 CaseFoldingUTextIterator inputItr(*fInputText);
   3654 
   3655                 //   Note: if the capture group match was of an empty string the backref
   3656                 //         match succeeds.  Verified by testing:  Perl matches succeed
   3657                 //         in this case, so we do too.
   3658 
   3659                 UBool success = TRUE;
   3660                 for (;;) {
   3661                     if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
   3662                         success = TRUE;
   3663                         break;
   3664                     }
   3665                     if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
   3666                         success = FALSE;
   3667                         fHitEnd = TRUE;
   3668                         break;
   3669                     }
   3670                     UChar32 captureGroupChar = captureGroupItr.next();
   3671                     UChar32 inputChar = inputItr.next();
   3672                     if (inputChar != captureGroupChar) {
   3673                         success = FALSE;
   3674                         break;
   3675                     }
   3676                 }
   3677 
   3678                 if (success && inputItr.inExpansion()) {
   3679                     // We otained a match by consuming part of a string obtained from
   3680                     // case-folding a single code point of the input text.
   3681                     // This does not count as an overall match.
   3682                     success = FALSE;
   3683                 }
   3684 
   3685                 if (success) {
   3686                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3687                 } else {
   3688                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3689                 }
   3690 
   3691             }
   3692             break;
   3693 
   3694         case URX_STO_INP_LOC:
   3695             {
   3696                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   3697                 fp->fExtra[opValue] = fp->fInputIdx;
   3698             }
   3699             break;
   3700 
   3701         case URX_JMPX:
   3702             {
   3703                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3704                 fp->fPatIdx += 1;
   3705                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   3706                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   3707                 int64_t savedInputIdx = fp->fExtra[dataLoc];
   3708                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   3709                 if (savedInputIdx < fp->fInputIdx) {
   3710                     fp->fPatIdx = opValue;                               // JMP
   3711                 } else {
   3712                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   3713                 }
   3714             }
   3715             break;
   3716 
   3717         case URX_LA_START:
   3718             {
   3719                 // Entering a lookahead block.
   3720                 // Save Stack Ptr, Input Pos.
   3721                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3722                 fData[opValue]   = fStack->size();
   3723                 fData[opValue+1] = fp->fInputIdx;
   3724                 fActiveStart     = fLookStart;          // Set the match region change for
   3725                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   3726             }
   3727             break;
   3728 
   3729         case URX_LA_END:
   3730             {
   3731                 // Leaving a look-ahead block.
   3732                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   3733                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3734                 int32_t stackSize = fStack->size();
   3735                 int32_t newStackSize =(int32_t)fData[opValue];
   3736                 U_ASSERT(stackSize >= newStackSize);
   3737                 if (stackSize > newStackSize) {
   3738                     // Copy the current top frame back to the new (cut back) top frame.
   3739                     //   This makes the capture groups from within the look-ahead
   3740                     //   expression available.
   3741                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3742                     int32_t i;
   3743                     for (i=0; i<fFrameSize; i++) {
   3744                         newFP[i] = ((int64_t *)fp)[i];
   3745                     }
   3746                     fp = (REStackFrame *)newFP;
   3747                     fStack->setSize(newStackSize);
   3748                 }
   3749                 fp->fInputIdx = fData[opValue+1];
   3750 
   3751                 // Restore the active region bounds in the input string; they may have
   3752                 //    been changed because of transparent bounds on a Region.
   3753                 fActiveStart = fRegionStart;
   3754                 fActiveLimit = fRegionLimit;
   3755             }
   3756             break;
   3757 
   3758         case URX_ONECHAR_I:
   3759             // Case insensitive one char.  The char from the pattern is already case folded.
   3760             // Input text is not, but case folding the input can not reduce two or more code
   3761             // points to one.
   3762             if (fp->fInputIdx < fActiveLimit) {
   3763                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3764 
   3765                 UChar32 c = UTEXT_NEXT32(fInputText);
   3766                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   3767                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3768                     break;
   3769                 }
   3770             } else {
   3771                 fHitEnd = TRUE;
   3772             }
   3773 
   3774             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3775             break;
   3776 
   3777         case URX_STRING_I:
   3778             {
   3779                 // Case-insensitive test input against a literal string.
   3780                 // Strings require two slots in the compiled pattern, one for the
   3781                 //   offset to the string text, and one for the length.
   3782                 //   The compiled string has already been case folded.
   3783                 {
   3784                     const UChar *patternString = litText + opValue;
   3785                     int32_t      patternStringIdx  = 0;
   3786 
   3787                     op      = (int32_t)pat[fp->fPatIdx];
   3788                     fp->fPatIdx++;
   3789                     opType  = URX_TYPE(op);
   3790                     opValue = URX_VAL(op);
   3791                     U_ASSERT(opType == URX_STRING_LEN);
   3792                     int32_t patternStringLen = opValue;  // Length of the string from the pattern.
   3793 
   3794 
   3795                     UChar32   cPattern;
   3796                     UChar32   cText;
   3797                     UBool     success = TRUE;
   3798 
   3799                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3800                     CaseFoldingUTextIterator inputIterator(*fInputText);
   3801                     while (patternStringIdx < patternStringLen) {
   3802                         if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
   3803                             success = FALSE;
   3804                             fHitEnd = TRUE;
   3805                             break;
   3806                         }
   3807                         U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
   3808                         cText = inputIterator.next();
   3809                         if (cText != cPattern) {
   3810                             success = FALSE;
   3811                             break;
   3812                         }
   3813                     }
   3814                     if (inputIterator.inExpansion()) {
   3815                         success = FALSE;
   3816                     }
   3817 
   3818                     if (success) {
   3819                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3820                     } else {
   3821                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3822                     }
   3823                 }
   3824             }
   3825             break;
   3826 
   3827         case URX_LB_START:
   3828             {
   3829                 // Entering a look-behind block.
   3830                 // Save Stack Ptr, Input Pos.
   3831                 //   TODO:  implement transparent bounds.  Ticket #6067
   3832                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3833                 fData[opValue]   = fStack->size();
   3834                 fData[opValue+1] = fp->fInputIdx;
   3835                 // Init the variable containing the start index for attempted matches.
   3836                 fData[opValue+2] = -1;
   3837                 // Save input string length, then reset to pin any matches to end at
   3838                 //   the current position.
   3839                 fData[opValue+3] = fActiveLimit;
   3840                 fActiveLimit     = fp->fInputIdx;
   3841             }
   3842             break;
   3843 
   3844 
   3845         case URX_LB_CONT:
   3846             {
   3847                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   3848                 //    at all possible input starting positions.
   3849 
   3850                 // Fetch the min and max possible match lengths.  They are the operands
   3851                 //   of this op in the pattern.
   3852                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
   3853                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
   3854                 U_ASSERT(minML <= maxML);
   3855                 U_ASSERT(minML >= 0);
   3856 
   3857                 // Fetch (from data) the last input index where a match was attempted.
   3858                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3859                 int64_t  *lbStartIdx = &fData[opValue+2];
   3860                 if (*lbStartIdx < 0) {
   3861                     // First time through loop.
   3862                     *lbStartIdx = fp->fInputIdx - minML;
   3863                 } else {
   3864                     // 2nd through nth time through the loop.
   3865                     // Back up start position for match by one.
   3866                     if (*lbStartIdx == 0) {
   3867                         (*lbStartIdx)--;
   3868                     } else {
   3869                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
   3870                         (void)UTEXT_PREVIOUS32(fInputText);
   3871                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3872                     }
   3873                 }
   3874 
   3875                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   3876                     // We have tried all potential match starting points without
   3877                     //  getting a match.  Backtrack out, and out of the
   3878                     //   Look Behind altogether.
   3879                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3880                     int64_t restoreInputLen = fData[opValue+3];
   3881                     U_ASSERT(restoreInputLen >= fActiveLimit);
   3882                     U_ASSERT(restoreInputLen <= fInputLength);
   3883                     fActiveLimit = restoreInputLen;
   3884                     break;
   3885                 }
   3886 
   3887                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   3888                 //      (successful match will fall off the end of the loop.)
   3889                 fp = StateSave(fp, fp->fPatIdx-3, status);
   3890                 fp->fInputIdx = *lbStartIdx;
   3891             }
   3892             break;
   3893 
   3894         case URX_LB_END:
   3895             // End of a look-behind block, after a successful match.
   3896             {
   3897                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3898                 if (fp->fInputIdx != fActiveLimit) {
   3899                     //  The look-behind expression matched, but the match did not
   3900                     //    extend all the way to the point that we are looking behind from.
   3901                     //  FAIL out of here, which will take us back to the LB_CONT, which
   3902                     //     will retry the match starting at another position or fail
   3903                     //     the look-behind altogether, whichever is appropriate.
   3904                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3905                     break;
   3906                 }
   3907 
   3908                 // Look-behind match is good.  Restore the orignal input string length,
   3909                 //   which had been truncated to pin the end of the lookbehind match to the
   3910                 //   position being looked-behind.
   3911                 int64_t originalInputLen = fData[opValue+3];
   3912                 U_ASSERT(originalInputLen >= fActiveLimit);
   3913                 U_ASSERT(originalInputLen <= fInputLength);
   3914                 fActiveLimit = originalInputLen;
   3915             }
   3916             break;
   3917 
   3918 
   3919         case URX_LBN_CONT:
   3920             {
   3921                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   3922                 //    at all possible input starting positions.
   3923 
   3924                 // Fetch the extra parameters of this op.
   3925                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
   3926                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
   3927                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
   3928                         continueLoc = URX_VAL(continueLoc);
   3929                 U_ASSERT(minML <= maxML);
   3930                 U_ASSERT(minML >= 0);
   3931                 U_ASSERT(continueLoc > fp->fPatIdx);
   3932 
   3933                 // Fetch (from data) the last input index where a match was attempted.
   3934                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3935                 int64_t  *lbStartIdx = &fData[opValue+2];
   3936                 if (*lbStartIdx < 0) {
   3937                     // First time through loop.
   3938                     *lbStartIdx = fp->fInputIdx - minML;
   3939                 } else {
   3940                     // 2nd through nth time through the loop.
   3941                     // Back up start position for match by one.
   3942                     if (*lbStartIdx == 0) {
   3943                         (*lbStartIdx)--;
   3944                     } else {
   3945                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
   3946                         (void)UTEXT_PREVIOUS32(fInputText);
   3947                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3948                     }
   3949                 }
   3950 
   3951                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   3952                     // We have tried all potential match starting points without
   3953                     //  getting a match, which means that the negative lookbehind as
   3954                     //  a whole has succeeded.  Jump forward to the continue location
   3955                     int64_t restoreInputLen = fData[opValue+3];
   3956                     U_ASSERT(restoreInputLen >= fActiveLimit);
   3957                     U_ASSERT(restoreInputLen <= fInputLength);
   3958                     fActiveLimit = restoreInputLen;
   3959                     fp->fPatIdx = continueLoc;
   3960                     break;
   3961                 }
   3962 
   3963                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   3964                 //      (successful match will cause a FAIL out of the loop altogether.)
   3965                 fp = StateSave(fp, fp->fPatIdx-4, status);
   3966                 fp->fInputIdx = *lbStartIdx;
   3967             }
   3968             break;
   3969 
   3970         case URX_LBN_END:
   3971             // End of a negative look-behind block, after a successful match.
   3972             {
   3973                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3974                 if (fp->fInputIdx != fActiveLimit) {
   3975                     //  The look-behind expression matched, but the match did not
   3976                     //    extend all the way to the point that we are looking behind from.
   3977                     //  FAIL out of here, which will take us back to the LB_CONT, which
   3978                     //     will retry the match starting at another position or succeed
   3979                     //     the look-behind altogether, whichever is appropriate.
   3980                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3981                     break;
   3982                 }
   3983 
   3984                 // Look-behind expression matched, which means look-behind test as
   3985                 //   a whole Fails
   3986 
   3987                 //   Restore the orignal input string length, which had been truncated
   3988                 //   inorder to pin the end of the lookbehind match
   3989                 //   to the position being looked-behind.
   3990                 int64_t originalInputLen = fData[opValue+3];
   3991                 U_ASSERT(originalInputLen >= fActiveLimit);
   3992                 U_ASSERT(originalInputLen <= fInputLength);
   3993                 fActiveLimit = originalInputLen;
   3994 
   3995                 // Restore original stack position, discarding any state saved
   3996                 //   by the successful pattern match.
   3997                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3998                 int32_t newStackSize = (int32_t)fData[opValue];
   3999                 U_ASSERT(fStack->size() > newStackSize);
   4000                 fStack->setSize(newStackSize);
   4001 
   4002                 //  FAIL, which will take control back to someplace
   4003                 //  prior to entering the look-behind test.
   4004                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4005             }
   4006             break;
   4007 
   4008 
   4009         case URX_LOOP_SR_I:
   4010             // Loop Initialization for the optimized implementation of
   4011             //     [some character set]*
   4012             //   This op scans through all matching input.
   4013             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4014             {
   4015                 U_ASSERT(opValue > 0 && opValue < sets->size());
   4016                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   4017                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   4018 
   4019                 // Loop through input, until either the input is exhausted or
   4020                 //   we reach a character that is not a member of the set.
   4021                 int64_t ix = fp->fInputIdx;
   4022                 UTEXT_SETNATIVEINDEX(fInputText, ix);
   4023                 for (;;) {
   4024                     if (ix >= fActiveLimit) {
   4025                         fHitEnd = TRUE;
   4026                         break;
   4027                     }
   4028                     UChar32 c = UTEXT_NEXT32(fInputText);
   4029                     if (c<256) {
   4030                         if (s8->contains(c) == FALSE) {
   4031                             break;
   4032                         }
   4033                     } else {
   4034                         if (s->contains(c) == FALSE) {
   4035                             break;
   4036                         }
   4037                     }
   4038                     ix = UTEXT_GETNATIVEINDEX(fInputText);
   4039                 }
   4040 
   4041                 // If there were no matching characters, skip over the loop altogether.
   4042                 //   The loop doesn't run at all, a * op always succeeds.
   4043                 if (ix == fp->fInputIdx) {
   4044                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4045                     break;
   4046                 }
   4047 
   4048                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4049                 //   must follow.  It's operand is the stack location
   4050                 //   that holds the starting input index for the match of this [set]*
   4051                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   4052                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4053                 int32_t stackLoc = URX_VAL(loopcOp);
   4054                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4055                 fp->fExtra[stackLoc] = fp->fInputIdx;
   4056                 fp->fInputIdx = ix;
   4057 
   4058                 // Save State to the URX_LOOP_C op that follows this one,
   4059                 //   so that match failures in the following code will return to there.
   4060                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4061                 fp = StateSave(fp, fp->fPatIdx, status);
   4062                 fp->fPatIdx++;
   4063             }
   4064             break;
   4065 
   4066 
   4067         case URX_LOOP_DOT_I:
   4068             // Loop Initialization for the optimized implementation of .*
   4069             //   This op scans through all remaining input.
   4070             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4071             {
   4072                 // Loop through input until the input is exhausted (we reach an end-of-line)
   4073                 // In DOTALL mode, we can just go straight to the end of the input.
   4074                 int64_t ix;
   4075                 if ((opValue & 1) == 1) {
   4076                     // Dot-matches-All mode.  Jump straight to the end of the string.
   4077                     ix = fActiveLimit;
   4078                     fHitEnd = TRUE;
   4079                 } else {
   4080                     // NOT DOT ALL mode.  Line endings do not match '.'
   4081                     // Scan forward until a line ending or end of input.
   4082                     ix = fp->fInputIdx;
   4083                     UTEXT_SETNATIVEINDEX(fInputText, ix);
   4084                     for (;;) {
   4085                         if (ix >= fActiveLimit) {
   4086                             fHitEnd = TRUE;
   4087                             break;
   4088                         }
   4089                         UChar32 c = UTEXT_NEXT32(fInputText);
   4090                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
   4091                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
   4092                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   4093                                     (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
   4094                                 //  char is a line ending.  Exit the scanning loop.
   4095                                 break;
   4096                             }
   4097                         }
   4098                         ix = UTEXT_GETNATIVEINDEX(fInputText);
   4099                     }
   4100                 }
   4101 
   4102                 // If there were no matching characters, skip over the loop altogether.
   4103                 //   The loop doesn't run at all, a * op always succeeds.
   4104                 if (ix == fp->fInputIdx) {
   4105                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4106                     break;
   4107                 }
   4108 
   4109                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4110                 //   must follow.  It's operand is the stack location
   4111                 //   that holds the starting input index for the match of this .*
   4112                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   4113                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4114                 int32_t stackLoc = URX_VAL(loopcOp);
   4115                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4116                 fp->fExtra[stackLoc] = fp->fInputIdx;
   4117                 fp->fInputIdx = ix;
   4118 
   4119                 // Save State to the URX_LOOP_C op that follows this one,
   4120                 //   so that match failures in the following code will return to there.
   4121                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4122                 fp = StateSave(fp, fp->fPatIdx, status);
   4123                 fp->fPatIdx++;
   4124             }
   4125             break;
   4126 
   4127 
   4128         case URX_LOOP_C:
   4129             {
   4130                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   4131                 backSearchIndex = fp->fExtra[opValue];
   4132                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
   4133                 if (backSearchIndex == fp->fInputIdx) {
   4134                     // We've backed up the input idx to the point that the loop started.
   4135                     // The loop is done.  Leave here without saving state.
   4136                     //  Subsequent failures won't come back here.
   4137                     break;
   4138                 }
   4139                 // Set up for the next iteration of the loop, with input index
   4140                 //   backed up by one from the last time through,
   4141                 //   and a state save to this instruction in case the following code fails again.
   4142                 //   (We're going backwards because this loop emulates stack unwinding, not
   4143                 //    the initial scan forward.)
   4144                 U_ASSERT(fp->fInputIdx > 0);
   4145                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4146                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
   4147                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4148 
   4149                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
   4150                 if (prevC == 0x0a &&
   4151                     fp->fInputIdx > backSearchIndex &&
   4152                     twoPrevC == 0x0d) {
   4153                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
   4154                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   4155                         // .*, stepping back over CRLF pair.
   4156                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4157                     }
   4158                 }
   4159 
   4160 
   4161                 fp = StateSave(fp, fp->fPatIdx-1, status);
   4162             }
   4163             break;
   4164 
   4165 
   4166 
   4167         default:
   4168             // Trouble.  The compiled pattern contains an entry with an
   4169             //           unrecognized type tag.
   4170             U_ASSERT(FALSE);
   4171         }
   4172 
   4173         if (U_FAILURE(status)) {
   4174             isMatch = FALSE;
   4175             break;
   4176         }
   4177     }
   4178 
   4179 breakFromLoop:
   4180     fMatch = isMatch;
   4181     if (isMatch) {
   4182         fLastMatchEnd = fMatchEnd;
   4183         fMatchStart   = startIdx;
   4184         fMatchEnd     = fp->fInputIdx;
   4185     }
   4186 
   4187 #ifdef REGEX_RUN_DEBUG
   4188     if (fTraceDebug) {
   4189         if (isMatch) {
   4190             printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
   4191         } else {
   4192             printf("No match\n\n");
   4193         }
   4194     }
   4195 #endif
   4196 
   4197     fFrame = fp;                // The active stack frame when the engine stopped.
   4198                                 //   Contains the capture group results that we need to
   4199                                 //    access later.
   4200     return;
   4201 }
   4202 
   4203 
   4204 //--------------------------------------------------------------------------------
   4205 //
   4206 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
   4207 //                  assumption that the entire string is available in the UText's
   4208 //                  chunk buffer. For now, that means we can use int32_t indexes,
   4209 //                  except for anything that needs to be saved (like group starts
   4210 //                  and ends).
   4211 //
   4212 //                  startIdx:    begin matching a this index.
   4213 //                  toEnd:       if true, match must extend to end of the input region
   4214 //
   4215 //--------------------------------------------------------------------------------
   4216 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
   4217     UBool       isMatch  = FALSE;      // True if the we have a match.
   4218 
   4219     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
   4220 
   4221     int32_t     op;                    // Operation from the compiled pattern, split into
   4222     int32_t     opType;                //    the opcode
   4223     int32_t     opValue;               //    and the operand value.
   4224 
   4225 #ifdef REGEX_RUN_DEBUG
   4226     if (fTraceDebug) {
   4227         printf("MatchAt(startIdx=%d)\n", startIdx);
   4228         printf("Original Pattern: ");
   4229         UChar32 c = utext_next32From(fPattern->fPattern, 0);
   4230         while (c != U_SENTINEL) {
   4231             if (c<32 || c>256) {
   4232                 c = '.';
   4233             }
   4234             printf("%c", c);
   4235 
   4236             c = UTEXT_NEXT32(fPattern->fPattern);
   4237         }
   4238         printf("\n");
   4239         printf("Input String: ");
   4240         c = utext_next32From(fInputText, 0);
   4241         while (c != U_SENTINEL) {
   4242             if (c<32 || c>256) {
   4243                 c = '.';
   4244             }
   4245             printf("%c", c);
   4246 
   4247             c = UTEXT_NEXT32(fInputText);
   4248         }
   4249         printf("\n");
   4250         printf("\n");
   4251     }
   4252 #endif
   4253 
   4254     if (U_FAILURE(status)) {
   4255         return;
   4256     }
   4257 
   4258     //  Cache frequently referenced items from the compiled pattern
   4259     //
   4260     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   4261 
   4262     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   4263     UVector             *sets          = fPattern->fSets;
   4264 
   4265     const UChar         *inputBuf      = fInputText->chunkContents;
   4266 
   4267     fFrameSize = fPattern->fFrameSize;
   4268     REStackFrame        *fp            = resetStack();
   4269 
   4270     fp->fPatIdx   = 0;
   4271     fp->fInputIdx = startIdx;
   4272 
   4273     // Zero out the pattern's static data
   4274     int32_t i;
   4275     for (i = 0; i<fPattern->fDataSize; i++) {
   4276         fData[i] = 0;
   4277     }
   4278 
   4279     //
   4280     //  Main loop for interpreting the compiled pattern.
   4281     //  One iteration of the loop per pattern operation performed.
   4282     //
   4283     for (;;) {
   4284         op      = (int32_t)pat[fp->fPatIdx];
   4285         opType  = URX_TYPE(op);
   4286         opValue = URX_VAL(op);
   4287 #ifdef REGEX_RUN_DEBUG
   4288         if (fTraceDebug) {
   4289             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4290             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
   4291                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   4292             fPattern->dumpOp(fp->fPatIdx);
   4293         }
   4294 #endif
   4295         fp->fPatIdx++;
   4296 
   4297         switch (opType) {
   4298 
   4299 
   4300         case URX_NOP:
   4301             break;
   4302 
   4303 
   4304         case URX_BACKTRACK:
   4305             // Force a backtrack.  In some circumstances, the pattern compiler
   4306             //   will notice that the pattern can't possibly match anything, and will
   4307             //   emit one of these at that point.
   4308             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4309             break;
   4310 
   4311 
   4312         case URX_ONECHAR:
   4313             if (fp->fInputIdx < fActiveLimit) {
   4314                 UChar32 c;
   4315                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4316                 if (c == opValue) {
   4317                     break;
   4318                 }
   4319             } else {
   4320                 fHitEnd = TRUE;
   4321             }
   4322             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4323             break;
   4324 
   4325 
   4326         case URX_STRING:
   4327             {
   4328                 // Test input against a literal string.
   4329                 // Strings require two slots in the compiled pattern, one for the
   4330                 //   offset to the string text, and one for the length.
   4331                 int32_t   stringStartIdx = opValue;
   4332                 int32_t   stringLen;
   4333 
   4334                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
   4335                 fp->fPatIdx++;
   4336                 opType    = URX_TYPE(op);
   4337                 stringLen = URX_VAL(op);
   4338                 U_ASSERT(opType == URX_STRING_LEN);
   4339                 U_ASSERT(stringLen >= 2);
   4340 
   4341                 const UChar * pInp = inputBuf + fp->fInputIdx;
   4342                 const UChar * pInpLimit = inputBuf + fActiveLimit;
   4343                 const UChar * pPat = litText+stringStartIdx;
   4344                 const UChar * pEnd = pInp + stringLen;
   4345                 UBool success = TRUE;
   4346                 while (pInp < pEnd) {
   4347                     if (pInp >= pInpLimit) {
   4348                         fHitEnd = TRUE;
   4349                         success = FALSE;
   4350                         break;
   4351                     }
   4352                     if (*pInp++ != *pPat++) {
   4353                         success = FALSE;
   4354                         break;
   4355                     }
   4356                 }
   4357 
   4358                 if (success) {
   4359                     fp->fInputIdx += stringLen;
   4360                 } else {
   4361                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4362                 }
   4363             }
   4364             break;
   4365 
   4366 
   4367         case URX_STATE_SAVE:
   4368             fp = StateSave(fp, opValue, status);
   4369             break;
   4370 
   4371 
   4372         case URX_END:
   4373             // The match loop will exit via this path on a successful match,
   4374             //   when we reach the end of the pattern.
   4375             if (toEnd && fp->fInputIdx != fActiveLimit) {
   4376                 // The pattern matched, but not to the end of input.  Try some more.
   4377                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4378                 break;
   4379             }
   4380             isMatch = TRUE;
   4381             goto  breakFromLoop;
   4382 
   4383             // Start and End Capture stack frame variables are laid out out like this:
   4384             //  fp->fExtra[opValue]  - The start of a completed capture group
   4385             //             opValue+1 - The end   of a completed capture group
   4386             //             opValue+2 - the start of a capture group whose end
   4387             //                          has not yet been reached (and might not ever be).
   4388         case URX_START_CAPTURE:
   4389             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4390             fp->fExtra[opValue+2] = fp->fInputIdx;
   4391             break;
   4392 
   4393 
   4394         case URX_END_CAPTURE:
   4395             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4396             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   4397             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   4398             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   4399             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   4400             break;
   4401 
   4402 
   4403         case URX_DOLLAR:                   //  $, test for End of line
   4404             //     or for position before new line at end of input
   4405             if (fp->fInputIdx < fAnchorLimit-2) {
   4406                 // We are no where near the end of input.  Fail.
   4407                 //   This is the common case.  Keep it first.
   4408                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4409                 break;
   4410             }
   4411             if (fp->fInputIdx >= fAnchorLimit) {
   4412                 // We really are at the end of input.  Success.
   4413                 fHitEnd = TRUE;
   4414                 fRequireEnd = TRUE;
   4415                 break;
   4416             }
   4417 
   4418             // If we are positioned just before a new-line that is located at the
   4419             //   end of input, succeed.
   4420             if (fp->fInputIdx == fAnchorLimit-1) {
   4421                 UChar32 c;
   4422                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
   4423 
   4424                 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
   4425                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4426                         // At new-line at end of input. Success
   4427                         fHitEnd = TRUE;
   4428                         fRequireEnd = TRUE;
   4429                         break;
   4430                     }
   4431                 }
   4432             } else if (fp->fInputIdx == fAnchorLimit-2 &&
   4433                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
   4434                     fHitEnd = TRUE;
   4435                     fRequireEnd = TRUE;
   4436                     break;                         // At CR/LF at end of input.  Success
   4437             }
   4438 
   4439             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4440 
   4441             break;
   4442 
   4443 
   4444         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   4445             if (fp->fInputIdx >= fAnchorLimit-1) {
   4446                 // Either at the last character of input, or off the end.
   4447                 if (fp->fInputIdx == fAnchorLimit-1) {
   4448                     // At last char of input.  Success if it's a new line.
   4449                     if (inputBuf[fp->fInputIdx] == 0x0a) {
   4450                         fHitEnd = TRUE;
   4451                         fRequireEnd = TRUE;
   4452                         break;
   4453                     }
   4454                 } else {
   4455                     // Off the end of input.  Success.
   4456                     fHitEnd = TRUE;
   4457                     fRequireEnd = TRUE;
   4458                     break;
   4459                 }
   4460             }
   4461 
   4462             // Not at end of input.  Back-track out.
   4463             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4464             break;
   4465 
   4466 
   4467         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   4468             {
   4469                 if (fp->fInputIdx >= fAnchorLimit) {
   4470                     // We really are at the end of input.  Success.
   4471                     fHitEnd = TRUE;
   4472                     fRequireEnd = TRUE;
   4473                     break;
   4474                 }
   4475                 // If we are positioned just before a new-line, succeed.
   4476                 // It makes no difference where the new-line is within the input.
   4477                 UChar32 c = inputBuf[fp->fInputIdx];
   4478                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
   4479                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   4480                     //  In multi-line mode, hitting a new-line just before the end of input does not
   4481                     //   set the hitEnd or requireEnd flags
   4482                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4483                         break;
   4484                     }
   4485                 }
   4486                 // not at a new line.  Fail.
   4487                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4488             }
   4489             break;
   4490 
   4491 
   4492         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   4493             {
   4494                 if (fp->fInputIdx >= fAnchorLimit) {
   4495                     // We really are at the end of input.  Success.
   4496                     fHitEnd = TRUE;
   4497                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   4498                     break;               //   adding a new-line would not lose the match.
   4499                 }
   4500                 // If we are not positioned just before a new-line, the test fails; backtrack out.
   4501                 // It makes no difference where the new-line is within the input.
   4502                 if (inputBuf[fp->fInputIdx] != 0x0a) {
   4503                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4504                 }
   4505             }
   4506             break;
   4507 
   4508 
   4509         case URX_CARET:                    //  ^, test for start of line
   4510             if (fp->fInputIdx != fAnchorStart) {
   4511                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4512             }
   4513             break;
   4514 
   4515 
   4516         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   4517             {
   4518                 if (fp->fInputIdx == fAnchorStart) {
   4519                     // We are at the start input.  Success.
   4520                     break;
   4521                 }
   4522                 // Check whether character just before the current pos is a new-line
   4523                 //   unless we are at the end of input
   4524                 UChar  c = inputBuf[fp->fInputIdx - 1];
   4525                 if ((fp->fInputIdx < fAnchorLimit) &&
   4526                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   4527                     //  It's a new-line.  ^ is true.  Success.
   4528                     //  TODO:  what should be done with positions between a CR and LF?
   4529                     break;
   4530                 }
   4531                 // Not at the start of a line.  Fail.
   4532                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4533             }
   4534             break;
   4535 
   4536 
   4537         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   4538             {
   4539                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
   4540                 if (fp->fInputIdx <= fAnchorStart) {
   4541                     // We are at the start input.  Success.
   4542                     break;
   4543                 }
   4544                 // Check whether character just before the current pos is a new-line
   4545                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   4546                 UChar  c = inputBuf[fp->fInputIdx - 1];
   4547                 if (c != 0x0a) {
   4548                     // Not at the start of a line.  Back-track out.
   4549                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4550                 }
   4551             }
   4552             break;
   4553 
   4554         case URX_BACKSLASH_B:          // Test for word boundaries
   4555             {
   4556                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
   4557                 success ^= (UBool)(opValue != 0);     // flip sense for \B
   4558                 if (!success) {
   4559                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4560                 }
   4561             }
   4562             break;
   4563 
   4564 
   4565         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   4566             {
   4567                 UBool success = isUWordBoundary(fp->fInputIdx);
   4568                 success ^= (UBool)(opValue != 0);     // flip sense for \B
   4569                 if (!success) {
   4570                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4571                 }
   4572             }
   4573             break;
   4574 
   4575 
   4576         case URX_BACKSLASH_D:            // Test for decimal digit
   4577             {
   4578                 if (fp->fInputIdx >= fActiveLimit) {
   4579                     fHitEnd = TRUE;
   4580                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4581                     break;
   4582                 }
   4583 
   4584                 UChar32 c;
   4585                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4586                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   4587                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   4588                 success ^= (UBool)(opValue != 0);        // flip sense for \D
   4589                 if (!success) {
   4590                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4591                 }
   4592             }
   4593             break;
   4594 
   4595 
   4596         case URX_BACKSLASH_G:          // Test for position at end of previous match
   4597             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
   4598                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4599             }
   4600             break;
   4601 
   4602 
   4603         case URX_BACKSLASH_X:
   4604         //  Match a Grapheme, as defined by Unicode TR 29.
   4605         //  Differs slightly from Perl, which consumes combining marks independently
   4606         //    of context.
   4607         {
   4608 
   4609             // Fail if at end of input
   4610             if (fp->fInputIdx >= fActiveLimit) {
   4611                 fHitEnd = TRUE;
   4612                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4613                 break;
   4614             }
   4615 
   4616             // Examine (and consume) the current char.
   4617             //   Dispatch into a little state machine, based on the char.
   4618             UChar32  c;
   4619             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4620             UnicodeSet **sets = fPattern->fStaticSets;
   4621             if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   4622             if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   4623             if (sets[URX_GC_L]->contains(c))       goto GC_L;
   4624             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   4625             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   4626             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4627             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4628             goto GC_Extend;
   4629 
   4630 
   4631 
   4632 GC_L:
   4633             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4634             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4635             if (sets[URX_GC_L]->contains(c))       goto GC_L;
   4636             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   4637             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   4638             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4639             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4640             goto GC_Extend;
   4641 
   4642 GC_V:
   4643             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4644             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4645             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4646             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4647             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4648             goto GC_Extend;
   4649 
   4650 GC_T:
   4651             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4652             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4653             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4654             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4655             goto GC_Extend;
   4656 
   4657 GC_Extend:
   4658             // Combining characters are consumed here
   4659             for (;;) {
   4660                 if (fp->fInputIdx >= fActiveLimit) {
   4661                     break;
   4662                 }
   4663                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4664                 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   4665                     U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   4666                     break;
   4667                 }
   4668             }
   4669             goto GC_Done;
   4670 
   4671 GC_Control:
   4672             // Most control chars stand alone (don't combine with combining chars),
   4673             //   except for that CR/LF sequence is a single grapheme cluster.
   4674             if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
   4675                 fp->fInputIdx++;
   4676             }
   4677 
   4678 GC_Done:
   4679             if (fp->fInputIdx >= fActiveLimit) {
   4680                 fHitEnd = TRUE;
   4681             }
   4682             break;
   4683         }
   4684 
   4685 
   4686 
   4687 
   4688         case URX_BACKSLASH_Z:          // Test for end of Input
   4689             if (fp->fInputIdx < fAnchorLimit) {
   4690                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4691             } else {
   4692                 fHitEnd = TRUE;
   4693                 fRequireEnd = TRUE;
   4694             }
   4695             break;
   4696 
   4697 
   4698 
   4699         case URX_STATIC_SETREF:
   4700             {
   4701                 // Test input character against one of the predefined sets
   4702                 //    (Word Characters, for example)
   4703                 // The high bit of the op value is a flag for the match polarity.
   4704                 //    0:   success if input char is in set.
   4705                 //    1:   success if input char is not in set.
   4706                 if (fp->fInputIdx >= fActiveLimit) {
   4707                     fHitEnd = TRUE;
   4708                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4709                     break;
   4710                 }
   4711 
   4712                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   4713                 opValue &= ~URX_NEG_SET;
   4714                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   4715 
   4716                 UChar32 c;
   4717                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4718                 if (c < 256) {
   4719                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   4720                     if (s8->contains(c)) {
   4721                         success = !success;
   4722                     }
   4723                 } else {
   4724                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   4725                     if (s->contains(c)) {
   4726                         success = !success;
   4727                     }
   4728                 }
   4729                 if (!success) {
   4730                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4731                 }
   4732             }
   4733             break;
   4734 
   4735 
   4736         case URX_STAT_SETREF_N:
   4737             {
   4738                 // Test input character for NOT being a member of  one of
   4739                 //    the predefined sets (Word Characters, for example)
   4740                 if (fp->fInputIdx >= fActiveLimit) {
   4741                     fHitEnd = TRUE;
   4742                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4743                     break;
   4744                 }
   4745 
   4746                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   4747 
   4748                 UChar32  c;
   4749                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4750                 if (c < 256) {
   4751                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   4752                     if (s8->contains(c) == FALSE) {
   4753                         break;
   4754                     }
   4755                 } else {
   4756                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   4757                     if (s->contains(c) == FALSE) {
   4758                         break;
   4759                     }
   4760                 }
   4761                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4762             }
   4763             break;
   4764 
   4765 
   4766         case URX_SETREF:
   4767             {
   4768                 if (fp->fInputIdx >= fActiveLimit) {
   4769                     fHitEnd = TRUE;
   4770                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4771                     break;
   4772                 }
   4773 
   4774                 U_ASSERT(opValue > 0 && opValue < sets->size());
   4775 
   4776                 // There is input left.  Pick up one char and test it for set membership.
   4777                 UChar32  c;
   4778                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4779                 if (c<256) {
   4780                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   4781                     if (s8->contains(c)) {
   4782                         // The character is in the set.  A Match.
   4783                         break;
   4784                     }
   4785                 } else {
   4786                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   4787                     if (s->contains(c)) {
   4788                         // The character is in the set.  A Match.
   4789                         break;
   4790                     }
   4791                 }
   4792 
   4793                 // the character wasn't in the set.
   4794                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4795             }
   4796             break;
   4797 
   4798 
   4799         case URX_DOTANY:
   4800             {
   4801                 // . matches anything, but stops at end-of-line.
   4802                 if (fp->fInputIdx >= fActiveLimit) {
   4803                     // At end of input.  Match failed.  Backtrack out.
   4804                     fHitEnd = TRUE;
   4805                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4806                     break;
   4807                 }
   4808 
   4809                 // There is input left.  Advance over one char, unless we've hit end-of-line
   4810                 UChar32  c;
   4811                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4812                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   4813                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   4814                     // End of line in normal mode.   . does not match.
   4815                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4816                     break;
   4817                 }
   4818             }
   4819             break;
   4820 
   4821 
   4822         case URX_DOTANY_ALL:
   4823             {
   4824                 // . in dot-matches-all (including new lines) mode
   4825                 if (fp->fInputIdx >= fActiveLimit) {
   4826                     // At end of input.  Match failed.  Backtrack out.
   4827                     fHitEnd = TRUE;
   4828                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4829                     break;
   4830                 }
   4831 
   4832                 // There is input left.  Advance over one char, except if we are
   4833                 //   at a cr/lf, advance over both of them.
   4834                 UChar32 c;
   4835                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4836                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   4837                     // In the case of a CR/LF, we need to advance over both.
   4838                     if (inputBuf[fp->fInputIdx] == 0x0a) {
   4839                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
   4840                     }
   4841                 }
   4842             }
   4843             break;
   4844 
   4845 
   4846         case URX_DOTANY_UNIX:
   4847             {
   4848                 // '.' operator, matches all, but stops at end-of-line.
   4849                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   4850                 if (fp->fInputIdx >= fActiveLimit) {
   4851                     // At end of input.  Match failed.  Backtrack out.
   4852                     fHitEnd = TRUE;
   4853                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4854                     break;
   4855                 }
   4856 
   4857                 // There is input left.  Advance over one char, unless we've hit end-of-line
   4858                 UChar32 c;
   4859                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4860                 if (c == 0x0a) {
   4861                     // End of line in normal mode.   '.' does not match the \n
   4862                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4863                 }
   4864             }
   4865             break;
   4866 
   4867 
   4868         case URX_JMP:
   4869             fp->fPatIdx = opValue;
   4870             break;
   4871 
   4872         case URX_FAIL:
   4873             isMatch = FALSE;
   4874             goto breakFromLoop;
   4875 
   4876         case URX_JMP_SAV:
   4877             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   4878             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   4879             fp->fPatIdx = opValue;                         // Then JMP.
   4880             break;
   4881 
   4882         case URX_JMP_SAV_X:
   4883             // This opcode is used with (x)+, when x can match a zero length string.
   4884             // Same as JMP_SAV, except conditional on the match having made forward progress.
   4885             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   4886             //   data address of the input position at the start of the loop.
   4887             {
   4888                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   4889                 int32_t  stoOp = (int32_t)pat[opValue-1];
   4890                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   4891                 int32_t  frameLoc = URX_VAL(stoOp);
   4892                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   4893                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
   4894                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   4895                 if (prevInputIdx < fp->fInputIdx) {
   4896                     // The match did make progress.  Repeat the loop.
   4897                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   4898                     fp->fPatIdx = opValue;
   4899                     fp->fExtra[frameLoc] = fp->fInputIdx;
   4900                 }
   4901                 // If the input position did not advance, we do nothing here,
   4902                 //   execution will fall out of the loop.
   4903             }
   4904             break;
   4905 
   4906         case URX_CTR_INIT:
   4907             {
   4908                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   4909                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
   4910 
   4911                 // Pick up the three extra operands that CTR_INIT has, and
   4912                 //    skip the pattern location counter past
   4913                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   4914                 fp->fPatIdx += 3;
   4915                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   4916                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   4917                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   4918                 U_ASSERT(minCount>=0);
   4919                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   4920                 U_ASSERT(loopLoc>=fp->fPatIdx);
   4921 
   4922                 if (minCount == 0) {
   4923                     fp = StateSave(fp, loopLoc+1, status);
   4924                 }
   4925                 if (maxCount == -1) {
   4926                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
   4927                 } else if (maxCount == 0) {
   4928                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4929                 }
   4930             }
   4931             break;
   4932 
   4933         case URX_CTR_LOOP:
   4934             {
   4935                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   4936                 int32_t initOp = (int32_t)pat[opValue];
   4937                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   4938                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   4939                 int32_t minCount  = (int32_t)pat[opValue+2];
   4940                 int32_t maxCount  = (int32_t)pat[opValue+3];
   4941                 (*pCounter)++;
   4942                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
   4943                     U_ASSERT(*pCounter == maxCount);
   4944                     break;
   4945                 }
   4946                 if (*pCounter >= minCount) {
   4947                     if (maxCount == -1) {
   4948                         // Loop has no hard upper bound.
   4949                         // Check that it is progressing through the input, break if it is not.
   4950                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
   4951                         if (fp->fInputIdx == *pLastInputIdx) {
   4952                             break;
   4953                         } else {
   4954                             *pLastInputIdx = fp->fInputIdx;
   4955                         }
   4956                     }
   4957                     fp = StateSave(fp, fp->fPatIdx, status);
   4958                 }
   4959                 fp->fPatIdx = opValue + 4;    // Loop back.
   4960             }
   4961             break;
   4962 
   4963         case URX_CTR_INIT_NG:
   4964             {
   4965                 // Initialize a non-greedy loop
   4966                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   4967                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
   4968 
   4969                 // Pick up the three extra operands that CTR_INIT_NG has, and
   4970                 //    skip the pattern location counter past
   4971                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   4972                 fp->fPatIdx += 3;
   4973                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   4974                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   4975                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   4976                 U_ASSERT(minCount>=0);
   4977                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   4978                 U_ASSERT(loopLoc>fp->fPatIdx);
   4979                 if (maxCount == -1) {
   4980                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
   4981                 }
   4982 
   4983                 if (minCount == 0) {
   4984                     if (maxCount != 0) {
   4985                         fp = StateSave(fp, fp->fPatIdx, status);
   4986                     }
   4987                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   4988                 }
   4989             }
   4990             break;
   4991 
   4992         case URX_CTR_LOOP_NG:
   4993             {
   4994                 // Non-greedy {min, max} loops
   4995                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   4996                 int32_t initOp = (int32_t)pat[opValue];
   4997                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   4998                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   4999                 int32_t minCount  = (int32_t)pat[opValue+2];
   5000                 int32_t maxCount  = (int32_t)pat[opValue+3];
   5001 
   5002                 (*pCounter)++;
   5003                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
   5004                     // The loop has matched the maximum permitted number of times.
   5005                     //   Break out of here with no action.  Matching will
   5006                     //   continue with the following pattern.
   5007                     U_ASSERT(*pCounter == maxCount);
   5008                     break;
   5009                 }
   5010 
   5011                 if (*pCounter < minCount) {
   5012                     // We haven't met the minimum number of matches yet.
   5013                     //   Loop back for another one.
   5014                     fp->fPatIdx = opValue + 4;    // Loop back.
   5015                 } else {
   5016                     // We do have the minimum number of matches.
   5017 
   5018                     // If there is no upper bound on the loop iterations, check that the input index
   5019                     // is progressing, and stop the loop if it is not.
   5020                     if (maxCount == -1) {
   5021                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
   5022                         if (fp->fInputIdx == *pLastInputIdx) {
   5023                             break;
   5024                         }
   5025                         *pLastInputIdx = fp->fInputIdx;
   5026                     }
   5027 
   5028                     // Loop Continuation: we will fall into the pattern following the loop
   5029                     //   (non-greedy, don't execute loop body first), but first do
   5030                     //   a state save to the top of the loop, so that a match failure
   5031                     //   in the following pattern will try another iteration of the loop.
   5032                     fp = StateSave(fp, opValue + 4, status);
   5033                 }
   5034             }
   5035             break;
   5036 
   5037         case URX_STO_SP:
   5038             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5039             fData[opValue] = fStack->size();
   5040             break;
   5041 
   5042         case URX_LD_SP:
   5043             {
   5044                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5045                 int32_t newStackSize = (int32_t)fData[opValue];
   5046                 U_ASSERT(newStackSize <= fStack->size());
   5047                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5048                 if (newFP == (int64_t *)fp) {
   5049                     break;
   5050                 }
   5051                 int32_t i;
   5052                 for (i=0; i<fFrameSize; i++) {
   5053                     newFP[i] = ((int64_t *)fp)[i];
   5054                 }
   5055                 fp = (REStackFrame *)newFP;
   5056                 fStack->setSize(newStackSize);
   5057             }
   5058             break;
   5059 
   5060         case URX_BACKREF:
   5061             {
   5062                 U_ASSERT(opValue < fFrameSize);
   5063                 int64_t groupStartIdx = fp->fExtra[opValue];
   5064                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   5065                 U_ASSERT(groupStartIdx <= groupEndIdx);
   5066                 int64_t inputIndex = fp->fInputIdx;
   5067                 if (groupStartIdx < 0) {
   5068                     // This capture group has not participated in the match thus far,
   5069                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   5070                     break;
   5071                 }
   5072                 UBool success = TRUE;
   5073                 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
   5074                     if (inputIndex >= fActiveLimit) {
   5075                         success = FALSE;
   5076                         fHitEnd = TRUE;
   5077                         break;
   5078                     }
   5079                     if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
   5080                         success = FALSE;
   5081                         break;
   5082                     }
   5083                 }
   5084                 if (success) {
   5085                     fp->fInputIdx = inputIndex;
   5086                 } else {
   5087                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5088                 }
   5089             }
   5090             break;
   5091 
   5092         case URX_BACKREF_I:
   5093             {
   5094                 U_ASSERT(opValue < fFrameSize);
   5095                 int64_t groupStartIdx = fp->fExtra[opValue];
   5096                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   5097                 U_ASSERT(groupStartIdx <= groupEndIdx);
   5098                 if (groupStartIdx < 0) {
   5099                     // This capture group has not participated in the match thus far,
   5100                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   5101                     break;
   5102                 }
   5103                 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
   5104                 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
   5105 
   5106                 //   Note: if the capture group match was of an empty string the backref
   5107                 //         match succeeds.  Verified by testing:  Perl matches succeed
   5108                 //         in this case, so we do too.
   5109 
   5110                 UBool success = TRUE;
   5111                 for (;;) {
   5112                     UChar32 captureGroupChar = captureGroupItr.next();
   5113                     if (captureGroupChar == U_SENTINEL) {
   5114                         success = TRUE;
   5115                         break;
   5116                     }
   5117                     UChar32 inputChar = inputItr.next();
   5118                     if (inputChar == U_SENTINEL) {
   5119                         success = FALSE;
   5120                         fHitEnd = TRUE;
   5121                         break;
   5122                     }
   5123                     if (inputChar != captureGroupChar) {
   5124                         success = FALSE;
   5125                         break;
   5126                     }
   5127                 }
   5128 
   5129                 if (success && inputItr.inExpansion()) {
   5130                     // We otained a match by consuming part of a string obtained from
   5131                     // case-folding a single code point of the input text.
   5132                     // This does not count as an overall match.
   5133                     success = FALSE;
   5134                 }
   5135 
   5136                 if (success) {
   5137                     fp->fInputIdx = inputItr.getIndex();
   5138                 } else {
   5139                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5140                 }
   5141             }
   5142             break;
   5143 
   5144         case URX_STO_INP_LOC:
   5145             {
   5146                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   5147                 fp->fExtra[opValue] = fp->fInputIdx;
   5148             }
   5149             break;
   5150 
   5151         case URX_JMPX:
   5152             {
   5153                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5154                 fp->fPatIdx += 1;
   5155                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   5156                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   5157                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
   5158                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   5159                 if (savedInputIdx < fp->fInputIdx) {
   5160                     fp->fPatIdx = opValue;                               // JMP
   5161                 } else {
   5162                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   5163                 }
   5164             }
   5165             break;
   5166 
   5167         case URX_LA_START:
   5168             {
   5169                 // Entering a lookahead block.
   5170                 // Save Stack Ptr, Input Pos.
   5171                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5172                 fData[opValue]   = fStack->size();
   5173                 fData[opValue+1] = fp->fInputIdx;
   5174                 fActiveStart     = fLookStart;          // Set the match region change for
   5175                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   5176             }
   5177             break;
   5178 
   5179         case URX_LA_END:
   5180             {
   5181                 // Leaving a look-ahead block.
   5182                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   5183                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5184                 int32_t stackSize = fStack->size();
   5185                 int32_t newStackSize = (int32_t)fData[opValue];
   5186                 U_ASSERT(stackSize >= newStackSize);
   5187                 if (stackSize > newStackSize) {
   5188                     // Copy the current top frame back to the new (cut back) top frame.
   5189                     //   This makes the capture groups from within the look-ahead
   5190                     //   expression available.
   5191                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5192                     int32_t i;
   5193                     for (i=0; i<fFrameSize; i++) {
   5194                         newFP[i] = ((int64_t *)fp)[i];
   5195                     }
   5196                     fp = (REStackFrame *)newFP;
   5197                     fStack->setSize(newStackSize);
   5198                 }
   5199                 fp->fInputIdx = fData[opValue+1];
   5200 
   5201                 // Restore the active region bounds in the input string; they may have
   5202                 //    been changed because of transparent bounds on a Region.
   5203                 fActiveStart = fRegionStart;
   5204                 fActiveLimit = fRegionLimit;
   5205             }
   5206             break;
   5207 
   5208         case URX_ONECHAR_I:
   5209             if (fp->fInputIdx < fActiveLimit) {
   5210                 UChar32 c;
   5211                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5212                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   5213                     break;
   5214                 }
   5215             } else {
   5216                 fHitEnd = TRUE;
   5217             }
   5218             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5219             break;
   5220 
   5221         case URX_STRING_I:
   5222             // Case-insensitive test input against a literal string.
   5223             // Strings require two slots in the compiled pattern, one for the
   5224             //   offset to the string text, and one for the length.
   5225             //   The compiled string has already been case folded.
   5226             {
   5227                 const UChar *patternString = litText + opValue;
   5228 
   5229                 op      = (int32_t)pat[fp->fPatIdx];
   5230                 fp->fPatIdx++;
   5231                 opType  = URX_TYPE(op);
   5232                 opValue = URX_VAL(op);
   5233                 U_ASSERT(opType == URX_STRING_LEN);
   5234                 int32_t patternStringLen = opValue;  // Length of the string from the pattern.
   5235 
   5236                 UChar32      cText;
   5237                 UChar32      cPattern;
   5238                 UBool        success = TRUE;
   5239                 int32_t      patternStringIdx  = 0;
   5240                 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
   5241                 while (patternStringIdx < patternStringLen) {
   5242                     U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
   5243                     cText = inputIterator.next();
   5244                     if (cText != cPattern) {
   5245                         success = FALSE;
   5246                         if (cText == U_SENTINEL) {
   5247                             fHitEnd = TRUE;
   5248                         }
   5249                         break;
   5250                     }
   5251                 }
   5252                 if (inputIterator.inExpansion()) {
   5253                     success = FALSE;
   5254                 }
   5255 
   5256                 if (success) {
   5257                     fp->fInputIdx = inputIterator.getIndex();
   5258                 } else {
   5259                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5260                 }
   5261             }
   5262             break;
   5263 
   5264         case URX_LB_START:
   5265             {
   5266                 // Entering a look-behind block.
   5267                 // Save Stack Ptr, Input Pos.
   5268                 //   TODO:  implement transparent bounds.  Ticket #6067
   5269                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5270                 fData[opValue]   = fStack->size();
   5271                 fData[opValue+1] = fp->fInputIdx;
   5272                 // Init the variable containing the start index for attempted matches.
   5273                 fData[opValue+2] = -1;
   5274                 // Save input string length, then reset to pin any matches to end at
   5275                 //   the current position.
   5276                 fData[opValue+3] = fActiveLimit;
   5277                 fActiveLimit     = fp->fInputIdx;
   5278             }
   5279             break;
   5280 
   5281 
   5282         case URX_LB_CONT:
   5283             {
   5284                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   5285                 //    at all possible input starting positions.
   5286 
   5287                 // Fetch the min and max possible match lengths.  They are the operands
   5288                 //   of this op in the pattern.
   5289                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
   5290                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
   5291                 U_ASSERT(minML <= maxML);
   5292                 U_ASSERT(minML >= 0);
   5293 
   5294                 // Fetch (from data) the last input index where a match was attempted.
   5295                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5296                 int64_t  *lbStartIdx = &fData[opValue+2];
   5297                 if (*lbStartIdx < 0) {
   5298                     // First time through loop.
   5299                     *lbStartIdx = fp->fInputIdx - minML;
   5300                 } else {
   5301                     // 2nd through nth time through the loop.
   5302                     // Back up start position for match by one.
   5303                     if (*lbStartIdx == 0) {
   5304                         (*lbStartIdx)--;
   5305                     } else {
   5306                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
   5307                     }
   5308                 }
   5309 
   5310                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   5311                     // We have tried all potential match starting points without
   5312                     //  getting a match.  Backtrack out, and out of the
   5313                     //   Look Behind altogether.
   5314                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5315                     int64_t restoreInputLen = fData[opValue+3];
   5316                     U_ASSERT(restoreInputLen >= fActiveLimit);
   5317                     U_ASSERT(restoreInputLen <= fInputLength);
   5318                     fActiveLimit = restoreInputLen;
   5319                     break;
   5320                 }
   5321 
   5322                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5323                 //      (successful match will fall off the end of the loop.)
   5324                 fp = StateSave(fp, fp->fPatIdx-3, status);
   5325                 fp->fInputIdx =  *lbStartIdx;
   5326             }
   5327             break;
   5328 
   5329         case URX_LB_END:
   5330             // End of a look-behind block, after a successful match.
   5331             {
   5332                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5333                 if (fp->fInputIdx != fActiveLimit) {
   5334                     //  The look-behind expression matched, but the match did not
   5335                     //    extend all the way to the point that we are looking behind from.
   5336                     //  FAIL out of here, which will take us back to the LB_CONT, which
   5337                     //     will retry the match starting at another position or fail
   5338                     //     the look-behind altogether, whichever is appropriate.
   5339                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5340                     break;
   5341                 }
   5342 
   5343                 // Look-behind match is good.  Restore the orignal input string length,
   5344                 //   which had been truncated to pin the end of the lookbehind match to the
   5345                 //   position being looked-behind.
   5346                 int64_t originalInputLen = fData[opValue+3];
   5347                 U_ASSERT(originalInputLen >= fActiveLimit);
   5348                 U_ASSERT(originalInputLen <= fInputLength);
   5349                 fActiveLimit = originalInputLen;
   5350             }
   5351             break;
   5352 
   5353 
   5354         case URX_LBN_CONT:
   5355             {
   5356                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   5357                 //    at all possible input starting positions.
   5358 
   5359                 // Fetch the extra parameters of this op.
   5360                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
   5361                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
   5362                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
   5363                 continueLoc = URX_VAL(continueLoc);
   5364                 U_ASSERT(minML <= maxML);
   5365                 U_ASSERT(minML >= 0);
   5366                 U_ASSERT(continueLoc > fp->fPatIdx);
   5367 
   5368                 // Fetch (from data) the last input index where a match was attempted.
   5369                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5370                 int64_t  *lbStartIdx = &fData[opValue+2];
   5371                 if (*lbStartIdx < 0) {
   5372                     // First time through loop.
   5373                     *lbStartIdx = fp->fInputIdx - minML;
   5374                 } else {
   5375                     // 2nd through nth time through the loop.
   5376                     // Back up start position for match by one.
   5377                     if (*lbStartIdx == 0) {
   5378                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
   5379                     } else {
   5380                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
   5381                     }
   5382                 }
   5383 
   5384                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   5385                     // We have tried all potential match starting points without
   5386                     //  getting a match, which means that the negative lookbehind as
   5387                     //  a whole has succeeded.  Jump forward to the continue location
   5388                     int64_t restoreInputLen = fData[opValue+3];
   5389                     U_ASSERT(restoreInputLen >= fActiveLimit);
   5390                     U_ASSERT(restoreInputLen <= fInputLength);
   5391                     fActiveLimit = restoreInputLen;
   5392                     fp->fPatIdx = continueLoc;
   5393                     break;
   5394                 }
   5395 
   5396                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5397                 //      (successful match will cause a FAIL out of the loop altogether.)
   5398                 fp = StateSave(fp, fp->fPatIdx-4, status);
   5399                 fp->fInputIdx =  *lbStartIdx;
   5400             }
   5401             break;
   5402 
   5403         case URX_LBN_END:
   5404             // End of a negative look-behind block, after a successful match.
   5405             {
   5406                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5407                 if (fp->fInputIdx != fActiveLimit) {
   5408                     //  The look-behind expression matched, but the match did not
   5409                     //    extend all the way to the point that we are looking behind from.
   5410                     //  FAIL out of here, which will take us back to the LB_CONT, which
   5411                     //     will retry the match starting at another position or succeed
   5412                     //     the look-behind altogether, whichever is appropriate.
   5413                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5414                     break;
   5415                 }
   5416 
   5417                 // Look-behind expression matched, which means look-behind test as
   5418                 //   a whole Fails
   5419 
   5420                 //   Restore the orignal input string length, which had been truncated
   5421                 //   inorder to pin the end of the lookbehind match
   5422                 //   to the position being looked-behind.
   5423                 int64_t originalInputLen = fData[opValue+3];
   5424                 U_ASSERT(originalInputLen >= fActiveLimit);
   5425                 U_ASSERT(originalInputLen <= fInputLength);
   5426                 fActiveLimit = originalInputLen;
   5427 
   5428                 // Restore original stack position, discarding any state saved
   5429                 //   by the successful pattern match.
   5430                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5431                 int32_t newStackSize = (int32_t)fData[opValue];
   5432                 U_ASSERT(fStack->size() > newStackSize);
   5433                 fStack->setSize(newStackSize);
   5434 
   5435                 //  FAIL, which will take control back to someplace
   5436                 //  prior to entering the look-behind test.
   5437                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5438             }
   5439             break;
   5440 
   5441 
   5442         case URX_LOOP_SR_I:
   5443             // Loop Initialization for the optimized implementation of
   5444             //     [some character set]*
   5445             //   This op scans through all matching input.
   5446             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5447             {
   5448                 U_ASSERT(opValue > 0 && opValue < sets->size());
   5449                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5450                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   5451 
   5452                 // Loop through input, until either the input is exhausted or
   5453                 //   we reach a character that is not a member of the set.
   5454                 int32_t ix = (int32_t)fp->fInputIdx;
   5455                 for (;;) {
   5456                     if (ix >= fActiveLimit) {
   5457                         fHitEnd = TRUE;
   5458                         break;
   5459                     }
   5460                     UChar32   c;
   5461                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
   5462                     if (c<256) {
   5463                         if (s8->contains(c) == FALSE) {
   5464                             U16_BACK_1(inputBuf, 0, ix);
   5465                             break;
   5466                         }
   5467                     } else {
   5468                         if (s->contains(c) == FALSE) {
   5469                             U16_BACK_1(inputBuf, 0, ix);
   5470                             break;
   5471                         }
   5472                     }
   5473                 }
   5474 
   5475                 // If there were no matching characters, skip over the loop altogether.
   5476                 //   The loop doesn't run at all, a * op always succeeds.
   5477                 if (ix == fp->fInputIdx) {
   5478                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   5479                     break;
   5480                 }
   5481 
   5482                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   5483                 //   must follow.  It's operand is the stack location
   5484                 //   that holds the starting input index for the match of this [set]*
   5485                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   5486                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   5487                 int32_t stackLoc = URX_VAL(loopcOp);
   5488                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   5489                 fp->fExtra[stackLoc] = fp->fInputIdx;
   5490                 fp->fInputIdx = ix;
   5491 
   5492                 // Save State to the URX_LOOP_C op that follows this one,
   5493                 //   so that match failures in the following code will return to there.
   5494                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   5495                 fp = StateSave(fp, fp->fPatIdx, status);
   5496                 fp->fPatIdx++;
   5497             }
   5498             break;
   5499 
   5500 
   5501         case URX_LOOP_DOT_I:
   5502             // Loop Initialization for the optimized implementation of .*
   5503             //   This op scans through all remaining input.
   5504             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5505             {
   5506                 // Loop through input until the input is exhausted (we reach an end-of-line)
   5507                 // In DOTALL mode, we can just go straight to the end of the input.
   5508                 int32_t ix;
   5509                 if ((opValue & 1) == 1) {
   5510                     // Dot-matches-All mode.  Jump straight to the end of the string.
   5511                     ix = (int32_t)fActiveLimit;
   5512                     fHitEnd = TRUE;
   5513                 } else {
   5514                     // NOT DOT ALL mode.  Line endings do not match '.'
   5515                     // Scan forward until a line ending or end of input.
   5516                     ix = (int32_t)fp->fInputIdx;
   5517                     for (;;) {
   5518                         if (ix >= fActiveLimit) {
   5519                             fHitEnd = TRUE;
   5520                             break;
   5521                         }
   5522                         UChar32   c;
   5523                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
   5524                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
   5525                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
   5526                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   5527                                    ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
   5528                                 //  char is a line ending.  Put the input pos back to the
   5529                                 //    line ending char, and exit the scanning loop.
   5530                                 U16_BACK_1(inputBuf, 0, ix);
   5531                                 break;
   5532                             }
   5533                         }
   5534                     }
   5535                 }
   5536 
   5537                 // If there were no matching characters, skip over the loop altogether.
   5538                 //   The loop doesn't run at all, a * op always succeeds.
   5539                 if (ix == fp->fInputIdx) {
   5540                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   5541                     break;
   5542                 }
   5543 
   5544                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   5545                 //   must follow.  It's operand is the stack location
   5546                 //   that holds the starting input index for the match of this .*
   5547                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   5548                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   5549                 int32_t stackLoc = URX_VAL(loopcOp);
   5550                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   5551                 fp->fExtra[stackLoc] = fp->fInputIdx;
   5552                 fp->fInputIdx = ix;
   5553 
   5554                 // Save State to the URX_LOOP_C op that follows this one,
   5555                 //   so that match failures in the following code will return to there.
   5556                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   5557                 fp = StateSave(fp, fp->fPatIdx, status);
   5558                 fp->fPatIdx++;
   5559             }
   5560             break;
   5561 
   5562 
   5563         case URX_LOOP_C:
   5564             {
   5565                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   5566                 backSearchIndex = (int32_t)fp->fExtra[opValue];
   5567                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
   5568                 if (backSearchIndex == fp->fInputIdx) {
   5569                     // We've backed up the input idx to the point that the loop started.
   5570                     // The loop is done.  Leave here without saving state.
   5571                     //  Subsequent failures won't come back here.
   5572                     break;
   5573                 }
   5574                 // Set up for the next iteration of the loop, with input index
   5575                 //   backed up by one from the last time through,
   5576                 //   and a state save to this instruction in case the following code fails again.
   5577                 //   (We're going backwards because this loop emulates stack unwinding, not
   5578                 //    the initial scan forward.)
   5579                 U_ASSERT(fp->fInputIdx > 0);
   5580                 UChar32 prevC;
   5581                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
   5582 
   5583                 if (prevC == 0x0a &&
   5584                     fp->fInputIdx > backSearchIndex &&
   5585                     inputBuf[fp->fInputIdx-1] == 0x0d) {
   5586                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
   5587                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   5588                         // .*, stepping back over CRLF pair.
   5589                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   5590                     }
   5591                 }
   5592 
   5593 
   5594                 fp = StateSave(fp, fp->fPatIdx-1, status);
   5595             }
   5596             break;
   5597 
   5598 
   5599 
   5600         default:
   5601             // Trouble.  The compiled pattern contains an entry with an
   5602             //           unrecognized type tag.
   5603             U_ASSERT(FALSE);
   5604         }
   5605 
   5606         if (U_FAILURE(status)) {
   5607             isMatch = FALSE;
   5608             break;
   5609         }
   5610     }
   5611 
   5612 breakFromLoop:
   5613     fMatch = isMatch;
   5614     if (isMatch) {
   5615         fLastMatchEnd = fMatchEnd;
   5616         fMatchStart   = startIdx;
   5617         fMatchEnd     = fp->fInputIdx;
   5618     }
   5619 
   5620 #ifdef REGEX_RUN_DEBUG
   5621     if (fTraceDebug) {
   5622         if (isMatch) {
   5623             printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
   5624         } else {
   5625             printf("No match\n\n");
   5626         }
   5627     }
   5628 #endif
   5629 
   5630     fFrame = fp;                // The active stack frame when the engine stopped.
   5631                                 //   Contains the capture group results that we need to
   5632                                 //    access later.
   5633 
   5634     return;
   5635 }
   5636 
   5637 
   5638 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
   5639 
   5640 U_NAMESPACE_END
   5641 
   5642 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
   5643