Home | History | Annotate | Download | only in i18n
      1 /*
      2 **************************************************************************
      3 *   Copyright (C) 2002-2013 International Business Machines Corporation  *
      4 *   and others. All rights reserved.                                     *
      5 **************************************************************************
      6 */
      7 //
      8 //  file:  rematch.cpp
      9 //
     10 //         Contains the implementation of class RegexMatcher,
     11 //         which is one of the main API classes for the ICU regular expression package.
     12 //
     13 
     14 #include "unicode/utypes.h"
     15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     16 
     17 #include "unicode/regex.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/ustring.h"
     21 #include "unicode/rbbi.h"
     22 #include "unicode/utf.h"
     23 #include "unicode/utf16.h"
     24 #include "uassert.h"
     25 #include "cmemory.h"
     26 #include "uvector.h"
     27 #include "uvectr32.h"
     28 #include "uvectr64.h"
     29 #include "regeximp.h"
     30 #include "regexst.h"
     31 #include "regextxt.h"
     32 #include "ucase.h"
     33 
     34 // #include <malloc.h>        // Needed for heapcheck testing
     35 
     36 
     37 // Find progress callback
     38 // ----------------------
     39 // Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
     40 //
     41 #define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
     42     (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
     43 
     44 
     45 // Smart Backtracking
     46 // ------------------
     47 // When a failure would go back to a LOOP_C instruction,
     48 // strings, characters, and setrefs scan backwards for a valid start
     49 // character themselves, pop the stack, and save state, emulating the
     50 // LOOP_C's effect but assured that the next character of input is a
     51 // possible matching character.
     52 //
     53 // Good idea in theory; unfortunately it only helps out a few specific
     54 // cases and slows the engine down a little in the rest.
     55 
     56 U_NAMESPACE_BEGIN
     57 
     58 // Default limit for the size of the back track stack, to avoid system
     59 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
     60 // This value puts ICU's limits higher than most other regexp implementations,
     61 //    which use recursion rather than the heap, and take more storage per
     62 //    backtrack point.
     63 //
     64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
     65 
     66 // Time limit counter constant.
     67 //   Time limits for expression evaluation are in terms of quanta of work by
     68 //   the engine, each of which is 10,000 state saves.
     69 //   This constant determines that state saves per tick number.
     70 static const int32_t TIMER_INITIAL_VALUE = 10000;
     71 
     72 //-----------------------------------------------------------------------------
     73 //
     74 //   Constructor and Destructor
     75 //
     76 //-----------------------------------------------------------------------------
     77 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
     78     fDeferredStatus = U_ZERO_ERROR;
     79     init(fDeferredStatus);
     80     if (U_FAILURE(fDeferredStatus)) {
     81         return;
     82     }
     83     if (pat==NULL) {
     84         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
     85         return;
     86     }
     87     fPattern = pat;
     88     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
     89 }
     90 
     91 
     92 
     93 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
     94                            uint32_t flags, UErrorCode &status) {
     95     init(status);
     96     if (U_FAILURE(status)) {
     97         return;
     98     }
     99     UParseError    pe;
    100     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    101     fPattern           = fPatternOwned;
    102 
    103     UText inputText = UTEXT_INITIALIZER;
    104     utext_openConstUnicodeString(&inputText, &input, &status);
    105     init2(&inputText, status);
    106     utext_close(&inputText);
    107 
    108     fInputUniStrMaybeMutable = TRUE;
    109 }
    110 
    111 
    112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
    113                            uint32_t flags, UErrorCode &status) {
    114     init(status);
    115     if (U_FAILURE(status)) {
    116         return;
    117     }
    118     UParseError    pe;
    119     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    120     if (U_FAILURE(status)) {
    121         return;
    122     }
    123 
    124     fPattern           = fPatternOwned;
    125     init2(input, status);
    126 }
    127 
    128 
    129 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
    130                            uint32_t flags, UErrorCode &status) {
    131     init(status);
    132     if (U_FAILURE(status)) {
    133         return;
    134     }
    135     UParseError    pe;
    136     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    137     if (U_FAILURE(status)) {
    138         return;
    139     }
    140     fPattern           = fPatternOwned;
    141     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    142 }
    143 
    144 RegexMatcher::RegexMatcher(UText *regexp,
    145                            uint32_t flags, UErrorCode &status) {
    146     init(status);
    147     if (U_FAILURE(status)) {
    148         return;
    149     }
    150     UParseError    pe;
    151     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    152         if (U_FAILURE(status)) {
    153         return;
    154     }
    155 
    156     fPattern           = fPatternOwned;
    157     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    158 }
    159 
    160 
    161 
    162 
    163 RegexMatcher::~RegexMatcher() {
    164     delete fStack;
    165     if (fData != fSmallData) {
    166         uprv_free(fData);
    167         fData = NULL;
    168     }
    169     if (fPatternOwned) {
    170         delete fPatternOwned;
    171         fPatternOwned = NULL;
    172         fPattern = NULL;
    173     }
    174 
    175     if (fInput) {
    176         delete fInput;
    177     }
    178     if (fInputText) {
    179         utext_close(fInputText);
    180     }
    181     if (fAltInputText) {
    182         utext_close(fAltInputText);
    183     }
    184 
    185     #if UCONFIG_NO_BREAK_ITERATION==0
    186     delete fWordBreakItr;
    187     #endif
    188 }
    189 
    190 //
    191 //   init()   common initialization for use by all constructors.
    192 //            Initialize all fields, get the object into a consistent state.
    193 //            This must be done even when the initial status shows an error,
    194 //            so that the object is initialized sufficiently well for the destructor
    195 //            to run safely.
    196 //
    197 void RegexMatcher::init(UErrorCode &status) {
    198     fPattern           = NULL;
    199     fPatternOwned      = NULL;
    200     fFrameSize         = 0;
    201     fRegionStart       = 0;
    202     fRegionLimit       = 0;
    203     fAnchorStart       = 0;
    204     fAnchorLimit       = 0;
    205     fLookStart         = 0;
    206     fLookLimit         = 0;
    207     fActiveStart       = 0;
    208     fActiveLimit       = 0;
    209     fTransparentBounds = FALSE;
    210     fAnchoringBounds   = TRUE;
    211     fMatch             = FALSE;
    212     fMatchStart        = 0;
    213     fMatchEnd          = 0;
    214     fLastMatchEnd      = -1;
    215     fAppendPosition    = 0;
    216     fHitEnd            = FALSE;
    217     fRequireEnd        = FALSE;
    218     fStack             = NULL;
    219     fFrame             = NULL;
    220     fTimeLimit         = 0;
    221     fTime              = 0;
    222     fTickCounter       = 0;
    223     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
    224     fCallbackFn        = NULL;
    225     fCallbackContext   = NULL;
    226     fFindProgressCallbackFn      = NULL;
    227     fFindProgressCallbackContext = NULL;
    228     fTraceDebug        = FALSE;
    229     fDeferredStatus    = status;
    230     fData              = fSmallData;
    231     fWordBreakItr      = NULL;
    232 
    233     fStack             = NULL;
    234     fInputText         = NULL;
    235     fAltInputText      = NULL;
    236     fInput             = NULL;
    237     fInputLength       = 0;
    238     fInputUniStrMaybeMutable = FALSE;
    239 
    240     if (U_FAILURE(status)) {
    241         fDeferredStatus = status;
    242     }
    243 }
    244 
    245 //
    246 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
    247 //            This handles the common setup to be done after the Pattern is available.
    248 //
    249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
    250     if (U_FAILURE(status)) {
    251         fDeferredStatus = status;
    252         return;
    253     }
    254 
    255     if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
    256         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
    257         if (fData == NULL) {
    258             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    259             return;
    260         }
    261     }
    262 
    263     fStack = new UVector64(status);
    264     if (fStack == NULL) {
    265         status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    266         return;
    267     }
    268 
    269     reset(input);
    270     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
    271     if (U_FAILURE(status)) {
    272         fDeferredStatus = status;
    273         return;
    274     }
    275 }
    276 
    277 
    278 static const UChar BACKSLASH  = 0x5c;
    279 static const UChar DOLLARSIGN = 0x24;
    280 //--------------------------------------------------------------------------------
    281 //
    282 //    appendReplacement
    283 //
    284 //--------------------------------------------------------------------------------
    285 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
    286                                               const UnicodeString &replacement,
    287                                               UErrorCode &status) {
    288     UText replacementText = UTEXT_INITIALIZER;
    289 
    290     utext_openConstUnicodeString(&replacementText, &replacement, &status);
    291     if (U_SUCCESS(status)) {
    292         UText resultText = UTEXT_INITIALIZER;
    293         utext_openUnicodeString(&resultText, &dest, &status);
    294 
    295         if (U_SUCCESS(status)) {
    296             appendReplacement(&resultText, &replacementText, status);
    297             utext_close(&resultText);
    298         }
    299         utext_close(&replacementText);
    300     }
    301 
    302     return *this;
    303 }
    304 
    305 //
    306 //    appendReplacement, UText mode
    307 //
    308 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
    309                                               UText *replacement,
    310                                               UErrorCode &status) {
    311     if (U_FAILURE(status)) {
    312         return *this;
    313     }
    314     if (U_FAILURE(fDeferredStatus)) {
    315         status = fDeferredStatus;
    316         return *this;
    317     }
    318     if (fMatch == FALSE) {
    319         status = U_REGEX_INVALID_STATE;
    320         return *this;
    321     }
    322 
    323     // Copy input string from the end of previous match to start of current match
    324     int64_t  destLen = utext_nativeLength(dest);
    325     if (fMatchStart > fAppendPosition) {
    326         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    327             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
    328                                      (int32_t)(fMatchStart-fAppendPosition), &status);
    329         } else {
    330             int32_t len16;
    331             if (UTEXT_USES_U16(fInputText)) {
    332                 len16 = (int32_t)(fMatchStart-fAppendPosition);
    333             } else {
    334                 UErrorCode lengthStatus = U_ZERO_ERROR;
    335                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
    336             }
    337             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
    338             if (inputChars == NULL) {
    339                 status = U_MEMORY_ALLOCATION_ERROR;
    340                 return *this;
    341             }
    342             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
    343             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    344             uprv_free(inputChars);
    345         }
    346     }
    347     fAppendPosition = fMatchEnd;
    348 
    349 
    350     // scan the replacement text, looking for substitutions ($n) and \escapes.
    351     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
    352     //         move entire ranges not containing substitutions.
    353     UTEXT_SETNATIVEINDEX(replacement, 0);
    354     UChar32 c = UTEXT_NEXT32(replacement);
    355     while (c != U_SENTINEL) {
    356         if (c == BACKSLASH) {
    357             // Backslash Escape.  Copy the following char out without further checks.
    358             //                    Note:  Surrogate pairs don't need any special handling
    359             //                           The second half wont be a '$' or a '\', and
    360             //                           will move to the dest normally on the next
    361             //                           loop iteration.
    362             c = UTEXT_CURRENT32(replacement);
    363             if (c == U_SENTINEL) {
    364                 break;
    365             }
    366 
    367             if (c==0x55/*U*/ || c==0x75/*u*/) {
    368                 // We have a \udddd or \Udddddddd escape sequence.
    369                 int32_t offset = 0;
    370                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
    371                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
    372                 if (escapedChar != (UChar32)0xFFFFFFFF) {
    373                     if (U_IS_BMP(escapedChar)) {
    374                         UChar c16 = (UChar)escapedChar;
    375                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    376                     } else {
    377                         UChar surrogate[2];
    378                         surrogate[0] = U16_LEAD(escapedChar);
    379                         surrogate[1] = U16_TRAIL(escapedChar);
    380                         if (U_SUCCESS(status)) {
    381                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    382                         }
    383                     }
    384                     // TODO:  Report errors for mal-formed \u escapes?
    385                     //        As this is, the original sequence is output, which may be OK.
    386                     if (context.lastOffset == offset) {
    387                         (void)UTEXT_PREVIOUS32(replacement);
    388                     } else if (context.lastOffset != offset-1) {
    389                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
    390                     }
    391                 }
    392             } else {
    393                 (void)UTEXT_NEXT32(replacement);
    394                 // Plain backslash escape.  Just put out the escaped character.
    395                 if (U_IS_BMP(c)) {
    396                     UChar c16 = (UChar)c;
    397                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    398                 } else {
    399                     UChar surrogate[2];
    400                     surrogate[0] = U16_LEAD(c);
    401                     surrogate[1] = U16_TRAIL(c);
    402                     if (U_SUCCESS(status)) {
    403                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    404                     }
    405                 }
    406             }
    407         } else if (c != DOLLARSIGN) {
    408             // Normal char, not a $.  Copy it out without further checks.
    409             if (U_IS_BMP(c)) {
    410                 UChar c16 = (UChar)c;
    411                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    412             } else {
    413                 UChar surrogate[2];
    414                 surrogate[0] = U16_LEAD(c);
    415                 surrogate[1] = U16_TRAIL(c);
    416                 if (U_SUCCESS(status)) {
    417                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    418                 }
    419             }
    420         } else {
    421             // We've got a $.  Pick up a capture group number if one follows.
    422             // Consume at most the number of digits necessary for the largest capture
    423             // number that is valid for this pattern.
    424 
    425             int32_t numDigits = 0;
    426             int32_t groupNum  = 0;
    427             UChar32 digitC;
    428             for (;;) {
    429                 digitC = UTEXT_CURRENT32(replacement);
    430                 if (digitC == U_SENTINEL) {
    431                     break;
    432                 }
    433                 if (u_isdigit(digitC) == FALSE) {
    434                     break;
    435                 }
    436                 (void)UTEXT_NEXT32(replacement);
    437                 groupNum=groupNum*10 + u_charDigitValue(digitC);
    438                 numDigits++;
    439                 if (numDigits >= fPattern->fMaxCaptureDigits) {
    440                     break;
    441                 }
    442             }
    443 
    444 
    445             if (numDigits == 0) {
    446                 // The $ didn't introduce a group number at all.
    447                 // Treat it as just part of the substitution text.
    448                 UChar c16 = DOLLARSIGN;
    449                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    450             } else {
    451                 // Finally, append the capture group data to the destination.
    452                 destLen += appendGroup(groupNum, dest, status);
    453                 if (U_FAILURE(status)) {
    454                     // Can fail if group number is out of range.
    455                     break;
    456                 }
    457             }
    458         }
    459 
    460         if (U_FAILURE(status)) {
    461             break;
    462         } else {
    463             c = UTEXT_NEXT32(replacement);
    464         }
    465     }
    466 
    467     return *this;
    468 }
    469 
    470 
    471 
    472 //--------------------------------------------------------------------------------
    473 //
    474 //    appendTail     Intended to be used in conjunction with appendReplacement()
    475 //                   To the destination string, append everything following
    476 //                   the last match position from the input string.
    477 //
    478 //                   Note:  Match ranges do not affect appendTail or appendReplacement
    479 //
    480 //--------------------------------------------------------------------------------
    481 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
    482     UErrorCode status = U_ZERO_ERROR;
    483     UText resultText = UTEXT_INITIALIZER;
    484     utext_openUnicodeString(&resultText, &dest, &status);
    485 
    486     if (U_SUCCESS(status)) {
    487         appendTail(&resultText, status);
    488         utext_close(&resultText);
    489     }
    490 
    491     return dest;
    492 }
    493 
    494 //
    495 //   appendTail, UText mode
    496 //
    497 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
    498     UBool bailOut = FALSE;
    499     if (U_FAILURE(status)) {
    500         bailOut = TRUE;
    501     }
    502     if (U_FAILURE(fDeferredStatus)) {
    503         status = fDeferredStatus;
    504         bailOut = TRUE;
    505     }
    506 
    507     if (bailOut) {
    508         //  dest must not be NULL
    509         if (dest) {
    510             utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
    511             return dest;
    512         }
    513     }
    514 
    515     if (fInputLength > fAppendPosition) {
    516         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    517             int64_t destLen = utext_nativeLength(dest);
    518             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
    519                           (int32_t)(fInputLength-fAppendPosition), &status);
    520         } else {
    521             int32_t len16;
    522             if (UTEXT_USES_U16(fInputText)) {
    523                 len16 = (int32_t)(fInputLength-fAppendPosition);
    524             } else {
    525                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
    526                 status = U_ZERO_ERROR; // buffer overflow
    527             }
    528 
    529             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
    530             if (inputChars == NULL) {
    531                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    532             } else {
    533                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
    534                 int64_t destLen = utext_nativeLength(dest);
    535                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    536                 uprv_free(inputChars);
    537             }
    538         }
    539     }
    540     return dest;
    541 }
    542 
    543 
    544 
    545 //--------------------------------------------------------------------------------
    546 //
    547 //   end
    548 //
    549 //--------------------------------------------------------------------------------
    550 int32_t RegexMatcher::end(UErrorCode &err) const {
    551     return end(0, err);
    552 }
    553 
    554 int64_t RegexMatcher::end64(UErrorCode &err) const {
    555     return end64(0, err);
    556 }
    557 
    558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
    559     if (U_FAILURE(err)) {
    560         return -1;
    561     }
    562     if (fMatch == FALSE) {
    563         err = U_REGEX_INVALID_STATE;
    564         return -1;
    565     }
    566     if (group < 0 || group > fPattern->fGroupMap->size()) {
    567         err = U_INDEX_OUTOFBOUNDS_ERROR;
    568         return -1;
    569     }
    570     int64_t e = -1;
    571     if (group == 0) {
    572         e = fMatchEnd;
    573     } else {
    574         // Get the position within the stack frame of the variables for
    575         //    this capture group.
    576         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
    577         U_ASSERT(groupOffset < fPattern->fFrameSize);
    578         U_ASSERT(groupOffset >= 0);
    579         e = fFrame->fExtra[groupOffset + 1];
    580     }
    581 
    582         return e;
    583 }
    584 
    585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
    586     return (int32_t)end64(group, err);
    587 }
    588 
    589 
    590 //--------------------------------------------------------------------------------
    591 //
    592 //   find()
    593 //
    594 //--------------------------------------------------------------------------------
    595 UBool RegexMatcher::find() {
    596     // Start at the position of the last match end.  (Will be zero if the
    597     //   matcher has been reset.)
    598     //
    599     if (U_FAILURE(fDeferredStatus)) {
    600         return FALSE;
    601     }
    602 
    603     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    604         return findUsingChunk();
    605     }
    606 
    607     int64_t startPos = fMatchEnd;
    608     if (startPos==0) {
    609         startPos = fActiveStart;
    610     }
    611 
    612     if (fMatch) {
    613         // Save the position of any previous successful match.
    614         fLastMatchEnd = fMatchEnd;
    615 
    616         if (fMatchStart == fMatchEnd) {
    617             // Previous match had zero length.  Move start position up one position
    618             //  to avoid sending find() into a loop on zero-length matches.
    619             if (startPos >= fActiveLimit) {
    620                 fMatch = FALSE;
    621                 fHitEnd = TRUE;
    622                 return FALSE;
    623             }
    624             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    625             (void)UTEXT_NEXT32(fInputText);
    626             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    627         }
    628     } else {
    629         if (fLastMatchEnd >= 0) {
    630             // A previous find() failed to match.  Don't try again.
    631             //   (without this test, a pattern with a zero-length match
    632             //    could match again at the end of an input string.)
    633             fHitEnd = TRUE;
    634             return FALSE;
    635         }
    636     }
    637 
    638 
    639     // Compute the position in the input string beyond which a match can not begin, because
    640     //   the minimum length match would extend past the end of the input.
    641     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    642     //          Be aware of possible overflows if making changes here.
    643     int64_t testStartLimit;
    644     if (UTEXT_USES_U16(fInputText)) {
    645         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
    646         if (startPos > testStartLimit) {
    647             fMatch = FALSE;
    648             fHitEnd = TRUE;
    649             return FALSE;
    650         }
    651     } else {
    652         // For now, let the matcher discover that it can't match on its own
    653         // We don't know how long the match len is in native characters
    654         testStartLimit = fActiveLimit;
    655     }
    656 
    657     UChar32  c;
    658     U_ASSERT(startPos >= 0);
    659 
    660     switch (fPattern->fStartType) {
    661     case START_NO_INFO:
    662         // No optimization was found.
    663         //  Try a match at each input position.
    664         for (;;) {
    665             MatchAt(startPos, FALSE, fDeferredStatus);
    666             if (U_FAILURE(fDeferredStatus)) {
    667                 return FALSE;
    668             }
    669             if (fMatch) {
    670                 return TRUE;
    671             }
    672             if (startPos >= testStartLimit) {
    673                 fHitEnd = TRUE;
    674                 return FALSE;
    675             }
    676             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    677             (void)UTEXT_NEXT32(fInputText);
    678             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    679             // Note that it's perfectly OK for a pattern to have a zero-length
    680             //   match at the end of a string, so we must make sure that the loop
    681             //   runs with startPos == testStartLimit the last time through.
    682             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    683                 return FALSE;
    684         }
    685         U_ASSERT(FALSE);
    686 
    687     case START_START:
    688         // Matches are only possible at the start of the input string
    689         //   (pattern begins with ^ or \A)
    690         if (startPos > fActiveStart) {
    691             fMatch = FALSE;
    692             return FALSE;
    693         }
    694         MatchAt(startPos, FALSE, fDeferredStatus);
    695         if (U_FAILURE(fDeferredStatus)) {
    696             return FALSE;
    697         }
    698         return fMatch;
    699 
    700 
    701     case START_SET:
    702         {
    703             // Match may start on any char from a pre-computed set.
    704             U_ASSERT(fPattern->fMinMatchLen > 0);
    705             int64_t pos;
    706             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    707             for (;;) {
    708                 c = UTEXT_NEXT32(fInputText);
    709                 pos = UTEXT_GETNATIVEINDEX(fInputText);
    710                 // c will be -1 (U_SENTINEL) at end of text, in which case we
    711                 // skip this next block (so we don't have a negative array index)
    712                 // and handle end of text in the following block.
    713                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
    714                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
    715                     MatchAt(startPos, FALSE, fDeferredStatus);
    716                     if (U_FAILURE(fDeferredStatus)) {
    717                         return FALSE;
    718                     }
    719                     if (fMatch) {
    720                         return TRUE;
    721                     }
    722                     UTEXT_SETNATIVEINDEX(fInputText, pos);
    723                 }
    724                 if (startPos >= testStartLimit) {
    725                     fMatch = FALSE;
    726                     fHitEnd = TRUE;
    727                     return FALSE;
    728                 }
    729                 startPos = pos;
    730 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    731                     return FALSE;
    732             }
    733         }
    734         U_ASSERT(FALSE);
    735 
    736     case START_STRING:
    737     case START_CHAR:
    738         {
    739             // Match starts on exactly one char.
    740             U_ASSERT(fPattern->fMinMatchLen > 0);
    741             UChar32 theChar = fPattern->fInitialChar;
    742             int64_t pos;
    743             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    744             for (;;) {
    745                 c = UTEXT_NEXT32(fInputText);
    746                 pos = UTEXT_GETNATIVEINDEX(fInputText);
    747                 if (c == theChar) {
    748                     MatchAt(startPos, FALSE, fDeferredStatus);
    749                     if (U_FAILURE(fDeferredStatus)) {
    750                         return FALSE;
    751                     }
    752                     if (fMatch) {
    753                         return TRUE;
    754                     }
    755                     UTEXT_SETNATIVEINDEX(fInputText, pos);
    756                 }
    757                 if (startPos >= testStartLimit) {
    758                     fMatch = FALSE;
    759                     fHitEnd = TRUE;
    760                     return FALSE;
    761                 }
    762                 startPos = pos;
    763 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    764                     return FALSE;
    765            }
    766         }
    767         U_ASSERT(FALSE);
    768 
    769     case START_LINE:
    770         {
    771             UChar32  c;
    772             if (startPos == fAnchorStart) {
    773                 MatchAt(startPos, FALSE, fDeferredStatus);
    774                 if (U_FAILURE(fDeferredStatus)) {
    775                     return FALSE;
    776                 }
    777                 if (fMatch) {
    778                     return TRUE;
    779                 }
    780                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    781                 c = UTEXT_NEXT32(fInputText);
    782                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    783             } else {
    784                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    785                 c = UTEXT_PREVIOUS32(fInputText);
    786                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    787             }
    788 
    789             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
    790                 for (;;) {
    791                     if (c == 0x0a) {
    792                             MatchAt(startPos, FALSE, fDeferredStatus);
    793                             if (U_FAILURE(fDeferredStatus)) {
    794                                 return FALSE;
    795                             }
    796                             if (fMatch) {
    797                                 return TRUE;
    798                             }
    799                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    800                     }
    801                     if (startPos >= testStartLimit) {
    802                         fMatch = FALSE;
    803                         fHitEnd = TRUE;
    804                         return FALSE;
    805                     }
    806                     c = UTEXT_NEXT32(fInputText);
    807                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
    808                     // Note that it's perfectly OK for a pattern to have a zero-length
    809                     //   match at the end of a string, so we must make sure that the loop
    810                     //   runs with startPos == testStartLimit the last time through.
    811 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    812                         return FALSE;
    813                 }
    814             } else {
    815                 for (;;) {
    816                     if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
    817                         ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
    818                             if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
    819                                 (void)UTEXT_NEXT32(fInputText);
    820                                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    821                             }
    822                             MatchAt(startPos, FALSE, fDeferredStatus);
    823                             if (U_FAILURE(fDeferredStatus)) {
    824                                 return FALSE;
    825                             }
    826                             if (fMatch) {
    827                                 return TRUE;
    828                             }
    829                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    830                     }
    831                     if (startPos >= testStartLimit) {
    832                         fMatch = FALSE;
    833                         fHitEnd = TRUE;
    834                         return FALSE;
    835                     }
    836                     c = UTEXT_NEXT32(fInputText);
    837                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
    838                     // Note that it's perfectly OK for a pattern to have a zero-length
    839                     //   match at the end of a string, so we must make sure that the loop
    840                     //   runs with startPos == testStartLimit the last time through.
    841 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    842                         return FALSE;
    843                 }
    844             }
    845         }
    846 
    847     default:
    848         U_ASSERT(FALSE);
    849     }
    850 
    851     U_ASSERT(FALSE);
    852     return FALSE;
    853 }
    854 
    855 
    856 
    857 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
    858     if (U_FAILURE(status)) {
    859         return FALSE;
    860     }
    861     if (U_FAILURE(fDeferredStatus)) {
    862         status = fDeferredStatus;
    863         return FALSE;
    864     }
    865     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
    866                                           //        This will reset the region to be the full input length.
    867     if (start < 0) {
    868         status = U_INDEX_OUTOFBOUNDS_ERROR;
    869         return FALSE;
    870     }
    871 
    872     int64_t nativeStart = start;
    873     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
    874         status = U_INDEX_OUTOFBOUNDS_ERROR;
    875         return FALSE;
    876     }
    877     fMatchEnd = nativeStart;
    878     return find();
    879 }
    880 
    881 
    882 //--------------------------------------------------------------------------------
    883 //
    884 //   findUsingChunk() -- like find(), but with the advance knowledge that the
    885 //                       entire string is available in the UText's chunk buffer.
    886 //
    887 //--------------------------------------------------------------------------------
    888 UBool RegexMatcher::findUsingChunk() {
    889     // Start at the position of the last match end.  (Will be zero if the
    890     //   matcher has been reset.
    891     //
    892 
    893     int32_t startPos = (int32_t)fMatchEnd;
    894     if (startPos==0) {
    895         startPos = (int32_t)fActiveStart;
    896     }
    897 
    898     const UChar *inputBuf = fInputText->chunkContents;
    899 
    900     if (fMatch) {
    901         // Save the position of any previous successful match.
    902         fLastMatchEnd = fMatchEnd;
    903 
    904         if (fMatchStart == fMatchEnd) {
    905             // Previous match had zero length.  Move start position up one position
    906             //  to avoid sending find() into a loop on zero-length matches.
    907             if (startPos >= fActiveLimit) {
    908                 fMatch = FALSE;
    909                 fHitEnd = TRUE;
    910                 return FALSE;
    911             }
    912             U16_FWD_1(inputBuf, startPos, fInputLength);
    913         }
    914     } else {
    915         if (fLastMatchEnd >= 0) {
    916             // A previous find() failed to match.  Don't try again.
    917             //   (without this test, a pattern with a zero-length match
    918             //    could match again at the end of an input string.)
    919             fHitEnd = TRUE;
    920             return FALSE;
    921         }
    922     }
    923 
    924 
    925     // Compute the position in the input string beyond which a match can not begin, because
    926     //   the minimum length match would extend past the end of the input.
    927     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    928     //          Be aware of possible overflows if making changes here.
    929     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
    930     if (startPos > testLen) {
    931         fMatch = FALSE;
    932         fHitEnd = TRUE;
    933         return FALSE;
    934     }
    935 
    936     UChar32  c;
    937     U_ASSERT(startPos >= 0);
    938 
    939     switch (fPattern->fStartType) {
    940     case START_NO_INFO:
    941         // No optimization was found.
    942         //  Try a match at each input position.
    943         for (;;) {
    944             MatchChunkAt(startPos, FALSE, fDeferredStatus);
    945             if (U_FAILURE(fDeferredStatus)) {
    946                 return FALSE;
    947             }
    948             if (fMatch) {
    949                 return TRUE;
    950             }
    951             if (startPos >= testLen) {
    952                 fHitEnd = TRUE;
    953                 return FALSE;
    954             }
    955             U16_FWD_1(inputBuf, startPos, fActiveLimit);
    956             // Note that it's perfectly OK for a pattern to have a zero-length
    957             //   match at the end of a string, so we must make sure that the loop
    958             //   runs with startPos == testLen the last time through.
    959             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    960                 return FALSE;
    961         }
    962         U_ASSERT(FALSE);
    963 
    964     case START_START:
    965         // Matches are only possible at the start of the input string
    966         //   (pattern begins with ^ or \A)
    967         if (startPos > fActiveStart) {
    968             fMatch = FALSE;
    969             return FALSE;
    970         }
    971         MatchChunkAt(startPos, FALSE, fDeferredStatus);
    972         if (U_FAILURE(fDeferredStatus)) {
    973             return FALSE;
    974         }
    975         return fMatch;
    976 
    977 
    978     case START_SET:
    979     {
    980         // Match may start on any char from a pre-computed set.
    981         U_ASSERT(fPattern->fMinMatchLen > 0);
    982         for (;;) {
    983             int32_t pos = startPos;
    984             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
    985             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
    986                 (c>=256 && fPattern->fInitialChars->contains(c))) {
    987                 MatchChunkAt(pos, FALSE, fDeferredStatus);
    988                 if (U_FAILURE(fDeferredStatus)) {
    989                     return FALSE;
    990                 }
    991                 if (fMatch) {
    992                     return TRUE;
    993                 }
    994             }
    995             if (pos >= testLen) {
    996                 fMatch = FALSE;
    997                 fHitEnd = TRUE;
    998                 return FALSE;
    999             }
   1000             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1001                 return FALSE;
   1002         }
   1003     }
   1004         U_ASSERT(FALSE);
   1005 
   1006     case START_STRING:
   1007     case START_CHAR:
   1008     {
   1009         // Match starts on exactly one char.
   1010         U_ASSERT(fPattern->fMinMatchLen > 0);
   1011         UChar32 theChar = fPattern->fInitialChar;
   1012         for (;;) {
   1013             int32_t pos = startPos;
   1014             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
   1015             if (c == theChar) {
   1016                 MatchChunkAt(pos, FALSE, fDeferredStatus);
   1017                 if (U_FAILURE(fDeferredStatus)) {
   1018                     return FALSE;
   1019                 }
   1020                 if (fMatch) {
   1021                     return TRUE;
   1022                 }
   1023             }
   1024             if (pos >= testLen) {
   1025                 fMatch = FALSE;
   1026                 fHitEnd = TRUE;
   1027                 return FALSE;
   1028             }
   1029             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1030                 return FALSE;
   1031         }
   1032     }
   1033         U_ASSERT(FALSE);
   1034 
   1035     case START_LINE:
   1036     {
   1037         UChar32  c;
   1038         if (startPos == fAnchorStart) {
   1039             MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1040             if (U_FAILURE(fDeferredStatus)) {
   1041                 return FALSE;
   1042             }
   1043             if (fMatch) {
   1044                 return TRUE;
   1045             }
   1046             U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1047         }
   1048 
   1049         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
   1050             for (;;) {
   1051                 c = inputBuf[startPos-1];
   1052                 if (c == 0x0a) {
   1053                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1054                     if (U_FAILURE(fDeferredStatus)) {
   1055                         return FALSE;
   1056                     }
   1057                     if (fMatch) {
   1058                         return TRUE;
   1059                     }
   1060                 }
   1061                 if (startPos >= testLen) {
   1062                     fMatch = FALSE;
   1063                     fHitEnd = TRUE;
   1064                     return FALSE;
   1065                 }
   1066                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1067                 // Note that it's perfectly OK for a pattern to have a zero-length
   1068                 //   match at the end of a string, so we must make sure that the loop
   1069                 //   runs with startPos == testLen the last time through.
   1070 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1071                     return FALSE;
   1072             }
   1073         } else {
   1074             for (;;) {
   1075                 c = inputBuf[startPos-1];
   1076                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   1077                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
   1078                     if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
   1079                         startPos++;
   1080                     }
   1081                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1082                     if (U_FAILURE(fDeferredStatus)) {
   1083                         return FALSE;
   1084                     }
   1085                     if (fMatch) {
   1086                         return TRUE;
   1087                     }
   1088                 }
   1089                 if (startPos >= testLen) {
   1090                     fMatch = FALSE;
   1091                     fHitEnd = TRUE;
   1092                     return FALSE;
   1093                 }
   1094                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1095                 // Note that it's perfectly OK for a pattern to have a zero-length
   1096                 //   match at the end of a string, so we must make sure that the loop
   1097                 //   runs with startPos == testLen the last time through.
   1098 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1099                     return FALSE;
   1100             }
   1101         }
   1102     }
   1103 
   1104     default:
   1105         U_ASSERT(FALSE);
   1106     }
   1107 
   1108     U_ASSERT(FALSE);
   1109     return FALSE;
   1110 }
   1111 
   1112 
   1113 
   1114 //--------------------------------------------------------------------------------
   1115 //
   1116 //  group()
   1117 //
   1118 //--------------------------------------------------------------------------------
   1119 UnicodeString RegexMatcher::group(UErrorCode &status) const {
   1120     return group(0, status);
   1121 }
   1122 
   1123 //  Return immutable shallow clone
   1124 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
   1125     return group(0, dest, group_len, status);
   1126 }
   1127 
   1128 //  Return immutable shallow clone
   1129 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
   1130     group_len = 0;
   1131     UBool bailOut = FALSE;
   1132     if (U_FAILURE(status)) {
   1133         return dest;
   1134     }
   1135     if (U_FAILURE(fDeferredStatus)) {
   1136         status = fDeferredStatus;
   1137         bailOut = TRUE;
   1138     }
   1139     if (fMatch == FALSE) {
   1140         status = U_REGEX_INVALID_STATE;
   1141         bailOut = TRUE;
   1142     }
   1143     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1144         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1145         bailOut = TRUE;
   1146     }
   1147 
   1148     if (bailOut) {
   1149         return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
   1150     }
   1151 
   1152     int64_t s, e;
   1153     if (groupNum == 0) {
   1154         s = fMatchStart;
   1155         e = fMatchEnd;
   1156     } else {
   1157         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1158         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1159         U_ASSERT(groupOffset >= 0);
   1160         s = fFrame->fExtra[groupOffset];
   1161         e = fFrame->fExtra[groupOffset+1];
   1162     }
   1163 
   1164     if (s < 0) {
   1165         // A capture group wasn't part of the match
   1166         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
   1167     }
   1168     U_ASSERT(s <= e);
   1169     group_len = e - s;
   1170 
   1171     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
   1172     if (dest)
   1173         UTEXT_SETNATIVEINDEX(dest, s);
   1174     return dest;
   1175 }
   1176 
   1177 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
   1178     UnicodeString result;
   1179     if (U_FAILURE(status)) {
   1180         return result;
   1181     }
   1182     UText resultText = UTEXT_INITIALIZER;
   1183     utext_openUnicodeString(&resultText, &result, &status);
   1184     group(groupNum, &resultText, status);
   1185     utext_close(&resultText);
   1186     return result;
   1187 }
   1188 
   1189 
   1190 //  Return deep (mutable) clone
   1191 //		Technology Preview (as an API), but note that the UnicodeString API is implemented
   1192 //		using this function.
   1193 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
   1194     UBool bailOut = FALSE;
   1195     if (U_FAILURE(status)) {
   1196         return dest;
   1197     }
   1198     if (U_FAILURE(fDeferredStatus)) {
   1199         status = fDeferredStatus;
   1200         bailOut = TRUE;
   1201     }
   1202 
   1203     if (fMatch == FALSE) {
   1204         status = U_REGEX_INVALID_STATE;
   1205         bailOut = TRUE;
   1206     }
   1207     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1208         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1209         bailOut = TRUE;
   1210     }
   1211 
   1212     if (bailOut) {
   1213         if (dest) {
   1214             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1215             return dest;
   1216         } else {
   1217             return utext_openUChars(NULL, NULL, 0, &status);
   1218         }
   1219     }
   1220 
   1221     int64_t s, e;
   1222     if (groupNum == 0) {
   1223         s = fMatchStart;
   1224         e = fMatchEnd;
   1225     } else {
   1226         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1227         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1228         U_ASSERT(groupOffset >= 0);
   1229         s = fFrame->fExtra[groupOffset];
   1230         e = fFrame->fExtra[groupOffset+1];
   1231     }
   1232 
   1233     if (s < 0) {
   1234         // A capture group wasn't part of the match
   1235         if (dest) {
   1236             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1237             return dest;
   1238         } else {
   1239             return utext_openUChars(NULL, NULL, 0, &status);
   1240         }
   1241     }
   1242     U_ASSERT(s <= e);
   1243 
   1244     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1245         U_ASSERT(e <= fInputLength);
   1246         if (dest) {
   1247             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
   1248         } else {
   1249             UText groupText = UTEXT_INITIALIZER;
   1250             utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
   1251             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
   1252             utext_close(&groupText);
   1253         }
   1254     } else {
   1255         int32_t len16;
   1256         if (UTEXT_USES_U16(fInputText)) {
   1257             len16 = (int32_t)(e-s);
   1258         } else {
   1259             UErrorCode lengthStatus = U_ZERO_ERROR;
   1260             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
   1261         }
   1262         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   1263         if (groupChars == NULL) {
   1264             status = U_MEMORY_ALLOCATION_ERROR;
   1265             return dest;
   1266         }
   1267         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
   1268 
   1269         if (dest) {
   1270             utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
   1271         } else {
   1272             UText groupText = UTEXT_INITIALIZER;
   1273             utext_openUChars(&groupText, groupChars, len16, &status);
   1274             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
   1275             utext_close(&groupText);
   1276         }
   1277 
   1278         uprv_free(groupChars);
   1279     }
   1280     return dest;
   1281 }
   1282 
   1283 //--------------------------------------------------------------------------------
   1284 //
   1285 //  appendGroup() -- currently internal only, appends a group to a UText rather
   1286 //                   than replacing its contents
   1287 //
   1288 //--------------------------------------------------------------------------------
   1289 
   1290 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
   1291     if (U_FAILURE(status)) {
   1292         return 0;
   1293     }
   1294     if (U_FAILURE(fDeferredStatus)) {
   1295         status = fDeferredStatus;
   1296         return 0;
   1297     }
   1298     int64_t destLen = utext_nativeLength(dest);
   1299 
   1300     if (fMatch == FALSE) {
   1301         status = U_REGEX_INVALID_STATE;
   1302         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1303     }
   1304     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1305         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1306         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1307     }
   1308 
   1309     int64_t s, e;
   1310     if (groupNum == 0) {
   1311         s = fMatchStart;
   1312         e = fMatchEnd;
   1313     } else {
   1314         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1315         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1316         U_ASSERT(groupOffset >= 0);
   1317         s = fFrame->fExtra[groupOffset];
   1318         e = fFrame->fExtra[groupOffset+1];
   1319     }
   1320 
   1321     if (s < 0) {
   1322         // A capture group wasn't part of the match
   1323         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1324     }
   1325     U_ASSERT(s <= e);
   1326 
   1327     int64_t deltaLen;
   1328     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1329         U_ASSERT(e <= fInputLength);
   1330         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
   1331     } else {
   1332         int32_t len16;
   1333         if (UTEXT_USES_U16(fInputText)) {
   1334             len16 = (int32_t)(e-s);
   1335         } else {
   1336             UErrorCode lengthStatus = U_ZERO_ERROR;
   1337             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
   1338         }
   1339         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   1340         if (groupChars == NULL) {
   1341             status = U_MEMORY_ALLOCATION_ERROR;
   1342             return 0;
   1343         }
   1344         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
   1345 
   1346         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
   1347         uprv_free(groupChars);
   1348     }
   1349     return deltaLen;
   1350 }
   1351 
   1352 
   1353 
   1354 //--------------------------------------------------------------------------------
   1355 //
   1356 //  groupCount()
   1357 //
   1358 //--------------------------------------------------------------------------------
   1359 int32_t RegexMatcher::groupCount() const {
   1360     return fPattern->fGroupMap->size();
   1361 }
   1362 
   1363 
   1364 
   1365 //--------------------------------------------------------------------------------
   1366 //
   1367 //  hasAnchoringBounds()
   1368 //
   1369 //--------------------------------------------------------------------------------
   1370 UBool RegexMatcher::hasAnchoringBounds() const {
   1371     return fAnchoringBounds;
   1372 }
   1373 
   1374 
   1375 //--------------------------------------------------------------------------------
   1376 //
   1377 //  hasTransparentBounds()
   1378 //
   1379 //--------------------------------------------------------------------------------
   1380 UBool RegexMatcher::hasTransparentBounds() const {
   1381     return fTransparentBounds;
   1382 }
   1383 
   1384 
   1385 
   1386 //--------------------------------------------------------------------------------
   1387 //
   1388 //  hitEnd()
   1389 //
   1390 //--------------------------------------------------------------------------------
   1391 UBool RegexMatcher::hitEnd() const {
   1392     return fHitEnd;
   1393 }
   1394 
   1395 
   1396 //--------------------------------------------------------------------------------
   1397 //
   1398 //  input()
   1399 //
   1400 //--------------------------------------------------------------------------------
   1401 const UnicodeString &RegexMatcher::input() const {
   1402     if (!fInput) {
   1403         UErrorCode status = U_ZERO_ERROR;
   1404         int32_t len16;
   1405         if (UTEXT_USES_U16(fInputText)) {
   1406             len16 = (int32_t)fInputLength;
   1407         } else {
   1408             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
   1409             status = U_ZERO_ERROR; // overflow, length status
   1410         }
   1411         UnicodeString *result = new UnicodeString(len16, 0, 0);
   1412 
   1413         UChar *inputChars = result->getBuffer(len16);
   1414         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
   1415         result->releaseBuffer(len16);
   1416 
   1417         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
   1418     }
   1419 
   1420     return *fInput;
   1421 }
   1422 
   1423 //--------------------------------------------------------------------------------
   1424 //
   1425 //  inputText()
   1426 //
   1427 //--------------------------------------------------------------------------------
   1428 UText *RegexMatcher::inputText() const {
   1429     return fInputText;
   1430 }
   1431 
   1432 
   1433 //--------------------------------------------------------------------------------
   1434 //
   1435 //  getInput() -- like inputText(), but makes a clone or copies into another UText
   1436 //
   1437 //--------------------------------------------------------------------------------
   1438 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
   1439     UBool bailOut = FALSE;
   1440     if (U_FAILURE(status)) {
   1441         return dest;
   1442     }
   1443     if (U_FAILURE(fDeferredStatus)) {
   1444         status = fDeferredStatus;
   1445         bailOut = TRUE;
   1446     }
   1447 
   1448     if (bailOut) {
   1449         if (dest) {
   1450             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1451             return dest;
   1452         } else {
   1453             return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
   1454         }
   1455     }
   1456 
   1457     if (dest) {
   1458         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1459             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
   1460         } else {
   1461             int32_t input16Len;
   1462             if (UTEXT_USES_U16(fInputText)) {
   1463                 input16Len = (int32_t)fInputLength;
   1464             } else {
   1465                 UErrorCode lengthStatus = U_ZERO_ERROR;
   1466                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
   1467             }
   1468             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
   1469             if (inputChars == NULL) {
   1470                 return dest;
   1471             }
   1472 
   1473             status = U_ZERO_ERROR;
   1474             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
   1475             status = U_ZERO_ERROR;
   1476             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
   1477 
   1478             uprv_free(inputChars);
   1479         }
   1480         return dest;
   1481     } else {
   1482         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
   1483     }
   1484 }
   1485 
   1486 
   1487 static UBool compat_SyncMutableUTextContents(UText *ut);
   1488 static UBool compat_SyncMutableUTextContents(UText *ut) {
   1489     UBool retVal = FALSE;
   1490 
   1491     //  In the following test, we're really only interested in whether the UText should switch
   1492     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
   1493     //  will still point to the correct data.
   1494     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
   1495         UnicodeString *us=(UnicodeString *)ut->context;
   1496 
   1497         // Update to the latest length.
   1498         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
   1499         int32_t newLength = us->length();
   1500 
   1501         // Update the chunk description.
   1502         // The buffer may have switched between stack- and heap-based.
   1503         ut->chunkContents    = us->getBuffer();
   1504         ut->chunkLength      = newLength;
   1505         ut->chunkNativeLimit = newLength;
   1506         ut->nativeIndexingLimit = newLength;
   1507         retVal = TRUE;
   1508     }
   1509 
   1510     return retVal;
   1511 }
   1512 
   1513 //--------------------------------------------------------------------------------
   1514 //
   1515 //  lookingAt()
   1516 //
   1517 //--------------------------------------------------------------------------------
   1518 UBool RegexMatcher::lookingAt(UErrorCode &status) {
   1519     if (U_FAILURE(status)) {
   1520         return FALSE;
   1521     }
   1522     if (U_FAILURE(fDeferredStatus)) {
   1523         status = fDeferredStatus;
   1524         return FALSE;
   1525     }
   1526 
   1527     if (fInputUniStrMaybeMutable) {
   1528         if (compat_SyncMutableUTextContents(fInputText)) {
   1529         fInputLength = utext_nativeLength(fInputText);
   1530         reset();
   1531         }
   1532     }
   1533     else {
   1534         resetPreserveRegion();
   1535     }
   1536     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1537         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
   1538     } else {
   1539         MatchAt(fActiveStart, FALSE, status);
   1540     }
   1541     return fMatch;
   1542 }
   1543 
   1544 
   1545 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
   1546     if (U_FAILURE(status)) {
   1547         return FALSE;
   1548     }
   1549     if (U_FAILURE(fDeferredStatus)) {
   1550         status = fDeferredStatus;
   1551         return FALSE;
   1552     }
   1553     reset();
   1554 
   1555     if (start < 0) {
   1556         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1557         return FALSE;
   1558     }
   1559 
   1560     if (fInputUniStrMaybeMutable) {
   1561         if (compat_SyncMutableUTextContents(fInputText)) {
   1562         fInputLength = utext_nativeLength(fInputText);
   1563         reset();
   1564         }
   1565     }
   1566 
   1567     int64_t nativeStart;
   1568     nativeStart = start;
   1569     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1570         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1571         return FALSE;
   1572     }
   1573 
   1574     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1575         MatchChunkAt((int32_t)nativeStart, FALSE, status);
   1576     } else {
   1577         MatchAt(nativeStart, FALSE, status);
   1578     }
   1579     return fMatch;
   1580 }
   1581 
   1582 
   1583 
   1584 //--------------------------------------------------------------------------------
   1585 //
   1586 //  matches()
   1587 //
   1588 //--------------------------------------------------------------------------------
   1589 UBool RegexMatcher::matches(UErrorCode &status) {
   1590     if (U_FAILURE(status)) {
   1591         return FALSE;
   1592     }
   1593     if (U_FAILURE(fDeferredStatus)) {
   1594         status = fDeferredStatus;
   1595         return FALSE;
   1596     }
   1597 
   1598     if (fInputUniStrMaybeMutable) {
   1599         if (compat_SyncMutableUTextContents(fInputText)) {
   1600         fInputLength = utext_nativeLength(fInputText);
   1601         reset();
   1602         }
   1603     }
   1604     else {
   1605         resetPreserveRegion();
   1606     }
   1607 
   1608     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1609         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
   1610     } else {
   1611         MatchAt(fActiveStart, TRUE, status);
   1612     }
   1613     return fMatch;
   1614 }
   1615 
   1616 
   1617 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
   1618     if (U_FAILURE(status)) {
   1619         return FALSE;
   1620     }
   1621     if (U_FAILURE(fDeferredStatus)) {
   1622         status = fDeferredStatus;
   1623         return FALSE;
   1624     }
   1625     reset();
   1626 
   1627     if (start < 0) {
   1628         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1629         return FALSE;
   1630     }
   1631 
   1632     if (fInputUniStrMaybeMutable) {
   1633         if (compat_SyncMutableUTextContents(fInputText)) {
   1634         fInputLength = utext_nativeLength(fInputText);
   1635         reset();
   1636         }
   1637     }
   1638 
   1639     int64_t nativeStart;
   1640     nativeStart = start;
   1641     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1642         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1643         return FALSE;
   1644     }
   1645 
   1646     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1647         MatchChunkAt((int32_t)nativeStart, TRUE, status);
   1648     } else {
   1649         MatchAt(nativeStart, TRUE, status);
   1650     }
   1651     return fMatch;
   1652 }
   1653 
   1654 
   1655 
   1656 //--------------------------------------------------------------------------------
   1657 //
   1658 //    pattern
   1659 //
   1660 //--------------------------------------------------------------------------------
   1661 const RegexPattern &RegexMatcher::pattern() const {
   1662     return *fPattern;
   1663 }
   1664 
   1665 
   1666 
   1667 //--------------------------------------------------------------------------------
   1668 //
   1669 //    region
   1670 //
   1671 //--------------------------------------------------------------------------------
   1672 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
   1673     if (U_FAILURE(status)) {
   1674         return *this;
   1675     }
   1676 
   1677     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
   1678         status = U_ILLEGAL_ARGUMENT_ERROR;
   1679     }
   1680 
   1681     int64_t nativeStart = regionStart;
   1682     int64_t nativeLimit = regionLimit;
   1683     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
   1684       status = U_ILLEGAL_ARGUMENT_ERROR;
   1685     }
   1686 
   1687     if (startIndex == -1)
   1688       this->reset();
   1689     else
   1690       resetPreserveRegion();
   1691 
   1692     fRegionStart = nativeStart;
   1693     fRegionLimit = nativeLimit;
   1694     fActiveStart = nativeStart;
   1695     fActiveLimit = nativeLimit;
   1696 
   1697     if (startIndex != -1) {
   1698       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
   1699           status = U_INDEX_OUTOFBOUNDS_ERROR;
   1700       }
   1701       fMatchEnd = startIndex;
   1702     }
   1703 
   1704     if (!fTransparentBounds) {
   1705         fLookStart = nativeStart;
   1706         fLookLimit = nativeLimit;
   1707     }
   1708     if (fAnchoringBounds) {
   1709         fAnchorStart = nativeStart;
   1710         fAnchorLimit = nativeLimit;
   1711     }
   1712     return *this;
   1713 }
   1714 
   1715 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
   1716   return region(start, limit, -1, status);
   1717 }
   1718 
   1719 //--------------------------------------------------------------------------------
   1720 //
   1721 //    regionEnd
   1722 //
   1723 //--------------------------------------------------------------------------------
   1724 int32_t RegexMatcher::regionEnd() const {
   1725     return (int32_t)fRegionLimit;
   1726 }
   1727 
   1728 int64_t RegexMatcher::regionEnd64() const {
   1729     return fRegionLimit;
   1730 }
   1731 
   1732 //--------------------------------------------------------------------------------
   1733 //
   1734 //    regionStart
   1735 //
   1736 //--------------------------------------------------------------------------------
   1737 int32_t RegexMatcher::regionStart() const {
   1738     return (int32_t)fRegionStart;
   1739 }
   1740 
   1741 int64_t RegexMatcher::regionStart64() const {
   1742     return fRegionStart;
   1743 }
   1744 
   1745 
   1746 //--------------------------------------------------------------------------------
   1747 //
   1748 //    replaceAll
   1749 //
   1750 //--------------------------------------------------------------------------------
   1751 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
   1752     UText replacementText = UTEXT_INITIALIZER;
   1753     UText resultText = UTEXT_INITIALIZER;
   1754     UnicodeString resultString;
   1755     if (U_FAILURE(status)) {
   1756         return resultString;
   1757     }
   1758 
   1759     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1760     utext_openUnicodeString(&resultText, &resultString, &status);
   1761 
   1762     replaceAll(&replacementText, &resultText, status);
   1763 
   1764     utext_close(&resultText);
   1765     utext_close(&replacementText);
   1766 
   1767     return resultString;
   1768 }
   1769 
   1770 
   1771 //
   1772 //    replaceAll, UText mode
   1773 //
   1774 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
   1775     if (U_FAILURE(status)) {
   1776         return dest;
   1777     }
   1778     if (U_FAILURE(fDeferredStatus)) {
   1779         status = fDeferredStatus;
   1780         return dest;
   1781     }
   1782 
   1783     if (dest == NULL) {
   1784         UnicodeString emptyString;
   1785         UText empty = UTEXT_INITIALIZER;
   1786 
   1787         utext_openUnicodeString(&empty, &emptyString, &status);
   1788         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
   1789         utext_close(&empty);
   1790     }
   1791 
   1792     if (U_SUCCESS(status)) {
   1793         reset();
   1794         while (find()) {
   1795             appendReplacement(dest, replacement, status);
   1796             if (U_FAILURE(status)) {
   1797                 break;
   1798             }
   1799         }
   1800         appendTail(dest, status);
   1801     }
   1802 
   1803     return dest;
   1804 }
   1805 
   1806 
   1807 //--------------------------------------------------------------------------------
   1808 //
   1809 //    replaceFirst
   1810 //
   1811 //--------------------------------------------------------------------------------
   1812 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
   1813     UText replacementText = UTEXT_INITIALIZER;
   1814     UText resultText = UTEXT_INITIALIZER;
   1815     UnicodeString resultString;
   1816 
   1817     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1818     utext_openUnicodeString(&resultText, &resultString, &status);
   1819 
   1820     replaceFirst(&replacementText, &resultText, status);
   1821 
   1822     utext_close(&resultText);
   1823     utext_close(&replacementText);
   1824 
   1825     return resultString;
   1826 }
   1827 
   1828 //
   1829 //    replaceFirst, UText mode
   1830 //
   1831 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
   1832     if (U_FAILURE(status)) {
   1833         return dest;
   1834     }
   1835     if (U_FAILURE(fDeferredStatus)) {
   1836         status = fDeferredStatus;
   1837         return dest;
   1838     }
   1839 
   1840     reset();
   1841     if (!find()) {
   1842         return getInput(dest, status);
   1843     }
   1844 
   1845     if (dest == NULL) {
   1846         UnicodeString emptyString;
   1847         UText empty = UTEXT_INITIALIZER;
   1848 
   1849         utext_openUnicodeString(&empty, &emptyString, &status);
   1850         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
   1851         utext_close(&empty);
   1852     }
   1853 
   1854     appendReplacement(dest, replacement, status);
   1855     appendTail(dest, status);
   1856 
   1857     return dest;
   1858 }
   1859 
   1860 
   1861 //--------------------------------------------------------------------------------
   1862 //
   1863 //     requireEnd
   1864 //
   1865 //--------------------------------------------------------------------------------
   1866 UBool RegexMatcher::requireEnd() const {
   1867     return fRequireEnd;
   1868 }
   1869 
   1870 
   1871 //--------------------------------------------------------------------------------
   1872 //
   1873 //     reset
   1874 //
   1875 //--------------------------------------------------------------------------------
   1876 RegexMatcher &RegexMatcher::reset() {
   1877     fRegionStart    = 0;
   1878     fRegionLimit    = fInputLength;
   1879     fActiveStart    = 0;
   1880     fActiveLimit    = fInputLength;
   1881     fAnchorStart    = 0;
   1882     fAnchorLimit    = fInputLength;
   1883     fLookStart      = 0;
   1884     fLookLimit      = fInputLength;
   1885     resetPreserveRegion();
   1886     return *this;
   1887 }
   1888 
   1889 
   1890 
   1891 void RegexMatcher::resetPreserveRegion() {
   1892     fMatchStart     = 0;
   1893     fMatchEnd       = 0;
   1894     fLastMatchEnd   = -1;
   1895     fAppendPosition = 0;
   1896     fMatch          = FALSE;
   1897     fHitEnd         = FALSE;
   1898     fRequireEnd     = FALSE;
   1899     fTime           = 0;
   1900     fTickCounter    = TIMER_INITIAL_VALUE;
   1901     //resetStack(); // more expensive than it looks...
   1902 }
   1903 
   1904 
   1905 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
   1906     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
   1907     if (fPattern->fNeedsAltInput) {
   1908         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
   1909     }
   1910     fInputLength = utext_nativeLength(fInputText);
   1911 
   1912     reset();
   1913     delete fInput;
   1914     fInput = NULL;
   1915 
   1916     //  Do the following for any UnicodeString.
   1917     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
   1918     fInputUniStrMaybeMutable = TRUE;
   1919 
   1920     if (fWordBreakItr != NULL) {
   1921 #if UCONFIG_NO_BREAK_ITERATION==0
   1922         UErrorCode status = U_ZERO_ERROR;
   1923         fWordBreakItr->setText(fInputText, status);
   1924 #endif
   1925     }
   1926     return *this;
   1927 }
   1928 
   1929 
   1930 RegexMatcher &RegexMatcher::reset(UText *input) {
   1931     if (fInputText != input) {
   1932         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
   1933         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
   1934         fInputLength = utext_nativeLength(fInputText);
   1935 
   1936         delete fInput;
   1937         fInput = NULL;
   1938 
   1939         if (fWordBreakItr != NULL) {
   1940 #if UCONFIG_NO_BREAK_ITERATION==0
   1941             UErrorCode status = U_ZERO_ERROR;
   1942             fWordBreakItr->setText(input, status);
   1943 #endif
   1944         }
   1945     }
   1946     reset();
   1947     fInputUniStrMaybeMutable = FALSE;
   1948 
   1949     return *this;
   1950 }
   1951 
   1952 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
   1953     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
   1954     return *this;
   1955 }*/
   1956 
   1957 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
   1958     if (U_FAILURE(status)) {
   1959         return *this;
   1960     }
   1961     reset();       // Reset also resets the region to be the entire string.
   1962 
   1963     if (position < 0 || position > fActiveLimit) {
   1964         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1965         return *this;
   1966     }
   1967     fMatchEnd = position;
   1968     return *this;
   1969 }
   1970 
   1971 
   1972 //--------------------------------------------------------------------------------
   1973 //
   1974 //    refresh
   1975 //
   1976 //--------------------------------------------------------------------------------
   1977 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
   1978     if (U_FAILURE(status)) {
   1979         return *this;
   1980     }
   1981     if (input == NULL) {
   1982         status = U_ILLEGAL_ARGUMENT_ERROR;
   1983         return *this;
   1984     }
   1985     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
   1986         status = U_ILLEGAL_ARGUMENT_ERROR;
   1987         return *this;
   1988     }
   1989     int64_t  pos = utext_getNativeIndex(fInputText);
   1990     //  Shallow read-only clone of the new UText into the existing input UText
   1991     fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
   1992     if (U_FAILURE(status)) {
   1993         return *this;
   1994     }
   1995     utext_setNativeIndex(fInputText, pos);
   1996 
   1997     if (fAltInputText != NULL) {
   1998         pos = utext_getNativeIndex(fAltInputText);
   1999         fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
   2000         if (U_FAILURE(status)) {
   2001             return *this;
   2002         }
   2003         utext_setNativeIndex(fAltInputText, pos);
   2004     }
   2005     return *this;
   2006 }
   2007 
   2008 
   2009 
   2010 //--------------------------------------------------------------------------------
   2011 //
   2012 //    setTrace
   2013 //
   2014 //--------------------------------------------------------------------------------
   2015 void RegexMatcher::setTrace(UBool state) {
   2016     fTraceDebug = state;
   2017 }
   2018 
   2019 
   2020 
   2021 //---------------------------------------------------------------------
   2022 //
   2023 //   split
   2024 //
   2025 //---------------------------------------------------------------------
   2026 int32_t  RegexMatcher::split(const UnicodeString &input,
   2027         UnicodeString    dest[],
   2028         int32_t          destCapacity,
   2029         UErrorCode      &status)
   2030 {
   2031     UText inputText = UTEXT_INITIALIZER;
   2032     utext_openConstUnicodeString(&inputText, &input, &status);
   2033     if (U_FAILURE(status)) {
   2034         return 0;
   2035     }
   2036 
   2037     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
   2038     if (destText == NULL) {
   2039         status = U_MEMORY_ALLOCATION_ERROR;
   2040         return 0;
   2041     }
   2042     int32_t i;
   2043     for (i = 0; i < destCapacity; i++) {
   2044         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
   2045     }
   2046 
   2047     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
   2048 
   2049     for (i = 0; i < destCapacity; i++) {
   2050         utext_close(destText[i]);
   2051     }
   2052 
   2053     uprv_free(destText);
   2054     utext_close(&inputText);
   2055     return fieldCount;
   2056 }
   2057 
   2058 //
   2059 //   split, UText mode
   2060 //
   2061 int32_t  RegexMatcher::split(UText *input,
   2062         UText           *dest[],
   2063         int32_t          destCapacity,
   2064         UErrorCode      &status)
   2065 {
   2066     //
   2067     // Check arguements for validity
   2068     //
   2069     if (U_FAILURE(status)) {
   2070         return 0;
   2071     };
   2072 
   2073     if (destCapacity < 1) {
   2074         status = U_ILLEGAL_ARGUMENT_ERROR;
   2075         return 0;
   2076     }
   2077 
   2078     //
   2079     // Reset for the input text
   2080     //
   2081     reset(input);
   2082     int64_t   nextOutputStringStart = 0;
   2083     if (fActiveLimit == 0) {
   2084         return 0;
   2085     }
   2086 
   2087     //
   2088     // Loop through the input text, searching for the delimiter pattern
   2089     //
   2090     int32_t i;
   2091     int32_t numCaptureGroups = fPattern->fGroupMap->size();
   2092     for (i=0; ; i++) {
   2093         if (i>=destCapacity-1) {
   2094             // There is one or zero output string left.
   2095             // Fill the last output string with whatever is left from the input, then exit the loop.
   2096             //  ( i will be == destCapacity if we filled the output array while processing
   2097             //    capture groups of the delimiter expression, in which case we will discard the
   2098             //    last capture group saved in favor of the unprocessed remainder of the
   2099             //    input string.)
   2100             i = destCapacity-1;
   2101             if (fActiveLimit > nextOutputStringStart) {
   2102                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2103                     if (dest[i]) {
   2104                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2105                                       input->chunkContents+nextOutputStringStart,
   2106                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
   2107                     } else {
   2108                         UText remainingText = UTEXT_INITIALIZER;
   2109                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2110                                          fActiveLimit-nextOutputStringStart, &status);
   2111                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2112                         utext_close(&remainingText);
   2113                     }
   2114                 } else {
   2115                     UErrorCode lengthStatus = U_ZERO_ERROR;
   2116                     int32_t remaining16Length =
   2117                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
   2118                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2119                     if (remainingChars == NULL) {
   2120                         status = U_MEMORY_ALLOCATION_ERROR;
   2121                         break;
   2122                     }
   2123 
   2124                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2125                     if (dest[i]) {
   2126                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2127                     } else {
   2128                         UText remainingText = UTEXT_INITIALIZER;
   2129                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2130                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2131                         utext_close(&remainingText);
   2132                     }
   2133 
   2134                     uprv_free(remainingChars);
   2135                 }
   2136             }
   2137             break;
   2138         }
   2139         if (find()) {
   2140             // We found another delimiter.  Move everything from where we started looking
   2141             //  up until the start of the delimiter into the next output string.
   2142             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2143                 if (dest[i]) {
   2144                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2145                                   input->chunkContents+nextOutputStringStart,
   2146                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
   2147                 } else {
   2148                     UText remainingText = UTEXT_INITIALIZER;
   2149                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2150                                       fMatchStart-nextOutputStringStart, &status);
   2151                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2152                     utext_close(&remainingText);
   2153                 }
   2154             } else {
   2155                 UErrorCode lengthStatus = U_ZERO_ERROR;
   2156                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
   2157                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2158                 if (remainingChars == NULL) {
   2159                     status = U_MEMORY_ALLOCATION_ERROR;
   2160                     break;
   2161                 }
   2162                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
   2163                 if (dest[i]) {
   2164                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2165                 } else {
   2166                     UText remainingText = UTEXT_INITIALIZER;
   2167                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2168                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2169                     utext_close(&remainingText);
   2170                 }
   2171 
   2172                 uprv_free(remainingChars);
   2173             }
   2174             nextOutputStringStart = fMatchEnd;
   2175 
   2176             // If the delimiter pattern has capturing parentheses, the captured
   2177             //  text goes out into the next n destination strings.
   2178             int32_t groupNum;
   2179             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
   2180                 if (i >= destCapacity-2) {
   2181                     // Never fill the last available output string with capture group text.
   2182                     // It will filled with the last field, the remainder of the
   2183                     //  unsplit input text.
   2184                     break;
   2185                 }
   2186                 i++;
   2187                 dest[i] = group(groupNum, dest[i], status);
   2188             }
   2189 
   2190             if (nextOutputStringStart == fActiveLimit) {
   2191                 // The delimiter was at the end of the string.  We're done, but first
   2192                 // we output one last empty string, for the empty field following
   2193                 //   the delimiter at the end of input.
   2194                 if (i+1 < destCapacity) {
   2195                     ++i;
   2196                     if (dest[i] == NULL) {
   2197                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
   2198                     } else {
   2199                         static UChar emptyString[] = {(UChar)0};
   2200                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
   2201                     }
   2202                 }
   2203                 break;
   2204 
   2205             }
   2206         }
   2207         else
   2208         {
   2209             // We ran off the end of the input while looking for the next delimiter.
   2210             // All the remaining text goes into the current output string.
   2211             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2212                 if (dest[i]) {
   2213                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2214                                   input->chunkContents+nextOutputStringStart,
   2215                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
   2216                 } else {
   2217                     UText remainingText = UTEXT_INITIALIZER;
   2218                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2219                                      fActiveLimit-nextOutputStringStart, &status);
   2220                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2221                     utext_close(&remainingText);
   2222                 }
   2223             } else {
   2224                 UErrorCode lengthStatus = U_ZERO_ERROR;
   2225                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
   2226                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2227                 if (remainingChars == NULL) {
   2228                     status = U_MEMORY_ALLOCATION_ERROR;
   2229                     break;
   2230                 }
   2231 
   2232                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2233                 if (dest[i]) {
   2234                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2235                 } else {
   2236                     UText remainingText = UTEXT_INITIALIZER;
   2237                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2238                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2239                     utext_close(&remainingText);
   2240                 }
   2241 
   2242                 uprv_free(remainingChars);
   2243             }
   2244             break;
   2245         }
   2246         if (U_FAILURE(status)) {
   2247             break;
   2248         }
   2249     }   // end of for loop
   2250     return i+1;
   2251 }
   2252 
   2253 
   2254 //--------------------------------------------------------------------------------
   2255 //
   2256 //     start
   2257 //
   2258 //--------------------------------------------------------------------------------
   2259 int32_t RegexMatcher::start(UErrorCode &status) const {
   2260     return start(0, status);
   2261 }
   2262 
   2263 int64_t RegexMatcher::start64(UErrorCode &status) const {
   2264     return start64(0, status);
   2265 }
   2266 
   2267 //--------------------------------------------------------------------------------
   2268 //
   2269 //     start(int32_t group, UErrorCode &status)
   2270 //
   2271 //--------------------------------------------------------------------------------
   2272 
   2273 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
   2274     if (U_FAILURE(status)) {
   2275         return -1;
   2276     }
   2277     if (U_FAILURE(fDeferredStatus)) {
   2278         status = fDeferredStatus;
   2279         return -1;
   2280     }
   2281     if (fMatch == FALSE) {
   2282         status = U_REGEX_INVALID_STATE;
   2283         return -1;
   2284     }
   2285     if (group < 0 || group > fPattern->fGroupMap->size()) {
   2286         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2287         return -1;
   2288     }
   2289     int64_t s;
   2290     if (group == 0) {
   2291         s = fMatchStart;
   2292     } else {
   2293         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
   2294         U_ASSERT(groupOffset < fPattern->fFrameSize);
   2295         U_ASSERT(groupOffset >= 0);
   2296         s = fFrame->fExtra[groupOffset];
   2297     }
   2298 
   2299     return s;
   2300 }
   2301 
   2302 
   2303 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
   2304     return (int32_t)start64(group, status);
   2305 }
   2306 
   2307 //--------------------------------------------------------------------------------
   2308 //
   2309 //     useAnchoringBounds
   2310 //
   2311 //--------------------------------------------------------------------------------
   2312 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
   2313     fAnchoringBounds = b;
   2314     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
   2315     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
   2316     return *this;
   2317 }
   2318 
   2319 
   2320 //--------------------------------------------------------------------------------
   2321 //
   2322 //     useTransparentBounds
   2323 //
   2324 //--------------------------------------------------------------------------------
   2325 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
   2326     fTransparentBounds = b;
   2327     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
   2328     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
   2329     return *this;
   2330 }
   2331 
   2332 //--------------------------------------------------------------------------------
   2333 //
   2334 //     setTimeLimit
   2335 //
   2336 //--------------------------------------------------------------------------------
   2337 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
   2338     if (U_FAILURE(status)) {
   2339         return;
   2340     }
   2341     if (U_FAILURE(fDeferredStatus)) {
   2342         status = fDeferredStatus;
   2343         return;
   2344     }
   2345     if (limit < 0) {
   2346         status = U_ILLEGAL_ARGUMENT_ERROR;
   2347         return;
   2348     }
   2349     fTimeLimit = limit;
   2350 }
   2351 
   2352 
   2353 //--------------------------------------------------------------------------------
   2354 //
   2355 //     getTimeLimit
   2356 //
   2357 //--------------------------------------------------------------------------------
   2358 int32_t RegexMatcher::getTimeLimit() const {
   2359     return fTimeLimit;
   2360 }
   2361 
   2362 
   2363 //--------------------------------------------------------------------------------
   2364 //
   2365 //     setStackLimit
   2366 //
   2367 //--------------------------------------------------------------------------------
   2368 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
   2369     if (U_FAILURE(status)) {
   2370         return;
   2371     }
   2372     if (U_FAILURE(fDeferredStatus)) {
   2373         status = fDeferredStatus;
   2374         return;
   2375     }
   2376     if (limit < 0) {
   2377         status = U_ILLEGAL_ARGUMENT_ERROR;
   2378         return;
   2379     }
   2380 
   2381     // Reset the matcher.  This is needed here in case there is a current match
   2382     //    whose final stack frame (containing the match results, pointed to by fFrame)
   2383     //    would be lost by resizing to a smaller stack size.
   2384     reset();
   2385 
   2386     if (limit == 0) {
   2387         // Unlimited stack expansion
   2388         fStack->setMaxCapacity(0);
   2389     } else {
   2390         // Change the units of the limit  from bytes to ints, and bump the size up
   2391         //   to be big enough to hold at least one stack frame for the pattern,
   2392         //   if it isn't there already.
   2393         int32_t adjustedLimit = limit / sizeof(int32_t);
   2394         if (adjustedLimit < fPattern->fFrameSize) {
   2395             adjustedLimit = fPattern->fFrameSize;
   2396         }
   2397         fStack->setMaxCapacity(adjustedLimit);
   2398     }
   2399     fStackLimit = limit;
   2400 }
   2401 
   2402 
   2403 //--------------------------------------------------------------------------------
   2404 //
   2405 //     getStackLimit
   2406 //
   2407 //--------------------------------------------------------------------------------
   2408 int32_t RegexMatcher::getStackLimit() const {
   2409     return fStackLimit;
   2410 }
   2411 
   2412 
   2413 //--------------------------------------------------------------------------------
   2414 //
   2415 //     setMatchCallback
   2416 //
   2417 //--------------------------------------------------------------------------------
   2418 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
   2419                                     const void              *context,
   2420                                     UErrorCode              &status) {
   2421     if (U_FAILURE(status)) {
   2422         return;
   2423     }
   2424     fCallbackFn = callback;
   2425     fCallbackContext = context;
   2426 }
   2427 
   2428 
   2429 //--------------------------------------------------------------------------------
   2430 //
   2431 //     getMatchCallback
   2432 //
   2433 //--------------------------------------------------------------------------------
   2434 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
   2435                                   const void              *&context,
   2436                                   UErrorCode              &status) {
   2437     if (U_FAILURE(status)) {
   2438        return;
   2439     }
   2440     callback = fCallbackFn;
   2441     context  = fCallbackContext;
   2442 }
   2443 
   2444 
   2445 //--------------------------------------------------------------------------------
   2446 //
   2447 //     setMatchCallback
   2448 //
   2449 //--------------------------------------------------------------------------------
   2450 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
   2451                                                 const void                      *context,
   2452                                                 UErrorCode                      &status) {
   2453     if (U_FAILURE(status)) {
   2454         return;
   2455     }
   2456     fFindProgressCallbackFn = callback;
   2457     fFindProgressCallbackContext = context;
   2458 }
   2459 
   2460 
   2461 //--------------------------------------------------------------------------------
   2462 //
   2463 //     getMatchCallback
   2464 //
   2465 //--------------------------------------------------------------------------------
   2466 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
   2467                                                 const void                    *&context,
   2468                                                 UErrorCode                    &status) {
   2469     if (U_FAILURE(status)) {
   2470        return;
   2471     }
   2472     callback = fFindProgressCallbackFn;
   2473     context  = fFindProgressCallbackContext;
   2474 }
   2475 
   2476 
   2477 //================================================================================
   2478 //
   2479 //    Code following this point in this file is the internal
   2480 //    Match Engine Implementation.
   2481 //
   2482 //================================================================================
   2483 
   2484 
   2485 //--------------------------------------------------------------------------------
   2486 //
   2487 //   resetStack
   2488 //           Discard any previous contents of the state save stack, and initialize a
   2489 //           new stack frame to all -1.  The -1s are needed for capture group limits,
   2490 //           where they indicate that a group has not yet matched anything.
   2491 //--------------------------------------------------------------------------------
   2492 REStackFrame *RegexMatcher::resetStack() {
   2493     // Discard any previous contents of the state save stack, and initialize a
   2494     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
   2495     //  where they indicate that a group has not yet matched anything.
   2496     fStack->removeAllElements();
   2497 
   2498     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
   2499     int32_t i;
   2500     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
   2501         iFrame->fExtra[i] = -1;
   2502     }
   2503     return iFrame;
   2504 }
   2505 
   2506 
   2507 
   2508 //--------------------------------------------------------------------------------
   2509 //
   2510 //   isWordBoundary
   2511 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
   2512 //                     For us,
   2513 //                       If the current char is a combining mark,
   2514 //                          \b is FALSE.
   2515 //                       Else Scan backwards to the first non-combining char.
   2516 //                            We are at a boundary if the this char and the original chars are
   2517 //                               opposite in membership in \w set
   2518 //
   2519 //          parameters:   pos   - the current position in the input buffer
   2520 //
   2521 //              TODO:  double-check edge cases at region boundaries.
   2522 //
   2523 //--------------------------------------------------------------------------------
   2524 UBool RegexMatcher::isWordBoundary(int64_t pos) {
   2525     UBool isBoundary = FALSE;
   2526     UBool cIsWord    = FALSE;
   2527 
   2528     if (pos >= fLookLimit) {
   2529         fHitEnd = TRUE;
   2530     } else {
   2531         // Determine whether char c at current position is a member of the word set of chars.
   2532         // If we're off the end of the string, behave as though we're not at a word char.
   2533         UTEXT_SETNATIVEINDEX(fInputText, pos);
   2534         UChar32  c = UTEXT_CURRENT32(fInputText);
   2535         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2536             // Current char is a combining one.  Not a boundary.
   2537             return FALSE;
   2538         }
   2539         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   2540     }
   2541 
   2542     // Back up until we come to a non-combining char, determine whether
   2543     //  that char is a word char.
   2544     UBool prevCIsWord = FALSE;
   2545     for (;;) {
   2546         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
   2547             break;
   2548         }
   2549         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
   2550         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2551               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2552             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   2553             break;
   2554         }
   2555     }
   2556     isBoundary = cIsWord ^ prevCIsWord;
   2557     return isBoundary;
   2558 }
   2559 
   2560 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
   2561     UBool isBoundary = FALSE;
   2562     UBool cIsWord    = FALSE;
   2563 
   2564     const UChar *inputBuf = fInputText->chunkContents;
   2565 
   2566     if (pos >= fLookLimit) {
   2567         fHitEnd = TRUE;
   2568     } else {
   2569         // Determine whether char c at current position is a member of the word set of chars.
   2570         // If we're off the end of the string, behave as though we're not at a word char.
   2571         UChar32 c;
   2572         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
   2573         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2574             // Current char is a combining one.  Not a boundary.
   2575             return FALSE;
   2576         }
   2577         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   2578     }
   2579 
   2580     // Back up until we come to a non-combining char, determine whether
   2581     //  that char is a word char.
   2582     UBool prevCIsWord = FALSE;
   2583     for (;;) {
   2584         if (pos <= fLookStart) {
   2585             break;
   2586         }
   2587         UChar32 prevChar;
   2588         U16_PREV(inputBuf, fLookStart, pos, prevChar);
   2589         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2590               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2591             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   2592             break;
   2593         }
   2594     }
   2595     isBoundary = cIsWord ^ prevCIsWord;
   2596     return isBoundary;
   2597 }
   2598 
   2599 //--------------------------------------------------------------------------------
   2600 //
   2601 //   isUWordBoundary
   2602 //
   2603 //         Test for a word boundary using RBBI word break.
   2604 //
   2605 //          parameters:   pos   - the current position in the input buffer
   2606 //
   2607 //--------------------------------------------------------------------------------
   2608 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
   2609     UBool       returnVal = FALSE;
   2610 #if UCONFIG_NO_BREAK_ITERATION==0
   2611 
   2612     // If we haven't yet created a break iterator for this matcher, do it now.
   2613     if (fWordBreakItr == NULL) {
   2614         fWordBreakItr =
   2615             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
   2616         if (U_FAILURE(fDeferredStatus)) {
   2617             return FALSE;
   2618         }
   2619         fWordBreakItr->setText(fInputText, fDeferredStatus);
   2620     }
   2621 
   2622     if (pos >= fLookLimit) {
   2623         fHitEnd = TRUE;
   2624         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
   2625                             //    words are not boundaries.  All non-word chars stand by themselves,
   2626                             //    with word boundaries on both sides.
   2627     } else {
   2628         if (!UTEXT_USES_U16(fInputText)) {
   2629             // !!!: Would like a better way to do this!
   2630             UErrorCode status = U_ZERO_ERROR;
   2631             pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
   2632         }
   2633         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
   2634     }
   2635 #endif
   2636     return   returnVal;
   2637 }
   2638 
   2639 //--------------------------------------------------------------------------------
   2640 //
   2641 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
   2642 //                     saves. Increment the "time" counter, and call the
   2643 //                     user callback function if there is one installed.
   2644 //
   2645 //                     If the match operation needs to be aborted, either for a time-out
   2646 //                     or because the user callback asked for it, just set an error status.
   2647 //                     The engine will pick that up and stop in its outer loop.
   2648 //
   2649 //--------------------------------------------------------------------------------
   2650 void RegexMatcher::IncrementTime(UErrorCode &status) {
   2651     fTickCounter = TIMER_INITIAL_VALUE;
   2652     fTime++;
   2653     if (fCallbackFn != NULL) {
   2654         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
   2655             status = U_REGEX_STOPPED_BY_CALLER;
   2656             return;
   2657         }
   2658     }
   2659     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
   2660         status = U_REGEX_TIME_OUT;
   2661     }
   2662 }
   2663 
   2664 //--------------------------------------------------------------------------------
   2665 //
   2666 //   ReportFindProgress     This function is called once for each advance in the target
   2667 //                          string from the find() function, and calls the user progress callback
   2668 //                          function if there is one installed.
   2669 //
   2670 //                          NOTE:
   2671 //
   2672 //                          If the match operation needs to be aborted because the user
   2673 //                          callback asked for it, just set an error status.
   2674 //                          The engine will pick that up and stop in its outer loop.
   2675 //
   2676 //--------------------------------------------------------------------------------
   2677 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
   2678     if (fFindProgressCallbackFn != NULL) {
   2679         if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
   2680             status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
   2681             return FALSE;
   2682         }
   2683     }
   2684     return TRUE;
   2685 }
   2686 
   2687 //--------------------------------------------------------------------------------
   2688 //
   2689 //   StateSave
   2690 //       Make a new stack frame, initialized as a copy of the current stack frame.
   2691 //       Set the pattern index in the original stack frame from the operand value
   2692 //       in the opcode.  Execution of the engine continues with the state in
   2693 //       the newly created stack frame
   2694 //
   2695 //       Note that reserveBlock() may grow the stack, resulting in the
   2696 //       whole thing being relocated in memory.
   2697 //
   2698 //    Parameters:
   2699 //       fp           The top frame pointer when called.  At return, a new
   2700 //                    fame will be present
   2701 //       savePatIdx   An index into the compiled pattern.  Goes into the original
   2702 //                    (not new) frame.  If execution ever back-tracks out of the
   2703 //                    new frame, this will be where we continue from in the pattern.
   2704 //    Return
   2705 //                    The new frame pointer.
   2706 //
   2707 //--------------------------------------------------------------------------------
   2708 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
   2709     // push storage for a new frame.
   2710     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
   2711     if (newFP == NULL) {
   2712         // Failure on attempted stack expansion.
   2713         //   Stack function set some other error code, change it to a more
   2714         //   specific one for regular expressions.
   2715         status = U_REGEX_STACK_OVERFLOW;
   2716         // We need to return a writable stack frame, so just return the
   2717         //    previous frame.  The match operation will stop quickly
   2718         //    because of the error status, after which the frame will never
   2719         //    be looked at again.
   2720         return fp;
   2721     }
   2722     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
   2723 
   2724     // New stack frame = copy of old top frame.
   2725     int64_t *source = (int64_t *)fp;
   2726     int64_t *dest   = newFP;
   2727     for (;;) {
   2728         *dest++ = *source++;
   2729         if (source == newFP) {
   2730             break;
   2731         }
   2732     }
   2733 
   2734     fTickCounter--;
   2735     if (fTickCounter <= 0) {
   2736        IncrementTime(status);    // Re-initializes fTickCounter
   2737     }
   2738     fp->fPatIdx = savePatIdx;
   2739     return (REStackFrame *)newFP;
   2740 }
   2741 
   2742 
   2743 //--------------------------------------------------------------------------------
   2744 //
   2745 //   MatchAt      This is the actual matching engine.
   2746 //
   2747 //                  startIdx:    begin matching a this index.
   2748 //                  toEnd:       if true, match must extend to end of the input region
   2749 //
   2750 //--------------------------------------------------------------------------------
   2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
   2752     UBool       isMatch  = FALSE;      // True if the we have a match.
   2753 
   2754     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
   2755 
   2756     int32_t     op;                    // Operation from the compiled pattern, split into
   2757     int32_t     opType;                //    the opcode
   2758     int32_t     opValue;               //    and the operand value.
   2759 
   2760     #ifdef REGEX_RUN_DEBUG
   2761     if (fTraceDebug)
   2762     {
   2763         printf("MatchAt(startIdx=%ld)\n", startIdx);
   2764         printf("Original Pattern: ");
   2765         UChar32 c = utext_next32From(fPattern->fPattern, 0);
   2766         while (c != U_SENTINEL) {
   2767             if (c<32 || c>256) {
   2768                 c = '.';
   2769             }
   2770             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
   2771 
   2772             c = UTEXT_NEXT32(fPattern->fPattern);
   2773         }
   2774         printf("\n");
   2775         printf("Input String: ");
   2776         c = utext_next32From(fInputText, 0);
   2777         while (c != U_SENTINEL) {
   2778             if (c<32 || c>256) {
   2779                 c = '.';
   2780             }
   2781             printf("%c", c);
   2782 
   2783             c = UTEXT_NEXT32(fInputText);
   2784         }
   2785         printf("\n");
   2786         printf("\n");
   2787     }
   2788     #endif
   2789 
   2790     if (U_FAILURE(status)) {
   2791         return;
   2792     }
   2793 
   2794     //  Cache frequently referenced items from the compiled pattern
   2795     //
   2796     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   2797 
   2798     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   2799     UVector             *sets          = fPattern->fSets;
   2800 
   2801     fFrameSize = fPattern->fFrameSize;
   2802     REStackFrame        *fp            = resetStack();
   2803 
   2804     fp->fPatIdx   = 0;
   2805     fp->fInputIdx = startIdx;
   2806 
   2807     // Zero out the pattern's static data
   2808     int32_t i;
   2809     for (i = 0; i<fPattern->fDataSize; i++) {
   2810         fData[i] = 0;
   2811     }
   2812 
   2813     //
   2814     //  Main loop for interpreting the compiled pattern.
   2815     //  One iteration of the loop per pattern operation performed.
   2816     //
   2817     for (;;) {
   2818 #if 0
   2819         if (_heapchk() != _HEAPOK) {
   2820             fprintf(stderr, "Heap Trouble\n");
   2821         }
   2822 #endif
   2823 
   2824         op      = (int32_t)pat[fp->fPatIdx];
   2825         opType  = URX_TYPE(op);
   2826         opValue = URX_VAL(op);
   2827         #ifdef REGEX_RUN_DEBUG
   2828         if (fTraceDebug) {
   2829             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2830             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
   2831                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   2832             fPattern->dumpOp(fp->fPatIdx);
   2833         }
   2834         #endif
   2835         fp->fPatIdx++;
   2836 
   2837         switch (opType) {
   2838 
   2839 
   2840         case URX_NOP:
   2841             break;
   2842 
   2843 
   2844         case URX_BACKTRACK:
   2845             // Force a backtrack.  In some circumstances, the pattern compiler
   2846             //   will notice that the pattern can't possibly match anything, and will
   2847             //   emit one of these at that point.
   2848             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2849             break;
   2850 
   2851 
   2852         case URX_ONECHAR:
   2853             if (fp->fInputIdx < fActiveLimit) {
   2854                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2855                 UChar32 c = UTEXT_NEXT32(fInputText);
   2856                 if (c == opValue) {
   2857                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2858                     break;
   2859                 }
   2860             } else {
   2861                 fHitEnd = TRUE;
   2862             }
   2863             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2864             break;
   2865 
   2866 
   2867         case URX_STRING:
   2868             {
   2869                 // Test input against a literal string.
   2870                 // Strings require two slots in the compiled pattern, one for the
   2871                 //   offset to the string text, and one for the length.
   2872 
   2873                 int32_t   stringStartIdx = opValue;
   2874                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
   2875                 fp->fPatIdx++;
   2876                 opType    = URX_TYPE(op);
   2877                 int32_t stringLen = URX_VAL(op);
   2878                 U_ASSERT(opType == URX_STRING_LEN);
   2879                 U_ASSERT(stringLen >= 2);
   2880 
   2881                 const UChar *patternString = litText+stringStartIdx;
   2882                 int32_t patternStringIndex = 0;
   2883                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2884                 UChar32 inputChar;
   2885                 UChar32 patternChar;
   2886                 UBool success = TRUE;
   2887                 while (patternStringIndex < stringLen) {
   2888                     if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
   2889                         success = FALSE;
   2890                         fHitEnd = TRUE;
   2891                         break;
   2892                     }
   2893                     inputChar = UTEXT_NEXT32(fInputText);
   2894                     U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
   2895                     if (patternChar != inputChar) {
   2896                         success = FALSE;
   2897                         break;
   2898                     }
   2899                 }
   2900 
   2901                 if (success) {
   2902                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2903                 } else {
   2904                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2905                 }
   2906             }
   2907             break;
   2908 
   2909 
   2910         case URX_STATE_SAVE:
   2911             fp = StateSave(fp, opValue, status);
   2912             break;
   2913 
   2914 
   2915         case URX_END:
   2916             // The match loop will exit via this path on a successful match,
   2917             //   when we reach the end of the pattern.
   2918             if (toEnd && fp->fInputIdx != fActiveLimit) {
   2919                 // The pattern matched, but not to the end of input.  Try some more.
   2920                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2921                 break;
   2922             }
   2923             isMatch = TRUE;
   2924             goto  breakFromLoop;
   2925 
   2926         // Start and End Capture stack frame variables are laid out out like this:
   2927             //  fp->fExtra[opValue]  - The start of a completed capture group
   2928             //             opValue+1 - The end   of a completed capture group
   2929             //             opValue+2 - the start of a capture group whose end
   2930             //                          has not yet been reached (and might not ever be).
   2931         case URX_START_CAPTURE:
   2932             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2933             fp->fExtra[opValue+2] = fp->fInputIdx;
   2934             break;
   2935 
   2936 
   2937         case URX_END_CAPTURE:
   2938             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2939             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   2940             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   2941             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   2942             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   2943             break;
   2944 
   2945 
   2946         case URX_DOLLAR:                   //  $, test for End of line
   2947                                            //     or for position before new line at end of input
   2948             {
   2949                 if (fp->fInputIdx >= fAnchorLimit) {
   2950                     // We really are at the end of input.  Success.
   2951                     fHitEnd = TRUE;
   2952                     fRequireEnd = TRUE;
   2953                     break;
   2954                 }
   2955 
   2956                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2957 
   2958                 // If we are positioned just before a new-line that is located at the
   2959                 //   end of input, succeed.
   2960                 UChar32 c = UTEXT_NEXT32(fInputText);
   2961                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   2962                     if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
   2963                         // If not in the middle of a CR/LF sequence
   2964                       if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
   2965                             // At new-line at end of input. Success
   2966                             fHitEnd = TRUE;
   2967                             fRequireEnd = TRUE;
   2968 
   2969                             break;
   2970                         }
   2971                     }
   2972                 } else {
   2973                     UChar32 nextC = UTEXT_NEXT32(fInputText);
   2974                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   2975                         fHitEnd = TRUE;
   2976                         fRequireEnd = TRUE;
   2977                         break;                         // At CR/LF at end of input.  Success
   2978                     }
   2979                 }
   2980 
   2981                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2982             }
   2983             break;
   2984 
   2985 
   2986          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   2987             if (fp->fInputIdx >= fAnchorLimit) {
   2988                 // Off the end of input.  Success.
   2989                 fHitEnd = TRUE;
   2990                 fRequireEnd = TRUE;
   2991                 break;
   2992             } else {
   2993                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2994                 UChar32 c = UTEXT_NEXT32(fInputText);
   2995                 // Either at the last character of input, or off the end.
   2996                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
   2997                     fHitEnd = TRUE;
   2998                     fRequireEnd = TRUE;
   2999                     break;
   3000                 }
   3001             }
   3002 
   3003             // Not at end of input.  Back-track out.
   3004             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3005             break;
   3006 
   3007 
   3008          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   3009              {
   3010                  if (fp->fInputIdx >= fAnchorLimit) {
   3011                      // We really are at the end of input.  Success.
   3012                      fHitEnd = TRUE;
   3013                      fRequireEnd = TRUE;
   3014                      break;
   3015                  }
   3016                  // If we are positioned just before a new-line, succeed.
   3017                  // It makes no difference where the new-line is within the input.
   3018                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3019                  UChar32 c = UTEXT_CURRENT32(fInputText);
   3020                  if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
   3021                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   3022                      //  In multi-line mode, hitting a new-line just before the end of input does not
   3023                      //   set the hitEnd or requireEnd flags
   3024                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
   3025                         break;
   3026                      }
   3027                  }
   3028                  // not at a new line.  Fail.
   3029                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3030              }
   3031              break;
   3032 
   3033 
   3034          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   3035              {
   3036                  if (fp->fInputIdx >= fAnchorLimit) {
   3037                      // We really are at the end of input.  Success.
   3038                      fHitEnd = TRUE;
   3039                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   3040                      break;               //   adding a new-line would not lose the match.
   3041                  }
   3042                  // If we are not positioned just before a new-line, the test fails; backtrack out.
   3043                  // It makes no difference where the new-line is within the input.
   3044                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3045                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
   3046                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3047                  }
   3048              }
   3049              break;
   3050 
   3051 
   3052        case URX_CARET:                    //  ^, test for start of line
   3053             if (fp->fInputIdx != fAnchorStart) {
   3054                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3055             }
   3056             break;
   3057 
   3058 
   3059        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   3060            {
   3061                if (fp->fInputIdx == fAnchorStart) {
   3062                    // We are at the start input.  Success.
   3063                    break;
   3064                }
   3065                // Check whether character just before the current pos is a new-line
   3066                //   unless we are at the end of input
   3067                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3068                UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3069                if ((fp->fInputIdx < fAnchorLimit) &&
   3070                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   3071                    //  It's a new-line.  ^ is true.  Success.
   3072                    //  TODO:  what should be done with positions between a CR and LF?
   3073                    break;
   3074                }
   3075                // Not at the start of a line.  Fail.
   3076                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3077            }
   3078            break;
   3079 
   3080 
   3081        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   3082            {
   3083                U_ASSERT(fp->fInputIdx >= fAnchorStart);
   3084                if (fp->fInputIdx <= fAnchorStart) {
   3085                    // We are at the start input.  Success.
   3086                    break;
   3087                }
   3088                // Check whether character just before the current pos is a new-line
   3089                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   3090                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3091                UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3092                if (c != 0x0a) {
   3093                    // Not at the start of a line.  Back-track out.
   3094                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3095                }
   3096            }
   3097            break;
   3098 
   3099         case URX_BACKSLASH_B:          // Test for word boundaries
   3100             {
   3101                 UBool success = isWordBoundary(fp->fInputIdx);
   3102                 success ^= (UBool)(opValue != 0);     // flip sense for \B
   3103                 if (!success) {
   3104                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3105                 }
   3106             }
   3107             break;
   3108 
   3109 
   3110         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   3111             {
   3112                 UBool success = isUWordBoundary(fp->fInputIdx);
   3113                 success ^= (UBool)(opValue != 0);     // flip sense for \B
   3114                 if (!success) {
   3115                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3116                 }
   3117             }
   3118             break;
   3119 
   3120 
   3121         case URX_BACKSLASH_D:            // Test for decimal digit
   3122             {
   3123                 if (fp->fInputIdx >= fActiveLimit) {
   3124                     fHitEnd = TRUE;
   3125                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3126                     break;
   3127                 }
   3128 
   3129                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3130 
   3131                 UChar32 c = UTEXT_NEXT32(fInputText);
   3132                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   3133                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   3134                 success ^= (UBool)(opValue != 0);        // flip sense for \D
   3135                 if (success) {
   3136                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3137                 } else {
   3138                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3139                 }
   3140             }
   3141             break;
   3142 
   3143 
   3144         case URX_BACKSLASH_G:          // Test for position at end of previous match
   3145             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
   3146                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3147             }
   3148             break;
   3149 
   3150 
   3151         case URX_BACKSLASH_X:
   3152             //  Match a Grapheme, as defined by Unicode TR 29.
   3153             //  Differs slightly from Perl, which consumes combining marks independently
   3154             //    of context.
   3155             {
   3156 
   3157                 // Fail if at end of input
   3158                 if (fp->fInputIdx >= fActiveLimit) {
   3159                     fHitEnd = TRUE;
   3160                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3161                     break;
   3162                 }
   3163 
   3164                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3165 
   3166                 // Examine (and consume) the current char.
   3167                 //   Dispatch into a little state machine, based on the char.
   3168                 UChar32  c;
   3169                 c = UTEXT_NEXT32(fInputText);
   3170                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3171                 UnicodeSet **sets = fPattern->fStaticSets;
   3172                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   3173                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   3174                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   3175                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   3176                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   3177                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3178                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3179                 goto GC_Extend;
   3180 
   3181 
   3182 
   3183 GC_L:
   3184                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3185                 c = UTEXT_NEXT32(fInputText);
   3186                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3187                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   3188                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   3189                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   3190                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3191                 (void)UTEXT_PREVIOUS32(fInputText);
   3192                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3193                 goto GC_Extend;
   3194 
   3195 GC_V:
   3196                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3197                 c = UTEXT_NEXT32(fInputText);
   3198                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3199                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3200                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3201                 (void)UTEXT_PREVIOUS32(fInputText);
   3202                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3203                 goto GC_Extend;
   3204 
   3205 GC_T:
   3206                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3207                 c = UTEXT_NEXT32(fInputText);
   3208                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3209                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3210                 (void)UTEXT_PREVIOUS32(fInputText);
   3211                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3212                 goto GC_Extend;
   3213 
   3214 GC_Extend:
   3215                 // Combining characters are consumed here
   3216                 for (;;) {
   3217                     if (fp->fInputIdx >= fActiveLimit) {
   3218                         break;
   3219                     }
   3220                     c = UTEXT_CURRENT32(fInputText);
   3221                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   3222                         break;
   3223                     }
   3224                     (void)UTEXT_NEXT32(fInputText);
   3225                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3226                 }
   3227                 goto GC_Done;
   3228 
   3229 GC_Control:
   3230                 // Most control chars stand alone (don't combine with combining chars),
   3231                 //   except for that CR/LF sequence is a single grapheme cluster.
   3232                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
   3233                     c = UTEXT_NEXT32(fInputText);
   3234                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3235                 }
   3236 
   3237 GC_Done:
   3238                 if (fp->fInputIdx >= fActiveLimit) {
   3239                     fHitEnd = TRUE;
   3240                 }
   3241                 break;
   3242             }
   3243 
   3244 
   3245 
   3246 
   3247         case URX_BACKSLASH_Z:          // Test for end of Input
   3248             if (fp->fInputIdx < fAnchorLimit) {
   3249                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3250             } else {
   3251                 fHitEnd = TRUE;
   3252                 fRequireEnd = TRUE;
   3253             }
   3254             break;
   3255 
   3256 
   3257 
   3258         case URX_STATIC_SETREF:
   3259             {
   3260                 // Test input character against one of the predefined sets
   3261                 //    (Word Characters, for example)
   3262                 // The high bit of the op value is a flag for the match polarity.
   3263                 //    0:   success if input char is in set.
   3264                 //    1:   success if input char is not in set.
   3265                 if (fp->fInputIdx >= fActiveLimit) {
   3266                     fHitEnd = TRUE;
   3267                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3268                     break;
   3269                 }
   3270 
   3271                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   3272                 opValue &= ~URX_NEG_SET;
   3273                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3274 
   3275                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3276                 UChar32 c = UTEXT_NEXT32(fInputText);
   3277                 if (c < 256) {
   3278                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3279                     if (s8->contains(c)) {
   3280                         success = !success;
   3281                     }
   3282                 } else {
   3283                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3284                     if (s->contains(c)) {
   3285                         success = !success;
   3286                     }
   3287                 }
   3288                 if (success) {
   3289                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3290                 } else {
   3291                     // the character wasn't in the set.
   3292                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3293                 }
   3294             }
   3295             break;
   3296 
   3297 
   3298         case URX_STAT_SETREF_N:
   3299             {
   3300                 // Test input character for NOT being a member of  one of
   3301                 //    the predefined sets (Word Characters, for example)
   3302                 if (fp->fInputIdx >= fActiveLimit) {
   3303                     fHitEnd = TRUE;
   3304                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3305                     break;
   3306                 }
   3307 
   3308                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3309 
   3310                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3311 
   3312                 UChar32 c = UTEXT_NEXT32(fInputText);
   3313                 if (c < 256) {
   3314                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3315                     if (s8->contains(c) == FALSE) {
   3316                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3317                         break;
   3318                     }
   3319                 } else {
   3320                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3321                     if (s->contains(c) == FALSE) {
   3322                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3323                         break;
   3324                     }
   3325                 }
   3326                 // the character wasn't in the set.
   3327                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3328             }
   3329             break;
   3330 
   3331 
   3332         case URX_SETREF:
   3333             if (fp->fInputIdx >= fActiveLimit) {
   3334                 fHitEnd = TRUE;
   3335                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3336                 break;
   3337             } else {
   3338                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3339 
   3340                 // There is input left.  Pick up one char and test it for set membership.
   3341                 UChar32 c = UTEXT_NEXT32(fInputText);
   3342                 U_ASSERT(opValue > 0 && opValue < sets->size());
   3343                 if (c<256) {
   3344                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   3345                     if (s8->contains(c)) {
   3346                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3347                         break;
   3348                     }
   3349                 } else {
   3350                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   3351                     if (s->contains(c)) {
   3352                         // The character is in the set.  A Match.
   3353                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3354                         break;
   3355                     }
   3356                 }
   3357 
   3358                 // the character wasn't in the set.
   3359                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3360             }
   3361             break;
   3362 
   3363 
   3364         case URX_DOTANY:
   3365             {
   3366                 // . matches anything, but stops at end-of-line.
   3367                 if (fp->fInputIdx >= fActiveLimit) {
   3368                     // At end of input.  Match failed.  Backtrack out.
   3369                     fHitEnd = TRUE;
   3370                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3371                     break;
   3372                 }
   3373 
   3374                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3375 
   3376                 // There is input left.  Advance over one char, unless we've hit end-of-line
   3377                 UChar32 c = UTEXT_NEXT32(fInputText);
   3378                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   3379                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   3380                     // End of line in normal mode.   . does not match.
   3381                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3382                     break;
   3383                 }
   3384                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3385             }
   3386             break;
   3387 
   3388 
   3389         case URX_DOTANY_ALL:
   3390             {
   3391                 // ., in dot-matches-all (including new lines) mode
   3392                 if (fp->fInputIdx >= fActiveLimit) {
   3393                     // At end of input.  Match failed.  Backtrack out.
   3394                     fHitEnd = TRUE;
   3395                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3396                     break;
   3397                 }
   3398 
   3399                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3400 
   3401                 // There is input left.  Advance over one char, except if we are
   3402                 //   at a cr/lf, advance over both of them.
   3403                 UChar32 c;
   3404                 c = UTEXT_NEXT32(fInputText);
   3405                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3406                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   3407                     // In the case of a CR/LF, we need to advance over both.
   3408                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
   3409                     if (nextc == 0x0a) {
   3410                         (void)UTEXT_NEXT32(fInputText);
   3411                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3412                     }
   3413                 }
   3414             }
   3415             break;
   3416 
   3417 
   3418         case URX_DOTANY_UNIX:
   3419             {
   3420                 // '.' operator, matches all, but stops at end-of-line.
   3421                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   3422                 if (fp->fInputIdx >= fActiveLimit) {
   3423                     // At end of input.  Match failed.  Backtrack out.
   3424                     fHitEnd = TRUE;
   3425                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3426                     break;
   3427                 }
   3428 
   3429                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3430 
   3431                 // There is input left.  Advance over one char, unless we've hit end-of-line
   3432                 UChar32 c = UTEXT_NEXT32(fInputText);
   3433                 if (c == 0x0a) {
   3434                     // End of line in normal mode.   '.' does not match the \n
   3435                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3436                 } else {
   3437                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3438                 }
   3439             }
   3440             break;
   3441 
   3442 
   3443         case URX_JMP:
   3444             fp->fPatIdx = opValue;
   3445             break;
   3446 
   3447         case URX_FAIL:
   3448             isMatch = FALSE;
   3449             goto breakFromLoop;
   3450 
   3451         case URX_JMP_SAV:
   3452             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   3453             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   3454             fp->fPatIdx = opValue;                         // Then JMP.
   3455             break;
   3456 
   3457         case URX_JMP_SAV_X:
   3458             // This opcode is used with (x)+, when x can match a zero length string.
   3459             // Same as JMP_SAV, except conditional on the match having made forward progress.
   3460             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   3461             //   data address of the input position at the start of the loop.
   3462             {
   3463                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   3464                 int32_t  stoOp = (int32_t)pat[opValue-1];
   3465                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   3466                 int32_t  frameLoc = URX_VAL(stoOp);
   3467                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   3468                 int64_t prevInputIdx = fp->fExtra[frameLoc];
   3469                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   3470                 if (prevInputIdx < fp->fInputIdx) {
   3471                     // The match did make progress.  Repeat the loop.
   3472                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   3473                     fp->fPatIdx = opValue;
   3474                     fp->fExtra[frameLoc] = fp->fInputIdx;
   3475                 }
   3476                 // If the input position did not advance, we do nothing here,
   3477                 //   execution will fall out of the loop.
   3478             }
   3479             break;
   3480 
   3481         case URX_CTR_INIT:
   3482             {
   3483                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3484                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
   3485 
   3486                 // Pick up the three extra operands that CTR_INIT has, and
   3487                 //    skip the pattern location counter past
   3488                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3489                 fp->fPatIdx += 3;
   3490                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3491                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   3492                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   3493                 U_ASSERT(minCount>=0);
   3494                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   3495                 U_ASSERT(loopLoc>=fp->fPatIdx);
   3496 
   3497                 if (minCount == 0) {
   3498                     fp = StateSave(fp, loopLoc+1, status);
   3499                 }
   3500                 if (maxCount == -1) {
   3501                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
   3502                 } else if (maxCount == 0) {
   3503                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3504                 }
   3505             }
   3506             break;
   3507 
   3508         case URX_CTR_LOOP:
   3509             {
   3510                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3511                 int32_t initOp = (int32_t)pat[opValue];
   3512                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   3513                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3514                 int32_t minCount  = (int32_t)pat[opValue+2];
   3515                 int32_t maxCount  = (int32_t)pat[opValue+3];
   3516                 (*pCounter)++;
   3517                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
   3518                     U_ASSERT(*pCounter == maxCount);
   3519                     break;
   3520                 }
   3521                 if (*pCounter >= minCount) {
   3522                     if (maxCount == -1) {
   3523                         // Loop has no hard upper bound.
   3524                         // Check that it is progressing through the input, break if it is not.
   3525                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
   3526                         if (fp->fInputIdx == *pLastInputIdx) {
   3527                             break;
   3528                         } else {
   3529                             *pLastInputIdx = fp->fInputIdx;
   3530                         }
   3531                     }
   3532                     fp = StateSave(fp, fp->fPatIdx, status);
   3533                 }
   3534                 fp->fPatIdx = opValue + 4;    // Loop back.
   3535             }
   3536             break;
   3537 
   3538         case URX_CTR_INIT_NG:
   3539             {
   3540                 // Initialize a non-greedy loop
   3541                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3542                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
   3543 
   3544                 // Pick up the three extra operands that CTR_INIT_NG has, and
   3545                 //    skip the pattern location counter past
   3546                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3547                 fp->fPatIdx += 3;
   3548                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3549                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   3550                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   3551                 U_ASSERT(minCount>=0);
   3552                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   3553                 U_ASSERT(loopLoc>fp->fPatIdx);
   3554                 if (maxCount == -1) {
   3555                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
   3556                 }
   3557 
   3558                 if (minCount == 0) {
   3559                     if (maxCount != 0) {
   3560                         fp = StateSave(fp, fp->fPatIdx, status);
   3561                     }
   3562                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   3563                 }
   3564             }
   3565             break;
   3566 
   3567         case URX_CTR_LOOP_NG:
   3568             {
   3569                 // Non-greedy {min, max} loops
   3570                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3571                 int32_t initOp = (int32_t)pat[opValue];
   3572                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   3573                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3574                 int32_t minCount  = (int32_t)pat[opValue+2];
   3575                 int32_t maxCount  = (int32_t)pat[opValue+3];
   3576 
   3577                 (*pCounter)++;
   3578                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
   3579                     // The loop has matched the maximum permitted number of times.
   3580                     //   Break out of here with no action.  Matching will
   3581                     //   continue with the following pattern.
   3582                     U_ASSERT(*pCounter == maxCount);
   3583                     break;
   3584                 }
   3585 
   3586                 if (*pCounter < minCount) {
   3587                     // We haven't met the minimum number of matches yet.
   3588                     //   Loop back for another one.
   3589                     fp->fPatIdx = opValue + 4;    // Loop back.
   3590                 } else {
   3591                     // We do have the minimum number of matches.
   3592 
   3593                     // If there is no upper bound on the loop iterations, check that the input index
   3594                     // is progressing, and stop the loop if it is not.
   3595                     if (maxCount == -1) {
   3596                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
   3597                         if (fp->fInputIdx == *pLastInputIdx) {
   3598                             break;
   3599                         }
   3600                         *pLastInputIdx = fp->fInputIdx;
   3601                     }
   3602 
   3603                     // Loop Continuation: we will fall into the pattern following the loop
   3604                     //   (non-greedy, don't execute loop body first), but first do
   3605                     //   a state save to the top of the loop, so that a match failure
   3606                     //   in the following pattern will try another iteration of the loop.
   3607                     fp = StateSave(fp, opValue + 4, status);
   3608                 }
   3609             }
   3610             break;
   3611 
   3612         case URX_STO_SP:
   3613             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3614             fData[opValue] = fStack->size();
   3615             break;
   3616 
   3617         case URX_LD_SP:
   3618             {
   3619                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3620                 int32_t newStackSize = (int32_t)fData[opValue];
   3621                 U_ASSERT(newStackSize <= fStack->size());
   3622                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3623                 if (newFP == (int64_t *)fp) {
   3624                     break;
   3625                 }
   3626                 int32_t i;
   3627                 for (i=0; i<fFrameSize; i++) {
   3628                     newFP[i] = ((int64_t *)fp)[i];
   3629                 }
   3630                 fp = (REStackFrame *)newFP;
   3631                 fStack->setSize(newStackSize);
   3632             }
   3633             break;
   3634 
   3635         case URX_BACKREF:
   3636             {
   3637                 U_ASSERT(opValue < fFrameSize);
   3638                 int64_t groupStartIdx = fp->fExtra[opValue];
   3639                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   3640                 U_ASSERT(groupStartIdx <= groupEndIdx);
   3641                 if (groupStartIdx < 0) {
   3642                     // This capture group has not participated in the match thus far,
   3643                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3644                     break;
   3645                 }
   3646                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
   3647                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3648 
   3649                 //   Note: if the capture group match was of an empty string the backref
   3650                 //         match succeeds.  Verified by testing:  Perl matches succeed
   3651                 //         in this case, so we do too.
   3652 
   3653                 UBool success = TRUE;
   3654                 for (;;) {
   3655                     if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
   3656                         success = TRUE;
   3657                         break;
   3658                     }
   3659                     if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
   3660                         success = FALSE;
   3661                         fHitEnd = TRUE;
   3662                         break;
   3663                     }
   3664                     UChar32 captureGroupChar = utext_next32(fAltInputText);
   3665                     UChar32 inputChar = utext_next32(fInputText);
   3666                     if (inputChar != captureGroupChar) {
   3667                         success = FALSE;
   3668                         break;
   3669                     }
   3670                 }
   3671 
   3672                 if (success) {
   3673                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3674                 } else {
   3675                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3676                 }
   3677             }
   3678             break;
   3679 
   3680 
   3681 
   3682         case URX_BACKREF_I:
   3683             {
   3684                 U_ASSERT(opValue < fFrameSize);
   3685                 int64_t groupStartIdx = fp->fExtra[opValue];
   3686                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   3687                 U_ASSERT(groupStartIdx <= groupEndIdx);
   3688                 if (groupStartIdx < 0) {
   3689                     // This capture group has not participated in the match thus far,
   3690                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3691                     break;
   3692                 }
   3693                 utext_setNativeIndex(fAltInputText, groupStartIdx);
   3694                 utext_setNativeIndex(fInputText, fp->fInputIdx);
   3695                 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
   3696                 CaseFoldingUTextIterator inputItr(*fInputText);
   3697 
   3698                 //   Note: if the capture group match was of an empty string the backref
   3699                 //         match succeeds.  Verified by testing:  Perl matches succeed
   3700                 //         in this case, so we do too.
   3701 
   3702                 UBool success = TRUE;
   3703                 for (;;) {
   3704                     if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
   3705                         success = TRUE;
   3706                         break;
   3707                     }
   3708                     if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
   3709                         success = FALSE;
   3710                         fHitEnd = TRUE;
   3711                         break;
   3712                     }
   3713                     UChar32 captureGroupChar = captureGroupItr.next();
   3714                     UChar32 inputChar = inputItr.next();
   3715                     if (inputChar != captureGroupChar) {
   3716                         success = FALSE;
   3717                         break;
   3718                     }
   3719                 }
   3720 
   3721                 if (success && inputItr.inExpansion()) {
   3722                     // We otained a match by consuming part of a string obtained from
   3723                     // case-folding a single code point of the input text.
   3724                     // This does not count as an overall match.
   3725                     success = FALSE;
   3726                 }
   3727 
   3728                 if (success) {
   3729                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3730                 } else {
   3731                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3732                 }
   3733 
   3734             }
   3735             break;
   3736 
   3737         case URX_STO_INP_LOC:
   3738             {
   3739                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   3740                 fp->fExtra[opValue] = fp->fInputIdx;
   3741             }
   3742             break;
   3743 
   3744         case URX_JMPX:
   3745             {
   3746                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3747                 fp->fPatIdx += 1;
   3748                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   3749                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   3750                 int64_t savedInputIdx = fp->fExtra[dataLoc];
   3751                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   3752                 if (savedInputIdx < fp->fInputIdx) {
   3753                     fp->fPatIdx = opValue;                               // JMP
   3754                 } else {
   3755                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   3756                 }
   3757             }
   3758             break;
   3759 
   3760         case URX_LA_START:
   3761             {
   3762                 // Entering a lookahead block.
   3763                 // Save Stack Ptr, Input Pos.
   3764                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3765                 fData[opValue]   = fStack->size();
   3766                 fData[opValue+1] = fp->fInputIdx;
   3767                 fActiveStart     = fLookStart;          // Set the match region change for
   3768                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   3769             }
   3770             break;
   3771 
   3772         case URX_LA_END:
   3773             {
   3774                 // Leaving a look-ahead block.
   3775                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   3776                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3777                 int32_t stackSize = fStack->size();
   3778                 int32_t newStackSize =(int32_t)fData[opValue];
   3779                 U_ASSERT(stackSize >= newStackSize);
   3780                 if (stackSize > newStackSize) {
   3781                     // Copy the current top frame back to the new (cut back) top frame.
   3782                     //   This makes the capture groups from within the look-ahead
   3783                     //   expression available.
   3784                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3785                     int32_t i;
   3786                     for (i=0; i<fFrameSize; i++) {
   3787                         newFP[i] = ((int64_t *)fp)[i];
   3788                     }
   3789                     fp = (REStackFrame *)newFP;
   3790                     fStack->setSize(newStackSize);
   3791                 }
   3792                 fp->fInputIdx = fData[opValue+1];
   3793 
   3794                 // Restore the active region bounds in the input string; they may have
   3795                 //    been changed because of transparent bounds on a Region.
   3796                 fActiveStart = fRegionStart;
   3797                 fActiveLimit = fRegionLimit;
   3798             }
   3799             break;
   3800 
   3801         case URX_ONECHAR_I:
   3802             // Case insensitive one char.  The char from the pattern is already case folded.
   3803             // Input text is not, but case folding the input can not reduce two or more code
   3804             // points to one.
   3805             if (fp->fInputIdx < fActiveLimit) {
   3806                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3807 
   3808                 UChar32 c = UTEXT_NEXT32(fInputText);
   3809                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   3810                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3811                     break;
   3812                 }
   3813             } else {
   3814                 fHitEnd = TRUE;
   3815             }
   3816 
   3817             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3818             break;
   3819 
   3820         case URX_STRING_I:
   3821             {
   3822                 // Case-insensitive test input against a literal string.
   3823                 // Strings require two slots in the compiled pattern, one for the
   3824                 //   offset to the string text, and one for the length.
   3825                 //   The compiled string has already been case folded.
   3826                 {
   3827                     const UChar *patternString = litText + opValue;
   3828                     int32_t      patternStringIdx  = 0;
   3829 
   3830                     op      = (int32_t)pat[fp->fPatIdx];
   3831                     fp->fPatIdx++;
   3832                     opType  = URX_TYPE(op);
   3833                     opValue = URX_VAL(op);
   3834                     U_ASSERT(opType == URX_STRING_LEN);
   3835                     int32_t patternStringLen = opValue;  // Length of the string from the pattern.
   3836 
   3837 
   3838                     UChar32   cPattern;
   3839                     UChar32   cText;
   3840                     UBool     success = TRUE;
   3841 
   3842                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3843                     CaseFoldingUTextIterator inputIterator(*fInputText);
   3844                     while (patternStringIdx < patternStringLen) {
   3845                         if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
   3846                             success = FALSE;
   3847                             fHitEnd = TRUE;
   3848                             break;
   3849                         }
   3850                         U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
   3851                         cText = inputIterator.next();
   3852                         if (cText != cPattern) {
   3853                             success = FALSE;
   3854                             break;
   3855                         }
   3856                     }
   3857                     if (inputIterator.inExpansion()) {
   3858                         success = FALSE;
   3859                     }
   3860 
   3861                     if (success) {
   3862                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3863                     } else {
   3864                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3865                     }
   3866                 }
   3867             }
   3868             break;
   3869 
   3870         case URX_LB_START:
   3871             {
   3872                 // Entering a look-behind block.
   3873                 // Save Stack Ptr, Input Pos.
   3874                 //   TODO:  implement transparent bounds.  Ticket #6067
   3875                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3876                 fData[opValue]   = fStack->size();
   3877                 fData[opValue+1] = fp->fInputIdx;
   3878                 // Init the variable containing the start index for attempted matches.
   3879                 fData[opValue+2] = -1;
   3880                 // Save input string length, then reset to pin any matches to end at
   3881                 //   the current position.
   3882                 fData[opValue+3] = fActiveLimit;
   3883                 fActiveLimit     = fp->fInputIdx;
   3884             }
   3885             break;
   3886 
   3887 
   3888         case URX_LB_CONT:
   3889             {
   3890                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   3891                 //    at all possible input starting positions.
   3892 
   3893                 // Fetch the min and max possible match lengths.  They are the operands
   3894                 //   of this op in the pattern.
   3895                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
   3896                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
   3897                 U_ASSERT(minML <= maxML);
   3898                 U_ASSERT(minML >= 0);
   3899 
   3900                 // Fetch (from data) the last input index where a match was attempted.
   3901                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3902                 int64_t  *lbStartIdx = &fData[opValue+2];
   3903                 if (*lbStartIdx < 0) {
   3904                     // First time through loop.
   3905                     *lbStartIdx = fp->fInputIdx - minML;
   3906                 } else {
   3907                     // 2nd through nth time through the loop.
   3908                     // Back up start position for match by one.
   3909                     if (*lbStartIdx == 0) {
   3910                         (*lbStartIdx)--;
   3911                     } else {
   3912                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
   3913                         (void)UTEXT_PREVIOUS32(fInputText);
   3914                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3915                     }
   3916                 }
   3917 
   3918                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   3919                     // We have tried all potential match starting points without
   3920                     //  getting a match.  Backtrack out, and out of the
   3921                     //   Look Behind altogether.
   3922                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3923                     int64_t restoreInputLen = fData[opValue+3];
   3924                     U_ASSERT(restoreInputLen >= fActiveLimit);
   3925                     U_ASSERT(restoreInputLen <= fInputLength);
   3926                     fActiveLimit = restoreInputLen;
   3927                     break;
   3928                 }
   3929 
   3930                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   3931                 //      (successful match will fall off the end of the loop.)
   3932                 fp = StateSave(fp, fp->fPatIdx-3, status);
   3933                 fp->fInputIdx = *lbStartIdx;
   3934             }
   3935             break;
   3936 
   3937         case URX_LB_END:
   3938             // End of a look-behind block, after a successful match.
   3939             {
   3940                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3941                 if (fp->fInputIdx != fActiveLimit) {
   3942                     //  The look-behind expression matched, but the match did not
   3943                     //    extend all the way to the point that we are looking behind from.
   3944                     //  FAIL out of here, which will take us back to the LB_CONT, which
   3945                     //     will retry the match starting at another position or fail
   3946                     //     the look-behind altogether, whichever is appropriate.
   3947                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3948                     break;
   3949                 }
   3950 
   3951                 // Look-behind match is good.  Restore the orignal input string length,
   3952                 //   which had been truncated to pin the end of the lookbehind match to the
   3953                 //   position being looked-behind.
   3954                 int64_t originalInputLen = fData[opValue+3];
   3955                 U_ASSERT(originalInputLen >= fActiveLimit);
   3956                 U_ASSERT(originalInputLen <= fInputLength);
   3957                 fActiveLimit = originalInputLen;
   3958             }
   3959             break;
   3960 
   3961 
   3962         case URX_LBN_CONT:
   3963             {
   3964                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   3965                 //    at all possible input starting positions.
   3966 
   3967                 // Fetch the extra parameters of this op.
   3968                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
   3969                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
   3970                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
   3971                         continueLoc = URX_VAL(continueLoc);
   3972                 U_ASSERT(minML <= maxML);
   3973                 U_ASSERT(minML >= 0);
   3974                 U_ASSERT(continueLoc > fp->fPatIdx);
   3975 
   3976                 // Fetch (from data) the last input index where a match was attempted.
   3977                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3978                 int64_t  *lbStartIdx = &fData[opValue+2];
   3979                 if (*lbStartIdx < 0) {
   3980                     // First time through loop.
   3981                     *lbStartIdx = fp->fInputIdx - minML;
   3982                 } else {
   3983                     // 2nd through nth time through the loop.
   3984                     // Back up start position for match by one.
   3985                     if (*lbStartIdx == 0) {
   3986                         (*lbStartIdx)--;
   3987                     } else {
   3988                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
   3989                         (void)UTEXT_PREVIOUS32(fInputText);
   3990                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3991                     }
   3992                 }
   3993 
   3994                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   3995                     // We have tried all potential match starting points without
   3996                     //  getting a match, which means that the negative lookbehind as
   3997                     //  a whole has succeeded.  Jump forward to the continue location
   3998                     int64_t restoreInputLen = fData[opValue+3];
   3999                     U_ASSERT(restoreInputLen >= fActiveLimit);
   4000                     U_ASSERT(restoreInputLen <= fInputLength);
   4001                     fActiveLimit = restoreInputLen;
   4002                     fp->fPatIdx = continueLoc;
   4003                     break;
   4004                 }
   4005 
   4006                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   4007                 //      (successful match will cause a FAIL out of the loop altogether.)
   4008                 fp = StateSave(fp, fp->fPatIdx-4, status);
   4009                 fp->fInputIdx = *lbStartIdx;
   4010             }
   4011             break;
   4012 
   4013         case URX_LBN_END:
   4014             // End of a negative look-behind block, after a successful match.
   4015             {
   4016                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4017                 if (fp->fInputIdx != fActiveLimit) {
   4018                     //  The look-behind expression matched, but the match did not
   4019                     //    extend all the way to the point that we are looking behind from.
   4020                     //  FAIL out of here, which will take us back to the LB_CONT, which
   4021                     //     will retry the match starting at another position or succeed
   4022                     //     the look-behind altogether, whichever is appropriate.
   4023                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4024                     break;
   4025                 }
   4026 
   4027                 // Look-behind expression matched, which means look-behind test as
   4028                 //   a whole Fails
   4029 
   4030                 //   Restore the orignal input string length, which had been truncated
   4031                 //   inorder to pin the end of the lookbehind match
   4032                 //   to the position being looked-behind.
   4033                 int64_t originalInputLen = fData[opValue+3];
   4034                 U_ASSERT(originalInputLen >= fActiveLimit);
   4035                 U_ASSERT(originalInputLen <= fInputLength);
   4036                 fActiveLimit = originalInputLen;
   4037 
   4038                 // Restore original stack position, discarding any state saved
   4039                 //   by the successful pattern match.
   4040                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4041                 int32_t newStackSize = (int32_t)fData[opValue];
   4042                 U_ASSERT(fStack->size() > newStackSize);
   4043                 fStack->setSize(newStackSize);
   4044 
   4045                 //  FAIL, which will take control back to someplace
   4046                 //  prior to entering the look-behind test.
   4047                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4048             }
   4049             break;
   4050 
   4051 
   4052         case URX_LOOP_SR_I:
   4053             // Loop Initialization for the optimized implementation of
   4054             //     [some character set]*
   4055             //   This op scans through all matching input.
   4056             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4057             {
   4058                 U_ASSERT(opValue > 0 && opValue < sets->size());
   4059                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   4060                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   4061 
   4062                 // Loop through input, until either the input is exhausted or
   4063                 //   we reach a character that is not a member of the set.
   4064                 int64_t ix = fp->fInputIdx;
   4065                 UTEXT_SETNATIVEINDEX(fInputText, ix);
   4066                 for (;;) {
   4067                     if (ix >= fActiveLimit) {
   4068                         fHitEnd = TRUE;
   4069                         break;
   4070                     }
   4071                     UChar32 c = UTEXT_NEXT32(fInputText);
   4072                     if (c<256) {
   4073                         if (s8->contains(c) == FALSE) {
   4074                             break;
   4075                         }
   4076                     } else {
   4077                         if (s->contains(c) == FALSE) {
   4078                             break;
   4079                         }
   4080                     }
   4081                     ix = UTEXT_GETNATIVEINDEX(fInputText);
   4082                 }
   4083 
   4084                 // If there were no matching characters, skip over the loop altogether.
   4085                 //   The loop doesn't run at all, a * op always succeeds.
   4086                 if (ix == fp->fInputIdx) {
   4087                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4088                     break;
   4089                 }
   4090 
   4091                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4092                 //   must follow.  It's operand is the stack location
   4093                 //   that holds the starting input index for the match of this [set]*
   4094                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   4095                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4096                 int32_t stackLoc = URX_VAL(loopcOp);
   4097                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4098                 fp->fExtra[stackLoc] = fp->fInputIdx;
   4099                 fp->fInputIdx = ix;
   4100 
   4101                 // Save State to the URX_LOOP_C op that follows this one,
   4102                 //   so that match failures in the following code will return to there.
   4103                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4104                 fp = StateSave(fp, fp->fPatIdx, status);
   4105                 fp->fPatIdx++;
   4106             }
   4107             break;
   4108 
   4109 
   4110         case URX_LOOP_DOT_I:
   4111             // Loop Initialization for the optimized implementation of .*
   4112             //   This op scans through all remaining input.
   4113             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4114             {
   4115                 // Loop through input until the input is exhausted (we reach an end-of-line)
   4116                 // In DOTALL mode, we can just go straight to the end of the input.
   4117                 int64_t ix;
   4118                 if ((opValue & 1) == 1) {
   4119                     // Dot-matches-All mode.  Jump straight to the end of the string.
   4120                     ix = fActiveLimit;
   4121                     fHitEnd = TRUE;
   4122                 } else {
   4123                     // NOT DOT ALL mode.  Line endings do not match '.'
   4124                     // Scan forward until a line ending or end of input.
   4125                     ix = fp->fInputIdx;
   4126                     UTEXT_SETNATIVEINDEX(fInputText, ix);
   4127                     for (;;) {
   4128                         if (ix >= fActiveLimit) {
   4129                             fHitEnd = TRUE;
   4130                             break;
   4131                         }
   4132                         UChar32 c = UTEXT_NEXT32(fInputText);
   4133                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
   4134                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
   4135                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   4136                                     (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
   4137                                 //  char is a line ending.  Exit the scanning loop.
   4138                                 break;
   4139                             }
   4140                         }
   4141                         ix = UTEXT_GETNATIVEINDEX(fInputText);
   4142                     }
   4143                 }
   4144 
   4145                 // If there were no matching characters, skip over the loop altogether.
   4146                 //   The loop doesn't run at all, a * op always succeeds.
   4147                 if (ix == fp->fInputIdx) {
   4148                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4149                     break;
   4150                 }
   4151 
   4152                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4153                 //   must follow.  It's operand is the stack location
   4154                 //   that holds the starting input index for the match of this .*
   4155                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   4156                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4157                 int32_t stackLoc = URX_VAL(loopcOp);
   4158                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4159                 fp->fExtra[stackLoc] = fp->fInputIdx;
   4160                 fp->fInputIdx = ix;
   4161 
   4162                 // Save State to the URX_LOOP_C op that follows this one,
   4163                 //   so that match failures in the following code will return to there.
   4164                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4165                 fp = StateSave(fp, fp->fPatIdx, status);
   4166                 fp->fPatIdx++;
   4167             }
   4168             break;
   4169 
   4170 
   4171         case URX_LOOP_C:
   4172             {
   4173                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   4174                 backSearchIndex = fp->fExtra[opValue];
   4175                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
   4176                 if (backSearchIndex == fp->fInputIdx) {
   4177                     // We've backed up the input idx to the point that the loop started.
   4178                     // The loop is done.  Leave here without saving state.
   4179                     //  Subsequent failures won't come back here.
   4180                     break;
   4181                 }
   4182                 // Set up for the next iteration of the loop, with input index
   4183                 //   backed up by one from the last time through,
   4184                 //   and a state save to this instruction in case the following code fails again.
   4185                 //   (We're going backwards because this loop emulates stack unwinding, not
   4186                 //    the initial scan forward.)
   4187                 U_ASSERT(fp->fInputIdx > 0);
   4188                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4189                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
   4190                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4191 
   4192                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
   4193                 if (prevC == 0x0a &&
   4194                     fp->fInputIdx > backSearchIndex &&
   4195                     twoPrevC == 0x0d) {
   4196                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
   4197                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   4198                         // .*, stepping back over CRLF pair.
   4199                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4200                     }
   4201                 }
   4202 
   4203 
   4204                 fp = StateSave(fp, fp->fPatIdx-1, status);
   4205             }
   4206             break;
   4207 
   4208 
   4209 
   4210         default:
   4211             // Trouble.  The compiled pattern contains an entry with an
   4212             //           unrecognized type tag.
   4213             U_ASSERT(FALSE);
   4214         }
   4215 
   4216         if (U_FAILURE(status)) {
   4217             isMatch = FALSE;
   4218             break;
   4219         }
   4220     }
   4221 
   4222 breakFromLoop:
   4223     fMatch = isMatch;
   4224     if (isMatch) {
   4225         fLastMatchEnd = fMatchEnd;
   4226         fMatchStart   = startIdx;
   4227         fMatchEnd     = fp->fInputIdx;
   4228         if (fTraceDebug) {
   4229             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd));
   4230         }
   4231     }
   4232     else
   4233     {
   4234         if (fTraceDebug) {
   4235             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
   4236         }
   4237     }
   4238 
   4239     fFrame = fp;                // The active stack frame when the engine stopped.
   4240                                 //   Contains the capture group results that we need to
   4241                                 //    access later.
   4242     return;
   4243 }
   4244 
   4245 
   4246 //--------------------------------------------------------------------------------
   4247 //
   4248 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
   4249 //                  assumption that the entire string is available in the UText's
   4250 //                  chunk buffer. For now, that means we can use int32_t indexes,
   4251 //                  except for anything that needs to be saved (like group starts
   4252 //                  and ends).
   4253 //
   4254 //                  startIdx:    begin matching a this index.
   4255 //                  toEnd:       if true, match must extend to end of the input region
   4256 //
   4257 //--------------------------------------------------------------------------------
   4258 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
   4259     UBool       isMatch  = FALSE;      // True if the we have a match.
   4260 
   4261     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
   4262 
   4263     int32_t     op;                    // Operation from the compiled pattern, split into
   4264     int32_t     opType;                //    the opcode
   4265     int32_t     opValue;               //    and the operand value.
   4266 
   4267 #ifdef REGEX_RUN_DEBUG
   4268     if (fTraceDebug)
   4269     {
   4270         printf("MatchAt(startIdx=%d)\n", startIdx);
   4271         printf("Original Pattern: ");
   4272         UChar32 c = utext_next32From(fPattern->fPattern, 0);
   4273         while (c != U_SENTINEL) {
   4274             if (c<32 || c>256) {
   4275                 c = '.';
   4276             }
   4277             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
   4278 
   4279             c = UTEXT_NEXT32(fPattern->fPattern);
   4280         }
   4281         printf("\n");
   4282         printf("Input String: ");
   4283         c = utext_next32From(fInputText, 0);
   4284         while (c != U_SENTINEL) {
   4285             if (c<32 || c>256) {
   4286                 c = '.';
   4287             }
   4288             printf("%c", c);
   4289 
   4290             c = UTEXT_NEXT32(fInputText);
   4291         }
   4292         printf("\n");
   4293         printf("\n");
   4294     }
   4295 #endif
   4296 
   4297     if (U_FAILURE(status)) {
   4298         return;
   4299     }
   4300 
   4301     //  Cache frequently referenced items from the compiled pattern
   4302     //
   4303     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   4304 
   4305     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   4306     UVector             *sets          = fPattern->fSets;
   4307 
   4308     const UChar         *inputBuf      = fInputText->chunkContents;
   4309 
   4310     fFrameSize = fPattern->fFrameSize;
   4311     REStackFrame        *fp            = resetStack();
   4312 
   4313     fp->fPatIdx   = 0;
   4314     fp->fInputIdx = startIdx;
   4315 
   4316     // Zero out the pattern's static data
   4317     int32_t i;
   4318     for (i = 0; i<fPattern->fDataSize; i++) {
   4319         fData[i] = 0;
   4320     }
   4321 
   4322     //
   4323     //  Main loop for interpreting the compiled pattern.
   4324     //  One iteration of the loop per pattern operation performed.
   4325     //
   4326     for (;;) {
   4327 #if 0
   4328         if (_heapchk() != _HEAPOK) {
   4329             fprintf(stderr, "Heap Trouble\n");
   4330         }
   4331 #endif
   4332 
   4333         op      = (int32_t)pat[fp->fPatIdx];
   4334         opType  = URX_TYPE(op);
   4335         opValue = URX_VAL(op);
   4336 #ifdef REGEX_RUN_DEBUG
   4337         if (fTraceDebug) {
   4338             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4339             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
   4340                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   4341             fPattern->dumpOp(fp->fPatIdx);
   4342         }
   4343 #endif
   4344         fp->fPatIdx++;
   4345 
   4346         switch (opType) {
   4347 
   4348 
   4349         case URX_NOP:
   4350             break;
   4351 
   4352 
   4353         case URX_BACKTRACK:
   4354             // Force a backtrack.  In some circumstances, the pattern compiler
   4355             //   will notice that the pattern can't possibly match anything, and will
   4356             //   emit one of these at that point.
   4357             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4358             break;
   4359 
   4360 
   4361         case URX_ONECHAR:
   4362             if (fp->fInputIdx < fActiveLimit) {
   4363                 UChar32 c;
   4364                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4365                 if (c == opValue) {
   4366                     break;
   4367                 }
   4368             } else {
   4369                 fHitEnd = TRUE;
   4370             }
   4371             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4372             break;
   4373 
   4374 
   4375         case URX_STRING:
   4376             {
   4377                 // Test input against a literal string.
   4378                 // Strings require two slots in the compiled pattern, one for the
   4379                 //   offset to the string text, and one for the length.
   4380                 int32_t   stringStartIdx = opValue;
   4381                 int32_t   stringLen;
   4382 
   4383                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
   4384                 fp->fPatIdx++;
   4385                 opType    = URX_TYPE(op);
   4386                 stringLen = URX_VAL(op);
   4387                 U_ASSERT(opType == URX_STRING_LEN);
   4388                 U_ASSERT(stringLen >= 2);
   4389 
   4390                 const UChar * pInp = inputBuf + fp->fInputIdx;
   4391                 const UChar * pInpLimit = inputBuf + fActiveLimit;
   4392                 const UChar * pPat = litText+stringStartIdx;
   4393                 const UChar * pEnd = pInp + stringLen;
   4394                 UBool success = TRUE;
   4395                 while (pInp < pEnd) {
   4396                     if (pInp >= pInpLimit) {
   4397                         fHitEnd = TRUE;
   4398                         success = FALSE;
   4399                         break;
   4400                     }
   4401                     if (*pInp++ != *pPat++) {
   4402                         success = FALSE;
   4403                         break;
   4404                     }
   4405                 }
   4406 
   4407                 if (success) {
   4408                     fp->fInputIdx += stringLen;
   4409                 } else {
   4410                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4411                 }
   4412             }
   4413             break;
   4414 
   4415 
   4416         case URX_STATE_SAVE:
   4417             fp = StateSave(fp, opValue, status);
   4418             break;
   4419 
   4420 
   4421         case URX_END:
   4422             // The match loop will exit via this path on a successful match,
   4423             //   when we reach the end of the pattern.
   4424             if (toEnd && fp->fInputIdx != fActiveLimit) {
   4425                 // The pattern matched, but not to the end of input.  Try some more.
   4426                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4427                 break;
   4428             }
   4429             isMatch = TRUE;
   4430             goto  breakFromLoop;
   4431 
   4432             // Start and End Capture stack frame variables are laid out out like this:
   4433             //  fp->fExtra[opValue]  - The start of a completed capture group
   4434             //             opValue+1 - The end   of a completed capture group
   4435             //             opValue+2 - the start of a capture group whose end
   4436             //                          has not yet been reached (and might not ever be).
   4437         case URX_START_CAPTURE:
   4438             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4439             fp->fExtra[opValue+2] = fp->fInputIdx;
   4440             break;
   4441 
   4442 
   4443         case URX_END_CAPTURE:
   4444             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4445             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   4446             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   4447             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   4448             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   4449             break;
   4450 
   4451 
   4452         case URX_DOLLAR:                   //  $, test for End of line
   4453             //     or for position before new line at end of input
   4454             if (fp->fInputIdx < fAnchorLimit-2) {
   4455                 // We are no where near the end of input.  Fail.
   4456                 //   This is the common case.  Keep it first.
   4457                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4458                 break;
   4459             }
   4460             if (fp->fInputIdx >= fAnchorLimit) {
   4461                 // We really are at the end of input.  Success.
   4462                 fHitEnd = TRUE;
   4463                 fRequireEnd = TRUE;
   4464                 break;
   4465             }
   4466 
   4467             // If we are positioned just before a new-line that is located at the
   4468             //   end of input, succeed.
   4469             if (fp->fInputIdx == fAnchorLimit-1) {
   4470                 UChar32 c;
   4471                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
   4472 
   4473                 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
   4474                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4475                         // At new-line at end of input. Success
   4476                         fHitEnd = TRUE;
   4477                         fRequireEnd = TRUE;
   4478                         break;
   4479                     }
   4480                 }
   4481             } else if (fp->fInputIdx == fAnchorLimit-2 &&
   4482                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
   4483                     fHitEnd = TRUE;
   4484                     fRequireEnd = TRUE;
   4485                     break;                         // At CR/LF at end of input.  Success
   4486             }
   4487 
   4488             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4489 
   4490             break;
   4491 
   4492 
   4493         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   4494             if (fp->fInputIdx >= fAnchorLimit-1) {
   4495                 // Either at the last character of input, or off the end.
   4496                 if (fp->fInputIdx == fAnchorLimit-1) {
   4497                     // At last char of input.  Success if it's a new line.
   4498                     if (inputBuf[fp->fInputIdx] == 0x0a) {
   4499                         fHitEnd = TRUE;
   4500                         fRequireEnd = TRUE;
   4501                         break;
   4502                     }
   4503                 } else {
   4504                     // Off the end of input.  Success.
   4505                     fHitEnd = TRUE;
   4506                     fRequireEnd = TRUE;
   4507                     break;
   4508                 }
   4509             }
   4510 
   4511             // Not at end of input.  Back-track out.
   4512             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4513             break;
   4514 
   4515 
   4516         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   4517             {
   4518                 if (fp->fInputIdx >= fAnchorLimit) {
   4519                     // We really are at the end of input.  Success.
   4520                     fHitEnd = TRUE;
   4521                     fRequireEnd = TRUE;
   4522                     break;
   4523                 }
   4524                 // If we are positioned just before a new-line, succeed.
   4525                 // It makes no difference where the new-line is within the input.
   4526                 UChar32 c = inputBuf[fp->fInputIdx];
   4527                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
   4528                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   4529                     //  In multi-line mode, hitting a new-line just before the end of input does not
   4530                     //   set the hitEnd or requireEnd flags
   4531                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4532                         break;
   4533                     }
   4534                 }
   4535                 // not at a new line.  Fail.
   4536                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4537             }
   4538             break;
   4539 
   4540 
   4541         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   4542             {
   4543                 if (fp->fInputIdx >= fAnchorLimit) {
   4544                     // We really are at the end of input.  Success.
   4545                     fHitEnd = TRUE;
   4546                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   4547                     break;               //   adding a new-line would not lose the match.
   4548                 }
   4549                 // If we are not positioned just before a new-line, the test fails; backtrack out.
   4550                 // It makes no difference where the new-line is within the input.
   4551                 if (inputBuf[fp->fInputIdx] != 0x0a) {
   4552                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4553                 }
   4554             }
   4555             break;
   4556 
   4557 
   4558         case URX_CARET:                    //  ^, test for start of line
   4559             if (fp->fInputIdx != fAnchorStart) {
   4560                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4561             }
   4562             break;
   4563 
   4564 
   4565         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   4566             {
   4567                 if (fp->fInputIdx == fAnchorStart) {
   4568                     // We are at the start input.  Success.
   4569                     break;
   4570                 }
   4571                 // Check whether character just before the current pos is a new-line
   4572                 //   unless we are at the end of input
   4573                 UChar  c = inputBuf[fp->fInputIdx - 1];
   4574                 if ((fp->fInputIdx < fAnchorLimit) &&
   4575                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   4576                     //  It's a new-line.  ^ is true.  Success.
   4577                     //  TODO:  what should be done with positions between a CR and LF?
   4578                     break;
   4579                 }
   4580                 // Not at the start of a line.  Fail.
   4581                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4582             }
   4583             break;
   4584 
   4585 
   4586         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   4587             {
   4588                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
   4589                 if (fp->fInputIdx <= fAnchorStart) {
   4590                     // We are at the start input.  Success.
   4591                     break;
   4592                 }
   4593                 // Check whether character just before the current pos is a new-line
   4594                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   4595                 UChar  c = inputBuf[fp->fInputIdx - 1];
   4596                 if (c != 0x0a) {
   4597                     // Not at the start of a line.  Back-track out.
   4598                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4599                 }
   4600             }
   4601             break;
   4602 
   4603         case URX_BACKSLASH_B:          // Test for word boundaries
   4604             {
   4605                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
   4606                 success ^= (UBool)(opValue != 0);     // flip sense for \B
   4607                 if (!success) {
   4608                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4609                 }
   4610             }
   4611             break;
   4612 
   4613 
   4614         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   4615             {
   4616                 UBool success = isUWordBoundary(fp->fInputIdx);
   4617                 success ^= (UBool)(opValue != 0);     // flip sense for \B
   4618                 if (!success) {
   4619                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4620                 }
   4621             }
   4622             break;
   4623 
   4624 
   4625         case URX_BACKSLASH_D:            // Test for decimal digit
   4626             {
   4627                 if (fp->fInputIdx >= fActiveLimit) {
   4628                     fHitEnd = TRUE;
   4629                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4630                     break;
   4631                 }
   4632 
   4633                 UChar32 c;
   4634                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4635                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   4636                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   4637                 success ^= (UBool)(opValue != 0);        // flip sense for \D
   4638                 if (!success) {
   4639                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4640                 }
   4641             }
   4642             break;
   4643 
   4644 
   4645         case URX_BACKSLASH_G:          // Test for position at end of previous match
   4646             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
   4647                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4648             }
   4649             break;
   4650 
   4651 
   4652         case URX_BACKSLASH_X:
   4653         //  Match a Grapheme, as defined by Unicode TR 29.
   4654         //  Differs slightly from Perl, which consumes combining marks independently
   4655         //    of context.
   4656         {
   4657 
   4658             // Fail if at end of input
   4659             if (fp->fInputIdx >= fActiveLimit) {
   4660                 fHitEnd = TRUE;
   4661                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4662                 break;
   4663             }
   4664 
   4665             // Examine (and consume) the current char.
   4666             //   Dispatch into a little state machine, based on the char.
   4667             UChar32  c;
   4668             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4669             UnicodeSet **sets = fPattern->fStaticSets;
   4670             if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   4671             if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   4672             if (sets[URX_GC_L]->contains(c))       goto GC_L;
   4673             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   4674             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   4675             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4676             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4677             goto GC_Extend;
   4678 
   4679 
   4680 
   4681 GC_L:
   4682             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4683             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4684             if (sets[URX_GC_L]->contains(c))       goto GC_L;
   4685             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   4686             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   4687             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4688             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4689             goto GC_Extend;
   4690 
   4691 GC_V:
   4692             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4693             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4694             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4695             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4696             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4697             goto GC_Extend;
   4698 
   4699 GC_T:
   4700             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4701             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4702             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4703             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4704             goto GC_Extend;
   4705 
   4706 GC_Extend:
   4707             // Combining characters are consumed here
   4708             for (;;) {
   4709                 if (fp->fInputIdx >= fActiveLimit) {
   4710                     break;
   4711                 }
   4712                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4713                 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   4714                     U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   4715                     break;
   4716                 }
   4717             }
   4718             goto GC_Done;
   4719 
   4720 GC_Control:
   4721             // Most control chars stand alone (don't combine with combining chars),
   4722             //   except for that CR/LF sequence is a single grapheme cluster.
   4723             if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
   4724                 fp->fInputIdx++;
   4725             }
   4726 
   4727 GC_Done:
   4728             if (fp->fInputIdx >= fActiveLimit) {
   4729                 fHitEnd = TRUE;
   4730             }
   4731             break;
   4732         }
   4733 
   4734 
   4735 
   4736 
   4737         case URX_BACKSLASH_Z:          // Test for end of Input
   4738             if (fp->fInputIdx < fAnchorLimit) {
   4739                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4740             } else {
   4741                 fHitEnd = TRUE;
   4742                 fRequireEnd = TRUE;
   4743             }
   4744             break;
   4745 
   4746 
   4747 
   4748         case URX_STATIC_SETREF:
   4749             {
   4750                 // Test input character against one of the predefined sets
   4751                 //    (Word Characters, for example)
   4752                 // The high bit of the op value is a flag for the match polarity.
   4753                 //    0:   success if input char is in set.
   4754                 //    1:   success if input char is not in set.
   4755                 if (fp->fInputIdx >= fActiveLimit) {
   4756                     fHitEnd = TRUE;
   4757                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4758                     break;
   4759                 }
   4760 
   4761                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   4762                 opValue &= ~URX_NEG_SET;
   4763                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   4764 
   4765                 UChar32 c;
   4766                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4767                 if (c < 256) {
   4768                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   4769                     if (s8->contains(c)) {
   4770                         success = !success;
   4771                     }
   4772                 } else {
   4773                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   4774                     if (s->contains(c)) {
   4775                         success = !success;
   4776                     }
   4777                 }
   4778                 if (!success) {
   4779                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4780                 }
   4781             }
   4782             break;
   4783 
   4784 
   4785         case URX_STAT_SETREF_N:
   4786             {
   4787                 // Test input character for NOT being a member of  one of
   4788                 //    the predefined sets (Word Characters, for example)
   4789                 if (fp->fInputIdx >= fActiveLimit) {
   4790                     fHitEnd = TRUE;
   4791                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4792                     break;
   4793                 }
   4794 
   4795                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   4796 
   4797                 UChar32  c;
   4798                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4799                 if (c < 256) {
   4800                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   4801                     if (s8->contains(c) == FALSE) {
   4802                         break;
   4803                     }
   4804                 } else {
   4805                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   4806                     if (s->contains(c) == FALSE) {
   4807                         break;
   4808                     }
   4809                 }
   4810                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4811             }
   4812             break;
   4813 
   4814 
   4815         case URX_SETREF:
   4816             {
   4817                 if (fp->fInputIdx >= fActiveLimit) {
   4818                     fHitEnd = TRUE;
   4819                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4820                     break;
   4821                 }
   4822 
   4823                 U_ASSERT(opValue > 0 && opValue < sets->size());
   4824 
   4825                 // There is input left.  Pick up one char and test it for set membership.
   4826                 UChar32  c;
   4827                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4828                 if (c<256) {
   4829                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   4830                     if (s8->contains(c)) {
   4831                         // The character is in the set.  A Match.
   4832                         break;
   4833                     }
   4834                 } else {
   4835                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   4836                     if (s->contains(c)) {
   4837                         // The character is in the set.  A Match.
   4838                         break;
   4839                     }
   4840                 }
   4841 
   4842                 // the character wasn't in the set.
   4843                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4844             }
   4845             break;
   4846 
   4847 
   4848         case URX_DOTANY:
   4849             {
   4850                 // . matches anything, but stops at end-of-line.
   4851                 if (fp->fInputIdx >= fActiveLimit) {
   4852                     // At end of input.  Match failed.  Backtrack out.
   4853                     fHitEnd = TRUE;
   4854                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4855                     break;
   4856                 }
   4857 
   4858                 // There is input left.  Advance over one char, unless we've hit end-of-line
   4859                 UChar32  c;
   4860                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4861                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   4862                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   4863                     // End of line in normal mode.   . does not match.
   4864                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4865                     break;
   4866                 }
   4867             }
   4868             break;
   4869 
   4870 
   4871         case URX_DOTANY_ALL:
   4872             {
   4873                 // . in dot-matches-all (including new lines) mode
   4874                 if (fp->fInputIdx >= fActiveLimit) {
   4875                     // At end of input.  Match failed.  Backtrack out.
   4876                     fHitEnd = TRUE;
   4877                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4878                     break;
   4879                 }
   4880 
   4881                 // There is input left.  Advance over one char, except if we are
   4882                 //   at a cr/lf, advance over both of them.
   4883                 UChar32 c;
   4884                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4885                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   4886                     // In the case of a CR/LF, we need to advance over both.
   4887                     if (inputBuf[fp->fInputIdx] == 0x0a) {
   4888                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
   4889                     }
   4890                 }
   4891             }
   4892             break;
   4893 
   4894 
   4895         case URX_DOTANY_UNIX:
   4896             {
   4897                 // '.' operator, matches all, but stops at end-of-line.
   4898                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   4899                 if (fp->fInputIdx >= fActiveLimit) {
   4900                     // At end of input.  Match failed.  Backtrack out.
   4901                     fHitEnd = TRUE;
   4902                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4903                     break;
   4904                 }
   4905 
   4906                 // There is input left.  Advance over one char, unless we've hit end-of-line
   4907                 UChar32 c;
   4908                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4909                 if (c == 0x0a) {
   4910                     // End of line in normal mode.   '.' does not match the \n
   4911                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4912                 }
   4913             }
   4914             break;
   4915 
   4916 
   4917         case URX_JMP:
   4918             fp->fPatIdx = opValue;
   4919             break;
   4920 
   4921         case URX_FAIL:
   4922             isMatch = FALSE;
   4923             goto breakFromLoop;
   4924 
   4925         case URX_JMP_SAV:
   4926             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   4927             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   4928             fp->fPatIdx = opValue;                         // Then JMP.
   4929             break;
   4930 
   4931         case URX_JMP_SAV_X:
   4932             // This opcode is used with (x)+, when x can match a zero length string.
   4933             // Same as JMP_SAV, except conditional on the match having made forward progress.
   4934             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   4935             //   data address of the input position at the start of the loop.
   4936             {
   4937                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   4938                 int32_t  stoOp = (int32_t)pat[opValue-1];
   4939                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   4940                 int32_t  frameLoc = URX_VAL(stoOp);
   4941                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   4942                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
   4943                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   4944                 if (prevInputIdx < fp->fInputIdx) {
   4945                     // The match did make progress.  Repeat the loop.
   4946                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   4947                     fp->fPatIdx = opValue;
   4948                     fp->fExtra[frameLoc] = fp->fInputIdx;
   4949                 }
   4950                 // If the input position did not advance, we do nothing here,
   4951                 //   execution will fall out of the loop.
   4952             }
   4953             break;
   4954 
   4955         case URX_CTR_INIT:
   4956             {
   4957                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   4958                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
   4959 
   4960                 // Pick up the three extra operands that CTR_INIT has, and
   4961                 //    skip the pattern location counter past
   4962                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   4963                 fp->fPatIdx += 3;
   4964                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   4965                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   4966                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   4967                 U_ASSERT(minCount>=0);
   4968                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   4969                 U_ASSERT(loopLoc>=fp->fPatIdx);
   4970 
   4971                 if (minCount == 0) {
   4972                     fp = StateSave(fp, loopLoc+1, status);
   4973                 }
   4974                 if (maxCount == -1) {
   4975                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
   4976                 } else if (maxCount == 0) {
   4977                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4978                 }
   4979             }
   4980             break;
   4981 
   4982         case URX_CTR_LOOP:
   4983             {
   4984                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   4985                 int32_t initOp = (int32_t)pat[opValue];
   4986                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   4987                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   4988                 int32_t minCount  = (int32_t)pat[opValue+2];
   4989                 int32_t maxCount  = (int32_t)pat[opValue+3];
   4990                 (*pCounter)++;
   4991                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
   4992                     U_ASSERT(*pCounter == maxCount);
   4993                     break;
   4994                 }
   4995                 if (*pCounter >= minCount) {
   4996                     if (maxCount == -1) {
   4997                         // Loop has no hard upper bound.
   4998                         // Check that it is progressing through the input, break if it is not.
   4999                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
   5000                         if (fp->fInputIdx == *pLastInputIdx) {
   5001                             break;
   5002                         } else {
   5003                             *pLastInputIdx = fp->fInputIdx;
   5004                         }
   5005                     }
   5006                     fp = StateSave(fp, fp->fPatIdx, status);
   5007                 }
   5008                 fp->fPatIdx = opValue + 4;    // Loop back.
   5009             }
   5010             break;
   5011 
   5012         case URX_CTR_INIT_NG:
   5013             {
   5014                 // Initialize a non-greedy loop
   5015                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   5016                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
   5017 
   5018                 // Pick up the three extra operands that CTR_INIT_NG has, and
   5019                 //    skip the pattern location counter past
   5020                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5021                 fp->fPatIdx += 3;
   5022                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   5023                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   5024                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   5025                 U_ASSERT(minCount>=0);
   5026                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   5027                 U_ASSERT(loopLoc>fp->fPatIdx);
   5028                 if (maxCount == -1) {
   5029                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
   5030                 }
   5031 
   5032                 if (minCount == 0) {
   5033                     if (maxCount != 0) {
   5034                         fp = StateSave(fp, fp->fPatIdx, status);
   5035                     }
   5036                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   5037                 }
   5038             }
   5039             break;
   5040 
   5041         case URX_CTR_LOOP_NG:
   5042             {
   5043                 // Non-greedy {min, max} loops
   5044                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   5045                 int32_t initOp = (int32_t)pat[opValue];
   5046                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   5047                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   5048                 int32_t minCount  = (int32_t)pat[opValue+2];
   5049                 int32_t maxCount  = (int32_t)pat[opValue+3];
   5050 
   5051                 (*pCounter)++;
   5052                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
   5053                     // The loop has matched the maximum permitted number of times.
   5054                     //   Break out of here with no action.  Matching will
   5055                     //   continue with the following pattern.
   5056                     U_ASSERT(*pCounter == maxCount);
   5057                     break;
   5058                 }
   5059 
   5060                 if (*pCounter < minCount) {
   5061                     // We haven't met the minimum number of matches yet.
   5062                     //   Loop back for another one.
   5063                     fp->fPatIdx = opValue + 4;    // Loop back.
   5064                 } else {
   5065                     // We do have the minimum number of matches.
   5066 
   5067                     // If there is no upper bound on the loop iterations, check that the input index
   5068                     // is progressing, and stop the loop if it is not.
   5069                     if (maxCount == -1) {
   5070                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
   5071                         if (fp->fInputIdx == *pLastInputIdx) {
   5072                             break;
   5073                         }
   5074                         *pLastInputIdx = fp->fInputIdx;
   5075                     }
   5076 
   5077                     // Loop Continuation: we will fall into the pattern following the loop
   5078                     //   (non-greedy, don't execute loop body first), but first do
   5079                     //   a state save to the top of the loop, so that a match failure
   5080                     //   in the following pattern will try another iteration of the loop.
   5081                     fp = StateSave(fp, opValue + 4, status);
   5082                 }
   5083             }
   5084             break;
   5085 
   5086         case URX_STO_SP:
   5087             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5088             fData[opValue] = fStack->size();
   5089             break;
   5090 
   5091         case URX_LD_SP:
   5092             {
   5093                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5094                 int32_t newStackSize = (int32_t)fData[opValue];
   5095                 U_ASSERT(newStackSize <= fStack->size());
   5096                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5097                 if (newFP == (int64_t *)fp) {
   5098                     break;
   5099                 }
   5100                 int32_t i;
   5101                 for (i=0; i<fFrameSize; i++) {
   5102                     newFP[i] = ((int64_t *)fp)[i];
   5103                 }
   5104                 fp = (REStackFrame *)newFP;
   5105                 fStack->setSize(newStackSize);
   5106             }
   5107             break;
   5108 
   5109         case URX_BACKREF:
   5110             {
   5111                 U_ASSERT(opValue < fFrameSize);
   5112                 int64_t groupStartIdx = fp->fExtra[opValue];
   5113                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   5114                 U_ASSERT(groupStartIdx <= groupEndIdx);
   5115                 int64_t inputIndex = fp->fInputIdx;
   5116                 if (groupStartIdx < 0) {
   5117                     // This capture group has not participated in the match thus far,
   5118                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   5119                     break;
   5120                 }
   5121                 UBool success = TRUE;
   5122                 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
   5123                     if (inputIndex >= fActiveLimit) {
   5124                         success = FALSE;
   5125                         fHitEnd = TRUE;
   5126                         break;
   5127                     }
   5128                     if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
   5129                         success = FALSE;
   5130                         break;
   5131                     }
   5132                 }
   5133                 if (success) {
   5134                     fp->fInputIdx = inputIndex;
   5135                 } else {
   5136                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5137                 }
   5138             }
   5139             break;
   5140 
   5141         case URX_BACKREF_I:
   5142             {
   5143                 U_ASSERT(opValue < fFrameSize);
   5144                 int64_t groupStartIdx = fp->fExtra[opValue];
   5145                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   5146                 U_ASSERT(groupStartIdx <= groupEndIdx);
   5147                 if (groupStartIdx < 0) {
   5148                     // This capture group has not participated in the match thus far,
   5149                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   5150                     break;
   5151                 }
   5152                 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
   5153                 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
   5154 
   5155                 //   Note: if the capture group match was of an empty string the backref
   5156                 //         match succeeds.  Verified by testing:  Perl matches succeed
   5157                 //         in this case, so we do too.
   5158 
   5159                 UBool success = TRUE;
   5160                 for (;;) {
   5161                     UChar32 captureGroupChar = captureGroupItr.next();
   5162                     if (captureGroupChar == U_SENTINEL) {
   5163                         success = TRUE;
   5164                         break;
   5165                     }
   5166                     UChar32 inputChar = inputItr.next();
   5167                     if (inputChar == U_SENTINEL) {
   5168                         success = FALSE;
   5169                         fHitEnd = TRUE;
   5170                         break;
   5171                     }
   5172                     if (inputChar != captureGroupChar) {
   5173                         success = FALSE;
   5174                         break;
   5175                     }
   5176                 }
   5177 
   5178                 if (success && inputItr.inExpansion()) {
   5179                     // We otained a match by consuming part of a string obtained from
   5180                     // case-folding a single code point of the input text.
   5181                     // This does not count as an overall match.
   5182                     success = FALSE;
   5183                 }
   5184 
   5185                 if (success) {
   5186                     fp->fInputIdx = inputItr.getIndex();
   5187                 } else {
   5188                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5189                 }
   5190             }
   5191             break;
   5192 
   5193         case URX_STO_INP_LOC:
   5194             {
   5195                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   5196                 fp->fExtra[opValue] = fp->fInputIdx;
   5197             }
   5198             break;
   5199 
   5200         case URX_JMPX:
   5201             {
   5202                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5203                 fp->fPatIdx += 1;
   5204                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   5205                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   5206                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
   5207                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   5208                 if (savedInputIdx < fp->fInputIdx) {
   5209                     fp->fPatIdx = opValue;                               // JMP
   5210                 } else {
   5211                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   5212                 }
   5213             }
   5214             break;
   5215 
   5216         case URX_LA_START:
   5217             {
   5218                 // Entering a lookahead block.
   5219                 // Save Stack Ptr, Input Pos.
   5220                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5221                 fData[opValue]   = fStack->size();
   5222                 fData[opValue+1] = fp->fInputIdx;
   5223                 fActiveStart     = fLookStart;          // Set the match region change for
   5224                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   5225             }
   5226             break;
   5227 
   5228         case URX_LA_END:
   5229             {
   5230                 // Leaving a look-ahead block.
   5231                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   5232                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5233                 int32_t stackSize = fStack->size();
   5234                 int32_t newStackSize = (int32_t)fData[opValue];
   5235                 U_ASSERT(stackSize >= newStackSize);
   5236                 if (stackSize > newStackSize) {
   5237                     // Copy the current top frame back to the new (cut back) top frame.
   5238                     //   This makes the capture groups from within the look-ahead
   5239                     //   expression available.
   5240                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5241                     int32_t i;
   5242                     for (i=0; i<fFrameSize; i++) {
   5243                         newFP[i] = ((int64_t *)fp)[i];
   5244                     }
   5245                     fp = (REStackFrame *)newFP;
   5246                     fStack->setSize(newStackSize);
   5247                 }
   5248                 fp->fInputIdx = fData[opValue+1];
   5249 
   5250                 // Restore the active region bounds in the input string; they may have
   5251                 //    been changed because of transparent bounds on a Region.
   5252                 fActiveStart = fRegionStart;
   5253                 fActiveLimit = fRegionLimit;
   5254             }
   5255             break;
   5256 
   5257         case URX_ONECHAR_I:
   5258             if (fp->fInputIdx < fActiveLimit) {
   5259                 UChar32 c;
   5260                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5261                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   5262                     break;
   5263                 }
   5264             } else {
   5265                 fHitEnd = TRUE;
   5266             }
   5267             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5268             break;
   5269 
   5270         case URX_STRING_I:
   5271             // Case-insensitive test input against a literal string.
   5272             // Strings require two slots in the compiled pattern, one for the
   5273             //   offset to the string text, and one for the length.
   5274             //   The compiled string has already been case folded.
   5275             {
   5276                 const UChar *patternString = litText + opValue;
   5277 
   5278                 op      = (int32_t)pat[fp->fPatIdx];
   5279                 fp->fPatIdx++;
   5280                 opType  = URX_TYPE(op);
   5281                 opValue = URX_VAL(op);
   5282                 U_ASSERT(opType == URX_STRING_LEN);
   5283                 int32_t patternStringLen = opValue;  // Length of the string from the pattern.
   5284 
   5285                 UChar32      cText;
   5286                 UChar32      cPattern;
   5287                 UBool        success = TRUE;
   5288                 int32_t      patternStringIdx  = 0;
   5289                 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
   5290                 while (patternStringIdx < patternStringLen) {
   5291                     U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
   5292                     cText = inputIterator.next();
   5293                     if (cText != cPattern) {
   5294                         success = FALSE;
   5295                         if (cText == U_SENTINEL) {
   5296                             fHitEnd = TRUE;
   5297                         }
   5298                         break;
   5299                     }
   5300                 }
   5301                 if (inputIterator.inExpansion()) {
   5302                     success = FALSE;
   5303                 }
   5304 
   5305                 if (success) {
   5306                     fp->fInputIdx = inputIterator.getIndex();
   5307                 } else {
   5308                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5309                 }
   5310             }
   5311             break;
   5312 
   5313         case URX_LB_START:
   5314             {
   5315                 // Entering a look-behind block.
   5316                 // Save Stack Ptr, Input Pos.
   5317                 //   TODO:  implement transparent bounds.  Ticket #6067
   5318                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5319                 fData[opValue]   = fStack->size();
   5320                 fData[opValue+1] = fp->fInputIdx;
   5321                 // Init the variable containing the start index for attempted matches.
   5322                 fData[opValue+2] = -1;
   5323                 // Save input string length, then reset to pin any matches to end at
   5324                 //   the current position.
   5325                 fData[opValue+3] = fActiveLimit;
   5326                 fActiveLimit     = fp->fInputIdx;
   5327             }
   5328             break;
   5329 
   5330 
   5331         case URX_LB_CONT:
   5332             {
   5333                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   5334                 //    at all possible input starting positions.
   5335 
   5336                 // Fetch the min and max possible match lengths.  They are the operands
   5337                 //   of this op in the pattern.
   5338                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
   5339                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
   5340                 U_ASSERT(minML <= maxML);
   5341                 U_ASSERT(minML >= 0);
   5342 
   5343                 // Fetch (from data) the last input index where a match was attempted.
   5344                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5345                 int64_t  *lbStartIdx = &fData[opValue+2];
   5346                 if (*lbStartIdx < 0) {
   5347                     // First time through loop.
   5348                     *lbStartIdx = fp->fInputIdx - minML;
   5349                 } else {
   5350                     // 2nd through nth time through the loop.
   5351                     // Back up start position for match by one.
   5352                     if (*lbStartIdx == 0) {
   5353                         (*lbStartIdx)--;
   5354                     } else {
   5355                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
   5356                     }
   5357                 }
   5358 
   5359                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   5360                     // We have tried all potential match starting points without
   5361                     //  getting a match.  Backtrack out, and out of the
   5362                     //   Look Behind altogether.
   5363                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5364                     int64_t restoreInputLen = fData[opValue+3];
   5365                     U_ASSERT(restoreInputLen >= fActiveLimit);
   5366                     U_ASSERT(restoreInputLen <= fInputLength);
   5367                     fActiveLimit = restoreInputLen;
   5368                     break;
   5369                 }
   5370 
   5371                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5372                 //      (successful match will fall off the end of the loop.)
   5373                 fp = StateSave(fp, fp->fPatIdx-3, status);
   5374                 fp->fInputIdx =  *lbStartIdx;
   5375             }
   5376             break;
   5377 
   5378         case URX_LB_END:
   5379             // End of a look-behind block, after a successful match.
   5380             {
   5381                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5382                 if (fp->fInputIdx != fActiveLimit) {
   5383                     //  The look-behind expression matched, but the match did not
   5384                     //    extend all the way to the point that we are looking behind from.
   5385                     //  FAIL out of here, which will take us back to the LB_CONT, which
   5386                     //     will retry the match starting at another position or fail
   5387                     //     the look-behind altogether, whichever is appropriate.
   5388                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5389                     break;
   5390                 }
   5391 
   5392                 // Look-behind match is good.  Restore the orignal input string length,
   5393                 //   which had been truncated to pin the end of the lookbehind match to the
   5394                 //   position being looked-behind.
   5395                 int64_t originalInputLen = fData[opValue+3];
   5396                 U_ASSERT(originalInputLen >= fActiveLimit);
   5397                 U_ASSERT(originalInputLen <= fInputLength);
   5398                 fActiveLimit = originalInputLen;
   5399             }
   5400             break;
   5401 
   5402 
   5403         case URX_LBN_CONT:
   5404             {
   5405                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   5406                 //    at all possible input starting positions.
   5407 
   5408                 // Fetch the extra parameters of this op.
   5409                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
   5410                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
   5411                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
   5412                 continueLoc = URX_VAL(continueLoc);
   5413                 U_ASSERT(minML <= maxML);
   5414                 U_ASSERT(minML >= 0);
   5415                 U_ASSERT(continueLoc > fp->fPatIdx);
   5416 
   5417                 // Fetch (from data) the last input index where a match was attempted.
   5418                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5419                 int64_t  *lbStartIdx = &fData[opValue+2];
   5420                 if (*lbStartIdx < 0) {
   5421                     // First time through loop.
   5422                     *lbStartIdx = fp->fInputIdx - minML;
   5423                 } else {
   5424                     // 2nd through nth time through the loop.
   5425                     // Back up start position for match by one.
   5426                     if (*lbStartIdx == 0) {
   5427                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
   5428                     } else {
   5429                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
   5430                     }
   5431                 }
   5432 
   5433                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   5434                     // We have tried all potential match starting points without
   5435                     //  getting a match, which means that the negative lookbehind as
   5436                     //  a whole has succeeded.  Jump forward to the continue location
   5437                     int64_t restoreInputLen = fData[opValue+3];
   5438                     U_ASSERT(restoreInputLen >= fActiveLimit);
   5439                     U_ASSERT(restoreInputLen <= fInputLength);
   5440                     fActiveLimit = restoreInputLen;
   5441                     fp->fPatIdx = continueLoc;
   5442                     break;
   5443                 }
   5444 
   5445                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5446                 //      (successful match will cause a FAIL out of the loop altogether.)
   5447                 fp = StateSave(fp, fp->fPatIdx-4, status);
   5448                 fp->fInputIdx =  *lbStartIdx;
   5449             }
   5450             break;
   5451 
   5452         case URX_LBN_END:
   5453             // End of a negative look-behind block, after a successful match.
   5454             {
   5455                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5456                 if (fp->fInputIdx != fActiveLimit) {
   5457                     //  The look-behind expression matched, but the match did not
   5458                     //    extend all the way to the point that we are looking behind from.
   5459                     //  FAIL out of here, which will take us back to the LB_CONT, which
   5460                     //     will retry the match starting at another position or succeed
   5461                     //     the look-behind altogether, whichever is appropriate.
   5462                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5463                     break;
   5464                 }
   5465 
   5466                 // Look-behind expression matched, which means look-behind test as
   5467                 //   a whole Fails
   5468 
   5469                 //   Restore the orignal input string length, which had been truncated
   5470                 //   inorder to pin the end of the lookbehind match
   5471                 //   to the position being looked-behind.
   5472                 int64_t originalInputLen = fData[opValue+3];
   5473                 U_ASSERT(originalInputLen >= fActiveLimit);
   5474                 U_ASSERT(originalInputLen <= fInputLength);
   5475                 fActiveLimit = originalInputLen;
   5476 
   5477                 // Restore original stack position, discarding any state saved
   5478                 //   by the successful pattern match.
   5479                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5480                 int32_t newStackSize = (int32_t)fData[opValue];
   5481                 U_ASSERT(fStack->size() > newStackSize);
   5482                 fStack->setSize(newStackSize);
   5483 
   5484                 //  FAIL, which will take control back to someplace
   5485                 //  prior to entering the look-behind test.
   5486                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5487             }
   5488             break;
   5489 
   5490 
   5491         case URX_LOOP_SR_I:
   5492             // Loop Initialization for the optimized implementation of
   5493             //     [some character set]*
   5494             //   This op scans through all matching input.
   5495             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5496             {
   5497                 U_ASSERT(opValue > 0 && opValue < sets->size());
   5498                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5499                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   5500 
   5501                 // Loop through input, until either the input is exhausted or
   5502                 //   we reach a character that is not a member of the set.
   5503                 int32_t ix = (int32_t)fp->fInputIdx;
   5504                 for (;;) {
   5505                     if (ix >= fActiveLimit) {
   5506                         fHitEnd = TRUE;
   5507                         break;
   5508                     }
   5509                     UChar32   c;
   5510                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
   5511                     if (c<256) {
   5512                         if (s8->contains(c) == FALSE) {
   5513                             U16_BACK_1(inputBuf, 0, ix);
   5514                             break;
   5515                         }
   5516                     } else {
   5517                         if (s->contains(c) == FALSE) {
   5518                             U16_BACK_1(inputBuf, 0, ix);
   5519                             break;
   5520                         }
   5521                     }
   5522                 }
   5523 
   5524                 // If there were no matching characters, skip over the loop altogether.
   5525                 //   The loop doesn't run at all, a * op always succeeds.
   5526                 if (ix == fp->fInputIdx) {
   5527                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   5528                     break;
   5529                 }
   5530 
   5531                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   5532                 //   must follow.  It's operand is the stack location
   5533                 //   that holds the starting input index for the match of this [set]*
   5534                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   5535                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   5536                 int32_t stackLoc = URX_VAL(loopcOp);
   5537                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   5538                 fp->fExtra[stackLoc] = fp->fInputIdx;
   5539                 fp->fInputIdx = ix;
   5540 
   5541                 // Save State to the URX_LOOP_C op that follows this one,
   5542                 //   so that match failures in the following code will return to there.
   5543                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   5544                 fp = StateSave(fp, fp->fPatIdx, status);
   5545                 fp->fPatIdx++;
   5546             }
   5547             break;
   5548 
   5549 
   5550         case URX_LOOP_DOT_I:
   5551             // Loop Initialization for the optimized implementation of .*
   5552             //   This op scans through all remaining input.
   5553             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5554             {
   5555                 // Loop through input until the input is exhausted (we reach an end-of-line)
   5556                 // In DOTALL mode, we can just go straight to the end of the input.
   5557                 int32_t ix;
   5558                 if ((opValue & 1) == 1) {
   5559                     // Dot-matches-All mode.  Jump straight to the end of the string.
   5560                     ix = (int32_t)fActiveLimit;
   5561                     fHitEnd = TRUE;
   5562                 } else {
   5563                     // NOT DOT ALL mode.  Line endings do not match '.'
   5564                     // Scan forward until a line ending or end of input.
   5565                     ix = (int32_t)fp->fInputIdx;
   5566                     for (;;) {
   5567                         if (ix >= fActiveLimit) {
   5568                             fHitEnd = TRUE;
   5569                             break;
   5570                         }
   5571                         UChar32   c;
   5572                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
   5573                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
   5574                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
   5575                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   5576                                    ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
   5577                                 //  char is a line ending.  Put the input pos back to the
   5578                                 //    line ending char, and exit the scanning loop.
   5579                                 U16_BACK_1(inputBuf, 0, ix);
   5580                                 break;
   5581                             }
   5582                         }
   5583                     }
   5584                 }
   5585 
   5586                 // If there were no matching characters, skip over the loop altogether.
   5587                 //   The loop doesn't run at all, a * op always succeeds.
   5588                 if (ix == fp->fInputIdx) {
   5589                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   5590                     break;
   5591                 }
   5592 
   5593                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   5594                 //   must follow.  It's operand is the stack location
   5595                 //   that holds the starting input index for the match of this .*
   5596                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   5597                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   5598                 int32_t stackLoc = URX_VAL(loopcOp);
   5599                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   5600                 fp->fExtra[stackLoc] = fp->fInputIdx;
   5601                 fp->fInputIdx = ix;
   5602 
   5603                 // Save State to the URX_LOOP_C op that follows this one,
   5604                 //   so that match failures in the following code will return to there.
   5605                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   5606                 fp = StateSave(fp, fp->fPatIdx, status);
   5607                 fp->fPatIdx++;
   5608             }
   5609             break;
   5610 
   5611 
   5612         case URX_LOOP_C:
   5613             {
   5614                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   5615                 backSearchIndex = (int32_t)fp->fExtra[opValue];
   5616                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
   5617                 if (backSearchIndex == fp->fInputIdx) {
   5618                     // We've backed up the input idx to the point that the loop started.
   5619                     // The loop is done.  Leave here without saving state.
   5620                     //  Subsequent failures won't come back here.
   5621                     break;
   5622                 }
   5623                 // Set up for the next iteration of the loop, with input index
   5624                 //   backed up by one from the last time through,
   5625                 //   and a state save to this instruction in case the following code fails again.
   5626                 //   (We're going backwards because this loop emulates stack unwinding, not
   5627                 //    the initial scan forward.)
   5628                 U_ASSERT(fp->fInputIdx > 0);
   5629                 UChar32 prevC;
   5630                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
   5631 
   5632                 if (prevC == 0x0a &&
   5633                     fp->fInputIdx > backSearchIndex &&
   5634                     inputBuf[fp->fInputIdx-1] == 0x0d) {
   5635                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
   5636                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   5637                         // .*, stepping back over CRLF pair.
   5638                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   5639                     }
   5640                 }
   5641 
   5642 
   5643                 fp = StateSave(fp, fp->fPatIdx-1, status);
   5644             }
   5645             break;
   5646 
   5647 
   5648 
   5649         default:
   5650             // Trouble.  The compiled pattern contains an entry with an
   5651             //           unrecognized type tag.
   5652             U_ASSERT(FALSE);
   5653         }
   5654 
   5655         if (U_FAILURE(status)) {
   5656             isMatch = FALSE;
   5657             break;
   5658         }
   5659     }
   5660 
   5661 breakFromLoop:
   5662     fMatch = isMatch;
   5663     if (isMatch) {
   5664         fLastMatchEnd = fMatchEnd;
   5665         fMatchStart   = startIdx;
   5666         fMatchEnd     = fp->fInputIdx;
   5667         if (fTraceDebug) {
   5668             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd));
   5669         }
   5670     }
   5671     else
   5672     {
   5673         if (fTraceDebug) {
   5674             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
   5675         }
   5676     }
   5677 
   5678     fFrame = fp;                // The active stack frame when the engine stopped.
   5679     //   Contains the capture group results that we need to
   5680     //    access later.
   5681 
   5682     return;
   5683 }
   5684 
   5685 
   5686 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
   5687 
   5688 U_NAMESPACE_END
   5689 
   5690 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
   5691