Home | History | Annotate | Download | only in i18n
      1 /*
      2 **************************************************************************
      3 *   Copyright (C) 2002-2012 International Business Machines Corporation  *
      4 *   and others. All rights reserved.                                     *
      5 **************************************************************************
      6 */
      7 //
      8 //  file:  rematch.cpp
      9 //
     10 //         Contains the implementation of class RegexMatcher,
     11 //         which is one of the main API classes for the ICU regular expression package.
     12 //
     13 
     14 #include "unicode/utypes.h"
     15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     16 
     17 #include "unicode/regex.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/ustring.h"
     21 #include "unicode/rbbi.h"
     22 #include "unicode/utf.h"
     23 #include "unicode/utf16.h"
     24 #include "uassert.h"
     25 #include "cmemory.h"
     26 #include "uvector.h"
     27 #include "uvectr32.h"
     28 #include "uvectr64.h"
     29 #include "regeximp.h"
     30 #include "regexst.h"
     31 #include "regextxt.h"
     32 #include "ucase.h"
     33 
     34 // #include <malloc.h>        // Needed for heapcheck testing
     35 
     36 
     37 // Find progress callback
     38 // ----------------------
     39 // Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
     40 //
     41 #define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
     42     (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
     43 
     44 
     45 // Smart Backtracking
     46 // ------------------
     47 // When a failure would go back to a LOOP_C instruction,
     48 // strings, characters, and setrefs scan backwards for a valid start
     49 // character themselves, pop the stack, and save state, emulating the
     50 // LOOP_C's effect but assured that the next character of input is a
     51 // possible matching character.
     52 //
     53 // Good idea in theory; unfortunately it only helps out a few specific
     54 // cases and slows the engine down a little in the rest.
     55 
     56 U_NAMESPACE_BEGIN
     57 
     58 // Default limit for the size of the back track stack, to avoid system
     59 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
     60 // This value puts ICU's limits higher than most other regexp implementations,
     61 //    which use recursion rather than the heap, and take more storage per
     62 //    backtrack point.
     63 //
     64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
     65 
     66 // Time limit counter constant.
     67 //   Time limits for expression evaluation are in terms of quanta of work by
     68 //   the engine, each of which is 10,000 state saves.
     69 //   This constant determines that state saves per tick number.
     70 static const int32_t TIMER_INITIAL_VALUE = 10000;
     71 
     72 //-----------------------------------------------------------------------------
     73 //
     74 //   Constructor and Destructor
     75 //
     76 //-----------------------------------------------------------------------------
     77 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
     78     fDeferredStatus = U_ZERO_ERROR;
     79     init(fDeferredStatus);
     80     if (U_FAILURE(fDeferredStatus)) {
     81         return;
     82     }
     83     if (pat==NULL) {
     84         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
     85         return;
     86     }
     87     fPattern = pat;
     88     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
     89 }
     90 
     91 
     92 
     93 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
     94                            uint32_t flags, UErrorCode &status) {
     95     init(status);
     96     if (U_FAILURE(status)) {
     97         return;
     98     }
     99     UParseError    pe;
    100     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    101     fPattern           = fPatternOwned;
    102 
    103     UText inputText = UTEXT_INITIALIZER;
    104     utext_openConstUnicodeString(&inputText, &input, &status);
    105     init2(&inputText, status);
    106     utext_close(&inputText);
    107 
    108     fInputUniStrMaybeMutable = TRUE;
    109 }
    110 
    111 
    112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
    113                            uint32_t flags, UErrorCode &status) {
    114     init(status);
    115     if (U_FAILURE(status)) {
    116         return;
    117     }
    118     UParseError    pe;
    119     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    120     if (U_FAILURE(status)) {
    121         return;
    122     }
    123 
    124     fPattern           = fPatternOwned;
    125     init2(input, status);
    126 }
    127 
    128 
    129 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
    130                            uint32_t flags, UErrorCode &status) {
    131     init(status);
    132     if (U_FAILURE(status)) {
    133         return;
    134     }
    135     UParseError    pe;
    136     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    137     if (U_FAILURE(status)) {
    138         return;
    139     }
    140     fPattern           = fPatternOwned;
    141     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    142 }
    143 
    144 RegexMatcher::RegexMatcher(UText *regexp,
    145                            uint32_t flags, UErrorCode &status) {
    146     init(status);
    147     if (U_FAILURE(status)) {
    148         return;
    149     }
    150     UParseError    pe;
    151     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    152         if (U_FAILURE(status)) {
    153         return;
    154     }
    155 
    156     fPattern           = fPatternOwned;
    157     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    158 }
    159 
    160 
    161 
    162 
    163 RegexMatcher::~RegexMatcher() {
    164     delete fStack;
    165     if (fData != fSmallData) {
    166         uprv_free(fData);
    167         fData = NULL;
    168     }
    169     if (fPatternOwned) {
    170         delete fPatternOwned;
    171         fPatternOwned = NULL;
    172         fPattern = NULL;
    173     }
    174 
    175     if (fInput) {
    176         delete fInput;
    177     }
    178     if (fInputText) {
    179         utext_close(fInputText);
    180     }
    181     if (fAltInputText) {
    182         utext_close(fAltInputText);
    183     }
    184 
    185     #if UCONFIG_NO_BREAK_ITERATION==0
    186     delete fWordBreakItr;
    187     #endif
    188 }
    189 
    190 //
    191 //   init()   common initialization for use by all constructors.
    192 //            Initialize all fields, get the object into a consistent state.
    193 //            This must be done even when the initial status shows an error,
    194 //            so that the object is initialized sufficiently well for the destructor
    195 //            to run safely.
    196 //
    197 void RegexMatcher::init(UErrorCode &status) {
    198     fPattern           = NULL;
    199     fPatternOwned      = NULL;
    200     fFrameSize         = 0;
    201     fRegionStart       = 0;
    202     fRegionLimit       = 0;
    203     fAnchorStart       = 0;
    204     fAnchorLimit       = 0;
    205     fLookStart         = 0;
    206     fLookLimit         = 0;
    207     fActiveStart       = 0;
    208     fActiveLimit       = 0;
    209     fTransparentBounds = FALSE;
    210     fAnchoringBounds   = TRUE;
    211     fMatch             = FALSE;
    212     fMatchStart        = 0;
    213     fMatchEnd          = 0;
    214     fLastMatchEnd      = -1;
    215     fAppendPosition    = 0;
    216     fHitEnd            = FALSE;
    217     fRequireEnd        = FALSE;
    218     fStack             = NULL;
    219     fFrame             = NULL;
    220     fTimeLimit         = 0;
    221     fTime              = 0;
    222     fTickCounter       = 0;
    223     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
    224     fCallbackFn        = NULL;
    225     fCallbackContext   = NULL;
    226     fFindProgressCallbackFn      = NULL;
    227     fFindProgressCallbackContext = NULL;
    228     fTraceDebug        = FALSE;
    229     fDeferredStatus    = status;
    230     fData              = fSmallData;
    231     fWordBreakItr      = NULL;
    232 
    233     fStack             = NULL;
    234     fInputText         = NULL;
    235     fAltInputText      = NULL;
    236     fInput             = NULL;
    237     fInputLength       = 0;
    238     fInputUniStrMaybeMutable = FALSE;
    239 
    240     if (U_FAILURE(status)) {
    241         fDeferredStatus = status;
    242     }
    243 }
    244 
    245 //
    246 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
    247 //            This handles the common setup to be done after the Pattern is available.
    248 //
    249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
    250     if (U_FAILURE(status)) {
    251         fDeferredStatus = status;
    252         return;
    253     }
    254 
    255     if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
    256         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
    257         if (fData == NULL) {
    258             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    259             return;
    260         }
    261     }
    262 
    263     fStack = new UVector64(status);
    264     if (fStack == NULL) {
    265         status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    266         return;
    267     }
    268 
    269     reset(input);
    270     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
    271     if (U_FAILURE(status)) {
    272         fDeferredStatus = status;
    273         return;
    274     }
    275 }
    276 
    277 
    278 static const UChar BACKSLASH  = 0x5c;
    279 static const UChar DOLLARSIGN = 0x24;
    280 //--------------------------------------------------------------------------------
    281 //
    282 //    appendReplacement
    283 //
    284 //--------------------------------------------------------------------------------
    285 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
    286                                               const UnicodeString &replacement,
    287                                               UErrorCode &status) {
    288     UText replacementText = UTEXT_INITIALIZER;
    289 
    290     utext_openConstUnicodeString(&replacementText, &replacement, &status);
    291     if (U_SUCCESS(status)) {
    292         UText resultText = UTEXT_INITIALIZER;
    293         utext_openUnicodeString(&resultText, &dest, &status);
    294 
    295         if (U_SUCCESS(status)) {
    296             appendReplacement(&resultText, &replacementText, status);
    297             utext_close(&resultText);
    298         }
    299         utext_close(&replacementText);
    300     }
    301 
    302     return *this;
    303 }
    304 
    305 //
    306 //    appendReplacement, UText mode
    307 //
    308 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
    309                                               UText *replacement,
    310                                               UErrorCode &status) {
    311     if (U_FAILURE(status)) {
    312         return *this;
    313     }
    314     if (U_FAILURE(fDeferredStatus)) {
    315         status = fDeferredStatus;
    316         return *this;
    317     }
    318     if (fMatch == FALSE) {
    319         status = U_REGEX_INVALID_STATE;
    320         return *this;
    321     }
    322 
    323     // Copy input string from the end of previous match to start of current match
    324     int64_t  destLen = utext_nativeLength(dest);
    325     if (fMatchStart > fAppendPosition) {
    326         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    327             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
    328                                      (int32_t)(fMatchStart-fAppendPosition), &status);
    329         } else {
    330             int32_t len16;
    331             if (UTEXT_USES_U16(fInputText)) {
    332                 len16 = (int32_t)(fMatchStart-fAppendPosition);
    333             } else {
    334                 UErrorCode lengthStatus = U_ZERO_ERROR;
    335                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
    336             }
    337             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
    338             if (inputChars == NULL) {
    339                 status = U_MEMORY_ALLOCATION_ERROR;
    340                 return *this;
    341             }
    342             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
    343             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    344             uprv_free(inputChars);
    345         }
    346     }
    347     fAppendPosition = fMatchEnd;
    348 
    349 
    350     // scan the replacement text, looking for substitutions ($n) and \escapes.
    351     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
    352     //         move entire ranges not containing substitutions.
    353     UTEXT_SETNATIVEINDEX(replacement, 0);
    354     UChar32 c = UTEXT_NEXT32(replacement);
    355     while (c != U_SENTINEL) {
    356         if (c == BACKSLASH) {
    357             // Backslash Escape.  Copy the following char out without further checks.
    358             //                    Note:  Surrogate pairs don't need any special handling
    359             //                           The second half wont be a '$' or a '\', and
    360             //                           will move to the dest normally on the next
    361             //                           loop iteration.
    362             c = UTEXT_CURRENT32(replacement);
    363             if (c == U_SENTINEL) {
    364                 break;
    365             }
    366 
    367             if (c==0x55/*U*/ || c==0x75/*u*/) {
    368                 // We have a \udddd or \Udddddddd escape sequence.
    369                 int32_t offset = 0;
    370                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
    371                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
    372                 if (escapedChar != (UChar32)0xFFFFFFFF) {
    373                     if (U_IS_BMP(escapedChar)) {
    374                         UChar c16 = (UChar)escapedChar;
    375                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    376                     } else {
    377                         UChar surrogate[2];
    378                         surrogate[0] = U16_LEAD(escapedChar);
    379                         surrogate[1] = U16_TRAIL(escapedChar);
    380                         if (U_SUCCESS(status)) {
    381                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    382                         }
    383                     }
    384                     // TODO:  Report errors for mal-formed \u escapes?
    385                     //        As this is, the original sequence is output, which may be OK.
    386                     if (context.lastOffset == offset) {
    387                         (void)UTEXT_PREVIOUS32(replacement);
    388                     } else if (context.lastOffset != offset-1) {
    389                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
    390                     }
    391                 }
    392             } else {
    393                 (void)UTEXT_NEXT32(replacement);
    394                 // Plain backslash escape.  Just put out the escaped character.
    395                 if (U_IS_BMP(c)) {
    396                     UChar c16 = (UChar)c;
    397                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    398                 } else {
    399                     UChar surrogate[2];
    400                     surrogate[0] = U16_LEAD(c);
    401                     surrogate[1] = U16_TRAIL(c);
    402                     if (U_SUCCESS(status)) {
    403                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    404                     }
    405                 }
    406             }
    407         } else if (c != DOLLARSIGN) {
    408             // Normal char, not a $.  Copy it out without further checks.
    409             if (U_IS_BMP(c)) {
    410                 UChar c16 = (UChar)c;
    411                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    412             } else {
    413                 UChar surrogate[2];
    414                 surrogate[0] = U16_LEAD(c);
    415                 surrogate[1] = U16_TRAIL(c);
    416                 if (U_SUCCESS(status)) {
    417                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    418                 }
    419             }
    420         } else {
    421             // We've got a $.  Pick up a capture group number if one follows.
    422             // Consume at most the number of digits necessary for the largest capture
    423             // number that is valid for this pattern.
    424 
    425             int32_t numDigits = 0;
    426             int32_t groupNum  = 0;
    427             UChar32 digitC;
    428             for (;;) {
    429                 digitC = UTEXT_CURRENT32(replacement);
    430                 if (digitC == U_SENTINEL) {
    431                     break;
    432                 }
    433                 if (u_isdigit(digitC) == FALSE) {
    434                     break;
    435                 }
    436                 (void)UTEXT_NEXT32(replacement);
    437                 groupNum=groupNum*10 + u_charDigitValue(digitC);
    438                 numDigits++;
    439                 if (numDigits >= fPattern->fMaxCaptureDigits) {
    440                     break;
    441                 }
    442             }
    443 
    444 
    445             if (numDigits == 0) {
    446                 // The $ didn't introduce a group number at all.
    447                 // Treat it as just part of the substitution text.
    448                 UChar c16 = DOLLARSIGN;
    449                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    450             } else {
    451                 // Finally, append the capture group data to the destination.
    452                 destLen += appendGroup(groupNum, dest, status);
    453                 if (U_FAILURE(status)) {
    454                     // Can fail if group number is out of range.
    455                     break;
    456                 }
    457             }
    458         }
    459 
    460         if (U_FAILURE(status)) {
    461             break;
    462         } else {
    463             c = UTEXT_NEXT32(replacement);
    464         }
    465     }
    466 
    467     return *this;
    468 }
    469 
    470 
    471 
    472 //--------------------------------------------------------------------------------
    473 //
    474 //    appendTail     Intended to be used in conjunction with appendReplacement()
    475 //                   To the destination string, append everything following
    476 //                   the last match position from the input string.
    477 //
    478 //                   Note:  Match ranges do not affect appendTail or appendReplacement
    479 //
    480 //--------------------------------------------------------------------------------
    481 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
    482     UErrorCode status = U_ZERO_ERROR;
    483     UText resultText = UTEXT_INITIALIZER;
    484     utext_openUnicodeString(&resultText, &dest, &status);
    485 
    486     if (U_SUCCESS(status)) {
    487         appendTail(&resultText, status);
    488         utext_close(&resultText);
    489     }
    490 
    491     return dest;
    492 }
    493 
    494 //
    495 //   appendTail, UText mode
    496 //
    497 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
    498     UBool bailOut = FALSE;
    499     if (U_FAILURE(status)) {
    500         bailOut = TRUE;
    501     }
    502     if (U_FAILURE(fDeferredStatus)) {
    503         status = fDeferredStatus;
    504         bailOut = TRUE;
    505     }
    506 
    507     if (bailOut) {
    508         //  dest must not be NULL
    509         if (dest) {
    510             utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
    511             return dest;
    512         }
    513     }
    514 
    515     if (fInputLength > fAppendPosition) {
    516         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    517             int64_t destLen = utext_nativeLength(dest);
    518             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
    519                           (int32_t)(fInputLength-fAppendPosition), &status);
    520         } else {
    521             int32_t len16;
    522             if (UTEXT_USES_U16(fInputText)) {
    523                 len16 = (int32_t)(fInputLength-fAppendPosition);
    524             } else {
    525                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
    526                 status = U_ZERO_ERROR; // buffer overflow
    527             }
    528 
    529             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
    530             if (inputChars == NULL) {
    531                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    532             } else {
    533                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
    534                 int64_t destLen = utext_nativeLength(dest);
    535                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    536                 uprv_free(inputChars);
    537             }
    538         }
    539     }
    540     return dest;
    541 }
    542 
    543 
    544 
    545 //--------------------------------------------------------------------------------
    546 //
    547 //   end
    548 //
    549 //--------------------------------------------------------------------------------
    550 int32_t RegexMatcher::end(UErrorCode &err) const {
    551     return end(0, err);
    552 }
    553 
    554 int64_t RegexMatcher::end64(UErrorCode &err) const {
    555     return end64(0, err);
    556 }
    557 
    558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
    559     if (U_FAILURE(err)) {
    560         return -1;
    561     }
    562     if (fMatch == FALSE) {
    563         err = U_REGEX_INVALID_STATE;
    564         return -1;
    565     }
    566     if (group < 0 || group > fPattern->fGroupMap->size()) {
    567         err = U_INDEX_OUTOFBOUNDS_ERROR;
    568         return -1;
    569     }
    570     int64_t e = -1;
    571     if (group == 0) {
    572         e = fMatchEnd;
    573     } else {
    574         // Get the position within the stack frame of the variables for
    575         //    this capture group.
    576         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
    577         U_ASSERT(groupOffset < fPattern->fFrameSize);
    578         U_ASSERT(groupOffset >= 0);
    579         e = fFrame->fExtra[groupOffset + 1];
    580     }
    581 
    582         return e;
    583 }
    584 
    585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
    586     return (int32_t)end64(group, err);
    587 }
    588 
    589 
    590 //--------------------------------------------------------------------------------
    591 //
    592 //   find()
    593 //
    594 //--------------------------------------------------------------------------------
    595 UBool RegexMatcher::find() {
    596     // Start at the position of the last match end.  (Will be zero if the
    597     //   matcher has been reset.)
    598     //
    599     if (U_FAILURE(fDeferredStatus)) {
    600         return FALSE;
    601     }
    602 
    603     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    604         return findUsingChunk();
    605     }
    606 
    607     int64_t startPos = fMatchEnd;
    608     if (startPos==0) {
    609         startPos = fActiveStart;
    610     }
    611 
    612     if (fMatch) {
    613         // Save the position of any previous successful match.
    614         fLastMatchEnd = fMatchEnd;
    615 
    616         if (fMatchStart == fMatchEnd) {
    617             // Previous match had zero length.  Move start position up one position
    618             //  to avoid sending find() into a loop on zero-length matches.
    619             if (startPos >= fActiveLimit) {
    620                 fMatch = FALSE;
    621                 fHitEnd = TRUE;
    622                 return FALSE;
    623             }
    624             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    625             (void)UTEXT_NEXT32(fInputText);
    626             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    627         }
    628     } else {
    629         if (fLastMatchEnd >= 0) {
    630             // A previous find() failed to match.  Don't try again.
    631             //   (without this test, a pattern with a zero-length match
    632             //    could match again at the end of an input string.)
    633             fHitEnd = TRUE;
    634             return FALSE;
    635         }
    636     }
    637 
    638 
    639     // Compute the position in the input string beyond which a match can not begin, because
    640     //   the minimum length match would extend past the end of the input.
    641     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    642     //          Be aware of possible overflows if making changes here.
    643     int64_t testStartLimit;
    644     if (UTEXT_USES_U16(fInputText)) {
    645         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
    646         if (startPos > testStartLimit) {
    647             fMatch = FALSE;
    648             fHitEnd = TRUE;
    649             return FALSE;
    650         }
    651     } else {
    652         // For now, let the matcher discover that it can't match on its own
    653         // We don't know how long the match len is in native characters
    654         testStartLimit = fActiveLimit;
    655     }
    656 
    657     UChar32  c;
    658     U_ASSERT(startPos >= 0);
    659 
    660     switch (fPattern->fStartType) {
    661     case START_NO_INFO:
    662         // No optimization was found.
    663         //  Try a match at each input position.
    664         for (;;) {
    665             MatchAt(startPos, FALSE, fDeferredStatus);
    666             if (U_FAILURE(fDeferredStatus)) {
    667                 return FALSE;
    668             }
    669             if (fMatch) {
    670                 return TRUE;
    671             }
    672             if (startPos >= testStartLimit) {
    673                 fHitEnd = TRUE;
    674                 return FALSE;
    675             }
    676             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    677             (void)UTEXT_NEXT32(fInputText);
    678             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    679             // Note that it's perfectly OK for a pattern to have a zero-length
    680             //   match at the end of a string, so we must make sure that the loop
    681             //   runs with startPos == testStartLimit the last time through.
    682             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    683                 return FALSE;
    684         }
    685         U_ASSERT(FALSE);
    686 
    687     case START_START:
    688         // Matches are only possible at the start of the input string
    689         //   (pattern begins with ^ or \A)
    690         if (startPos > fActiveStart) {
    691             fMatch = FALSE;
    692             return FALSE;
    693         }
    694         MatchAt(startPos, FALSE, fDeferredStatus);
    695         if (U_FAILURE(fDeferredStatus)) {
    696             return FALSE;
    697         }
    698         return fMatch;
    699 
    700 
    701     case START_SET:
    702         {
    703             // Match may start on any char from a pre-computed set.
    704             U_ASSERT(fPattern->fMinMatchLen > 0);
    705             int64_t pos;
    706             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    707             for (;;) {
    708                 c = UTEXT_NEXT32(fInputText);
    709                 pos = UTEXT_GETNATIVEINDEX(fInputText);
    710                 // c will be -1 (U_SENTINEL) at end of text, in which case we
    711                 // skip this next block (so we don't have a negative array index)
    712                 // and handle end of text in the following block.
    713                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
    714                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
    715                     MatchAt(startPos, FALSE, fDeferredStatus);
    716                     if (U_FAILURE(fDeferredStatus)) {
    717                         return FALSE;
    718                     }
    719                     if (fMatch) {
    720                         return TRUE;
    721                     }
    722                     UTEXT_SETNATIVEINDEX(fInputText, pos);
    723                 }
    724                 if (startPos >= testStartLimit) {
    725                     fMatch = FALSE;
    726                     fHitEnd = TRUE;
    727                     return FALSE;
    728                 }
    729                 startPos = pos;
    730 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    731                     return FALSE;
    732             }
    733         }
    734         U_ASSERT(FALSE);
    735 
    736     case START_STRING:
    737     case START_CHAR:
    738         {
    739             // Match starts on exactly one char.
    740             U_ASSERT(fPattern->fMinMatchLen > 0);
    741             UChar32 theChar = fPattern->fInitialChar;
    742             int64_t pos;
    743             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    744             for (;;) {
    745                 c = UTEXT_NEXT32(fInputText);
    746                 pos = UTEXT_GETNATIVEINDEX(fInputText);
    747                 if (c == theChar) {
    748                     MatchAt(startPos, FALSE, fDeferredStatus);
    749                     if (U_FAILURE(fDeferredStatus)) {
    750                         return FALSE;
    751                     }
    752                     if (fMatch) {
    753                         return TRUE;
    754                     }
    755                     UTEXT_SETNATIVEINDEX(fInputText, pos);
    756                 }
    757                 if (startPos >= testStartLimit) {
    758                     fMatch = FALSE;
    759                     fHitEnd = TRUE;
    760                     return FALSE;
    761                 }
    762                 startPos = pos;
    763 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    764                     return FALSE;
    765            }
    766         }
    767         U_ASSERT(FALSE);
    768 
    769     case START_LINE:
    770         {
    771             UChar32  c;
    772             if (startPos == fAnchorStart) {
    773                 MatchAt(startPos, FALSE, fDeferredStatus);
    774                 if (U_FAILURE(fDeferredStatus)) {
    775                     return FALSE;
    776                 }
    777                 if (fMatch) {
    778                     return TRUE;
    779                 }
    780                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    781                 c = UTEXT_NEXT32(fInputText);
    782                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    783             } else {
    784                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    785                 c = UTEXT_PREVIOUS32(fInputText);
    786                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    787             }
    788 
    789             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
    790                 for (;;) {
    791                     if (c == 0x0a) {
    792                             MatchAt(startPos, FALSE, fDeferredStatus);
    793                             if (U_FAILURE(fDeferredStatus)) {
    794                                 return FALSE;
    795                             }
    796                             if (fMatch) {
    797                                 return TRUE;
    798                             }
    799                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    800                     }
    801                     if (startPos >= testStartLimit) {
    802                         fMatch = FALSE;
    803                         fHitEnd = TRUE;
    804                         return FALSE;
    805                     }
    806                     c = UTEXT_NEXT32(fInputText);
    807                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
    808                     // Note that it's perfectly OK for a pattern to have a zero-length
    809                     //   match at the end of a string, so we must make sure that the loop
    810                     //   runs with startPos == testStartLimit the last time through.
    811 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    812                         return FALSE;
    813                 }
    814             } else {
    815                 for (;;) {
    816                     if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
    817                         ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
    818                             if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
    819                                 (void)UTEXT_NEXT32(fInputText);
    820                                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    821                             }
    822                             MatchAt(startPos, FALSE, fDeferredStatus);
    823                             if (U_FAILURE(fDeferredStatus)) {
    824                                 return FALSE;
    825                             }
    826                             if (fMatch) {
    827                                 return TRUE;
    828                             }
    829                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    830                     }
    831                     if (startPos >= testStartLimit) {
    832                         fMatch = FALSE;
    833                         fHitEnd = TRUE;
    834                         return FALSE;
    835                     }
    836                     c = UTEXT_NEXT32(fInputText);
    837                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
    838                     // Note that it's perfectly OK for a pattern to have a zero-length
    839                     //   match at the end of a string, so we must make sure that the loop
    840                     //   runs with startPos == testStartLimit the last time through.
    841 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    842                         return FALSE;
    843                 }
    844             }
    845         }
    846 
    847     default:
    848         U_ASSERT(FALSE);
    849     }
    850 
    851     U_ASSERT(FALSE);
    852     return FALSE;
    853 }
    854 
    855 
    856 
    857 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
    858     if (U_FAILURE(status)) {
    859         return FALSE;
    860     }
    861     if (U_FAILURE(fDeferredStatus)) {
    862         status = fDeferredStatus;
    863         return FALSE;
    864     }
    865     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
    866                                           //        This will reset the region to be the full input length.
    867     if (start < 0) {
    868         status = U_INDEX_OUTOFBOUNDS_ERROR;
    869         return FALSE;
    870     }
    871 
    872     int64_t nativeStart = start;
    873     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
    874         status = U_INDEX_OUTOFBOUNDS_ERROR;
    875         return FALSE;
    876     }
    877     fMatchEnd = nativeStart;
    878     return find();
    879 }
    880 
    881 
    882 //--------------------------------------------------------------------------------
    883 //
    884 //   findUsingChunk() -- like find(), but with the advance knowledge that the
    885 //                       entire string is available in the UText's chunk buffer.
    886 //
    887 //--------------------------------------------------------------------------------
    888 UBool RegexMatcher::findUsingChunk() {
    889     // Start at the position of the last match end.  (Will be zero if the
    890     //   matcher has been reset.
    891     //
    892 
    893     int32_t startPos = (int32_t)fMatchEnd;
    894     if (startPos==0) {
    895         startPos = (int32_t)fActiveStart;
    896     }
    897 
    898     const UChar *inputBuf = fInputText->chunkContents;
    899 
    900     if (fMatch) {
    901         // Save the position of any previous successful match.
    902         fLastMatchEnd = fMatchEnd;
    903 
    904         if (fMatchStart == fMatchEnd) {
    905             // Previous match had zero length.  Move start position up one position
    906             //  to avoid sending find() into a loop on zero-length matches.
    907             if (startPos >= fActiveLimit) {
    908                 fMatch = FALSE;
    909                 fHitEnd = TRUE;
    910                 return FALSE;
    911             }
    912             U16_FWD_1(inputBuf, startPos, fInputLength);
    913         }
    914     } else {
    915         if (fLastMatchEnd >= 0) {
    916             // A previous find() failed to match.  Don't try again.
    917             //   (without this test, a pattern with a zero-length match
    918             //    could match again at the end of an input string.)
    919             fHitEnd = TRUE;
    920             return FALSE;
    921         }
    922     }
    923 
    924 
    925     // Compute the position in the input string beyond which a match can not begin, because
    926     //   the minimum length match would extend past the end of the input.
    927     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    928     //          Be aware of possible overflows if making changes here.
    929     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
    930     if (startPos > testLen) {
    931         fMatch = FALSE;
    932         fHitEnd = TRUE;
    933         return FALSE;
    934     }
    935 
    936     UChar32  c;
    937     U_ASSERT(startPos >= 0);
    938 
    939     switch (fPattern->fStartType) {
    940     case START_NO_INFO:
    941         // No optimization was found.
    942         //  Try a match at each input position.
    943         for (;;) {
    944             MatchChunkAt(startPos, FALSE, fDeferredStatus);
    945             if (U_FAILURE(fDeferredStatus)) {
    946                 return FALSE;
    947             }
    948             if (fMatch) {
    949                 return TRUE;
    950             }
    951             if (startPos >= testLen) {
    952                 fHitEnd = TRUE;
    953                 return FALSE;
    954             }
    955             U16_FWD_1(inputBuf, startPos, fActiveLimit);
    956             // Note that it's perfectly OK for a pattern to have a zero-length
    957             //   match at the end of a string, so we must make sure that the loop
    958             //   runs with startPos == testLen the last time through.
    959             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    960                 return FALSE;
    961         }
    962         U_ASSERT(FALSE);
    963 
    964     case START_START:
    965         // Matches are only possible at the start of the input string
    966         //   (pattern begins with ^ or \A)
    967         if (startPos > fActiveStart) {
    968             fMatch = FALSE;
    969             return FALSE;
    970         }
    971         MatchChunkAt(startPos, FALSE, fDeferredStatus);
    972         if (U_FAILURE(fDeferredStatus)) {
    973             return FALSE;
    974         }
    975         return fMatch;
    976 
    977 
    978     case START_SET:
    979     {
    980         // Match may start on any char from a pre-computed set.
    981         U_ASSERT(fPattern->fMinMatchLen > 0);
    982         for (;;) {
    983             int32_t pos = startPos;
    984             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
    985             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
    986                 (c>=256 && fPattern->fInitialChars->contains(c))) {
    987                 MatchChunkAt(pos, FALSE, fDeferredStatus);
    988                 if (U_FAILURE(fDeferredStatus)) {
    989                     return FALSE;
    990                 }
    991                 if (fMatch) {
    992                     return TRUE;
    993                 }
    994             }
    995             if (pos >= testLen) {
    996                 fMatch = FALSE;
    997                 fHitEnd = TRUE;
    998                 return FALSE;
    999             }
   1000             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1001                 return FALSE;
   1002         }
   1003     }
   1004         U_ASSERT(FALSE);
   1005 
   1006     case START_STRING:
   1007     case START_CHAR:
   1008     {
   1009         // Match starts on exactly one char.
   1010         U_ASSERT(fPattern->fMinMatchLen > 0);
   1011         UChar32 theChar = fPattern->fInitialChar;
   1012         for (;;) {
   1013             int32_t pos = startPos;
   1014             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
   1015             if (c == theChar) {
   1016                 MatchChunkAt(pos, FALSE, fDeferredStatus);
   1017                 if (U_FAILURE(fDeferredStatus)) {
   1018                     return FALSE;
   1019                 }
   1020                 if (fMatch) {
   1021                     return TRUE;
   1022                 }
   1023             }
   1024             if (pos >= testLen) {
   1025                 fMatch = FALSE;
   1026                 fHitEnd = TRUE;
   1027                 return FALSE;
   1028             }
   1029             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1030                 return FALSE;
   1031         }
   1032     }
   1033         U_ASSERT(FALSE);
   1034 
   1035     case START_LINE:
   1036     {
   1037         UChar32  c;
   1038         if (startPos == fAnchorStart) {
   1039             MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1040             if (U_FAILURE(fDeferredStatus)) {
   1041                 return FALSE;
   1042             }
   1043             if (fMatch) {
   1044                 return TRUE;
   1045             }
   1046             U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1047         }
   1048 
   1049         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
   1050             for (;;) {
   1051                 c = inputBuf[startPos-1];
   1052                 if (c == 0x0a) {
   1053                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1054                     if (U_FAILURE(fDeferredStatus)) {
   1055                         return FALSE;
   1056                     }
   1057                     if (fMatch) {
   1058                         return TRUE;
   1059                     }
   1060                 }
   1061                 if (startPos >= testLen) {
   1062                     fMatch = FALSE;
   1063                     fHitEnd = TRUE;
   1064                     return FALSE;
   1065                 }
   1066                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1067                 // Note that it's perfectly OK for a pattern to have a zero-length
   1068                 //   match at the end of a string, so we must make sure that the loop
   1069                 //   runs with startPos == testLen the last time through.
   1070 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1071                     return FALSE;
   1072             }
   1073         } else {
   1074             for (;;) {
   1075                 c = inputBuf[startPos-1];
   1076                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   1077                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
   1078                     if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
   1079                         startPos++;
   1080                     }
   1081                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1082                     if (U_FAILURE(fDeferredStatus)) {
   1083                         return FALSE;
   1084                     }
   1085                     if (fMatch) {
   1086                         return TRUE;
   1087                     }
   1088                 }
   1089                 if (startPos >= testLen) {
   1090                     fMatch = FALSE;
   1091                     fHitEnd = TRUE;
   1092                     return FALSE;
   1093                 }
   1094                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1095                 // Note that it's perfectly OK for a pattern to have a zero-length
   1096                 //   match at the end of a string, so we must make sure that the loop
   1097                 //   runs with startPos == testLen the last time through.
   1098 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1099                     return FALSE;
   1100             }
   1101         }
   1102     }
   1103 
   1104     default:
   1105         U_ASSERT(FALSE);
   1106     }
   1107 
   1108     U_ASSERT(FALSE);
   1109     return FALSE;
   1110 }
   1111 
   1112 
   1113 
   1114 //--------------------------------------------------------------------------------
   1115 //
   1116 //  group()
   1117 //
   1118 //--------------------------------------------------------------------------------
   1119 UnicodeString RegexMatcher::group(UErrorCode &status) const {
   1120     return group(0, status);
   1121 }
   1122 
   1123 //  Return immutable shallow clone
   1124 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
   1125     return group(0, dest, group_len, status);
   1126 }
   1127 
   1128 //  Return immutable shallow clone
   1129 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
   1130     group_len = 0;
   1131     UBool bailOut = FALSE;
   1132     if (U_FAILURE(status)) {
   1133         return dest;
   1134     }
   1135     if (U_FAILURE(fDeferredStatus)) {
   1136         status = fDeferredStatus;
   1137         bailOut = TRUE;
   1138     }
   1139     if (fMatch == FALSE) {
   1140         status = U_REGEX_INVALID_STATE;
   1141         bailOut = TRUE;
   1142     }
   1143     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1144         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1145         bailOut = TRUE;
   1146     }
   1147 
   1148     if (bailOut) {
   1149         return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
   1150     }
   1151 
   1152     int64_t s, e;
   1153     if (groupNum == 0) {
   1154         s = fMatchStart;
   1155         e = fMatchEnd;
   1156     } else {
   1157         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1158         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1159         U_ASSERT(groupOffset >= 0);
   1160         s = fFrame->fExtra[groupOffset];
   1161         e = fFrame->fExtra[groupOffset+1];
   1162     }
   1163 
   1164     if (s < 0) {
   1165         // A capture group wasn't part of the match
   1166         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
   1167     }
   1168     U_ASSERT(s <= e);
   1169     group_len = e - s;
   1170 
   1171     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
   1172     if (dest)
   1173         UTEXT_SETNATIVEINDEX(dest, s);
   1174     return dest;
   1175 }
   1176 
   1177 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
   1178     UnicodeString result;
   1179     if (U_FAILURE(status)) {
   1180         return result;
   1181     }
   1182     UText resultText = UTEXT_INITIALIZER;
   1183     utext_openUnicodeString(&resultText, &result, &status);
   1184     group(groupNum, &resultText, status);
   1185     utext_close(&resultText);
   1186     return result;
   1187 }
   1188 
   1189 
   1190 //  Return deep (mutable) clone
   1191 //		Technology Preview (as an API), but note that the UnicodeString API is implemented
   1192 //		using this function.
   1193 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
   1194     UBool bailOut = FALSE;
   1195     if (U_FAILURE(status)) {
   1196         return dest;
   1197     }
   1198     if (U_FAILURE(fDeferredStatus)) {
   1199         status = fDeferredStatus;
   1200         bailOut = TRUE;
   1201     }
   1202 
   1203     if (fMatch == FALSE) {
   1204         status = U_REGEX_INVALID_STATE;
   1205         bailOut = TRUE;
   1206     }
   1207     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1208         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1209         bailOut = TRUE;
   1210     }
   1211 
   1212     if (bailOut) {
   1213         if (dest) {
   1214             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1215             return dest;
   1216         } else {
   1217             return utext_openUChars(NULL, NULL, 0, &status);
   1218         }
   1219     }
   1220 
   1221     int64_t s, e;
   1222     if (groupNum == 0) {
   1223         s = fMatchStart;
   1224         e = fMatchEnd;
   1225     } else {
   1226         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1227         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1228         U_ASSERT(groupOffset >= 0);
   1229         s = fFrame->fExtra[groupOffset];
   1230         e = fFrame->fExtra[groupOffset+1];
   1231     }
   1232 
   1233     if (s < 0) {
   1234         // A capture group wasn't part of the match
   1235         if (dest) {
   1236             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1237             return dest;
   1238         } else {
   1239             return utext_openUChars(NULL, NULL, 0, &status);
   1240         }
   1241     }
   1242     U_ASSERT(s <= e);
   1243 
   1244     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1245         U_ASSERT(e <= fInputLength);
   1246         if (dest) {
   1247             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
   1248         } else {
   1249             UText groupText = UTEXT_INITIALIZER;
   1250             utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
   1251             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
   1252             utext_close(&groupText);
   1253         }
   1254     } else {
   1255         int32_t len16;
   1256         if (UTEXT_USES_U16(fInputText)) {
   1257             len16 = (int32_t)(e-s);
   1258         } else {
   1259             UErrorCode lengthStatus = U_ZERO_ERROR;
   1260             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
   1261         }
   1262         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   1263         if (groupChars == NULL) {
   1264             status = U_MEMORY_ALLOCATION_ERROR;
   1265             return dest;
   1266         }
   1267         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
   1268 
   1269         if (dest) {
   1270             utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
   1271         } else {
   1272             UText groupText = UTEXT_INITIALIZER;
   1273             utext_openUChars(&groupText, groupChars, len16, &status);
   1274             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
   1275             utext_close(&groupText);
   1276         }
   1277 
   1278         uprv_free(groupChars);
   1279     }
   1280     return dest;
   1281 }
   1282 
   1283 //--------------------------------------------------------------------------------
   1284 //
   1285 //  appendGroup() -- currently internal only, appends a group to a UText rather
   1286 //                   than replacing its contents
   1287 //
   1288 //--------------------------------------------------------------------------------
   1289 
   1290 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
   1291     if (U_FAILURE(status)) {
   1292         return 0;
   1293     }
   1294     if (U_FAILURE(fDeferredStatus)) {
   1295         status = fDeferredStatus;
   1296         return 0;
   1297     }
   1298     int64_t destLen = utext_nativeLength(dest);
   1299 
   1300     if (fMatch == FALSE) {
   1301         status = U_REGEX_INVALID_STATE;
   1302         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1303     }
   1304     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1305         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1306         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1307     }
   1308 
   1309     int64_t s, e;
   1310     if (groupNum == 0) {
   1311         s = fMatchStart;
   1312         e = fMatchEnd;
   1313     } else {
   1314         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1315         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1316         U_ASSERT(groupOffset >= 0);
   1317         s = fFrame->fExtra[groupOffset];
   1318         e = fFrame->fExtra[groupOffset+1];
   1319     }
   1320 
   1321     if (s < 0) {
   1322         // A capture group wasn't part of the match
   1323         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1324     }
   1325     U_ASSERT(s <= e);
   1326 
   1327     int64_t deltaLen;
   1328     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1329         U_ASSERT(e <= fInputLength);
   1330         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
   1331     } else {
   1332         int32_t len16;
   1333         if (UTEXT_USES_U16(fInputText)) {
   1334             len16 = (int32_t)(e-s);
   1335         } else {
   1336             UErrorCode lengthStatus = U_ZERO_ERROR;
   1337             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
   1338         }
   1339         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   1340         if (groupChars == NULL) {
   1341             status = U_MEMORY_ALLOCATION_ERROR;
   1342             return 0;
   1343         }
   1344         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
   1345 
   1346         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
   1347         uprv_free(groupChars);
   1348     }
   1349     return deltaLen;
   1350 }
   1351 
   1352 
   1353 
   1354 //--------------------------------------------------------------------------------
   1355 //
   1356 //  groupCount()
   1357 //
   1358 //--------------------------------------------------------------------------------
   1359 int32_t RegexMatcher::groupCount() const {
   1360     return fPattern->fGroupMap->size();
   1361 }
   1362 
   1363 
   1364 
   1365 //--------------------------------------------------------------------------------
   1366 //
   1367 //  hasAnchoringBounds()
   1368 //
   1369 //--------------------------------------------------------------------------------
   1370 UBool RegexMatcher::hasAnchoringBounds() const {
   1371     return fAnchoringBounds;
   1372 }
   1373 
   1374 
   1375 //--------------------------------------------------------------------------------
   1376 //
   1377 //  hasTransparentBounds()
   1378 //
   1379 //--------------------------------------------------------------------------------
   1380 UBool RegexMatcher::hasTransparentBounds() const {
   1381     return fTransparentBounds;
   1382 }
   1383 
   1384 
   1385 
   1386 //--------------------------------------------------------------------------------
   1387 //
   1388 //  hitEnd()
   1389 //
   1390 //--------------------------------------------------------------------------------
   1391 UBool RegexMatcher::hitEnd() const {
   1392     return fHitEnd;
   1393 }
   1394 
   1395 
   1396 //--------------------------------------------------------------------------------
   1397 //
   1398 //  input()
   1399 //
   1400 //--------------------------------------------------------------------------------
   1401 const UnicodeString &RegexMatcher::input() const {
   1402     if (!fInput) {
   1403         UErrorCode status = U_ZERO_ERROR;
   1404         int32_t len16;
   1405         if (UTEXT_USES_U16(fInputText)) {
   1406             len16 = (int32_t)fInputLength;
   1407         } else {
   1408             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
   1409             status = U_ZERO_ERROR; // overflow, length status
   1410         }
   1411         UnicodeString *result = new UnicodeString(len16, 0, 0);
   1412 
   1413         UChar *inputChars = result->getBuffer(len16);
   1414         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
   1415         result->releaseBuffer(len16);
   1416 
   1417         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
   1418     }
   1419 
   1420     return *fInput;
   1421 }
   1422 
   1423 //--------------------------------------------------------------------------------
   1424 //
   1425 //  inputText()
   1426 //
   1427 //--------------------------------------------------------------------------------
   1428 UText *RegexMatcher::inputText() const {
   1429     return fInputText;
   1430 }
   1431 
   1432 
   1433 //--------------------------------------------------------------------------------
   1434 //
   1435 //  getInput() -- like inputText(), but makes a clone or copies into another UText
   1436 //
   1437 //--------------------------------------------------------------------------------
   1438 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
   1439     UBool bailOut = FALSE;
   1440     if (U_FAILURE(status)) {
   1441         return dest;
   1442     }
   1443     if (U_FAILURE(fDeferredStatus)) {
   1444         status = fDeferredStatus;
   1445         bailOut = TRUE;
   1446     }
   1447 
   1448     if (bailOut) {
   1449         if (dest) {
   1450             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1451             return dest;
   1452         } else {
   1453             return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
   1454         }
   1455     }
   1456 
   1457     if (dest) {
   1458         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1459             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
   1460         } else {
   1461             int32_t input16Len;
   1462             if (UTEXT_USES_U16(fInputText)) {
   1463                 input16Len = (int32_t)fInputLength;
   1464             } else {
   1465                 UErrorCode lengthStatus = U_ZERO_ERROR;
   1466                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
   1467             }
   1468             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
   1469             if (inputChars == NULL) {
   1470                 return dest;
   1471             }
   1472 
   1473             status = U_ZERO_ERROR;
   1474             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
   1475             status = U_ZERO_ERROR;
   1476             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
   1477 
   1478             uprv_free(inputChars);
   1479         }
   1480         return dest;
   1481     } else {
   1482         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
   1483     }
   1484 }
   1485 
   1486 
   1487 static UBool compat_SyncMutableUTextContents(UText *ut);
   1488 static UBool compat_SyncMutableUTextContents(UText *ut) {
   1489     UBool retVal = FALSE;
   1490 
   1491     //  In the following test, we're really only interested in whether the UText should switch
   1492     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
   1493     //  will still point to the correct data.
   1494     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
   1495         UnicodeString *us=(UnicodeString *)ut->context;
   1496 
   1497         // Update to the latest length.
   1498         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
   1499         int32_t newLength = us->length();
   1500 
   1501         // Update the chunk description.
   1502         // The buffer may have switched between stack- and heap-based.
   1503         ut->chunkContents    = us->getBuffer();
   1504         ut->chunkLength      = newLength;
   1505         ut->chunkNativeLimit = newLength;
   1506         ut->nativeIndexingLimit = newLength;
   1507         retVal = TRUE;
   1508     }
   1509 
   1510     return retVal;
   1511 }
   1512 
   1513 //--------------------------------------------------------------------------------
   1514 //
   1515 //  lookingAt()
   1516 //
   1517 //--------------------------------------------------------------------------------
   1518 UBool RegexMatcher::lookingAt(UErrorCode &status) {
   1519     if (U_FAILURE(status)) {
   1520         return FALSE;
   1521     }
   1522     if (U_FAILURE(fDeferredStatus)) {
   1523         status = fDeferredStatus;
   1524         return FALSE;
   1525     }
   1526 
   1527     if (fInputUniStrMaybeMutable) {
   1528         if (compat_SyncMutableUTextContents(fInputText)) {
   1529         fInputLength = utext_nativeLength(fInputText);
   1530         reset();
   1531         }
   1532     }
   1533     else {
   1534         resetPreserveRegion();
   1535     }
   1536     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1537         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
   1538     } else {
   1539         MatchAt(fActiveStart, FALSE, status);
   1540     }
   1541     return fMatch;
   1542 }
   1543 
   1544 
   1545 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
   1546     if (U_FAILURE(status)) {
   1547         return FALSE;
   1548     }
   1549     if (U_FAILURE(fDeferredStatus)) {
   1550         status = fDeferredStatus;
   1551         return FALSE;
   1552     }
   1553     reset();
   1554 
   1555     if (start < 0) {
   1556         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1557         return FALSE;
   1558     }
   1559 
   1560     if (fInputUniStrMaybeMutable) {
   1561         if (compat_SyncMutableUTextContents(fInputText)) {
   1562         fInputLength = utext_nativeLength(fInputText);
   1563         reset();
   1564         }
   1565     }
   1566 
   1567     int64_t nativeStart;
   1568     nativeStart = start;
   1569     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1570         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1571         return FALSE;
   1572     }
   1573 
   1574     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1575         MatchChunkAt((int32_t)nativeStart, FALSE, status);
   1576     } else {
   1577         MatchAt(nativeStart, FALSE, status);
   1578     }
   1579     return fMatch;
   1580 }
   1581 
   1582 
   1583 
   1584 //--------------------------------------------------------------------------------
   1585 //
   1586 //  matches()
   1587 //
   1588 //--------------------------------------------------------------------------------
   1589 UBool RegexMatcher::matches(UErrorCode &status) {
   1590     if (U_FAILURE(status)) {
   1591         return FALSE;
   1592     }
   1593     if (U_FAILURE(fDeferredStatus)) {
   1594         status = fDeferredStatus;
   1595         return FALSE;
   1596     }
   1597 
   1598     if (fInputUniStrMaybeMutable) {
   1599         if (compat_SyncMutableUTextContents(fInputText)) {
   1600         fInputLength = utext_nativeLength(fInputText);
   1601         reset();
   1602         }
   1603     }
   1604     else {
   1605         resetPreserveRegion();
   1606     }
   1607 
   1608     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1609         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
   1610     } else {
   1611         MatchAt(fActiveStart, TRUE, status);
   1612     }
   1613     return fMatch;
   1614 }
   1615 
   1616 
   1617 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
   1618     if (U_FAILURE(status)) {
   1619         return FALSE;
   1620     }
   1621     if (U_FAILURE(fDeferredStatus)) {
   1622         status = fDeferredStatus;
   1623         return FALSE;
   1624     }
   1625     reset();
   1626 
   1627     if (start < 0) {
   1628         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1629         return FALSE;
   1630     }
   1631 
   1632     if (fInputUniStrMaybeMutable) {
   1633         if (compat_SyncMutableUTextContents(fInputText)) {
   1634         fInputLength = utext_nativeLength(fInputText);
   1635         reset();
   1636         }
   1637     }
   1638 
   1639     int64_t nativeStart;
   1640     nativeStart = start;
   1641     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1642         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1643         return FALSE;
   1644     }
   1645 
   1646     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1647         MatchChunkAt((int32_t)nativeStart, TRUE, status);
   1648     } else {
   1649         MatchAt(nativeStart, TRUE, status);
   1650     }
   1651     return fMatch;
   1652 }
   1653 
   1654 
   1655 
   1656 //--------------------------------------------------------------------------------
   1657 //
   1658 //    pattern
   1659 //
   1660 //--------------------------------------------------------------------------------
   1661 const RegexPattern &RegexMatcher::pattern() const {
   1662     return *fPattern;
   1663 }
   1664 
   1665 
   1666 
   1667 //--------------------------------------------------------------------------------
   1668 //
   1669 //    region
   1670 //
   1671 //--------------------------------------------------------------------------------
   1672 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
   1673     if (U_FAILURE(status)) {
   1674         return *this;
   1675     }
   1676 
   1677     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
   1678         status = U_ILLEGAL_ARGUMENT_ERROR;
   1679     }
   1680 
   1681     int64_t nativeStart = regionStart;
   1682     int64_t nativeLimit = regionLimit;
   1683     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
   1684       status = U_ILLEGAL_ARGUMENT_ERROR;
   1685     }
   1686 
   1687     if (startIndex == -1)
   1688       this->reset();
   1689     else
   1690       resetPreserveRegion();
   1691 
   1692     fRegionStart = nativeStart;
   1693     fRegionLimit = nativeLimit;
   1694     fActiveStart = nativeStart;
   1695     fActiveLimit = nativeLimit;
   1696 
   1697     if (startIndex != -1) {
   1698       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
   1699           status = U_INDEX_OUTOFBOUNDS_ERROR;
   1700       }
   1701       fMatchEnd = startIndex;
   1702     }
   1703 
   1704     if (!fTransparentBounds) {
   1705         fLookStart = nativeStart;
   1706         fLookLimit = nativeLimit;
   1707     }
   1708     if (fAnchoringBounds) {
   1709         fAnchorStart = nativeStart;
   1710         fAnchorLimit = nativeLimit;
   1711     }
   1712     return *this;
   1713 }
   1714 
   1715 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
   1716   return region(start, limit, -1, status);
   1717 }
   1718 
   1719 //--------------------------------------------------------------------------------
   1720 //
   1721 //    regionEnd
   1722 //
   1723 //--------------------------------------------------------------------------------
   1724 int32_t RegexMatcher::regionEnd() const {
   1725     return (int32_t)fRegionLimit;
   1726 }
   1727 
   1728 int64_t RegexMatcher::regionEnd64() const {
   1729     return fRegionLimit;
   1730 }
   1731 
   1732 //--------------------------------------------------------------------------------
   1733 //
   1734 //    regionStart
   1735 //
   1736 //--------------------------------------------------------------------------------
   1737 int32_t RegexMatcher::regionStart() const {
   1738     return (int32_t)fRegionStart;
   1739 }
   1740 
   1741 int64_t RegexMatcher::regionStart64() const {
   1742     return fRegionStart;
   1743 }
   1744 
   1745 
   1746 //--------------------------------------------------------------------------------
   1747 //
   1748 //    replaceAll
   1749 //
   1750 //--------------------------------------------------------------------------------
   1751 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
   1752     UText replacementText = UTEXT_INITIALIZER;
   1753     UText resultText = UTEXT_INITIALIZER;
   1754     UnicodeString resultString;
   1755     if (U_FAILURE(status)) {
   1756         return resultString;
   1757     }
   1758 
   1759     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1760     utext_openUnicodeString(&resultText, &resultString, &status);
   1761 
   1762     replaceAll(&replacementText, &resultText, status);
   1763 
   1764     utext_close(&resultText);
   1765     utext_close(&replacementText);
   1766 
   1767     return resultString;
   1768 }
   1769 
   1770 
   1771 //
   1772 //    replaceAll, UText mode
   1773 //
   1774 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
   1775     if (U_FAILURE(status)) {
   1776         return dest;
   1777     }
   1778     if (U_FAILURE(fDeferredStatus)) {
   1779         status = fDeferredStatus;
   1780         return dest;
   1781     }
   1782 
   1783     if (dest == NULL) {
   1784         UnicodeString emptyString;
   1785         UText empty = UTEXT_INITIALIZER;
   1786 
   1787         utext_openUnicodeString(&empty, &emptyString, &status);
   1788         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
   1789         utext_close(&empty);
   1790     }
   1791 
   1792     if (U_SUCCESS(status)) {
   1793         reset();
   1794         while (find()) {
   1795             appendReplacement(dest, replacement, status);
   1796             if (U_FAILURE(status)) {
   1797                 break;
   1798             }
   1799         }
   1800         appendTail(dest, status);
   1801     }
   1802 
   1803     return dest;
   1804 }
   1805 
   1806 
   1807 //--------------------------------------------------------------------------------
   1808 //
   1809 //    replaceFirst
   1810 //
   1811 //--------------------------------------------------------------------------------
   1812 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
   1813     UText replacementText = UTEXT_INITIALIZER;
   1814     UText resultText = UTEXT_INITIALIZER;
   1815     UnicodeString resultString;
   1816 
   1817     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1818     utext_openUnicodeString(&resultText, &resultString, &status);
   1819 
   1820     replaceFirst(&replacementText, &resultText, status);
   1821 
   1822     utext_close(&resultText);
   1823     utext_close(&replacementText);
   1824 
   1825     return resultString;
   1826 }
   1827 
   1828 //
   1829 //    replaceFirst, UText mode
   1830 //
   1831 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
   1832     if (U_FAILURE(status)) {
   1833         return dest;
   1834     }
   1835     if (U_FAILURE(fDeferredStatus)) {
   1836         status = fDeferredStatus;
   1837         return dest;
   1838     }
   1839 
   1840     reset();
   1841     if (!find()) {
   1842         return getInput(dest, status);
   1843     }
   1844 
   1845     if (dest == NULL) {
   1846         UnicodeString emptyString;
   1847         UText empty = UTEXT_INITIALIZER;
   1848 
   1849         utext_openUnicodeString(&empty, &emptyString, &status);
   1850         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
   1851         utext_close(&empty);
   1852     }
   1853 
   1854     appendReplacement(dest, replacement, status);
   1855     appendTail(dest, status);
   1856 
   1857     return dest;
   1858 }
   1859 
   1860 
   1861 //--------------------------------------------------------------------------------
   1862 //
   1863 //     requireEnd
   1864 //
   1865 //--------------------------------------------------------------------------------
   1866 UBool RegexMatcher::requireEnd() const {
   1867     return fRequireEnd;
   1868 }
   1869 
   1870 
   1871 //--------------------------------------------------------------------------------
   1872 //
   1873 //     reset
   1874 //
   1875 //--------------------------------------------------------------------------------
   1876 RegexMatcher &RegexMatcher::reset() {
   1877     fRegionStart    = 0;
   1878     fRegionLimit    = fInputLength;
   1879     fActiveStart    = 0;
   1880     fActiveLimit    = fInputLength;
   1881     fAnchorStart    = 0;
   1882     fAnchorLimit    = fInputLength;
   1883     fLookStart      = 0;
   1884     fLookLimit      = fInputLength;
   1885     resetPreserveRegion();
   1886     return *this;
   1887 }
   1888 
   1889 
   1890 
   1891 void RegexMatcher::resetPreserveRegion() {
   1892     fMatchStart     = 0;
   1893     fMatchEnd       = 0;
   1894     fLastMatchEnd   = -1;
   1895     fAppendPosition = 0;
   1896     fMatch          = FALSE;
   1897     fHitEnd         = FALSE;
   1898     fRequireEnd     = FALSE;
   1899     fTime           = 0;
   1900     fTickCounter    = TIMER_INITIAL_VALUE;
   1901     //resetStack(); // more expensive than it looks...
   1902 }
   1903 
   1904 
   1905 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
   1906     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
   1907     if (fPattern->fNeedsAltInput) {
   1908         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
   1909     }
   1910     fInputLength = utext_nativeLength(fInputText);
   1911 
   1912     reset();
   1913     delete fInput;
   1914     fInput = NULL;
   1915 
   1916     //  Do the following for any UnicodeString.
   1917     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
   1918     fInputUniStrMaybeMutable = TRUE;
   1919 
   1920     if (fWordBreakItr != NULL) {
   1921 #if UCONFIG_NO_BREAK_ITERATION==0
   1922         UErrorCode status = U_ZERO_ERROR;
   1923         fWordBreakItr->setText(fInputText, status);
   1924 #endif
   1925     }
   1926     return *this;
   1927 }
   1928 
   1929 
   1930 RegexMatcher &RegexMatcher::reset(UText *input) {
   1931     if (fInputText != input) {
   1932         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
   1933         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
   1934         fInputLength = utext_nativeLength(fInputText);
   1935 
   1936         delete fInput;
   1937         fInput = NULL;
   1938 
   1939         if (fWordBreakItr != NULL) {
   1940 #if UCONFIG_NO_BREAK_ITERATION==0
   1941             UErrorCode status = U_ZERO_ERROR;
   1942             fWordBreakItr->setText(input, status);
   1943 #endif
   1944         }
   1945     }
   1946     reset();
   1947     fInputUniStrMaybeMutable = FALSE;
   1948 
   1949     return *this;
   1950 }
   1951 
   1952 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
   1953     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
   1954     return *this;
   1955 }*/
   1956 
   1957 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
   1958     if (U_FAILURE(status)) {
   1959         return *this;
   1960     }
   1961     reset();       // Reset also resets the region to be the entire string.
   1962 
   1963     if (position < 0 || position > fActiveLimit) {
   1964         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1965         return *this;
   1966     }
   1967     fMatchEnd = position;
   1968     return *this;
   1969 }
   1970 
   1971 
   1972 //--------------------------------------------------------------------------------
   1973 //
   1974 //    refresh
   1975 //
   1976 //--------------------------------------------------------------------------------
   1977 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
   1978     if (U_FAILURE(status)) {
   1979         return *this;
   1980     }
   1981     if (input == NULL) {
   1982         status = U_ILLEGAL_ARGUMENT_ERROR;
   1983         return *this;
   1984     }
   1985     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
   1986         status = U_ILLEGAL_ARGUMENT_ERROR;
   1987         return *this;
   1988     }
   1989     int64_t  pos = utext_getNativeIndex(fInputText);
   1990     //  Shallow read-only clone of the new UText into the existing input UText
   1991     fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
   1992     if (U_FAILURE(status)) {
   1993         return *this;
   1994     }
   1995     utext_setNativeIndex(fInputText, pos);
   1996 
   1997     if (fAltInputText != NULL) {
   1998         pos = utext_getNativeIndex(fAltInputText);
   1999         fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
   2000         if (U_FAILURE(status)) {
   2001             return *this;
   2002         }
   2003         utext_setNativeIndex(fAltInputText, pos);
   2004     }
   2005     return *this;
   2006 }
   2007 
   2008 
   2009 
   2010 //--------------------------------------------------------------------------------
   2011 //
   2012 //    setTrace
   2013 //
   2014 //--------------------------------------------------------------------------------
   2015 void RegexMatcher::setTrace(UBool state) {
   2016     fTraceDebug = state;
   2017 }
   2018 
   2019 
   2020 
   2021 //---------------------------------------------------------------------
   2022 //
   2023 //   split
   2024 //
   2025 //---------------------------------------------------------------------
   2026 int32_t  RegexMatcher::split(const UnicodeString &input,
   2027         UnicodeString    dest[],
   2028         int32_t          destCapacity,
   2029         UErrorCode      &status)
   2030 {
   2031     UText inputText = UTEXT_INITIALIZER;
   2032     utext_openConstUnicodeString(&inputText, &input, &status);
   2033     if (U_FAILURE(status)) {
   2034         return 0;
   2035     }
   2036 
   2037     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
   2038     if (destText == NULL) {
   2039         status = U_MEMORY_ALLOCATION_ERROR;
   2040         return 0;
   2041     }
   2042     int32_t i;
   2043     for (i = 0; i < destCapacity; i++) {
   2044         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
   2045     }
   2046 
   2047     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
   2048 
   2049     for (i = 0; i < destCapacity; i++) {
   2050         utext_close(destText[i]);
   2051     }
   2052 
   2053     uprv_free(destText);
   2054     utext_close(&inputText);
   2055     return fieldCount;
   2056 }
   2057 
   2058 //
   2059 //   split, UText mode
   2060 //
   2061 int32_t  RegexMatcher::split(UText *input,
   2062         UText           *dest[],
   2063         int32_t          destCapacity,
   2064         UErrorCode      &status)
   2065 {
   2066     //
   2067     // Check arguements for validity
   2068     //
   2069     if (U_FAILURE(status)) {
   2070         return 0;
   2071     };
   2072 
   2073     if (destCapacity < 1) {
   2074         status = U_ILLEGAL_ARGUMENT_ERROR;
   2075         return 0;
   2076     }
   2077 
   2078     //
   2079     // Reset for the input text
   2080     //
   2081     reset(input);
   2082     int64_t   nextOutputStringStart = 0;
   2083     if (fActiveLimit == 0) {
   2084         return 0;
   2085     }
   2086 
   2087     //
   2088     // Loop through the input text, searching for the delimiter pattern
   2089     //
   2090     int32_t i;
   2091     int32_t numCaptureGroups = fPattern->fGroupMap->size();
   2092     for (i=0; ; i++) {
   2093         if (i>=destCapacity-1) {
   2094             // There is one or zero output string left.
   2095             // Fill the last output string with whatever is left from the input, then exit the loop.
   2096             //  ( i will be == destCapacity if we filled the output array while processing
   2097             //    capture groups of the delimiter expression, in which case we will discard the
   2098             //    last capture group saved in favor of the unprocessed remainder of the
   2099             //    input string.)
   2100             i = destCapacity-1;
   2101             if (fActiveLimit > nextOutputStringStart) {
   2102                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2103                     if (dest[i]) {
   2104                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2105                                       input->chunkContents+nextOutputStringStart,
   2106                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
   2107                     } else {
   2108                         UText remainingText = UTEXT_INITIALIZER;
   2109                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2110                                          fActiveLimit-nextOutputStringStart, &status);
   2111                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2112                         utext_close(&remainingText);
   2113                     }
   2114                 } else {
   2115                     UErrorCode lengthStatus = U_ZERO_ERROR;
   2116                     int32_t remaining16Length =
   2117                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
   2118                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2119                     if (remainingChars == NULL) {
   2120                         status = U_MEMORY_ALLOCATION_ERROR;
   2121                         break;
   2122                     }
   2123 
   2124                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2125                     if (dest[i]) {
   2126                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2127                     } else {
   2128                         UText remainingText = UTEXT_INITIALIZER;
   2129                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2130                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2131                         utext_close(&remainingText);
   2132                     }
   2133 
   2134                     uprv_free(remainingChars);
   2135                 }
   2136             }
   2137             break;
   2138         }
   2139         if (find()) {
   2140             // We found another delimiter.  Move everything from where we started looking
   2141             //  up until the start of the delimiter into the next output string.
   2142             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2143                 if (dest[i]) {
   2144                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2145                                   input->chunkContents+nextOutputStringStart,
   2146                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
   2147                 } else {
   2148                     UText remainingText = UTEXT_INITIALIZER;
   2149                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2150                                       fMatchStart-nextOutputStringStart, &status);
   2151                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2152                     utext_close(&remainingText);
   2153                 }
   2154             } else {
   2155                 UErrorCode lengthStatus = U_ZERO_ERROR;
   2156                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
   2157                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2158                 if (remainingChars == NULL) {
   2159                     status = U_MEMORY_ALLOCATION_ERROR;
   2160                     break;
   2161                 }
   2162                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
   2163                 if (dest[i]) {
   2164                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2165                 } else {
   2166                     UText remainingText = UTEXT_INITIALIZER;
   2167                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2168                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2169                     utext_close(&remainingText);
   2170                 }
   2171 
   2172                 uprv_free(remainingChars);
   2173             }
   2174             nextOutputStringStart = fMatchEnd;
   2175 
   2176             // If the delimiter pattern has capturing parentheses, the captured
   2177             //  text goes out into the next n destination strings.
   2178             int32_t groupNum;
   2179             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
   2180                 if (i >= destCapacity-2) {
   2181                     // Never fill the last available output string with capture group text.
   2182                     // It will filled with the last field, the remainder of the
   2183                     //  unsplit input text.
   2184                     break;
   2185                 }
   2186                 i++;
   2187                 dest[i] = group(groupNum, dest[i], status);
   2188             }
   2189 
   2190             if (nextOutputStringStart == fActiveLimit) {
   2191                 // The delimiter was at the end of the string.  We're done, but first
   2192                 // we output one last empty string, for the empty field following
   2193                 //   the delimiter at the end of input.
   2194                 if (i+1 < destCapacity) {
   2195                     ++i;
   2196                     if (dest[i] == NULL) {
   2197                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
   2198                     } else {
   2199                         static UChar emptyString[] = {(UChar)0};
   2200                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
   2201                     }
   2202                 }
   2203                 break;
   2204 
   2205             }
   2206         }
   2207         else
   2208         {
   2209             // We ran off the end of the input while looking for the next delimiter.
   2210             // All the remaining text goes into the current output string.
   2211             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2212                 if (dest[i]) {
   2213                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2214                                   input->chunkContents+nextOutputStringStart,
   2215                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
   2216                 } else {
   2217                     UText remainingText = UTEXT_INITIALIZER;
   2218                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2219                                      fActiveLimit-nextOutputStringStart, &status);
   2220                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2221                     utext_close(&remainingText);
   2222                 }
   2223             } else {
   2224                 UErrorCode lengthStatus = U_ZERO_ERROR;
   2225                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
   2226                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2227                 if (remainingChars == NULL) {
   2228                     status = U_MEMORY_ALLOCATION_ERROR;
   2229                     break;
   2230                 }
   2231 
   2232                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2233                 if (dest[i]) {
   2234                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2235                 } else {
   2236                     UText remainingText = UTEXT_INITIALIZER;
   2237                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2238                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2239                     utext_close(&remainingText);
   2240                 }
   2241 
   2242                 uprv_free(remainingChars);
   2243             }
   2244             break;
   2245         }
   2246         if (U_FAILURE(status)) {
   2247             break;
   2248         }
   2249     }   // end of for loop
   2250     return i+1;
   2251 }
   2252 
   2253 
   2254 //--------------------------------------------------------------------------------
   2255 //
   2256 //     start
   2257 //
   2258 //--------------------------------------------------------------------------------
   2259 int32_t RegexMatcher::start(UErrorCode &status) const {
   2260     return start(0, status);
   2261 }
   2262 
   2263 int64_t RegexMatcher::start64(UErrorCode &status) const {
   2264     return start64(0, status);
   2265 }
   2266 
   2267 //--------------------------------------------------------------------------------
   2268 //
   2269 //     start(int32_t group, UErrorCode &status)
   2270 //
   2271 //--------------------------------------------------------------------------------
   2272 
   2273 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
   2274     if (U_FAILURE(status)) {
   2275         return -1;
   2276     }
   2277     if (U_FAILURE(fDeferredStatus)) {
   2278         status = fDeferredStatus;
   2279         return -1;
   2280     }
   2281     if (fMatch == FALSE) {
   2282         status = U_REGEX_INVALID_STATE;
   2283         return -1;
   2284     }
   2285     if (group < 0 || group > fPattern->fGroupMap->size()) {
   2286         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2287         return -1;
   2288     }
   2289     int64_t s;
   2290     if (group == 0) {
   2291         s = fMatchStart;
   2292     } else {
   2293         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
   2294         U_ASSERT(groupOffset < fPattern->fFrameSize);
   2295         U_ASSERT(groupOffset >= 0);
   2296         s = fFrame->fExtra[groupOffset];
   2297     }
   2298 
   2299     return s;
   2300 }
   2301 
   2302 
   2303 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
   2304     return (int32_t)start64(group, status);
   2305 }
   2306 
   2307 //--------------------------------------------------------------------------------
   2308 //
   2309 //     useAnchoringBounds
   2310 //
   2311 //--------------------------------------------------------------------------------
   2312 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
   2313     fAnchoringBounds = b;
   2314     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
   2315     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
   2316     return *this;
   2317 }
   2318 
   2319 
   2320 //--------------------------------------------------------------------------------
   2321 //
   2322 //     useTransparentBounds
   2323 //
   2324 //--------------------------------------------------------------------------------
   2325 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
   2326     fTransparentBounds = b;
   2327     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
   2328     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
   2329     return *this;
   2330 }
   2331 
   2332 //--------------------------------------------------------------------------------
   2333 //
   2334 //     setTimeLimit
   2335 //
   2336 //--------------------------------------------------------------------------------
   2337 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
   2338     if (U_FAILURE(status)) {
   2339         return;
   2340     }
   2341     if (U_FAILURE(fDeferredStatus)) {
   2342         status = fDeferredStatus;
   2343         return;
   2344     }
   2345     if (limit < 0) {
   2346         status = U_ILLEGAL_ARGUMENT_ERROR;
   2347         return;
   2348     }
   2349     fTimeLimit = limit;
   2350 }
   2351 
   2352 
   2353 //--------------------------------------------------------------------------------
   2354 //
   2355 //     getTimeLimit
   2356 //
   2357 //--------------------------------------------------------------------------------
   2358 int32_t RegexMatcher::getTimeLimit() const {
   2359     return fTimeLimit;
   2360 }
   2361 
   2362 
   2363 //--------------------------------------------------------------------------------
   2364 //
   2365 //     setStackLimit
   2366 //
   2367 //--------------------------------------------------------------------------------
   2368 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
   2369     if (U_FAILURE(status)) {
   2370         return;
   2371     }
   2372     if (U_FAILURE(fDeferredStatus)) {
   2373         status = fDeferredStatus;
   2374         return;
   2375     }
   2376     if (limit < 0) {
   2377         status = U_ILLEGAL_ARGUMENT_ERROR;
   2378         return;
   2379     }
   2380 
   2381     // Reset the matcher.  This is needed here in case there is a current match
   2382     //    whose final stack frame (containing the match results, pointed to by fFrame)
   2383     //    would be lost by resizing to a smaller stack size.
   2384     reset();
   2385 
   2386     if (limit == 0) {
   2387         // Unlimited stack expansion
   2388         fStack->setMaxCapacity(0);
   2389     } else {
   2390         // Change the units of the limit  from bytes to ints, and bump the size up
   2391         //   to be big enough to hold at least one stack frame for the pattern,
   2392         //   if it isn't there already.
   2393         int32_t adjustedLimit = limit / sizeof(int32_t);
   2394         if (adjustedLimit < fPattern->fFrameSize) {
   2395             adjustedLimit = fPattern->fFrameSize;
   2396         }
   2397         fStack->setMaxCapacity(adjustedLimit);
   2398     }
   2399     fStackLimit = limit;
   2400 }
   2401 
   2402 
   2403 //--------------------------------------------------------------------------------
   2404 //
   2405 //     getStackLimit
   2406 //
   2407 //--------------------------------------------------------------------------------
   2408 int32_t RegexMatcher::getStackLimit() const {
   2409     return fStackLimit;
   2410 }
   2411 
   2412 
   2413 //--------------------------------------------------------------------------------
   2414 //
   2415 //     setMatchCallback
   2416 //
   2417 //--------------------------------------------------------------------------------
   2418 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
   2419                                     const void              *context,
   2420                                     UErrorCode              &status) {
   2421     if (U_FAILURE(status)) {
   2422         return;
   2423     }
   2424     fCallbackFn = callback;
   2425     fCallbackContext = context;
   2426 }
   2427 
   2428 
   2429 //--------------------------------------------------------------------------------
   2430 //
   2431 //     getMatchCallback
   2432 //
   2433 //--------------------------------------------------------------------------------
   2434 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
   2435                                   const void              *&context,
   2436                                   UErrorCode              &status) {
   2437     if (U_FAILURE(status)) {
   2438        return;
   2439     }
   2440     callback = fCallbackFn;
   2441     context  = fCallbackContext;
   2442 }
   2443 
   2444 
   2445 //--------------------------------------------------------------------------------
   2446 //
   2447 //     setMatchCallback
   2448 //
   2449 //--------------------------------------------------------------------------------
   2450 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
   2451                                                 const void                      *context,
   2452                                                 UErrorCode                      &status) {
   2453     if (U_FAILURE(status)) {
   2454         return;
   2455     }
   2456     fFindProgressCallbackFn = callback;
   2457     fFindProgressCallbackContext = context;
   2458 }
   2459 
   2460 
   2461 //--------------------------------------------------------------------------------
   2462 //
   2463 //     getMatchCallback
   2464 //
   2465 //--------------------------------------------------------------------------------
   2466 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
   2467                                                 const void                    *&context,
   2468                                                 UErrorCode                    &status) {
   2469     if (U_FAILURE(status)) {
   2470        return;
   2471     }
   2472     callback = fFindProgressCallbackFn;
   2473     context  = fFindProgressCallbackContext;
   2474 }
   2475 
   2476 
   2477 //================================================================================
   2478 //
   2479 //    Code following this point in this file is the internal
   2480 //    Match Engine Implementation.
   2481 //
   2482 //================================================================================
   2483 
   2484 
   2485 //--------------------------------------------------------------------------------
   2486 //
   2487 //   resetStack
   2488 //           Discard any previous contents of the state save stack, and initialize a
   2489 //           new stack frame to all -1.  The -1s are needed for capture group limits,
   2490 //           where they indicate that a group has not yet matched anything.
   2491 //--------------------------------------------------------------------------------
   2492 REStackFrame *RegexMatcher::resetStack() {
   2493     // Discard any previous contents of the state save stack, and initialize a
   2494     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
   2495     //  where they indicate that a group has not yet matched anything.
   2496     fStack->removeAllElements();
   2497 
   2498     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
   2499     int32_t i;
   2500     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
   2501         iFrame->fExtra[i] = -1;
   2502     }
   2503     return iFrame;
   2504 }
   2505 
   2506 
   2507 
   2508 //--------------------------------------------------------------------------------
   2509 //
   2510 //   isWordBoundary
   2511 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
   2512 //                     For us,
   2513 //                       If the current char is a combining mark,
   2514 //                          \b is FALSE.
   2515 //                       Else Scan backwards to the first non-combining char.
   2516 //                            We are at a boundary if the this char and the original chars are
   2517 //                               opposite in membership in \w set
   2518 //
   2519 //          parameters:   pos   - the current position in the input buffer
   2520 //
   2521 //              TODO:  double-check edge cases at region boundaries.
   2522 //
   2523 //--------------------------------------------------------------------------------
   2524 UBool RegexMatcher::isWordBoundary(int64_t pos) {
   2525     UBool isBoundary = FALSE;
   2526     UBool cIsWord    = FALSE;
   2527 
   2528     if (pos >= fLookLimit) {
   2529         fHitEnd = TRUE;
   2530     } else {
   2531         // Determine whether char c at current position is a member of the word set of chars.
   2532         // If we're off the end of the string, behave as though we're not at a word char.
   2533         UTEXT_SETNATIVEINDEX(fInputText, pos);
   2534         UChar32  c = UTEXT_CURRENT32(fInputText);
   2535         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2536             // Current char is a combining one.  Not a boundary.
   2537             return FALSE;
   2538         }
   2539         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   2540     }
   2541 
   2542     // Back up until we come to a non-combining char, determine whether
   2543     //  that char is a word char.
   2544     UBool prevCIsWord = FALSE;
   2545     for (;;) {
   2546         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
   2547             break;
   2548         }
   2549         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
   2550         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2551               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2552             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   2553             break;
   2554         }
   2555     }
   2556     isBoundary = cIsWord ^ prevCIsWord;
   2557     return isBoundary;
   2558 }
   2559 
   2560 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
   2561     UBool isBoundary = FALSE;
   2562     UBool cIsWord    = FALSE;
   2563 
   2564     const UChar *inputBuf = fInputText->chunkContents;
   2565 
   2566     if (pos >= fLookLimit) {
   2567         fHitEnd = TRUE;
   2568     } else {
   2569         // Determine whether char c at current position is a member of the word set of chars.
   2570         // If we're off the end of the string, behave as though we're not at a word char.
   2571         UChar32 c;
   2572         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
   2573         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2574             // Current char is a combining one.  Not a boundary.
   2575             return FALSE;
   2576         }
   2577         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   2578     }
   2579 
   2580     // Back up until we come to a non-combining char, determine whether
   2581     //  that char is a word char.
   2582     UBool prevCIsWord = FALSE;
   2583     for (;;) {
   2584         if (pos <= fLookStart) {
   2585             break;
   2586         }
   2587         UChar32 prevChar;
   2588         U16_PREV(inputBuf, fLookStart, pos, prevChar);
   2589         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2590               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2591             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   2592             break;
   2593         }
   2594     }
   2595     isBoundary = cIsWord ^ prevCIsWord;
   2596     return isBoundary;
   2597 }
   2598 
   2599 //--------------------------------------------------------------------------------
   2600 //
   2601 //   isUWordBoundary
   2602 //
   2603 //         Test for a word boundary using RBBI word break.
   2604 //
   2605 //          parameters:   pos   - the current position in the input buffer
   2606 //
   2607 //--------------------------------------------------------------------------------
   2608 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
   2609     UBool       returnVal = FALSE;
   2610 #if UCONFIG_NO_BREAK_ITERATION==0
   2611 
   2612     // If we haven't yet created a break iterator for this matcher, do it now.
   2613     if (fWordBreakItr == NULL) {
   2614         fWordBreakItr =
   2615             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
   2616         if (U_FAILURE(fDeferredStatus)) {
   2617             return FALSE;
   2618         }
   2619         fWordBreakItr->setText(fInputText, fDeferredStatus);
   2620     }
   2621 
   2622     if (pos >= fLookLimit) {
   2623         fHitEnd = TRUE;
   2624         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
   2625                             //    words are not boundaries.  All non-word chars stand by themselves,
   2626                             //    with word boundaries on both sides.
   2627     } else {
   2628         if (!UTEXT_USES_U16(fInputText)) {
   2629             // !!!: Would like a better way to do this!
   2630             UErrorCode status = U_ZERO_ERROR;
   2631             pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
   2632         }
   2633         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
   2634     }
   2635 #endif
   2636     return   returnVal;
   2637 }
   2638 
   2639 //--------------------------------------------------------------------------------
   2640 //
   2641 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
   2642 //                     saves. Increment the "time" counter, and call the
   2643 //                     user callback function if there is one installed.
   2644 //
   2645 //                     If the match operation needs to be aborted, either for a time-out
   2646 //                     or because the user callback asked for it, just set an error status.
   2647 //                     The engine will pick that up and stop in its outer loop.
   2648 //
   2649 //--------------------------------------------------------------------------------
   2650 void RegexMatcher::IncrementTime(UErrorCode &status) {
   2651     fTickCounter = TIMER_INITIAL_VALUE;
   2652     fTime++;
   2653     if (fCallbackFn != NULL) {
   2654         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
   2655             status = U_REGEX_STOPPED_BY_CALLER;
   2656             return;
   2657         }
   2658     }
   2659     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
   2660         status = U_REGEX_TIME_OUT;
   2661     }
   2662 }
   2663 
   2664 //--------------------------------------------------------------------------------
   2665 //
   2666 //   ReportFindProgress     This function is called once for each advance in the target
   2667 //                          string from the find() function, and calls the user progress callback
   2668 //                          function if there is one installed.
   2669 //
   2670 //                          NOTE:
   2671 //
   2672 //                          If the match operation needs to be aborted because the user
   2673 //                          callback asked for it, just set an error status.
   2674 //                          The engine will pick that up and stop in its outer loop.
   2675 //
   2676 //--------------------------------------------------------------------------------
   2677 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
   2678     if (fFindProgressCallbackFn != NULL) {
   2679         if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
   2680             status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
   2681             return FALSE;
   2682         }
   2683     }
   2684     return TRUE;
   2685 }
   2686 
   2687 //--------------------------------------------------------------------------------
   2688 //
   2689 //   StateSave
   2690 //       Make a new stack frame, initialized as a copy of the current stack frame.
   2691 //       Set the pattern index in the original stack frame from the operand value
   2692 //       in the opcode.  Execution of the engine continues with the state in
   2693 //       the newly created stack frame
   2694 //
   2695 //       Note that reserveBlock() may grow the stack, resulting in the
   2696 //       whole thing being relocated in memory.
   2697 //
   2698 //    Parameters:
   2699 //       fp           The top frame pointer when called.  At return, a new
   2700 //                    fame will be present
   2701 //       savePatIdx   An index into the compiled pattern.  Goes into the original
   2702 //                    (not new) frame.  If execution ever back-tracks out of the
   2703 //                    new frame, this will be where we continue from in the pattern.
   2704 //    Return
   2705 //                    The new frame pointer.
   2706 //
   2707 //--------------------------------------------------------------------------------
   2708 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
   2709     // push storage for a new frame.
   2710     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
   2711     if (newFP == NULL) {
   2712         // Failure on attempted stack expansion.
   2713         //   Stack function set some other error code, change it to a more
   2714         //   specific one for regular expressions.
   2715         status = U_REGEX_STACK_OVERFLOW;
   2716         // We need to return a writable stack frame, so just return the
   2717         //    previous frame.  The match operation will stop quickly
   2718         //    because of the error status, after which the frame will never
   2719         //    be looked at again.
   2720         return fp;
   2721     }
   2722     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
   2723 
   2724     // New stack frame = copy of old top frame.
   2725     int64_t *source = (int64_t *)fp;
   2726     int64_t *dest   = newFP;
   2727     for (;;) {
   2728         *dest++ = *source++;
   2729         if (source == newFP) {
   2730             break;
   2731         }
   2732     }
   2733 
   2734     fTickCounter--;
   2735     if (fTickCounter <= 0) {
   2736        IncrementTime(status);    // Re-initializes fTickCounter
   2737     }
   2738     fp->fPatIdx = savePatIdx;
   2739     return (REStackFrame *)newFP;
   2740 }
   2741 
   2742 
   2743 //--------------------------------------------------------------------------------
   2744 //
   2745 //   MatchAt      This is the actual matching engine.
   2746 //
   2747 //                  startIdx:    begin matching a this index.
   2748 //                  toEnd:       if true, match must extend to end of the input region
   2749 //
   2750 //--------------------------------------------------------------------------------
   2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
   2752     UBool       isMatch  = FALSE;      // True if the we have a match.
   2753 
   2754     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
   2755 
   2756     int32_t     op;                    // Operation from the compiled pattern, split into
   2757     int32_t     opType;                //    the opcode
   2758     int32_t     opValue;               //    and the operand value.
   2759 
   2760     #ifdef REGEX_RUN_DEBUG
   2761     if (fTraceDebug)
   2762     {
   2763         printf("MatchAt(startIdx=%ld)\n", startIdx);
   2764         printf("Original Pattern: ");
   2765         UChar32 c = utext_next32From(fPattern->fPattern, 0);
   2766         while (c != U_SENTINEL) {
   2767             if (c<32 || c>256) {
   2768                 c = '.';
   2769             }
   2770             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
   2771 
   2772             c = UTEXT_NEXT32(fPattern->fPattern);
   2773         }
   2774         printf("\n");
   2775         printf("Input String: ");
   2776         c = utext_next32From(fInputText, 0);
   2777         while (c != U_SENTINEL) {
   2778             if (c<32 || c>256) {
   2779                 c = '.';
   2780             }
   2781             printf("%c", c);
   2782 
   2783             c = UTEXT_NEXT32(fInputText);
   2784         }
   2785         printf("\n");
   2786         printf("\n");
   2787     }
   2788     #endif
   2789 
   2790     if (U_FAILURE(status)) {
   2791         return;
   2792     }
   2793 
   2794     //  Cache frequently referenced items from the compiled pattern
   2795     //
   2796     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   2797 
   2798     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   2799     UVector             *sets          = fPattern->fSets;
   2800 
   2801     fFrameSize = fPattern->fFrameSize;
   2802     REStackFrame        *fp            = resetStack();
   2803 
   2804     fp->fPatIdx   = 0;
   2805     fp->fInputIdx = startIdx;
   2806 
   2807     // Zero out the pattern's static data
   2808     int32_t i;
   2809     for (i = 0; i<fPattern->fDataSize; i++) {
   2810         fData[i] = 0;
   2811     }
   2812 
   2813     //
   2814     //  Main loop for interpreting the compiled pattern.
   2815     //  One iteration of the loop per pattern operation performed.
   2816     //
   2817     for (;;) {
   2818 #if 0
   2819         if (_heapchk() != _HEAPOK) {
   2820             fprintf(stderr, "Heap Trouble\n");
   2821         }
   2822 #endif
   2823 
   2824         op      = (int32_t)pat[fp->fPatIdx];
   2825         opType  = URX_TYPE(op);
   2826         opValue = URX_VAL(op);
   2827         #ifdef REGEX_RUN_DEBUG
   2828         if (fTraceDebug) {
   2829             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2830             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
   2831                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   2832             fPattern->dumpOp(fp->fPatIdx);
   2833         }
   2834         #endif
   2835         fp->fPatIdx++;
   2836 
   2837         switch (opType) {
   2838 
   2839 
   2840         case URX_NOP:
   2841             break;
   2842 
   2843 
   2844         case URX_BACKTRACK:
   2845             // Force a backtrack.  In some circumstances, the pattern compiler
   2846             //   will notice that the pattern can't possibly match anything, and will
   2847             //   emit one of these at that point.
   2848             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2849             break;
   2850 
   2851 
   2852         case URX_ONECHAR:
   2853             if (fp->fInputIdx < fActiveLimit) {
   2854                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2855                 UChar32 c = UTEXT_NEXT32(fInputText);
   2856                 if (c == opValue) {
   2857                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2858                     break;
   2859                 }
   2860             } else {
   2861                 fHitEnd = TRUE;
   2862             }
   2863             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2864             break;
   2865 
   2866 
   2867         case URX_STRING:
   2868             {
   2869                 // Test input against a literal string.
   2870                 // Strings require two slots in the compiled pattern, one for the
   2871                 //   offset to the string text, and one for the length.
   2872 
   2873                 int32_t   stringStartIdx = opValue;
   2874                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
   2875                 fp->fPatIdx++;
   2876                 opType    = URX_TYPE(op);
   2877                 int32_t stringLen = URX_VAL(op);
   2878                 U_ASSERT(opType == URX_STRING_LEN);
   2879                 U_ASSERT(stringLen >= 2);
   2880 
   2881                 const UChar *patternString = litText+stringStartIdx;
   2882                 int32_t patternStringIndex = 0;
   2883                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2884                 UChar32 inputChar;
   2885                 UChar32 patternChar;
   2886                 UBool success = TRUE;
   2887                 while (patternStringIndex < stringLen) {
   2888                     if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
   2889                         success = FALSE;
   2890                         fHitEnd = TRUE;
   2891                         break;
   2892                     }
   2893                     inputChar = UTEXT_NEXT32(fInputText);
   2894                     U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
   2895                     if (patternChar != inputChar) {
   2896                         success = FALSE;
   2897                         break;
   2898                     }
   2899                 }
   2900 
   2901                 if (success) {
   2902                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2903                 } else {
   2904                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2905                 }
   2906             }
   2907             break;
   2908 
   2909 
   2910         case URX_STATE_SAVE:
   2911             fp = StateSave(fp, opValue, status);
   2912             break;
   2913 
   2914 
   2915         case URX_END:
   2916             // The match loop will exit via this path on a successful match,
   2917             //   when we reach the end of the pattern.
   2918             if (toEnd && fp->fInputIdx != fActiveLimit) {
   2919                 // The pattern matched, but not to the end of input.  Try some more.
   2920                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2921                 break;
   2922             }
   2923             isMatch = TRUE;
   2924             goto  breakFromLoop;
   2925 
   2926         // Start and End Capture stack frame variables are laid out out like this:
   2927             //  fp->fExtra[opValue]  - The start of a completed capture group
   2928             //             opValue+1 - The end   of a completed capture group
   2929             //             opValue+2 - the start of a capture group whose end
   2930             //                          has not yet been reached (and might not ever be).
   2931         case URX_START_CAPTURE:
   2932             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2933             fp->fExtra[opValue+2] = fp->fInputIdx;
   2934             break;
   2935 
   2936 
   2937         case URX_END_CAPTURE:
   2938             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2939             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   2940             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   2941             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   2942             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   2943             break;
   2944 
   2945 
   2946         case URX_DOLLAR:                   //  $, test for End of line
   2947                                            //     or for position before new line at end of input
   2948             {
   2949                 if (fp->fInputIdx >= fAnchorLimit) {
   2950                     // We really are at the end of input.  Success.
   2951                     fHitEnd = TRUE;
   2952                     fRequireEnd = TRUE;
   2953                     break;
   2954                 }
   2955 
   2956                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2957 
   2958                 // If we are positioned just before a new-line that is located at the
   2959                 //   end of input, succeed.
   2960                 UChar32 c = UTEXT_NEXT32(fInputText);
   2961                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   2962                     if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
   2963                         // If not in the middle of a CR/LF sequence
   2964                       if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
   2965                             // At new-line at end of input. Success
   2966                             fHitEnd = TRUE;
   2967                             fRequireEnd = TRUE;
   2968 
   2969                             break;
   2970                         }
   2971                     }
   2972                 } else {
   2973                     UChar32 nextC = UTEXT_NEXT32(fInputText);
   2974                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   2975                         fHitEnd = TRUE;
   2976                         fRequireEnd = TRUE;
   2977                         break;                         // At CR/LF at end of input.  Success
   2978                     }
   2979                 }
   2980 
   2981                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2982             }
   2983             break;
   2984 
   2985 
   2986          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   2987             if (fp->fInputIdx >= fAnchorLimit) {
   2988                 // Off the end of input.  Success.
   2989                 fHitEnd = TRUE;
   2990                 fRequireEnd = TRUE;
   2991                 break;
   2992             } else {
   2993                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2994                 UChar32 c = UTEXT_NEXT32(fInputText);
   2995                 // Either at the last character of input, or off the end.
   2996                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
   2997                     fHitEnd = TRUE;
   2998                     fRequireEnd = TRUE;
   2999                     break;
   3000                 }
   3001             }
   3002 
   3003             // Not at end of input.  Back-track out.
   3004             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3005             break;
   3006 
   3007 
   3008          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   3009              {
   3010                  if (fp->fInputIdx >= fAnchorLimit) {
   3011                      // We really are at the end of input.  Success.
   3012                      fHitEnd = TRUE;
   3013                      fRequireEnd = TRUE;
   3014                      break;
   3015                  }
   3016                  // If we are positioned just before a new-line, succeed.
   3017                  // It makes no difference where the new-line is within the input.
   3018                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3019                  UChar32 c = UTEXT_CURRENT32(fInputText);
   3020                  if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
   3021                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   3022                      //  In multi-line mode, hitting a new-line just before the end of input does not
   3023                      //   set the hitEnd or requireEnd flags
   3024                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
   3025                         break;
   3026                      }
   3027                  }
   3028                  // not at a new line.  Fail.
   3029                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3030              }
   3031              break;
   3032 
   3033 
   3034          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   3035              {
   3036                  if (fp->fInputIdx >= fAnchorLimit) {
   3037                      // We really are at the end of input.  Success.
   3038                      fHitEnd = TRUE;
   3039                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   3040                      break;               //   adding a new-line would not lose the match.
   3041                  }
   3042                  // If we are not positioned just before a new-line, the test fails; backtrack out.
   3043                  // It makes no difference where the new-line is within the input.
   3044                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3045                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
   3046                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3047                  }
   3048              }
   3049              break;
   3050 
   3051 
   3052        case URX_CARET:                    //  ^, test for start of line
   3053             if (fp->fInputIdx != fAnchorStart) {
   3054                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3055             }
   3056             break;
   3057 
   3058 
   3059        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   3060            {
   3061                if (fp->fInputIdx == fAnchorStart) {
   3062                    // We are at the start input.  Success.
   3063                    break;
   3064                }
   3065                // Check whether character just before the current pos is a new-line
   3066                //   unless we are at the end of input
   3067                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3068                UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3069                if ((fp->fInputIdx < fAnchorLimit) &&
   3070                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   3071                    //  It's a new-line.  ^ is true.  Success.
   3072                    //  TODO:  what should be done with positions between a CR and LF?
   3073                    break;
   3074                }
   3075                // Not at the start of a line.  Fail.
   3076                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3077            }
   3078            break;
   3079 
   3080 
   3081        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   3082            {
   3083                U_ASSERT(fp->fInputIdx >= fAnchorStart);
   3084                if (fp->fInputIdx <= fAnchorStart) {
   3085                    // We are at the start input.  Success.
   3086                    break;
   3087                }
   3088                // Check whether character just before the current pos is a new-line
   3089                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   3090                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3091                UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3092                if (c != 0x0a) {
   3093                    // Not at the start of a line.  Back-track out.
   3094                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3095                }
   3096            }
   3097            break;
   3098 
   3099         case URX_BACKSLASH_B:          // Test for word boundaries
   3100             {
   3101                 UBool success = isWordBoundary(fp->fInputIdx);
   3102                 success ^= (UBool)(opValue != 0);     // flip sense for \B
   3103                 if (!success) {
   3104                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3105                 }
   3106             }
   3107             break;
   3108 
   3109 
   3110         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   3111             {
   3112                 UBool success = isUWordBoundary(fp->fInputIdx);
   3113                 success ^= (UBool)(opValue != 0);     // flip sense for \B
   3114                 if (!success) {
   3115                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3116                 }
   3117             }
   3118             break;
   3119 
   3120 
   3121         case URX_BACKSLASH_D:            // Test for decimal digit
   3122             {
   3123                 if (fp->fInputIdx >= fActiveLimit) {
   3124                     fHitEnd = TRUE;
   3125                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3126                     break;
   3127                 }
   3128 
   3129                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3130 
   3131                 UChar32 c = UTEXT_NEXT32(fInputText);
   3132                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   3133                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   3134                 success ^= (UBool)(opValue != 0);        // flip sense for \D
   3135                 if (success) {
   3136                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3137                 } else {
   3138                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3139                 }
   3140             }
   3141             break;
   3142 
   3143 
   3144         case URX_BACKSLASH_G:          // Test for position at end of previous match
   3145             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
   3146                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3147             }
   3148             break;
   3149 
   3150 
   3151         case URX_BACKSLASH_X:
   3152             //  Match a Grapheme, as defined by Unicode TR 29.
   3153             //  Differs slightly from Perl, which consumes combining marks independently
   3154             //    of context.
   3155             {
   3156 
   3157                 // Fail if at end of input
   3158                 if (fp->fInputIdx >= fActiveLimit) {
   3159                     fHitEnd = TRUE;
   3160                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3161                     break;
   3162                 }
   3163 
   3164                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3165 
   3166                 // Examine (and consume) the current char.
   3167                 //   Dispatch into a little state machine, based on the char.
   3168                 UChar32  c;
   3169                 c = UTEXT_NEXT32(fInputText);
   3170                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3171                 UnicodeSet **sets = fPattern->fStaticSets;
   3172                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   3173                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   3174                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   3175                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   3176                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   3177                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3178                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3179                 goto GC_Extend;
   3180 
   3181 
   3182 
   3183 GC_L:
   3184                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3185                 c = UTEXT_NEXT32(fInputText);
   3186                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3187                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   3188                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   3189                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   3190                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3191                 (void)UTEXT_PREVIOUS32(fInputText);
   3192                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3193                 goto GC_Extend;
   3194 
   3195 GC_V:
   3196                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3197                 c = UTEXT_NEXT32(fInputText);
   3198                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3199                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3200                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3201                 (void)UTEXT_PREVIOUS32(fInputText);
   3202                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3203                 goto GC_Extend;
   3204 
   3205 GC_T:
   3206                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3207                 c = UTEXT_NEXT32(fInputText);
   3208                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3209                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3210                 (void)UTEXT_PREVIOUS32(fInputText);
   3211                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3212                 goto GC_Extend;
   3213 
   3214 GC_Extend:
   3215                 // Combining characters are consumed here
   3216                 for (;;) {
   3217                     if (fp->fInputIdx >= fActiveLimit) {
   3218                         break;
   3219                     }
   3220                     c = UTEXT_CURRENT32(fInputText);
   3221                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   3222                         break;
   3223                     }
   3224                     (void)UTEXT_NEXT32(fInputText);
   3225                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3226                 }
   3227                 goto GC_Done;
   3228 
   3229 GC_Control:
   3230                 // Most control chars stand alone (don't combine with combining chars),
   3231                 //   except for that CR/LF sequence is a single grapheme cluster.
   3232                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
   3233                     c = UTEXT_NEXT32(fInputText);
   3234                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3235                 }
   3236 
   3237 GC_Done:
   3238                 if (fp->fInputIdx >= fActiveLimit) {
   3239                     fHitEnd = TRUE;
   3240                 }
   3241                 break;
   3242             }
   3243 
   3244 
   3245 
   3246 
   3247         case URX_BACKSLASH_Z:          // Test for end of Input
   3248             if (fp->fInputIdx < fAnchorLimit) {
   3249                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3250             } else {
   3251                 fHitEnd = TRUE;
   3252                 fRequireEnd = TRUE;
   3253             }
   3254             break;
   3255 
   3256 
   3257 
   3258         case URX_STATIC_SETREF:
   3259             {
   3260                 // Test input character against one of the predefined sets
   3261                 //    (Word Characters, for example)
   3262                 // The high bit of the op value is a flag for the match polarity.
   3263                 //    0:   success if input char is in set.
   3264                 //    1:   success if input char is not in set.
   3265                 if (fp->fInputIdx >= fActiveLimit) {
   3266                     fHitEnd = TRUE;
   3267                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3268                     break;
   3269                 }
   3270 
   3271                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   3272                 opValue &= ~URX_NEG_SET;
   3273                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3274 
   3275                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3276                 UChar32 c = UTEXT_NEXT32(fInputText);
   3277                 if (c < 256) {
   3278                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3279                     if (s8->contains(c)) {
   3280                         success = !success;
   3281                     }
   3282                 } else {
   3283                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3284                     if (s->contains(c)) {
   3285                         success = !success;
   3286                     }
   3287                 }
   3288                 if (success) {
   3289                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3290                 } else {
   3291                     // the character wasn't in the set.
   3292                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3293                 }
   3294             }
   3295             break;
   3296 
   3297 
   3298         case URX_STAT_SETREF_N:
   3299             {
   3300                 // Test input character for NOT being a member of  one of
   3301                 //    the predefined sets (Word Characters, for example)
   3302                 if (fp->fInputIdx >= fActiveLimit) {
   3303                     fHitEnd = TRUE;
   3304                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3305                     break;
   3306                 }
   3307 
   3308                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3309 
   3310                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3311 
   3312                 UChar32 c = UTEXT_NEXT32(fInputText);
   3313                 if (c < 256) {
   3314                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3315                     if (s8->contains(c) == FALSE) {
   3316                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3317                         break;
   3318                     }
   3319                 } else {
   3320                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3321                     if (s->contains(c) == FALSE) {
   3322                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3323                         break;
   3324                     }
   3325                 }
   3326                 // the character wasn't in the set.
   3327                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3328             }
   3329             break;
   3330 
   3331 
   3332         case URX_SETREF:
   3333             if (fp->fInputIdx >= fActiveLimit) {
   3334                 fHitEnd = TRUE;
   3335                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3336                 break;
   3337             } else {
   3338                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3339 
   3340                 // There is input left.  Pick up one char and test it for set membership.
   3341                 UChar32 c = UTEXT_NEXT32(fInputText);
   3342                 U_ASSERT(opValue > 0 && opValue < sets->size());
   3343                 if (c<256) {
   3344                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   3345                     if (s8->contains(c)) {
   3346                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3347                         break;
   3348                     }
   3349                 } else {
   3350                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   3351                     if (s->contains(c)) {
   3352                         // The character is in the set.  A Match.
   3353                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3354                         break;
   3355                     }
   3356                 }
   3357 
   3358                 // the character wasn't in the set.
   3359                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3360             }
   3361             break;
   3362 
   3363 
   3364         case URX_DOTANY:
   3365             {
   3366                 // . matches anything, but stops at end-of-line.
   3367                 if (fp->fInputIdx >= fActiveLimit) {
   3368                     // At end of input.  Match failed.  Backtrack out.
   3369                     fHitEnd = TRUE;
   3370                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3371                     break;
   3372                 }
   3373 
   3374                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3375 
   3376                 // There is input left.  Advance over one char, unless we've hit end-of-line
   3377                 UChar32 c = UTEXT_NEXT32(fInputText);
   3378                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   3379                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   3380                     // End of line in normal mode.   . does not match.
   3381                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3382                     break;
   3383                 }
   3384                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3385             }
   3386             break;
   3387 
   3388 
   3389         case URX_DOTANY_ALL:
   3390             {
   3391                 // ., in dot-matches-all (including new lines) mode
   3392                 if (fp->fInputIdx >= fActiveLimit) {
   3393                     // At end of input.  Match failed.  Backtrack out.
   3394                     fHitEnd = TRUE;
   3395                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3396                     break;
   3397                 }
   3398 
   3399                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3400 
   3401                 // There is input left.  Advance over one char, except if we are
   3402                 //   at a cr/lf, advance over both of them.
   3403                 UChar32 c;
   3404                 c = UTEXT_NEXT32(fInputText);
   3405                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3406                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   3407                     // In the case of a CR/LF, we need to advance over both.
   3408                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
   3409                     if (nextc == 0x0a) {
   3410                         (void)UTEXT_NEXT32(fInputText);
   3411                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3412                     }
   3413                 }
   3414             }
   3415             break;
   3416 
   3417 
   3418         case URX_DOTANY_UNIX:
   3419             {
   3420                 // '.' operator, matches all, but stops at end-of-line.
   3421                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   3422                 if (fp->fInputIdx >= fActiveLimit) {
   3423                     // At end of input.  Match failed.  Backtrack out.
   3424                     fHitEnd = TRUE;
   3425                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3426                     break;
   3427                 }
   3428 
   3429                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3430 
   3431                 // There is input left.  Advance over one char, unless we've hit end-of-line
   3432                 UChar32 c = UTEXT_NEXT32(fInputText);
   3433                 if (c == 0x0a) {
   3434                     // End of line in normal mode.   '.' does not match the \n
   3435                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3436                 } else {
   3437                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3438                 }
   3439             }
   3440             break;
   3441 
   3442 
   3443         case URX_JMP:
   3444             fp->fPatIdx = opValue;
   3445             break;
   3446 
   3447         case URX_FAIL:
   3448             isMatch = FALSE;
   3449             goto breakFromLoop;
   3450 
   3451         case URX_JMP_SAV:
   3452             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   3453             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   3454             fp->fPatIdx = opValue;                         // Then JMP.
   3455             break;
   3456 
   3457         case URX_JMP_SAV_X:
   3458             // This opcode is used with (x)+, when x can match a zero length string.
   3459             // Same as JMP_SAV, except conditional on the match having made forward progress.
   3460             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   3461             //   data address of the input position at the start of the loop.
   3462             {
   3463                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   3464                 int32_t  stoOp = (int32_t)pat[opValue-1];
   3465                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   3466                 int32_t  frameLoc = URX_VAL(stoOp);
   3467                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   3468                 int64_t prevInputIdx = fp->fExtra[frameLoc];
   3469                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   3470                 if (prevInputIdx < fp->fInputIdx) {
   3471                     // The match did make progress.  Repeat the loop.
   3472                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   3473                     fp->fPatIdx = opValue;
   3474                     fp->fExtra[frameLoc] = fp->fInputIdx;
   3475                 }
   3476                 // If the input position did not advance, we do nothing here,
   3477                 //   execution will fall out of the loop.
   3478             }
   3479             break;
   3480 
   3481         case URX_CTR_INIT:
   3482             {
   3483                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3484                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   3485 
   3486                 // Pick up the three extra operands that CTR_INIT has, and
   3487                 //    skip the pattern location counter past
   3488                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3489                 fp->fPatIdx += 3;
   3490                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3491                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   3492                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   3493                 U_ASSERT(minCount>=0);
   3494                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   3495                 U_ASSERT(loopLoc>fp->fPatIdx);
   3496 
   3497                 if (minCount == 0) {
   3498                     fp = StateSave(fp, loopLoc+1, status);
   3499                 }
   3500                 if (maxCount == 0) {
   3501                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3502                 }
   3503             }
   3504             break;
   3505 
   3506         case URX_CTR_LOOP:
   3507             {
   3508                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3509                 int32_t initOp = (int32_t)pat[opValue];
   3510                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   3511                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3512                 int32_t minCount  = (int32_t)pat[opValue+2];
   3513                 int32_t maxCount  = (int32_t)pat[opValue+3];
   3514                 // Increment the counter.  Note: we DIDN'T worry about counter
   3515                 //   overflow, since the data comes from UnicodeStrings, which
   3516                 //   stores its length in an int32_t. Do we have to think about
   3517                 //   this now that we're using UText? Probably not, since the length
   3518                 //   in UChar32s is still an int32_t.
   3519                 (*pCounter)++;
   3520                 U_ASSERT(*pCounter > 0);
   3521                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   3522                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   3523                     break;
   3524                 }
   3525                 if (*pCounter >= minCount) {
   3526                     fp = StateSave(fp, fp->fPatIdx, status);
   3527                 }
   3528                 fp->fPatIdx = opValue + 4;    // Loop back.
   3529             }
   3530             break;
   3531 
   3532         case URX_CTR_INIT_NG:
   3533             {
   3534                 // Initialize a non-greedy loop
   3535                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3536                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   3537 
   3538                 // Pick up the three extra operands that CTR_INIT has, and
   3539                 //    skip the pattern location counter past
   3540                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3541                 fp->fPatIdx += 3;
   3542                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3543                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   3544                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   3545                 U_ASSERT(minCount>=0);
   3546                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   3547                 U_ASSERT(loopLoc>fp->fPatIdx);
   3548 
   3549                 if (minCount == 0) {
   3550                     if (maxCount != 0) {
   3551                         fp = StateSave(fp, fp->fPatIdx, status);
   3552                     }
   3553                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   3554                 }
   3555             }
   3556             break;
   3557 
   3558         case URX_CTR_LOOP_NG:
   3559             {
   3560                 // Non-greedy {min, max} loops
   3561                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3562                 int32_t initOp = (int32_t)pat[opValue];
   3563                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   3564                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3565                 int32_t minCount  = (int32_t)pat[opValue+2];
   3566                 int32_t maxCount  = (int32_t)pat[opValue+3];
   3567                 // Increment the counter.  Note: we DIDN'T worry about counter
   3568                 //   overflow, since the data comes from UnicodeStrings, which
   3569                 //   stores its length in an int32_t. Do we have to think about
   3570                 //   this now that we're using UText? Probably not, since the length
   3571                 //   in UChar32s is still an int32_t.
   3572                 (*pCounter)++;
   3573                 U_ASSERT(*pCounter > 0);
   3574 
   3575                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   3576                     // The loop has matched the maximum permitted number of times.
   3577                     //   Break out of here with no action.  Matching will
   3578                     //   continue with the following pattern.
   3579                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   3580                     break;
   3581                 }
   3582 
   3583                 if (*pCounter < minCount) {
   3584                     // We haven't met the minimum number of matches yet.
   3585                     //   Loop back for another one.
   3586                     fp->fPatIdx = opValue + 4;    // Loop back.
   3587                 } else {
   3588                     // We do have the minimum number of matches.
   3589                     //   Fall into the following pattern, but first do
   3590                     //   a state save to the top of the loop, so that a failure
   3591                     //   in the following pattern will try another iteration of the loop.
   3592                     fp = StateSave(fp, opValue + 4, status);
   3593                 }
   3594             }
   3595             break;
   3596 
   3597         case URX_STO_SP:
   3598             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3599             fData[opValue] = fStack->size();
   3600             break;
   3601 
   3602         case URX_LD_SP:
   3603             {
   3604                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3605                 int32_t newStackSize = (int32_t)fData[opValue];
   3606                 U_ASSERT(newStackSize <= fStack->size());
   3607                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3608                 if (newFP == (int64_t *)fp) {
   3609                     break;
   3610                 }
   3611                 int32_t i;
   3612                 for (i=0; i<fFrameSize; i++) {
   3613                     newFP[i] = ((int64_t *)fp)[i];
   3614                 }
   3615                 fp = (REStackFrame *)newFP;
   3616                 fStack->setSize(newStackSize);
   3617             }
   3618             break;
   3619 
   3620         case URX_BACKREF:
   3621             {
   3622                 U_ASSERT(opValue < fFrameSize);
   3623                 int64_t groupStartIdx = fp->fExtra[opValue];
   3624                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   3625                 U_ASSERT(groupStartIdx <= groupEndIdx);
   3626                 if (groupStartIdx < 0) {
   3627                     // This capture group has not participated in the match thus far,
   3628                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3629                     break;
   3630                 }
   3631                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
   3632                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3633 
   3634                 //   Note: if the capture group match was of an empty string the backref
   3635                 //         match succeeds.  Verified by testing:  Perl matches succeed
   3636                 //         in this case, so we do too.
   3637 
   3638                 UBool success = TRUE;
   3639                 for (;;) {
   3640                     if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
   3641                         success = TRUE;
   3642                         break;
   3643                     }
   3644                     if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
   3645                         success = FALSE;
   3646                         fHitEnd = TRUE;
   3647                         break;
   3648                     }
   3649                     UChar32 captureGroupChar = utext_next32(fAltInputText);
   3650                     UChar32 inputChar = utext_next32(fInputText);
   3651                     if (inputChar != captureGroupChar) {
   3652                         success = FALSE;
   3653                         break;
   3654                     }
   3655                 }
   3656 
   3657                 if (success) {
   3658                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3659                 } else {
   3660                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3661                 }
   3662             }
   3663             break;
   3664 
   3665 
   3666 
   3667         case URX_BACKREF_I:
   3668             {
   3669                 U_ASSERT(opValue < fFrameSize);
   3670                 int64_t groupStartIdx = fp->fExtra[opValue];
   3671                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   3672                 U_ASSERT(groupStartIdx <= groupEndIdx);
   3673                 if (groupStartIdx < 0) {
   3674                     // This capture group has not participated in the match thus far,
   3675                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3676                     break;
   3677                 }
   3678                 utext_setNativeIndex(fAltInputText, groupStartIdx);
   3679                 utext_setNativeIndex(fInputText, fp->fInputIdx);
   3680                 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
   3681                 CaseFoldingUTextIterator inputItr(*fInputText);
   3682 
   3683                 //   Note: if the capture group match was of an empty string the backref
   3684                 //         match succeeds.  Verified by testing:  Perl matches succeed
   3685                 //         in this case, so we do too.
   3686 
   3687                 UBool success = TRUE;
   3688                 for (;;) {
   3689                     if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
   3690                         success = TRUE;
   3691                         break;
   3692                     }
   3693                     if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
   3694                         success = FALSE;
   3695                         fHitEnd = TRUE;
   3696                         break;
   3697                     }
   3698                     UChar32 captureGroupChar = captureGroupItr.next();
   3699                     UChar32 inputChar = inputItr.next();
   3700                     if (inputChar != captureGroupChar) {
   3701                         success = FALSE;
   3702                         break;
   3703                     }
   3704                 }
   3705 
   3706                 if (success && inputItr.inExpansion()) {
   3707                     // We otained a match by consuming part of a string obtained from
   3708                     // case-folding a single code point of the input text.
   3709                     // This does not count as an overall match.
   3710                     success = FALSE;
   3711                 }
   3712 
   3713                 if (success) {
   3714                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3715                 } else {
   3716                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3717                 }
   3718 
   3719             }
   3720             break;
   3721 
   3722         case URX_STO_INP_LOC:
   3723             {
   3724                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   3725                 fp->fExtra[opValue] = fp->fInputIdx;
   3726             }
   3727             break;
   3728 
   3729         case URX_JMPX:
   3730             {
   3731                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3732                 fp->fPatIdx += 1;
   3733                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   3734                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   3735                 int64_t savedInputIdx = fp->fExtra[dataLoc];
   3736                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   3737                 if (savedInputIdx < fp->fInputIdx) {
   3738                     fp->fPatIdx = opValue;                               // JMP
   3739                 } else {
   3740                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   3741                 }
   3742             }
   3743             break;
   3744 
   3745         case URX_LA_START:
   3746             {
   3747                 // Entering a lookahead block.
   3748                 // Save Stack Ptr, Input Pos.
   3749                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3750                 fData[opValue]   = fStack->size();
   3751                 fData[opValue+1] = fp->fInputIdx;
   3752                 fActiveStart     = fLookStart;          // Set the match region change for
   3753                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   3754             }
   3755             break;
   3756 
   3757         case URX_LA_END:
   3758             {
   3759                 // Leaving a look-ahead block.
   3760                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   3761                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3762                 int32_t stackSize = fStack->size();
   3763                 int32_t newStackSize =(int32_t)fData[opValue];
   3764                 U_ASSERT(stackSize >= newStackSize);
   3765                 if (stackSize > newStackSize) {
   3766                     // Copy the current top frame back to the new (cut back) top frame.
   3767                     //   This makes the capture groups from within the look-ahead
   3768                     //   expression available.
   3769                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3770                     int32_t i;
   3771                     for (i=0; i<fFrameSize; i++) {
   3772                         newFP[i] = ((int64_t *)fp)[i];
   3773                     }
   3774                     fp = (REStackFrame *)newFP;
   3775                     fStack->setSize(newStackSize);
   3776                 }
   3777                 fp->fInputIdx = fData[opValue+1];
   3778 
   3779                 // Restore the active region bounds in the input string; they may have
   3780                 //    been changed because of transparent bounds on a Region.
   3781                 fActiveStart = fRegionStart;
   3782                 fActiveLimit = fRegionLimit;
   3783             }
   3784             break;
   3785 
   3786         case URX_ONECHAR_I:
   3787             // Case insensitive one char.  The char from the pattern is already case folded.
   3788             // Input text is not, but case folding the input can not reduce two or more code
   3789             // points to one.
   3790             if (fp->fInputIdx < fActiveLimit) {
   3791                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3792 
   3793                 UChar32 c = UTEXT_NEXT32(fInputText);
   3794                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   3795                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3796                     break;
   3797                 }
   3798             } else {
   3799                 fHitEnd = TRUE;
   3800             }
   3801 
   3802             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3803             break;
   3804 
   3805         case URX_STRING_I:
   3806             {
   3807                 // Case-insensitive test input against a literal string.
   3808                 // Strings require two slots in the compiled pattern, one for the
   3809                 //   offset to the string text, and one for the length.
   3810                 //   The compiled string has already been case folded.
   3811                 {
   3812                     const UChar *patternString = litText + opValue;
   3813                     int32_t      patternStringIdx  = 0;
   3814 
   3815                     op      = (int32_t)pat[fp->fPatIdx];
   3816                     fp->fPatIdx++;
   3817                     opType  = URX_TYPE(op);
   3818                     opValue = URX_VAL(op);
   3819                     U_ASSERT(opType == URX_STRING_LEN);
   3820                     int32_t patternStringLen = opValue;  // Length of the string from the pattern.
   3821 
   3822 
   3823                     UChar32   cPattern;
   3824                     UChar32   cText;
   3825                     UBool     success = TRUE;
   3826 
   3827                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3828                     CaseFoldingUTextIterator inputIterator(*fInputText);
   3829                     while (patternStringIdx < patternStringLen) {
   3830                         if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
   3831                             success = FALSE;
   3832                             fHitEnd = TRUE;
   3833                             break;
   3834                         }
   3835                         U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
   3836                         cText = inputIterator.next();
   3837                         if (cText != cPattern) {
   3838                             success = FALSE;
   3839                             break;
   3840                         }
   3841                     }
   3842                     if (inputIterator.inExpansion()) {
   3843                         success = FALSE;
   3844                     }
   3845 
   3846                     if (success) {
   3847                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3848                     } else {
   3849                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3850                     }
   3851                 }
   3852             }
   3853             break;
   3854 
   3855         case URX_LB_START:
   3856             {
   3857                 // Entering a look-behind block.
   3858                 // Save Stack Ptr, Input Pos.
   3859                 //   TODO:  implement transparent bounds.  Ticket #6067
   3860                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3861                 fData[opValue]   = fStack->size();
   3862                 fData[opValue+1] = fp->fInputIdx;
   3863                 // Init the variable containing the start index for attempted matches.
   3864                 fData[opValue+2] = -1;
   3865                 // Save input string length, then reset to pin any matches to end at
   3866                 //   the current position.
   3867                 fData[opValue+3] = fActiveLimit;
   3868                 fActiveLimit     = fp->fInputIdx;
   3869             }
   3870             break;
   3871 
   3872 
   3873         case URX_LB_CONT:
   3874             {
   3875                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   3876                 //    at all possible input starting positions.
   3877 
   3878                 // Fetch the min and max possible match lengths.  They are the operands
   3879                 //   of this op in the pattern.
   3880                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
   3881                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
   3882                 U_ASSERT(minML <= maxML);
   3883                 U_ASSERT(minML >= 0);
   3884 
   3885                 // Fetch (from data) the last input index where a match was attempted.
   3886                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3887                 int64_t  *lbStartIdx = &fData[opValue+2];
   3888                 if (*lbStartIdx < 0) {
   3889                     // First time through loop.
   3890                     *lbStartIdx = fp->fInputIdx - minML;
   3891                 } else {
   3892                     // 2nd through nth time through the loop.
   3893                     // Back up start position for match by one.
   3894                     if (*lbStartIdx == 0) {
   3895                         (*lbStartIdx)--;
   3896                     } else {
   3897                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
   3898                         (void)UTEXT_PREVIOUS32(fInputText);
   3899                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3900                     }
   3901                 }
   3902 
   3903                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   3904                     // We have tried all potential match starting points without
   3905                     //  getting a match.  Backtrack out, and out of the
   3906                     //   Look Behind altogether.
   3907                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3908                     int64_t restoreInputLen = fData[opValue+3];
   3909                     U_ASSERT(restoreInputLen >= fActiveLimit);
   3910                     U_ASSERT(restoreInputLen <= fInputLength);
   3911                     fActiveLimit = restoreInputLen;
   3912                     break;
   3913                 }
   3914 
   3915                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   3916                 //      (successful match will fall off the end of the loop.)
   3917                 fp = StateSave(fp, fp->fPatIdx-3, status);
   3918                 fp->fInputIdx = *lbStartIdx;
   3919             }
   3920             break;
   3921 
   3922         case URX_LB_END:
   3923             // End of a look-behind block, after a successful match.
   3924             {
   3925                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3926                 if (fp->fInputIdx != fActiveLimit) {
   3927                     //  The look-behind expression matched, but the match did not
   3928                     //    extend all the way to the point that we are looking behind from.
   3929                     //  FAIL out of here, which will take us back to the LB_CONT, which
   3930                     //     will retry the match starting at another position or fail
   3931                     //     the look-behind altogether, whichever is appropriate.
   3932                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3933                     break;
   3934                 }
   3935 
   3936                 // Look-behind match is good.  Restore the orignal input string length,
   3937                 //   which had been truncated to pin the end of the lookbehind match to the
   3938                 //   position being looked-behind.
   3939                 int64_t originalInputLen = fData[opValue+3];
   3940                 U_ASSERT(originalInputLen >= fActiveLimit);
   3941                 U_ASSERT(originalInputLen <= fInputLength);
   3942                 fActiveLimit = originalInputLen;
   3943             }
   3944             break;
   3945 
   3946 
   3947         case URX_LBN_CONT:
   3948             {
   3949                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   3950                 //    at all possible input starting positions.
   3951 
   3952                 // Fetch the extra parameters of this op.
   3953                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
   3954                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
   3955                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
   3956                         continueLoc = URX_VAL(continueLoc);
   3957                 U_ASSERT(minML <= maxML);
   3958                 U_ASSERT(minML >= 0);
   3959                 U_ASSERT(continueLoc > fp->fPatIdx);
   3960 
   3961                 // Fetch (from data) the last input index where a match was attempted.
   3962                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3963                 int64_t  *lbStartIdx = &fData[opValue+2];
   3964                 if (*lbStartIdx < 0) {
   3965                     // First time through loop.
   3966                     *lbStartIdx = fp->fInputIdx - minML;
   3967                 } else {
   3968                     // 2nd through nth time through the loop.
   3969                     // Back up start position for match by one.
   3970                     if (*lbStartIdx == 0) {
   3971                         (*lbStartIdx)--;
   3972                     } else {
   3973                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
   3974                         (void)UTEXT_PREVIOUS32(fInputText);
   3975                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3976                     }
   3977                 }
   3978 
   3979                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   3980                     // We have tried all potential match starting points without
   3981                     //  getting a match, which means that the negative lookbehind as
   3982                     //  a whole has succeeded.  Jump forward to the continue location
   3983                     int64_t restoreInputLen = fData[opValue+3];
   3984                     U_ASSERT(restoreInputLen >= fActiveLimit);
   3985                     U_ASSERT(restoreInputLen <= fInputLength);
   3986                     fActiveLimit = restoreInputLen;
   3987                     fp->fPatIdx = continueLoc;
   3988                     break;
   3989                 }
   3990 
   3991                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   3992                 //      (successful match will cause a FAIL out of the loop altogether.)
   3993                 fp = StateSave(fp, fp->fPatIdx-4, status);
   3994                 fp->fInputIdx = *lbStartIdx;
   3995             }
   3996             break;
   3997 
   3998         case URX_LBN_END:
   3999             // End of a negative look-behind block, after a successful match.
   4000             {
   4001                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4002                 if (fp->fInputIdx != fActiveLimit) {
   4003                     //  The look-behind expression matched, but the match did not
   4004                     //    extend all the way to the point that we are looking behind from.
   4005                     //  FAIL out of here, which will take us back to the LB_CONT, which
   4006                     //     will retry the match starting at another position or succeed
   4007                     //     the look-behind altogether, whichever is appropriate.
   4008                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4009                     break;
   4010                 }
   4011 
   4012                 // Look-behind expression matched, which means look-behind test as
   4013                 //   a whole Fails
   4014 
   4015                 //   Restore the orignal input string length, which had been truncated
   4016                 //   inorder to pin the end of the lookbehind match
   4017                 //   to the position being looked-behind.
   4018                 int64_t originalInputLen = fData[opValue+3];
   4019                 U_ASSERT(originalInputLen >= fActiveLimit);
   4020                 U_ASSERT(originalInputLen <= fInputLength);
   4021                 fActiveLimit = originalInputLen;
   4022 
   4023                 // Restore original stack position, discarding any state saved
   4024                 //   by the successful pattern match.
   4025                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4026                 int32_t newStackSize = (int32_t)fData[opValue];
   4027                 U_ASSERT(fStack->size() > newStackSize);
   4028                 fStack->setSize(newStackSize);
   4029 
   4030                 //  FAIL, which will take control back to someplace
   4031                 //  prior to entering the look-behind test.
   4032                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4033             }
   4034             break;
   4035 
   4036 
   4037         case URX_LOOP_SR_I:
   4038             // Loop Initialization for the optimized implementation of
   4039             //     [some character set]*
   4040             //   This op scans through all matching input.
   4041             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4042             {
   4043                 U_ASSERT(opValue > 0 && opValue < sets->size());
   4044                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   4045                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   4046 
   4047                 // Loop through input, until either the input is exhausted or
   4048                 //   we reach a character that is not a member of the set.
   4049                 int64_t ix = fp->fInputIdx;
   4050                 UTEXT_SETNATIVEINDEX(fInputText, ix);
   4051                 for (;;) {
   4052                     if (ix >= fActiveLimit) {
   4053                         fHitEnd = TRUE;
   4054                         break;
   4055                     }
   4056                     UChar32 c = UTEXT_NEXT32(fInputText);
   4057                     if (c<256) {
   4058                         if (s8->contains(c) == FALSE) {
   4059                             break;
   4060                         }
   4061                     } else {
   4062                         if (s->contains(c) == FALSE) {
   4063                             break;
   4064                         }
   4065                     }
   4066                     ix = UTEXT_GETNATIVEINDEX(fInputText);
   4067                 }
   4068 
   4069                 // If there were no matching characters, skip over the loop altogether.
   4070                 //   The loop doesn't run at all, a * op always succeeds.
   4071                 if (ix == fp->fInputIdx) {
   4072                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4073                     break;
   4074                 }
   4075 
   4076                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4077                 //   must follow.  It's operand is the stack location
   4078                 //   that holds the starting input index for the match of this [set]*
   4079                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   4080                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4081                 int32_t stackLoc = URX_VAL(loopcOp);
   4082                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4083                 fp->fExtra[stackLoc] = fp->fInputIdx;
   4084                 fp->fInputIdx = ix;
   4085 
   4086                 // Save State to the URX_LOOP_C op that follows this one,
   4087                 //   so that match failures in the following code will return to there.
   4088                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4089                 fp = StateSave(fp, fp->fPatIdx, status);
   4090                 fp->fPatIdx++;
   4091             }
   4092             break;
   4093 
   4094 
   4095         case URX_LOOP_DOT_I:
   4096             // Loop Initialization for the optimized implementation of .*
   4097             //   This op scans through all remaining input.
   4098             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4099             {
   4100                 // Loop through input until the input is exhausted (we reach an end-of-line)
   4101                 // In DOTALL mode, we can just go straight to the end of the input.
   4102                 int64_t ix;
   4103                 if ((opValue & 1) == 1) {
   4104                     // Dot-matches-All mode.  Jump straight to the end of the string.
   4105                     ix = fActiveLimit;
   4106                     fHitEnd = TRUE;
   4107                 } else {
   4108                     // NOT DOT ALL mode.  Line endings do not match '.'
   4109                     // Scan forward until a line ending or end of input.
   4110                     ix = fp->fInputIdx;
   4111                     UTEXT_SETNATIVEINDEX(fInputText, ix);
   4112                     for (;;) {
   4113                         if (ix >= fActiveLimit) {
   4114                             fHitEnd = TRUE;
   4115                             break;
   4116                         }
   4117                         UChar32 c = UTEXT_NEXT32(fInputText);
   4118                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
   4119                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
   4120                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   4121                                     (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
   4122                                 //  char is a line ending.  Exit the scanning loop.
   4123                                 break;
   4124                             }
   4125                         }
   4126                         ix = UTEXT_GETNATIVEINDEX(fInputText);
   4127                     }
   4128                 }
   4129 
   4130                 // If there were no matching characters, skip over the loop altogether.
   4131                 //   The loop doesn't run at all, a * op always succeeds.
   4132                 if (ix == fp->fInputIdx) {
   4133                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4134                     break;
   4135                 }
   4136 
   4137                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4138                 //   must follow.  It's operand is the stack location
   4139                 //   that holds the starting input index for the match of this .*
   4140                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   4141                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4142                 int32_t stackLoc = URX_VAL(loopcOp);
   4143                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4144                 fp->fExtra[stackLoc] = fp->fInputIdx;
   4145                 fp->fInputIdx = ix;
   4146 
   4147                 // Save State to the URX_LOOP_C op that follows this one,
   4148                 //   so that match failures in the following code will return to there.
   4149                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4150                 fp = StateSave(fp, fp->fPatIdx, status);
   4151                 fp->fPatIdx++;
   4152             }
   4153             break;
   4154 
   4155 
   4156         case URX_LOOP_C:
   4157             {
   4158                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   4159                 backSearchIndex = fp->fExtra[opValue];
   4160                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
   4161                 if (backSearchIndex == fp->fInputIdx) {
   4162                     // We've backed up the input idx to the point that the loop started.
   4163                     // The loop is done.  Leave here without saving state.
   4164                     //  Subsequent failures won't come back here.
   4165                     break;
   4166                 }
   4167                 // Set up for the next iteration of the loop, with input index
   4168                 //   backed up by one from the last time through,
   4169                 //   and a state save to this instruction in case the following code fails again.
   4170                 //   (We're going backwards because this loop emulates stack unwinding, not
   4171                 //    the initial scan forward.)
   4172                 U_ASSERT(fp->fInputIdx > 0);
   4173                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4174                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
   4175                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4176 
   4177                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
   4178                 if (prevC == 0x0a &&
   4179                     fp->fInputIdx > backSearchIndex &&
   4180                     twoPrevC == 0x0d) {
   4181                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
   4182                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   4183                         // .*, stepping back over CRLF pair.
   4184                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4185                     }
   4186                 }
   4187 
   4188 
   4189                 fp = StateSave(fp, fp->fPatIdx-1, status);
   4190             }
   4191             break;
   4192 
   4193 
   4194 
   4195         default:
   4196             // Trouble.  The compiled pattern contains an entry with an
   4197             //           unrecognized type tag.
   4198             U_ASSERT(FALSE);
   4199         }
   4200 
   4201         if (U_FAILURE(status)) {
   4202             isMatch = FALSE;
   4203             break;
   4204         }
   4205     }
   4206 
   4207 breakFromLoop:
   4208     fMatch = isMatch;
   4209     if (isMatch) {
   4210         fLastMatchEnd = fMatchEnd;
   4211         fMatchStart   = startIdx;
   4212         fMatchEnd     = fp->fInputIdx;
   4213         if (fTraceDebug) {
   4214             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
   4215         }
   4216     }
   4217     else
   4218     {
   4219         if (fTraceDebug) {
   4220             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
   4221         }
   4222     }
   4223 
   4224     fFrame = fp;                // The active stack frame when the engine stopped.
   4225                                 //   Contains the capture group results that we need to
   4226                                 //    access later.
   4227     return;
   4228 }
   4229 
   4230 
   4231 //--------------------------------------------------------------------------------
   4232 //
   4233 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
   4234 //                  assumption that the entire string is available in the UText's
   4235 //                  chunk buffer. For now, that means we can use int32_t indexes,
   4236 //                  except for anything that needs to be saved (like group starts
   4237 //                  and ends).
   4238 //
   4239 //                  startIdx:    begin matching a this index.
   4240 //                  toEnd:       if true, match must extend to end of the input region
   4241 //
   4242 //--------------------------------------------------------------------------------
   4243 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
   4244     UBool       isMatch  = FALSE;      // True if the we have a match.
   4245 
   4246     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
   4247 
   4248     int32_t     op;                    // Operation from the compiled pattern, split into
   4249     int32_t     opType;                //    the opcode
   4250     int32_t     opValue;               //    and the operand value.
   4251 
   4252 #ifdef REGEX_RUN_DEBUG
   4253     if (fTraceDebug)
   4254     {
   4255         printf("MatchAt(startIdx=%ld)\n", startIdx);
   4256         printf("Original Pattern: ");
   4257         UChar32 c = utext_next32From(fPattern->fPattern, 0);
   4258         while (c != U_SENTINEL) {
   4259             if (c<32 || c>256) {
   4260                 c = '.';
   4261             }
   4262             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
   4263 
   4264             c = UTEXT_NEXT32(fPattern->fPattern);
   4265         }
   4266         printf("\n");
   4267         printf("Input String: ");
   4268         c = utext_next32From(fInputText, 0);
   4269         while (c != U_SENTINEL) {
   4270             if (c<32 || c>256) {
   4271                 c = '.';
   4272             }
   4273             printf("%c", c);
   4274 
   4275             c = UTEXT_NEXT32(fInputText);
   4276         }
   4277         printf("\n");
   4278         printf("\n");
   4279     }
   4280 #endif
   4281 
   4282     if (U_FAILURE(status)) {
   4283         return;
   4284     }
   4285 
   4286     //  Cache frequently referenced items from the compiled pattern
   4287     //
   4288     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   4289 
   4290     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   4291     UVector             *sets          = fPattern->fSets;
   4292 
   4293     const UChar         *inputBuf      = fInputText->chunkContents;
   4294 
   4295     fFrameSize = fPattern->fFrameSize;
   4296     REStackFrame        *fp            = resetStack();
   4297 
   4298     fp->fPatIdx   = 0;
   4299     fp->fInputIdx = startIdx;
   4300 
   4301     // Zero out the pattern's static data
   4302     int32_t i;
   4303     for (i = 0; i<fPattern->fDataSize; i++) {
   4304         fData[i] = 0;
   4305     }
   4306 
   4307     //
   4308     //  Main loop for interpreting the compiled pattern.
   4309     //  One iteration of the loop per pattern operation performed.
   4310     //
   4311     for (;;) {
   4312 #if 0
   4313         if (_heapchk() != _HEAPOK) {
   4314             fprintf(stderr, "Heap Trouble\n");
   4315         }
   4316 #endif
   4317 
   4318         op      = (int32_t)pat[fp->fPatIdx];
   4319         opType  = URX_TYPE(op);
   4320         opValue = URX_VAL(op);
   4321 #ifdef REGEX_RUN_DEBUG
   4322         if (fTraceDebug) {
   4323             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4324             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
   4325                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   4326             fPattern->dumpOp(fp->fPatIdx);
   4327         }
   4328 #endif
   4329         fp->fPatIdx++;
   4330 
   4331         switch (opType) {
   4332 
   4333 
   4334         case URX_NOP:
   4335             break;
   4336 
   4337 
   4338         case URX_BACKTRACK:
   4339             // Force a backtrack.  In some circumstances, the pattern compiler
   4340             //   will notice that the pattern can't possibly match anything, and will
   4341             //   emit one of these at that point.
   4342             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4343             break;
   4344 
   4345 
   4346         case URX_ONECHAR:
   4347             if (fp->fInputIdx < fActiveLimit) {
   4348                 UChar32 c;
   4349                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4350                 if (c == opValue) {
   4351                     break;
   4352                 }
   4353             } else {
   4354                 fHitEnd = TRUE;
   4355             }
   4356             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4357             break;
   4358 
   4359 
   4360         case URX_STRING:
   4361             {
   4362                 // Test input against a literal string.
   4363                 // Strings require two slots in the compiled pattern, one for the
   4364                 //   offset to the string text, and one for the length.
   4365                 int32_t   stringStartIdx = opValue;
   4366                 int32_t   stringLen;
   4367 
   4368                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
   4369                 fp->fPatIdx++;
   4370                 opType    = URX_TYPE(op);
   4371                 stringLen = URX_VAL(op);
   4372                 U_ASSERT(opType == URX_STRING_LEN);
   4373                 U_ASSERT(stringLen >= 2);
   4374 
   4375                 const UChar * pInp = inputBuf + fp->fInputIdx;
   4376                 const UChar * pInpLimit = inputBuf + fActiveLimit;
   4377                 const UChar * pPat = litText+stringStartIdx;
   4378                 const UChar * pEnd = pInp + stringLen;
   4379                 UBool success = TRUE;
   4380                 while (pInp < pEnd) {
   4381                     if (pInp >= pInpLimit) {
   4382                         fHitEnd = TRUE;
   4383                         success = FALSE;
   4384                         break;
   4385                     }
   4386                     if (*pInp++ != *pPat++) {
   4387                         success = FALSE;
   4388                         break;
   4389                     }
   4390                 }
   4391 
   4392                 if (success) {
   4393                     fp->fInputIdx += stringLen;
   4394                 } else {
   4395                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4396                 }
   4397             }
   4398             break;
   4399 
   4400 
   4401         case URX_STATE_SAVE:
   4402             fp = StateSave(fp, opValue, status);
   4403             break;
   4404 
   4405 
   4406         case URX_END:
   4407             // The match loop will exit via this path on a successful match,
   4408             //   when we reach the end of the pattern.
   4409             if (toEnd && fp->fInputIdx != fActiveLimit) {
   4410                 // The pattern matched, but not to the end of input.  Try some more.
   4411                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4412                 break;
   4413             }
   4414             isMatch = TRUE;
   4415             goto  breakFromLoop;
   4416 
   4417             // Start and End Capture stack frame variables are laid out out like this:
   4418             //  fp->fExtra[opValue]  - The start of a completed capture group
   4419             //             opValue+1 - The end   of a completed capture group
   4420             //             opValue+2 - the start of a capture group whose end
   4421             //                          has not yet been reached (and might not ever be).
   4422         case URX_START_CAPTURE:
   4423             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4424             fp->fExtra[opValue+2] = fp->fInputIdx;
   4425             break;
   4426 
   4427 
   4428         case URX_END_CAPTURE:
   4429             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4430             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   4431             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   4432             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   4433             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   4434             break;
   4435 
   4436 
   4437         case URX_DOLLAR:                   //  $, test for End of line
   4438             //     or for position before new line at end of input
   4439             if (fp->fInputIdx < fAnchorLimit-2) {
   4440                 // We are no where near the end of input.  Fail.
   4441                 //   This is the common case.  Keep it first.
   4442                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4443                 break;
   4444             }
   4445             if (fp->fInputIdx >= fAnchorLimit) {
   4446                 // We really are at the end of input.  Success.
   4447                 fHitEnd = TRUE;
   4448                 fRequireEnd = TRUE;
   4449                 break;
   4450             }
   4451 
   4452             // If we are positioned just before a new-line that is located at the
   4453             //   end of input, succeed.
   4454             if (fp->fInputIdx == fAnchorLimit-1) {
   4455                 UChar32 c;
   4456                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
   4457 
   4458                 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
   4459                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4460                         // At new-line at end of input. Success
   4461                         fHitEnd = TRUE;
   4462                         fRequireEnd = TRUE;
   4463                         break;
   4464                     }
   4465                 }
   4466             } else if (fp->fInputIdx == fAnchorLimit-2 &&
   4467                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
   4468                     fHitEnd = TRUE;
   4469                     fRequireEnd = TRUE;
   4470                     break;                         // At CR/LF at end of input.  Success
   4471             }
   4472 
   4473             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4474 
   4475             break;
   4476 
   4477 
   4478         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   4479             if (fp->fInputIdx >= fAnchorLimit-1) {
   4480                 // Either at the last character of input, or off the end.
   4481                 if (fp->fInputIdx == fAnchorLimit-1) {
   4482                     // At last char of input.  Success if it's a new line.
   4483                     if (inputBuf[fp->fInputIdx] == 0x0a) {
   4484                         fHitEnd = TRUE;
   4485                         fRequireEnd = TRUE;
   4486                         break;
   4487                     }
   4488                 } else {
   4489                     // Off the end of input.  Success.
   4490                     fHitEnd = TRUE;
   4491                     fRequireEnd = TRUE;
   4492                     break;
   4493                 }
   4494             }
   4495 
   4496             // Not at end of input.  Back-track out.
   4497             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4498             break;
   4499 
   4500 
   4501         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   4502             {
   4503                 if (fp->fInputIdx >= fAnchorLimit) {
   4504                     // We really are at the end of input.  Success.
   4505                     fHitEnd = TRUE;
   4506                     fRequireEnd = TRUE;
   4507                     break;
   4508                 }
   4509                 // If we are positioned just before a new-line, succeed.
   4510                 // It makes no difference where the new-line is within the input.
   4511                 UChar32 c = inputBuf[fp->fInputIdx];
   4512                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
   4513                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   4514                     //  In multi-line mode, hitting a new-line just before the end of input does not
   4515                     //   set the hitEnd or requireEnd flags
   4516                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4517                         break;
   4518                     }
   4519                 }
   4520                 // not at a new line.  Fail.
   4521                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4522             }
   4523             break;
   4524 
   4525 
   4526         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   4527             {
   4528                 if (fp->fInputIdx >= fAnchorLimit) {
   4529                     // We really are at the end of input.  Success.
   4530                     fHitEnd = TRUE;
   4531                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   4532                     break;               //   adding a new-line would not lose the match.
   4533                 }
   4534                 // If we are not positioned just before a new-line, the test fails; backtrack out.
   4535                 // It makes no difference where the new-line is within the input.
   4536                 if (inputBuf[fp->fInputIdx] != 0x0a) {
   4537                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4538                 }
   4539             }
   4540             break;
   4541 
   4542 
   4543         case URX_CARET:                    //  ^, test for start of line
   4544             if (fp->fInputIdx != fAnchorStart) {
   4545                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4546             }
   4547             break;
   4548 
   4549 
   4550         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   4551             {
   4552                 if (fp->fInputIdx == fAnchorStart) {
   4553                     // We are at the start input.  Success.
   4554                     break;
   4555                 }
   4556                 // Check whether character just before the current pos is a new-line
   4557                 //   unless we are at the end of input
   4558                 UChar  c = inputBuf[fp->fInputIdx - 1];
   4559                 if ((fp->fInputIdx < fAnchorLimit) &&
   4560                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   4561                     //  It's a new-line.  ^ is true.  Success.
   4562                     //  TODO:  what should be done with positions between a CR and LF?
   4563                     break;
   4564                 }
   4565                 // Not at the start of a line.  Fail.
   4566                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4567             }
   4568             break;
   4569 
   4570 
   4571         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   4572             {
   4573                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
   4574                 if (fp->fInputIdx <= fAnchorStart) {
   4575                     // We are at the start input.  Success.
   4576                     break;
   4577                 }
   4578                 // Check whether character just before the current pos is a new-line
   4579                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   4580                 UChar  c = inputBuf[fp->fInputIdx - 1];
   4581                 if (c != 0x0a) {
   4582                     // Not at the start of a line.  Back-track out.
   4583                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4584                 }
   4585             }
   4586             break;
   4587 
   4588         case URX_BACKSLASH_B:          // Test for word boundaries
   4589             {
   4590                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
   4591                 success ^= (UBool)(opValue != 0);     // flip sense for \B
   4592                 if (!success) {
   4593                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4594                 }
   4595             }
   4596             break;
   4597 
   4598 
   4599         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   4600             {
   4601                 UBool success = isUWordBoundary(fp->fInputIdx);
   4602                 success ^= (UBool)(opValue != 0);     // flip sense for \B
   4603                 if (!success) {
   4604                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4605                 }
   4606             }
   4607             break;
   4608 
   4609 
   4610         case URX_BACKSLASH_D:            // Test for decimal digit
   4611             {
   4612                 if (fp->fInputIdx >= fActiveLimit) {
   4613                     fHitEnd = TRUE;
   4614                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4615                     break;
   4616                 }
   4617 
   4618                 UChar32 c;
   4619                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4620                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   4621                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   4622                 success ^= (UBool)(opValue != 0);        // flip sense for \D
   4623                 if (!success) {
   4624                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4625                 }
   4626             }
   4627             break;
   4628 
   4629 
   4630         case URX_BACKSLASH_G:          // Test for position at end of previous match
   4631             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
   4632                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4633             }
   4634             break;
   4635 
   4636 
   4637         case URX_BACKSLASH_X:
   4638         //  Match a Grapheme, as defined by Unicode TR 29.
   4639         //  Differs slightly from Perl, which consumes combining marks independently
   4640         //    of context.
   4641         {
   4642 
   4643             // Fail if at end of input
   4644             if (fp->fInputIdx >= fActiveLimit) {
   4645                 fHitEnd = TRUE;
   4646                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4647                 break;
   4648             }
   4649 
   4650             // Examine (and consume) the current char.
   4651             //   Dispatch into a little state machine, based on the char.
   4652             UChar32  c;
   4653             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4654             UnicodeSet **sets = fPattern->fStaticSets;
   4655             if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   4656             if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   4657             if (sets[URX_GC_L]->contains(c))       goto GC_L;
   4658             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   4659             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   4660             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4661             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4662             goto GC_Extend;
   4663 
   4664 
   4665 
   4666 GC_L:
   4667             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4668             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4669             if (sets[URX_GC_L]->contains(c))       goto GC_L;
   4670             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   4671             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   4672             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4673             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4674             goto GC_Extend;
   4675 
   4676 GC_V:
   4677             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4678             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4679             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4680             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4681             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4682             goto GC_Extend;
   4683 
   4684 GC_T:
   4685             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4686             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4687             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4688             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4689             goto GC_Extend;
   4690 
   4691 GC_Extend:
   4692             // Combining characters are consumed here
   4693             for (;;) {
   4694                 if (fp->fInputIdx >= fActiveLimit) {
   4695                     break;
   4696                 }
   4697                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4698                 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   4699                     U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   4700                     break;
   4701                 }
   4702             }
   4703             goto GC_Done;
   4704 
   4705 GC_Control:
   4706             // Most control chars stand alone (don't combine with combining chars),
   4707             //   except for that CR/LF sequence is a single grapheme cluster.
   4708             if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
   4709                 fp->fInputIdx++;
   4710             }
   4711 
   4712 GC_Done:
   4713             if (fp->fInputIdx >= fActiveLimit) {
   4714                 fHitEnd = TRUE;
   4715             }
   4716             break;
   4717         }
   4718 
   4719 
   4720 
   4721 
   4722         case URX_BACKSLASH_Z:          // Test for end of Input
   4723             if (fp->fInputIdx < fAnchorLimit) {
   4724                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4725             } else {
   4726                 fHitEnd = TRUE;
   4727                 fRequireEnd = TRUE;
   4728             }
   4729             break;
   4730 
   4731 
   4732 
   4733         case URX_STATIC_SETREF:
   4734             {
   4735                 // Test input character against one of the predefined sets
   4736                 //    (Word Characters, for example)
   4737                 // The high bit of the op value is a flag for the match polarity.
   4738                 //    0:   success if input char is in set.
   4739                 //    1:   success if input char is not in set.
   4740                 if (fp->fInputIdx >= fActiveLimit) {
   4741                     fHitEnd = TRUE;
   4742                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4743                     break;
   4744                 }
   4745 
   4746                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   4747                 opValue &= ~URX_NEG_SET;
   4748                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   4749 
   4750                 UChar32 c;
   4751                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4752                 if (c < 256) {
   4753                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   4754                     if (s8->contains(c)) {
   4755                         success = !success;
   4756                     }
   4757                 } else {
   4758                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   4759                     if (s->contains(c)) {
   4760                         success = !success;
   4761                     }
   4762                 }
   4763                 if (!success) {
   4764                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4765                 }
   4766             }
   4767             break;
   4768 
   4769 
   4770         case URX_STAT_SETREF_N:
   4771             {
   4772                 // Test input character for NOT being a member of  one of
   4773                 //    the predefined sets (Word Characters, for example)
   4774                 if (fp->fInputIdx >= fActiveLimit) {
   4775                     fHitEnd = TRUE;
   4776                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4777                     break;
   4778                 }
   4779 
   4780                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   4781 
   4782                 UChar32  c;
   4783                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4784                 if (c < 256) {
   4785                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   4786                     if (s8->contains(c) == FALSE) {
   4787                         break;
   4788                     }
   4789                 } else {
   4790                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   4791                     if (s->contains(c) == FALSE) {
   4792                         break;
   4793                     }
   4794                 }
   4795                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4796             }
   4797             break;
   4798 
   4799 
   4800         case URX_SETREF:
   4801             {
   4802                 if (fp->fInputIdx >= fActiveLimit) {
   4803                     fHitEnd = TRUE;
   4804                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4805                     break;
   4806                 }
   4807 
   4808                 U_ASSERT(opValue > 0 && opValue < sets->size());
   4809 
   4810                 // There is input left.  Pick up one char and test it for set membership.
   4811                 UChar32  c;
   4812                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4813                 if (c<256) {
   4814                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   4815                     if (s8->contains(c)) {
   4816                         // The character is in the set.  A Match.
   4817                         break;
   4818                     }
   4819                 } else {
   4820                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   4821                     if (s->contains(c)) {
   4822                         // The character is in the set.  A Match.
   4823                         break;
   4824                     }
   4825                 }
   4826 
   4827                 // the character wasn't in the set.
   4828                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4829             }
   4830             break;
   4831 
   4832 
   4833         case URX_DOTANY:
   4834             {
   4835                 // . matches anything, but stops at end-of-line.
   4836                 if (fp->fInputIdx >= fActiveLimit) {
   4837                     // At end of input.  Match failed.  Backtrack out.
   4838                     fHitEnd = TRUE;
   4839                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4840                     break;
   4841                 }
   4842 
   4843                 // There is input left.  Advance over one char, unless we've hit end-of-line
   4844                 UChar32  c;
   4845                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4846                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   4847                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   4848                     // End of line in normal mode.   . does not match.
   4849                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4850                     break;
   4851                 }
   4852             }
   4853             break;
   4854 
   4855 
   4856         case URX_DOTANY_ALL:
   4857             {
   4858                 // . in dot-matches-all (including new lines) mode
   4859                 if (fp->fInputIdx >= fActiveLimit) {
   4860                     // At end of input.  Match failed.  Backtrack out.
   4861                     fHitEnd = TRUE;
   4862                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4863                     break;
   4864                 }
   4865 
   4866                 // There is input left.  Advance over one char, except if we are
   4867                 //   at a cr/lf, advance over both of them.
   4868                 UChar32 c;
   4869                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4870                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   4871                     // In the case of a CR/LF, we need to advance over both.
   4872                     if (inputBuf[fp->fInputIdx] == 0x0a) {
   4873                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
   4874                     }
   4875                 }
   4876             }
   4877             break;
   4878 
   4879 
   4880         case URX_DOTANY_UNIX:
   4881             {
   4882                 // '.' operator, matches all, but stops at end-of-line.
   4883                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   4884                 if (fp->fInputIdx >= fActiveLimit) {
   4885                     // At end of input.  Match failed.  Backtrack out.
   4886                     fHitEnd = TRUE;
   4887                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4888                     break;
   4889                 }
   4890 
   4891                 // There is input left.  Advance over one char, unless we've hit end-of-line
   4892                 UChar32 c;
   4893                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4894                 if (c == 0x0a) {
   4895                     // End of line in normal mode.   '.' does not match the \n
   4896                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4897                 }
   4898             }
   4899             break;
   4900 
   4901 
   4902         case URX_JMP:
   4903             fp->fPatIdx = opValue;
   4904             break;
   4905 
   4906         case URX_FAIL:
   4907             isMatch = FALSE;
   4908             goto breakFromLoop;
   4909 
   4910         case URX_JMP_SAV:
   4911             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   4912             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   4913             fp->fPatIdx = opValue;                         // Then JMP.
   4914             break;
   4915 
   4916         case URX_JMP_SAV_X:
   4917             // This opcode is used with (x)+, when x can match a zero length string.
   4918             // Same as JMP_SAV, except conditional on the match having made forward progress.
   4919             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   4920             //   data address of the input position at the start of the loop.
   4921             {
   4922                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   4923                 int32_t  stoOp = (int32_t)pat[opValue-1];
   4924                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   4925                 int32_t  frameLoc = URX_VAL(stoOp);
   4926                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   4927                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
   4928                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   4929                 if (prevInputIdx < fp->fInputIdx) {
   4930                     // The match did make progress.  Repeat the loop.
   4931                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   4932                     fp->fPatIdx = opValue;
   4933                     fp->fExtra[frameLoc] = fp->fInputIdx;
   4934                 }
   4935                 // If the input position did not advance, we do nothing here,
   4936                 //   execution will fall out of the loop.
   4937             }
   4938             break;
   4939 
   4940         case URX_CTR_INIT:
   4941             {
   4942                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   4943                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   4944 
   4945                 // Pick up the three extra operands that CTR_INIT has, and
   4946                 //    skip the pattern location counter past
   4947                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   4948                 fp->fPatIdx += 3;
   4949                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   4950                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   4951                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   4952                 U_ASSERT(minCount>=0);
   4953                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   4954                 U_ASSERT(loopLoc>fp->fPatIdx);
   4955 
   4956                 if (minCount == 0) {
   4957                     fp = StateSave(fp, loopLoc+1, status);
   4958                 }
   4959                 if (maxCount == 0) {
   4960                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4961                 }
   4962             }
   4963             break;
   4964 
   4965         case URX_CTR_LOOP:
   4966             {
   4967                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   4968                 int32_t initOp = (int32_t)pat[opValue];
   4969                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   4970                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   4971                 int32_t minCount  = (int32_t)pat[opValue+2];
   4972                 int32_t maxCount  = (int32_t)pat[opValue+3];
   4973                 // Increment the counter.  Note: we DIDN'T worry about counter
   4974                 //   overflow, since the data comes from UnicodeStrings, which
   4975                 //   stores its length in an int32_t. Do we have to think about
   4976                 //   this now that we're using UText? Probably not, since the length
   4977                 //   in UChar32s is still an int32_t.
   4978                 (*pCounter)++;
   4979                 U_ASSERT(*pCounter > 0);
   4980                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   4981                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   4982                     break;
   4983                 }
   4984                 if (*pCounter >= minCount) {
   4985                     fp = StateSave(fp, fp->fPatIdx, status);
   4986                 }
   4987                 fp->fPatIdx = opValue + 4;    // Loop back.
   4988             }
   4989             break;
   4990 
   4991         case URX_CTR_INIT_NG:
   4992             {
   4993                 // Initialize a non-greedy loop
   4994                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   4995                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   4996 
   4997                 // Pick up the three extra operands that CTR_INIT has, and
   4998                 //    skip the pattern location counter past
   4999                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5000                 fp->fPatIdx += 3;
   5001                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   5002                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   5003                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   5004                 U_ASSERT(minCount>=0);
   5005                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   5006                 U_ASSERT(loopLoc>fp->fPatIdx);
   5007 
   5008                 if (minCount == 0) {
   5009                     if (maxCount != 0) {
   5010                         fp = StateSave(fp, fp->fPatIdx, status);
   5011                     }
   5012                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   5013                 }
   5014             }
   5015             break;
   5016 
   5017         case URX_CTR_LOOP_NG:
   5018             {
   5019                 // Non-greedy {min, max} loops
   5020                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   5021                 int32_t initOp = (int32_t)pat[opValue];
   5022                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   5023                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   5024                 int32_t minCount  = (int32_t)pat[opValue+2];
   5025                 int32_t maxCount  = (int32_t)pat[opValue+3];
   5026                 // Increment the counter.  Note: we DIDN'T worry about counter
   5027                 //   overflow, since the data comes from UnicodeStrings, which
   5028                 //   stores its length in an int32_t. Do we have to think about
   5029                 //   this now that we're using UText? Probably not, since the length
   5030                 //   in UChar32s is still an int32_t.
   5031                 (*pCounter)++;
   5032                 U_ASSERT(*pCounter > 0);
   5033 
   5034                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   5035                     // The loop has matched the maximum permitted number of times.
   5036                     //   Break out of here with no action.  Matching will
   5037                     //   continue with the following pattern.
   5038                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   5039                     break;
   5040                 }
   5041 
   5042                 if (*pCounter < minCount) {
   5043                     // We haven't met the minimum number of matches yet.
   5044                     //   Loop back for another one.
   5045                     fp->fPatIdx = opValue + 4;    // Loop back.
   5046                 } else {
   5047                     // We do have the minimum number of matches.
   5048                     //   Fall into the following pattern, but first do
   5049                     //   a state save to the top of the loop, so that a failure
   5050                     //   in the following pattern will try another iteration of the loop.
   5051                     fp = StateSave(fp, opValue + 4, status);
   5052                 }
   5053             }
   5054             break;
   5055 
   5056         case URX_STO_SP:
   5057             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5058             fData[opValue] = fStack->size();
   5059             break;
   5060 
   5061         case URX_LD_SP:
   5062             {
   5063                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5064                 int32_t newStackSize = (int32_t)fData[opValue];
   5065                 U_ASSERT(newStackSize <= fStack->size());
   5066                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5067                 if (newFP == (int64_t *)fp) {
   5068                     break;
   5069                 }
   5070                 int32_t i;
   5071                 for (i=0; i<fFrameSize; i++) {
   5072                     newFP[i] = ((int64_t *)fp)[i];
   5073                 }
   5074                 fp = (REStackFrame *)newFP;
   5075                 fStack->setSize(newStackSize);
   5076             }
   5077             break;
   5078 
   5079         case URX_BACKREF:
   5080             {
   5081                 U_ASSERT(opValue < fFrameSize);
   5082                 int64_t groupStartIdx = fp->fExtra[opValue];
   5083                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   5084                 U_ASSERT(groupStartIdx <= groupEndIdx);
   5085                 int64_t inputIndex = fp->fInputIdx;
   5086                 if (groupStartIdx < 0) {
   5087                     // This capture group has not participated in the match thus far,
   5088                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   5089                     break;
   5090                 }
   5091                 UBool success = TRUE;
   5092                 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
   5093                     if (inputIndex >= fActiveLimit) {
   5094                         success = FALSE;
   5095                         fHitEnd = TRUE;
   5096                         break;
   5097                     }
   5098                     if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
   5099                         success = FALSE;
   5100                         break;
   5101                     }
   5102                 }
   5103                 if (success) {
   5104                     fp->fInputIdx = inputIndex;
   5105                 } else {
   5106                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5107                 }
   5108             }
   5109             break;
   5110 
   5111         case URX_BACKREF_I:
   5112             {
   5113                 U_ASSERT(opValue < fFrameSize);
   5114                 int64_t groupStartIdx = fp->fExtra[opValue];
   5115                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   5116                 U_ASSERT(groupStartIdx <= groupEndIdx);
   5117                 if (groupStartIdx < 0) {
   5118                     // This capture group has not participated in the match thus far,
   5119                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   5120                     break;
   5121                 }
   5122                 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
   5123                 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
   5124 
   5125                 //   Note: if the capture group match was of an empty string the backref
   5126                 //         match succeeds.  Verified by testing:  Perl matches succeed
   5127                 //         in this case, so we do too.
   5128 
   5129                 UBool success = TRUE;
   5130                 for (;;) {
   5131                     UChar32 captureGroupChar = captureGroupItr.next();
   5132                     if (captureGroupChar == U_SENTINEL) {
   5133                         success = TRUE;
   5134                         break;
   5135                     }
   5136                     UChar32 inputChar = inputItr.next();
   5137                     if (inputChar == U_SENTINEL) {
   5138                         success = FALSE;
   5139                         fHitEnd = TRUE;
   5140                         break;
   5141                     }
   5142                     if (inputChar != captureGroupChar) {
   5143                         success = FALSE;
   5144                         break;
   5145                     }
   5146                 }
   5147 
   5148                 if (success && inputItr.inExpansion()) {
   5149                     // We otained a match by consuming part of a string obtained from
   5150                     // case-folding a single code point of the input text.
   5151                     // This does not count as an overall match.
   5152                     success = FALSE;
   5153                 }
   5154 
   5155                 if (success) {
   5156                     fp->fInputIdx = inputItr.getIndex();
   5157                 } else {
   5158                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5159                 }
   5160             }
   5161             break;
   5162 
   5163         case URX_STO_INP_LOC:
   5164             {
   5165                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   5166                 fp->fExtra[opValue] = fp->fInputIdx;
   5167             }
   5168             break;
   5169 
   5170         case URX_JMPX:
   5171             {
   5172                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5173                 fp->fPatIdx += 1;
   5174                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   5175                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   5176                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
   5177                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   5178                 if (savedInputIdx < fp->fInputIdx) {
   5179                     fp->fPatIdx = opValue;                               // JMP
   5180                 } else {
   5181                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   5182                 }
   5183             }
   5184             break;
   5185 
   5186         case URX_LA_START:
   5187             {
   5188                 // Entering a lookahead block.
   5189                 // Save Stack Ptr, Input Pos.
   5190                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5191                 fData[opValue]   = fStack->size();
   5192                 fData[opValue+1] = fp->fInputIdx;
   5193                 fActiveStart     = fLookStart;          // Set the match region change for
   5194                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   5195             }
   5196             break;
   5197 
   5198         case URX_LA_END:
   5199             {
   5200                 // Leaving a look-ahead block.
   5201                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   5202                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5203                 int32_t stackSize = fStack->size();
   5204                 int32_t newStackSize = (int32_t)fData[opValue];
   5205                 U_ASSERT(stackSize >= newStackSize);
   5206                 if (stackSize > newStackSize) {
   5207                     // Copy the current top frame back to the new (cut back) top frame.
   5208                     //   This makes the capture groups from within the look-ahead
   5209                     //   expression available.
   5210                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5211                     int32_t i;
   5212                     for (i=0; i<fFrameSize; i++) {
   5213                         newFP[i] = ((int64_t *)fp)[i];
   5214                     }
   5215                     fp = (REStackFrame *)newFP;
   5216                     fStack->setSize(newStackSize);
   5217                 }
   5218                 fp->fInputIdx = fData[opValue+1];
   5219 
   5220                 // Restore the active region bounds in the input string; they may have
   5221                 //    been changed because of transparent bounds on a Region.
   5222                 fActiveStart = fRegionStart;
   5223                 fActiveLimit = fRegionLimit;
   5224             }
   5225             break;
   5226 
   5227         case URX_ONECHAR_I:
   5228             if (fp->fInputIdx < fActiveLimit) {
   5229                 UChar32 c;
   5230                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5231                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   5232                     break;
   5233                 }
   5234             } else {
   5235                 fHitEnd = TRUE;
   5236             }
   5237             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5238             break;
   5239 
   5240         case URX_STRING_I:
   5241             // Case-insensitive test input against a literal string.
   5242             // Strings require two slots in the compiled pattern, one for the
   5243             //   offset to the string text, and one for the length.
   5244             //   The compiled string has already been case folded.
   5245             {
   5246                 const UChar *patternString = litText + opValue;
   5247 
   5248                 op      = (int32_t)pat[fp->fPatIdx];
   5249                 fp->fPatIdx++;
   5250                 opType  = URX_TYPE(op);
   5251                 opValue = URX_VAL(op);
   5252                 U_ASSERT(opType == URX_STRING_LEN);
   5253                 int32_t patternStringLen = opValue;  // Length of the string from the pattern.
   5254 
   5255                 UChar32      cText;
   5256                 UChar32      cPattern;
   5257                 UBool        success = TRUE;
   5258                 int32_t      patternStringIdx  = 0;
   5259                 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
   5260                 while (patternStringIdx < patternStringLen) {
   5261                     U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
   5262                     cText = inputIterator.next();
   5263                     if (cText != cPattern) {
   5264                         success = FALSE;
   5265                         if (cText == U_SENTINEL) {
   5266                             fHitEnd = TRUE;
   5267                         }
   5268                         break;
   5269                     }
   5270                 }
   5271                 if (inputIterator.inExpansion()) {
   5272                     success = FALSE;
   5273                 }
   5274 
   5275                 if (success) {
   5276                     fp->fInputIdx = inputIterator.getIndex();
   5277                 } else {
   5278                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5279                 }
   5280             }
   5281             break;
   5282 
   5283         case URX_LB_START:
   5284             {
   5285                 // Entering a look-behind block.
   5286                 // Save Stack Ptr, Input Pos.
   5287                 //   TODO:  implement transparent bounds.  Ticket #6067
   5288                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5289                 fData[opValue]   = fStack->size();
   5290                 fData[opValue+1] = fp->fInputIdx;
   5291                 // Init the variable containing the start index for attempted matches.
   5292                 fData[opValue+2] = -1;
   5293                 // Save input string length, then reset to pin any matches to end at
   5294                 //   the current position.
   5295                 fData[opValue+3] = fActiveLimit;
   5296                 fActiveLimit     = fp->fInputIdx;
   5297             }
   5298             break;
   5299 
   5300 
   5301         case URX_LB_CONT:
   5302             {
   5303                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   5304                 //    at all possible input starting positions.
   5305 
   5306                 // Fetch the min and max possible match lengths.  They are the operands
   5307                 //   of this op in the pattern.
   5308                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
   5309                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
   5310                 U_ASSERT(minML <= maxML);
   5311                 U_ASSERT(minML >= 0);
   5312 
   5313                 // Fetch (from data) the last input index where a match was attempted.
   5314                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5315                 int64_t  *lbStartIdx = &fData[opValue+2];
   5316                 if (*lbStartIdx < 0) {
   5317                     // First time through loop.
   5318                     *lbStartIdx = fp->fInputIdx - minML;
   5319                 } else {
   5320                     // 2nd through nth time through the loop.
   5321                     // Back up start position for match by one.
   5322                     if (*lbStartIdx == 0) {
   5323                         (*lbStartIdx)--;
   5324                     } else {
   5325                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
   5326                     }
   5327                 }
   5328 
   5329                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   5330                     // We have tried all potential match starting points without
   5331                     //  getting a match.  Backtrack out, and out of the
   5332                     //   Look Behind altogether.
   5333                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5334                     int64_t restoreInputLen = fData[opValue+3];
   5335                     U_ASSERT(restoreInputLen >= fActiveLimit);
   5336                     U_ASSERT(restoreInputLen <= fInputLength);
   5337                     fActiveLimit = restoreInputLen;
   5338                     break;
   5339                 }
   5340 
   5341                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5342                 //      (successful match will fall off the end of the loop.)
   5343                 fp = StateSave(fp, fp->fPatIdx-3, status);
   5344                 fp->fInputIdx =  *lbStartIdx;
   5345             }
   5346             break;
   5347 
   5348         case URX_LB_END:
   5349             // End of a look-behind block, after a successful match.
   5350             {
   5351                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5352                 if (fp->fInputIdx != fActiveLimit) {
   5353                     //  The look-behind expression matched, but the match did not
   5354                     //    extend all the way to the point that we are looking behind from.
   5355                     //  FAIL out of here, which will take us back to the LB_CONT, which
   5356                     //     will retry the match starting at another position or fail
   5357                     //     the look-behind altogether, whichever is appropriate.
   5358                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5359                     break;
   5360                 }
   5361 
   5362                 // Look-behind match is good.  Restore the orignal input string length,
   5363                 //   which had been truncated to pin the end of the lookbehind match to the
   5364                 //   position being looked-behind.
   5365                 int64_t originalInputLen = fData[opValue+3];
   5366                 U_ASSERT(originalInputLen >= fActiveLimit);
   5367                 U_ASSERT(originalInputLen <= fInputLength);
   5368                 fActiveLimit = originalInputLen;
   5369             }
   5370             break;
   5371 
   5372 
   5373         case URX_LBN_CONT:
   5374             {
   5375                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   5376                 //    at all possible input starting positions.
   5377 
   5378                 // Fetch the extra parameters of this op.
   5379                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
   5380                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
   5381                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
   5382                 continueLoc = URX_VAL(continueLoc);
   5383                 U_ASSERT(minML <= maxML);
   5384                 U_ASSERT(minML >= 0);
   5385                 U_ASSERT(continueLoc > fp->fPatIdx);
   5386 
   5387                 // Fetch (from data) the last input index where a match was attempted.
   5388                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5389                 int64_t  *lbStartIdx = &fData[opValue+2];
   5390                 if (*lbStartIdx < 0) {
   5391                     // First time through loop.
   5392                     *lbStartIdx = fp->fInputIdx - minML;
   5393                 } else {
   5394                     // 2nd through nth time through the loop.
   5395                     // Back up start position for match by one.
   5396                     if (*lbStartIdx == 0) {
   5397                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
   5398                     } else {
   5399                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
   5400                     }
   5401                 }
   5402 
   5403                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   5404                     // We have tried all potential match starting points without
   5405                     //  getting a match, which means that the negative lookbehind as
   5406                     //  a whole has succeeded.  Jump forward to the continue location
   5407                     int64_t restoreInputLen = fData[opValue+3];
   5408                     U_ASSERT(restoreInputLen >= fActiveLimit);
   5409                     U_ASSERT(restoreInputLen <= fInputLength);
   5410                     fActiveLimit = restoreInputLen;
   5411                     fp->fPatIdx = continueLoc;
   5412                     break;
   5413                 }
   5414 
   5415                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5416                 //      (successful match will cause a FAIL out of the loop altogether.)
   5417                 fp = StateSave(fp, fp->fPatIdx-4, status);
   5418                 fp->fInputIdx =  *lbStartIdx;
   5419             }
   5420             break;
   5421 
   5422         case URX_LBN_END:
   5423             // End of a negative look-behind block, after a successful match.
   5424             {
   5425                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5426                 if (fp->fInputIdx != fActiveLimit) {
   5427                     //  The look-behind expression matched, but the match did not
   5428                     //    extend all the way to the point that we are looking behind from.
   5429                     //  FAIL out of here, which will take us back to the LB_CONT, which
   5430                     //     will retry the match starting at another position or succeed
   5431                     //     the look-behind altogether, whichever is appropriate.
   5432                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5433                     break;
   5434                 }
   5435 
   5436                 // Look-behind expression matched, which means look-behind test as
   5437                 //   a whole Fails
   5438 
   5439                 //   Restore the orignal input string length, which had been truncated
   5440                 //   inorder to pin the end of the lookbehind match
   5441                 //   to the position being looked-behind.
   5442                 int64_t originalInputLen = fData[opValue+3];
   5443                 U_ASSERT(originalInputLen >= fActiveLimit);
   5444                 U_ASSERT(originalInputLen <= fInputLength);
   5445                 fActiveLimit = originalInputLen;
   5446 
   5447                 // Restore original stack position, discarding any state saved
   5448                 //   by the successful pattern match.
   5449                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5450                 int32_t newStackSize = (int32_t)fData[opValue];
   5451                 U_ASSERT(fStack->size() > newStackSize);
   5452                 fStack->setSize(newStackSize);
   5453 
   5454                 //  FAIL, which will take control back to someplace
   5455                 //  prior to entering the look-behind test.
   5456                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5457             }
   5458             break;
   5459 
   5460 
   5461         case URX_LOOP_SR_I:
   5462             // Loop Initialization for the optimized implementation of
   5463             //     [some character set]*
   5464             //   This op scans through all matching input.
   5465             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5466             {
   5467                 U_ASSERT(opValue > 0 && opValue < sets->size());
   5468                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5469                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   5470 
   5471                 // Loop through input, until either the input is exhausted or
   5472                 //   we reach a character that is not a member of the set.
   5473                 int32_t ix = (int32_t)fp->fInputIdx;
   5474                 for (;;) {
   5475                     if (ix >= fActiveLimit) {
   5476                         fHitEnd = TRUE;
   5477                         break;
   5478                     }
   5479                     UChar32   c;
   5480                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
   5481                     if (c<256) {
   5482                         if (s8->contains(c) == FALSE) {
   5483                             U16_BACK_1(inputBuf, 0, ix);
   5484                             break;
   5485                         }
   5486                     } else {
   5487                         if (s->contains(c) == FALSE) {
   5488                             U16_BACK_1(inputBuf, 0, ix);
   5489                             break;
   5490                         }
   5491                     }
   5492                 }
   5493 
   5494                 // If there were no matching characters, skip over the loop altogether.
   5495                 //   The loop doesn't run at all, a * op always succeeds.
   5496                 if (ix == fp->fInputIdx) {
   5497                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   5498                     break;
   5499                 }
   5500 
   5501                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   5502                 //   must follow.  It's operand is the stack location
   5503                 //   that holds the starting input index for the match of this [set]*
   5504                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   5505                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   5506                 int32_t stackLoc = URX_VAL(loopcOp);
   5507                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   5508                 fp->fExtra[stackLoc] = fp->fInputIdx;
   5509                 fp->fInputIdx = ix;
   5510 
   5511                 // Save State to the URX_LOOP_C op that follows this one,
   5512                 //   so that match failures in the following code will return to there.
   5513                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   5514                 fp = StateSave(fp, fp->fPatIdx, status);
   5515                 fp->fPatIdx++;
   5516             }
   5517             break;
   5518 
   5519 
   5520         case URX_LOOP_DOT_I:
   5521             // Loop Initialization for the optimized implementation of .*
   5522             //   This op scans through all remaining input.
   5523             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5524             {
   5525                 // Loop through input until the input is exhausted (we reach an end-of-line)
   5526                 // In DOTALL mode, we can just go straight to the end of the input.
   5527                 int32_t ix;
   5528                 if ((opValue & 1) == 1) {
   5529                     // Dot-matches-All mode.  Jump straight to the end of the string.
   5530                     ix = (int32_t)fActiveLimit;
   5531                     fHitEnd = TRUE;
   5532                 } else {
   5533                     // NOT DOT ALL mode.  Line endings do not match '.'
   5534                     // Scan forward until a line ending or end of input.
   5535                     ix = (int32_t)fp->fInputIdx;
   5536                     for (;;) {
   5537                         if (ix >= fActiveLimit) {
   5538                             fHitEnd = TRUE;
   5539                             break;
   5540                         }
   5541                         UChar32   c;
   5542                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
   5543                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
   5544                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
   5545                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   5546                                    ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
   5547                                 //  char is a line ending.  Put the input pos back to the
   5548                                 //    line ending char, and exit the scanning loop.
   5549                                 U16_BACK_1(inputBuf, 0, ix);
   5550                                 break;
   5551                             }
   5552                         }
   5553                     }
   5554                 }
   5555 
   5556                 // If there were no matching characters, skip over the loop altogether.
   5557                 //   The loop doesn't run at all, a * op always succeeds.
   5558                 if (ix == fp->fInputIdx) {
   5559                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   5560                     break;
   5561                 }
   5562 
   5563                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   5564                 //   must follow.  It's operand is the stack location
   5565                 //   that holds the starting input index for the match of this .*
   5566                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   5567                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   5568                 int32_t stackLoc = URX_VAL(loopcOp);
   5569                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   5570                 fp->fExtra[stackLoc] = fp->fInputIdx;
   5571                 fp->fInputIdx = ix;
   5572 
   5573                 // Save State to the URX_LOOP_C op that follows this one,
   5574                 //   so that match failures in the following code will return to there.
   5575                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   5576                 fp = StateSave(fp, fp->fPatIdx, status);
   5577                 fp->fPatIdx++;
   5578             }
   5579             break;
   5580 
   5581 
   5582         case URX_LOOP_C:
   5583             {
   5584                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   5585                 backSearchIndex = (int32_t)fp->fExtra[opValue];
   5586                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
   5587                 if (backSearchIndex == fp->fInputIdx) {
   5588                     // We've backed up the input idx to the point that the loop started.
   5589                     // The loop is done.  Leave here without saving state.
   5590                     //  Subsequent failures won't come back here.
   5591                     break;
   5592                 }
   5593                 // Set up for the next iteration of the loop, with input index
   5594                 //   backed up by one from the last time through,
   5595                 //   and a state save to this instruction in case the following code fails again.
   5596                 //   (We're going backwards because this loop emulates stack unwinding, not
   5597                 //    the initial scan forward.)
   5598                 U_ASSERT(fp->fInputIdx > 0);
   5599                 UChar32 prevC;
   5600                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
   5601 
   5602                 if (prevC == 0x0a &&
   5603                     fp->fInputIdx > backSearchIndex &&
   5604                     inputBuf[fp->fInputIdx-1] == 0x0d) {
   5605                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
   5606                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   5607                         // .*, stepping back over CRLF pair.
   5608                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   5609                     }
   5610                 }
   5611 
   5612 
   5613                 fp = StateSave(fp, fp->fPatIdx-1, status);
   5614             }
   5615             break;
   5616 
   5617 
   5618 
   5619         default:
   5620             // Trouble.  The compiled pattern contains an entry with an
   5621             //           unrecognized type tag.
   5622             U_ASSERT(FALSE);
   5623         }
   5624 
   5625         if (U_FAILURE(status)) {
   5626             isMatch = FALSE;
   5627             break;
   5628         }
   5629     }
   5630 
   5631 breakFromLoop:
   5632     fMatch = isMatch;
   5633     if (isMatch) {
   5634         fLastMatchEnd = fMatchEnd;
   5635         fMatchStart   = startIdx;
   5636         fMatchEnd     = fp->fInputIdx;
   5637         if (fTraceDebug) {
   5638             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
   5639         }
   5640     }
   5641     else
   5642     {
   5643         if (fTraceDebug) {
   5644             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
   5645         }
   5646     }
   5647 
   5648     fFrame = fp;                // The active stack frame when the engine stopped.
   5649     //   Contains the capture group results that we need to
   5650     //    access later.
   5651 
   5652     return;
   5653 }
   5654 
   5655 
   5656 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
   5657 
   5658 U_NAMESPACE_END
   5659 
   5660 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
   5661