Home | History | Annotate | Download | only in i18n
      1 /*
      2 **************************************************************************
      3 *   Copyright (C) 2002-2011 International Business Machines Corporation  *
      4 *   and others. All rights reserved.                                     *
      5 **************************************************************************
      6 */
      7 //
      8 //  file:  rematch.cpp
      9 //
     10 //         Contains the implementation of class RegexMatcher,
     11 //         which is one of the main API classes for the ICU regular expression package.
     12 //
     13 
     14 #include "unicode/utypes.h"
     15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     16 
     17 #include "unicode/regex.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/ustring.h"
     21 #include "unicode/rbbi.h"
     22 #include "uassert.h"
     23 #include "cmemory.h"
     24 #include "uvector.h"
     25 #include "uvectr32.h"
     26 #include "uvectr64.h"
     27 #include "regeximp.h"
     28 #include "regexst.h"
     29 #include "regextxt.h"
     30 #include "ucase.h"
     31 
     32 // #include <malloc.h>        // Needed for heapcheck testing
     33 
     34 
     35 // Find progress callback
     36 // ----------------------
     37 // Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
     38 //
     39 #define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
     40     (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
     41 
     42 
     43 // Smart Backtracking
     44 // ------------------
     45 // When a failure would go back to a LOOP_C instruction,
     46 // strings, characters, and setrefs scan backwards for a valid start
     47 // character themselves, pop the stack, and save state, emulating the
     48 // LOOP_C's effect but assured that the next character of input is a
     49 // possible matching character.
     50 //
     51 // Good idea in theory; unfortunately it only helps out a few specific
     52 // cases and slows the engine down a little in the rest.
     53 
     54 //#define REGEX_SMART_BACKTRACKING 1
     55 
     56 U_NAMESPACE_BEGIN
     57 
     58 // Default limit for the size of the back track stack, to avoid system
     59 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
     60 // This value puts ICU's limits higher than most other regexp implementations,
     61 //    which use recursion rather than the heap, and take more storage per
     62 //    backtrack point.
     63 //
     64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
     65 
     66 // Time limit counter constant.
     67 //   Time limits for expression evaluation are in terms of quanta of work by
     68 //   the engine, each of which is 10,000 state saves.
     69 //   This constant determines that state saves per tick number.
     70 static const int32_t TIMER_INITIAL_VALUE = 10000;
     71 
     72 //-----------------------------------------------------------------------------
     73 //
     74 //   Constructor and Destructor
     75 //
     76 //-----------------------------------------------------------------------------
     77 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
     78     fDeferredStatus = U_ZERO_ERROR;
     79     init(fDeferredStatus);
     80     if (U_FAILURE(fDeferredStatus)) {
     81         return;
     82     }
     83     if (pat==NULL) {
     84         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
     85         return;
     86     }
     87     fPattern = pat;
     88     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
     89 }
     90 
     91 
     92 
     93 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
     94                            uint32_t flags, UErrorCode &status) {
     95     init(status);
     96     if (U_FAILURE(status)) {
     97         return;
     98     }
     99     UParseError    pe;
    100     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    101     fPattern           = fPatternOwned;
    102 
    103     UText inputText = UTEXT_INITIALIZER;
    104     utext_openConstUnicodeString(&inputText, &input, &status);
    105     init2(&inputText, status);
    106     utext_close(&inputText);
    107 
    108     fInputUniStrMaybeMutable = TRUE;
    109 }
    110 
    111 
    112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
    113                            uint32_t flags, UErrorCode &status) {
    114     init(status);
    115     if (U_FAILURE(status)) {
    116         return;
    117     }
    118     UParseError    pe;
    119     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    120     if (U_FAILURE(status)) {
    121         return;
    122     }
    123 
    124     fPattern           = fPatternOwned;
    125     init2(input, status);
    126 }
    127 
    128 
    129 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
    130                            uint32_t flags, UErrorCode &status) {
    131     init(status);
    132     if (U_FAILURE(status)) {
    133         return;
    134     }
    135     UParseError    pe;
    136     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    137     if (U_FAILURE(status)) {
    138         return;
    139     }
    140     fPattern           = fPatternOwned;
    141     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    142 }
    143 
    144 RegexMatcher::RegexMatcher(UText *regexp,
    145                            uint32_t flags, UErrorCode &status) {
    146     init(status);
    147     if (U_FAILURE(status)) {
    148         return;
    149     }
    150     UParseError    pe;
    151     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    152         if (U_FAILURE(status)) {
    153         return;
    154     }
    155 
    156     fPattern           = fPatternOwned;
    157     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    158 }
    159 
    160 
    161 
    162 
    163 RegexMatcher::~RegexMatcher() {
    164     delete fStack;
    165     if (fData != fSmallData) {
    166         uprv_free(fData);
    167         fData = NULL;
    168     }
    169     if (fPatternOwned) {
    170         delete fPatternOwned;
    171         fPatternOwned = NULL;
    172         fPattern = NULL;
    173     }
    174 
    175     if (fInput) {
    176         delete fInput;
    177     }
    178     if (fInputText) {
    179         utext_close(fInputText);
    180     }
    181     if (fAltInputText) {
    182         utext_close(fAltInputText);
    183     }
    184 
    185     #if UCONFIG_NO_BREAK_ITERATION==0
    186     delete fWordBreakItr;
    187     #endif
    188 }
    189 
    190 //
    191 //   init()   common initialization for use by all constructors.
    192 //            Initialize all fields, get the object into a consistent state.
    193 //            This must be done even when the initial status shows an error,
    194 //            so that the object is initialized sufficiently well for the destructor
    195 //            to run safely.
    196 //
    197 void RegexMatcher::init(UErrorCode &status) {
    198     fPattern           = NULL;
    199     fPatternOwned      = NULL;
    200     fFrameSize         = 0;
    201     fRegionStart       = 0;
    202     fRegionLimit       = 0;
    203     fAnchorStart       = 0;
    204     fAnchorLimit       = 0;
    205     fLookStart         = 0;
    206     fLookLimit         = 0;
    207     fActiveStart       = 0;
    208     fActiveLimit       = 0;
    209     fTransparentBounds = FALSE;
    210     fAnchoringBounds   = TRUE;
    211     fMatch             = FALSE;
    212     fMatchStart        = 0;
    213     fMatchEnd          = 0;
    214     fLastMatchEnd      = -1;
    215     fAppendPosition    = 0;
    216     fHitEnd            = FALSE;
    217     fRequireEnd        = FALSE;
    218     fStack             = NULL;
    219     fFrame             = NULL;
    220     fTimeLimit         = 0;
    221     fTime              = 0;
    222     fTickCounter       = 0;
    223     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
    224     fCallbackFn        = NULL;
    225     fCallbackContext   = NULL;
    226     fFindProgressCallbackFn      = NULL;
    227     fFindProgressCallbackContext = NULL;
    228     fTraceDebug        = FALSE;
    229     fDeferredStatus    = status;
    230     fData              = fSmallData;
    231     fWordBreakItr      = NULL;
    232 
    233     fStack             = new UVector64(status);
    234     fInputText         = NULL;
    235     fAltInputText      = NULL;
    236     fInput             = NULL;
    237     fInputLength       = 0;
    238     fInputUniStrMaybeMutable = FALSE;
    239 
    240     if (U_FAILURE(status)) {
    241         fDeferredStatus = status;
    242     }
    243 }
    244 
    245 //
    246 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
    247 //            This handles the common setup to be done after the Pattern is available.
    248 //
    249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
    250     if (U_FAILURE(status)) {
    251         fDeferredStatus = status;
    252         return;
    253     }
    254 
    255     if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
    256         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
    257         if (fData == NULL) {
    258             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    259             return;
    260         }
    261     }
    262 
    263     reset(input);
    264     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
    265     if (U_FAILURE(status)) {
    266         fDeferredStatus = status;
    267         return;
    268     }
    269 }
    270 
    271 
    272 static const UChar BACKSLASH  = 0x5c;
    273 static const UChar DOLLARSIGN = 0x24;
    274 //--------------------------------------------------------------------------------
    275 //
    276 //    appendReplacement
    277 //
    278 //--------------------------------------------------------------------------------
    279 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
    280                                               const UnicodeString &replacement,
    281                                               UErrorCode &status) {
    282     UText replacementText = UTEXT_INITIALIZER;
    283 
    284     utext_openConstUnicodeString(&replacementText, &replacement, &status);
    285     if (U_SUCCESS(status)) {
    286         UText resultText = UTEXT_INITIALIZER;
    287         utext_openUnicodeString(&resultText, &dest, &status);
    288 
    289         if (U_SUCCESS(status)) {
    290             appendReplacement(&resultText, &replacementText, status);
    291             utext_close(&resultText);
    292         }
    293         utext_close(&replacementText);
    294     }
    295 
    296     return *this;
    297 }
    298 
    299 //
    300 //    appendReplacement, UText mode
    301 //
    302 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
    303                                               UText *replacement,
    304                                               UErrorCode &status) {
    305     if (U_FAILURE(status)) {
    306         return *this;
    307     }
    308     if (U_FAILURE(fDeferredStatus)) {
    309         status = fDeferredStatus;
    310         return *this;
    311     }
    312     if (fMatch == FALSE) {
    313         status = U_REGEX_INVALID_STATE;
    314         return *this;
    315     }
    316 
    317     // Copy input string from the end of previous match to start of current match
    318     int64_t  destLen = utext_nativeLength(dest);
    319     if (fMatchStart > fAppendPosition) {
    320         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    321             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
    322                                      (int32_t)(fMatchStart-fAppendPosition), &status);
    323         } else {
    324             int32_t len16;
    325             if (UTEXT_USES_U16(fInputText)) {
    326                 len16 = (int32_t)(fMatchStart-fAppendPosition);
    327             } else {
    328                 UErrorCode lengthStatus = U_ZERO_ERROR;
    329                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
    330             }
    331             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
    332             if (inputChars == NULL) {
    333                 status = U_MEMORY_ALLOCATION_ERROR;
    334                 return *this;
    335             }
    336             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
    337             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    338             uprv_free(inputChars);
    339         }
    340     }
    341     fAppendPosition = fMatchEnd;
    342 
    343 
    344     // scan the replacement text, looking for substitutions ($n) and \escapes.
    345     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
    346     //         move entire ranges not containing substitutions.
    347     UTEXT_SETNATIVEINDEX(replacement, 0);
    348     UChar32 c = UTEXT_NEXT32(replacement);
    349     while (c != U_SENTINEL) {
    350         if (c == BACKSLASH) {
    351             // Backslash Escape.  Copy the following char out without further checks.
    352             //                    Note:  Surrogate pairs don't need any special handling
    353             //                           The second half wont be a '$' or a '\', and
    354             //                           will move to the dest normally on the next
    355             //                           loop iteration.
    356             c = UTEXT_CURRENT32(replacement);
    357             if (c == U_SENTINEL) {
    358                 break;
    359             }
    360 
    361             if (c==0x55/*U*/ || c==0x75/*u*/) {
    362                 // We have a \udddd or \Udddddddd escape sequence.
    363                 int32_t offset = 0;
    364                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
    365                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
    366                 if (escapedChar != (UChar32)0xFFFFFFFF) {
    367                     if (U_IS_BMP(escapedChar)) {
    368                         UChar c16 = (UChar)escapedChar;
    369                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    370                     } else {
    371                         UChar surrogate[2];
    372                         surrogate[0] = U16_LEAD(escapedChar);
    373                         surrogate[1] = U16_TRAIL(escapedChar);
    374                         if (U_SUCCESS(status)) {
    375                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    376                         }
    377                     }
    378                     // TODO:  Report errors for mal-formed \u escapes?
    379                     //        As this is, the original sequence is output, which may be OK.
    380                     if (context.lastOffset == offset) {
    381                         (void)UTEXT_PREVIOUS32(replacement);
    382                     } else if (context.lastOffset != offset-1) {
    383                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
    384                     }
    385                 }
    386             } else {
    387                 (void)UTEXT_NEXT32(replacement);
    388                 // Plain backslash escape.  Just put out the escaped character.
    389                 if (U_IS_BMP(c)) {
    390                     UChar c16 = (UChar)c;
    391                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    392                 } else {
    393                     UChar surrogate[2];
    394                     surrogate[0] = U16_LEAD(c);
    395                     surrogate[1] = U16_TRAIL(c);
    396                     if (U_SUCCESS(status)) {
    397                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    398                     }
    399                 }
    400             }
    401         } else if (c != DOLLARSIGN) {
    402             // Normal char, not a $.  Copy it out without further checks.
    403             if (U_IS_BMP(c)) {
    404                 UChar c16 = (UChar)c;
    405                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    406             } else {
    407                 UChar surrogate[2];
    408                 surrogate[0] = U16_LEAD(c);
    409                 surrogate[1] = U16_TRAIL(c);
    410                 if (U_SUCCESS(status)) {
    411                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    412                 }
    413             }
    414         } else {
    415             // We've got a $.  Pick up a capture group number if one follows.
    416             // Consume at most the number of digits necessary for the largest capture
    417             // number that is valid for this pattern.
    418 
    419             int32_t numDigits = 0;
    420             int32_t groupNum  = 0;
    421             UChar32 digitC;
    422             for (;;) {
    423                 digitC = UTEXT_CURRENT32(replacement);
    424                 if (digitC == U_SENTINEL) {
    425                     break;
    426                 }
    427                 if (u_isdigit(digitC) == FALSE) {
    428                     break;
    429                 }
    430                 (void)UTEXT_NEXT32(replacement);
    431                 groupNum=groupNum*10 + u_charDigitValue(digitC);
    432                 numDigits++;
    433                 if (numDigits >= fPattern->fMaxCaptureDigits) {
    434                     break;
    435                 }
    436             }
    437 
    438 
    439             if (numDigits == 0) {
    440                 // The $ didn't introduce a group number at all.
    441                 // Treat it as just part of the substitution text.
    442                 UChar c16 = DOLLARSIGN;
    443                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    444             } else {
    445                 // Finally, append the capture group data to the destination.
    446                 destLen += appendGroup(groupNum, dest, status);
    447                 if (U_FAILURE(status)) {
    448                     // Can fail if group number is out of range.
    449                     break;
    450                 }
    451             }
    452         }
    453 
    454         if (U_FAILURE(status)) {
    455             break;
    456         } else {
    457             c = UTEXT_NEXT32(replacement);
    458         }
    459     }
    460 
    461     return *this;
    462 }
    463 
    464 
    465 
    466 //--------------------------------------------------------------------------------
    467 //
    468 //    appendTail     Intended to be used in conjunction with appendReplacement()
    469 //                   To the destination string, append everything following
    470 //                   the last match position from the input string.
    471 //
    472 //                   Note:  Match ranges do not affect appendTail or appendReplacement
    473 //
    474 //--------------------------------------------------------------------------------
    475 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
    476     UErrorCode status = U_ZERO_ERROR;
    477     UText resultText = UTEXT_INITIALIZER;
    478     utext_openUnicodeString(&resultText, &dest, &status);
    479 
    480     if (U_SUCCESS(status)) {
    481         appendTail(&resultText, status);
    482         utext_close(&resultText);
    483     }
    484 
    485     return dest;
    486 }
    487 
    488 //
    489 //   appendTail, UText mode
    490 //
    491 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
    492     UBool bailOut = FALSE;
    493     if (U_FAILURE(status)) {
    494         bailOut = TRUE;
    495     }
    496     if (U_FAILURE(fDeferredStatus)) {
    497         status = fDeferredStatus;
    498         bailOut = TRUE;
    499     }
    500 
    501     if (bailOut) {
    502         //  dest must not be NULL
    503         if (dest) {
    504             utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
    505             return dest;
    506         }
    507     }
    508 
    509     if (fInputLength > fAppendPosition) {
    510         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    511             int64_t destLen = utext_nativeLength(dest);
    512             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
    513                           (int32_t)(fInputLength-fAppendPosition), &status);
    514         } else {
    515             int32_t len16;
    516             if (UTEXT_USES_U16(fInputText)) {
    517                 len16 = (int32_t)(fInputLength-fAppendPosition);
    518             } else {
    519                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
    520                 status = U_ZERO_ERROR; // buffer overflow
    521             }
    522 
    523             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
    524             if (inputChars == NULL) {
    525                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    526             } else {
    527                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
    528                 int64_t destLen = utext_nativeLength(dest);
    529                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    530                 uprv_free(inputChars);
    531             }
    532         }
    533     }
    534     return dest;
    535 }
    536 
    537 
    538 
    539 //--------------------------------------------------------------------------------
    540 //
    541 //   end
    542 //
    543 //--------------------------------------------------------------------------------
    544 int32_t RegexMatcher::end(UErrorCode &err) const {
    545     return end(0, err);
    546 }
    547 
    548 int64_t RegexMatcher::end64(UErrorCode &err) const {
    549     return end64(0, err);
    550 }
    551 
    552 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
    553     if (U_FAILURE(err)) {
    554         return -1;
    555     }
    556     if (fMatch == FALSE) {
    557         err = U_REGEX_INVALID_STATE;
    558         return -1;
    559     }
    560     if (group < 0 || group > fPattern->fGroupMap->size()) {
    561         err = U_INDEX_OUTOFBOUNDS_ERROR;
    562         return -1;
    563     }
    564     int64_t e = -1;
    565     if (group == 0) {
    566         e = fMatchEnd;
    567     } else {
    568         // Get the position within the stack frame of the variables for
    569         //    this capture group.
    570         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
    571         U_ASSERT(groupOffset < fPattern->fFrameSize);
    572         U_ASSERT(groupOffset >= 0);
    573         e = fFrame->fExtra[groupOffset + 1];
    574     }
    575 
    576         return e;
    577 }
    578 
    579 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
    580     return (int32_t)end64(group, err);
    581 }
    582 
    583 
    584 //--------------------------------------------------------------------------------
    585 //
    586 //   find()
    587 //
    588 //--------------------------------------------------------------------------------
    589 UBool RegexMatcher::find() {
    590     // Start at the position of the last match end.  (Will be zero if the
    591     //   matcher has been reset.)
    592     //
    593     if (U_FAILURE(fDeferredStatus)) {
    594         return FALSE;
    595     }
    596 
    597     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    598         return findUsingChunk();
    599     }
    600 
    601     int64_t startPos = fMatchEnd;
    602     if (startPos==0) {
    603         startPos = fActiveStart;
    604     }
    605 
    606     if (fMatch) {
    607         // Save the position of any previous successful match.
    608         fLastMatchEnd = fMatchEnd;
    609 
    610         if (fMatchStart == fMatchEnd) {
    611             // Previous match had zero length.  Move start position up one position
    612             //  to avoid sending find() into a loop on zero-length matches.
    613             if (startPos >= fActiveLimit) {
    614                 fMatch = FALSE;
    615                 fHitEnd = TRUE;
    616                 return FALSE;
    617             }
    618             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    619             (void)UTEXT_NEXT32(fInputText);
    620             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    621         }
    622     } else {
    623         if (fLastMatchEnd >= 0) {
    624             // A previous find() failed to match.  Don't try again.
    625             //   (without this test, a pattern with a zero-length match
    626             //    could match again at the end of an input string.)
    627             fHitEnd = TRUE;
    628             return FALSE;
    629         }
    630     }
    631 
    632 
    633     // Compute the position in the input string beyond which a match can not begin, because
    634     //   the minimum length match would extend past the end of the input.
    635     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    636     //          Be aware of possible overflows if making changes here.
    637     int64_t testStartLimit;
    638     if (UTEXT_USES_U16(fInputText)) {
    639         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
    640         if (startPos > testStartLimit) {
    641             fMatch = FALSE;
    642             fHitEnd = TRUE;
    643             return FALSE;
    644         }
    645     } else {
    646         // For now, let the matcher discover that it can't match on its own
    647         // We don't know how long the match len is in native characters
    648         testStartLimit = fActiveLimit;
    649     }
    650 
    651     UChar32  c;
    652     U_ASSERT(startPos >= 0);
    653 
    654     switch (fPattern->fStartType) {
    655     case START_NO_INFO:
    656         // No optimization was found.
    657         //  Try a match at each input position.
    658         for (;;) {
    659             MatchAt(startPos, FALSE, fDeferredStatus);
    660             if (U_FAILURE(fDeferredStatus)) {
    661                 return FALSE;
    662             }
    663             if (fMatch) {
    664                 return TRUE;
    665             }
    666             if (startPos >= testStartLimit) {
    667                 fHitEnd = TRUE;
    668                 return FALSE;
    669             }
    670             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    671             (void)UTEXT_NEXT32(fInputText);
    672             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    673             // Note that it's perfectly OK for a pattern to have a zero-length
    674             //   match at the end of a string, so we must make sure that the loop
    675             //   runs with startPos == testStartLimit the last time through.
    676             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    677                 return FALSE;
    678         }
    679         U_ASSERT(FALSE);
    680 
    681     case START_START:
    682         // Matches are only possible at the start of the input string
    683         //   (pattern begins with ^ or \A)
    684         if (startPos > fActiveStart) {
    685             fMatch = FALSE;
    686             return FALSE;
    687         }
    688         MatchAt(startPos, FALSE, fDeferredStatus);
    689         if (U_FAILURE(fDeferredStatus)) {
    690             return FALSE;
    691         }
    692         return fMatch;
    693 
    694 
    695     case START_SET:
    696         {
    697             // Match may start on any char from a pre-computed set.
    698             U_ASSERT(fPattern->fMinMatchLen > 0);
    699             int64_t pos;
    700             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    701             for (;;) {
    702                 c = UTEXT_NEXT32(fInputText);
    703                 pos = UTEXT_GETNATIVEINDEX(fInputText);
    704                 // c will be -1 (U_SENTINEL) at end of text, in which case we
    705                 // skip this next block (so we don't have a negative array index)
    706                 // and handle end of text in the following block.
    707                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
    708                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
    709                     MatchAt(startPos, FALSE, fDeferredStatus);
    710                     if (U_FAILURE(fDeferredStatus)) {
    711                         return FALSE;
    712                     }
    713                     if (fMatch) {
    714                         return TRUE;
    715                     }
    716                     UTEXT_SETNATIVEINDEX(fInputText, pos);
    717                 }
    718                 if (startPos >= testStartLimit) {
    719                     fMatch = FALSE;
    720                     fHitEnd = TRUE;
    721                     return FALSE;
    722                 }
    723                 startPos = pos;
    724 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    725                     return FALSE;
    726             }
    727         }
    728         U_ASSERT(FALSE);
    729 
    730     case START_STRING:
    731     case START_CHAR:
    732         {
    733             // Match starts on exactly one char.
    734             U_ASSERT(fPattern->fMinMatchLen > 0);
    735             UChar32 theChar = fPattern->fInitialChar;
    736             int64_t pos;
    737             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    738             for (;;) {
    739                 c = UTEXT_NEXT32(fInputText);
    740                 pos = UTEXT_GETNATIVEINDEX(fInputText);
    741                 if (c == theChar) {
    742                     MatchAt(startPos, FALSE, fDeferredStatus);
    743                     if (U_FAILURE(fDeferredStatus)) {
    744                         return FALSE;
    745                     }
    746                     if (fMatch) {
    747                         return TRUE;
    748                     }
    749                     UTEXT_SETNATIVEINDEX(fInputText, pos);
    750                 }
    751                 if (startPos >= testStartLimit) {
    752                     fMatch = FALSE;
    753                     fHitEnd = TRUE;
    754                     return FALSE;
    755                 }
    756                 startPos = pos;
    757 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    758                     return FALSE;
    759            }
    760         }
    761         U_ASSERT(FALSE);
    762 
    763     case START_LINE:
    764         {
    765             UChar32  c;
    766             if (startPos == fAnchorStart) {
    767                 MatchAt(startPos, FALSE, fDeferredStatus);
    768                 if (U_FAILURE(fDeferredStatus)) {
    769                     return FALSE;
    770                 }
    771                 if (fMatch) {
    772                     return TRUE;
    773                 }
    774                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    775                 c = UTEXT_NEXT32(fInputText);
    776                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    777             } else {
    778                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    779                 c = UTEXT_PREVIOUS32(fInputText);
    780                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    781             }
    782 
    783             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
    784                 for (;;) {
    785                     if (c == 0x0a) {
    786                             MatchAt(startPos, FALSE, fDeferredStatus);
    787                             if (U_FAILURE(fDeferredStatus)) {
    788                                 return FALSE;
    789                             }
    790                             if (fMatch) {
    791                                 return TRUE;
    792                             }
    793                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    794                     }
    795                     if (startPos >= testStartLimit) {
    796                         fMatch = FALSE;
    797                         fHitEnd = TRUE;
    798                         return FALSE;
    799                     }
    800                     c = UTEXT_NEXT32(fInputText);
    801                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
    802                     // Note that it's perfectly OK for a pattern to have a zero-length
    803                     //   match at the end of a string, so we must make sure that the loop
    804                     //   runs with startPos == testStartLimit the last time through.
    805 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    806                         return FALSE;
    807                 }
    808             } else {
    809                 for (;;) {
    810                     if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
    811                         ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
    812                             if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
    813                                 (void)UTEXT_NEXT32(fInputText);
    814                                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    815                             }
    816                             MatchAt(startPos, FALSE, fDeferredStatus);
    817                             if (U_FAILURE(fDeferredStatus)) {
    818                                 return FALSE;
    819                             }
    820                             if (fMatch) {
    821                                 return TRUE;
    822                             }
    823                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    824                     }
    825                     if (startPos >= testStartLimit) {
    826                         fMatch = FALSE;
    827                         fHitEnd = TRUE;
    828                         return FALSE;
    829                     }
    830                     c = UTEXT_NEXT32(fInputText);
    831                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
    832                     // Note that it's perfectly OK for a pattern to have a zero-length
    833                     //   match at the end of a string, so we must make sure that the loop
    834                     //   runs with startPos == testStartLimit the last time through.
    835 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    836                         return FALSE;
    837                 }
    838             }
    839         }
    840 
    841     default:
    842         U_ASSERT(FALSE);
    843     }
    844 
    845     U_ASSERT(FALSE);
    846     return FALSE;
    847 }
    848 
    849 
    850 
    851 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
    852     if (U_FAILURE(status)) {
    853         return FALSE;
    854     }
    855     if (U_FAILURE(fDeferredStatus)) {
    856         status = fDeferredStatus;
    857         return FALSE;
    858     }
    859     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
    860                                           //        This will reset the region to be the full input length.
    861     if (start < 0) {
    862         status = U_INDEX_OUTOFBOUNDS_ERROR;
    863         return FALSE;
    864     }
    865 
    866     int64_t nativeStart = start;
    867     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
    868         status = U_INDEX_OUTOFBOUNDS_ERROR;
    869         return FALSE;
    870     }
    871     fMatchEnd = nativeStart;
    872     return find();
    873 }
    874 
    875 
    876 //--------------------------------------------------------------------------------
    877 //
    878 //   findUsingChunk() -- like find(), but with the advance knowledge that the
    879 //                       entire string is available in the UText's chunk buffer.
    880 //
    881 //--------------------------------------------------------------------------------
    882 UBool RegexMatcher::findUsingChunk() {
    883     // Start at the position of the last match end.  (Will be zero if the
    884     //   matcher has been reset.
    885     //
    886 
    887     int32_t startPos = (int32_t)fMatchEnd;
    888     if (startPos==0) {
    889         startPos = (int32_t)fActiveStart;
    890     }
    891 
    892     const UChar *inputBuf = fInputText->chunkContents;
    893 
    894     if (fMatch) {
    895         // Save the position of any previous successful match.
    896         fLastMatchEnd = fMatchEnd;
    897 
    898         if (fMatchStart == fMatchEnd) {
    899             // Previous match had zero length.  Move start position up one position
    900             //  to avoid sending find() into a loop on zero-length matches.
    901             if (startPos >= fActiveLimit) {
    902                 fMatch = FALSE;
    903                 fHitEnd = TRUE;
    904                 return FALSE;
    905             }
    906             U16_FWD_1(inputBuf, startPos, fInputLength);
    907         }
    908     } else {
    909         if (fLastMatchEnd >= 0) {
    910             // A previous find() failed to match.  Don't try again.
    911             //   (without this test, a pattern with a zero-length match
    912             //    could match again at the end of an input string.)
    913             fHitEnd = TRUE;
    914             return FALSE;
    915         }
    916     }
    917 
    918 
    919     // Compute the position in the input string beyond which a match can not begin, because
    920     //   the minimum length match would extend past the end of the input.
    921     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    922     //          Be aware of possible overflows if making changes here.
    923     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
    924     if (startPos > testLen) {
    925         fMatch = FALSE;
    926         fHitEnd = TRUE;
    927         return FALSE;
    928     }
    929 
    930     UChar32  c;
    931     U_ASSERT(startPos >= 0);
    932 
    933     switch (fPattern->fStartType) {
    934     case START_NO_INFO:
    935         // No optimization was found.
    936         //  Try a match at each input position.
    937         for (;;) {
    938             MatchChunkAt(startPos, FALSE, fDeferredStatus);
    939             if (U_FAILURE(fDeferredStatus)) {
    940                 return FALSE;
    941             }
    942             if (fMatch) {
    943                 return TRUE;
    944             }
    945             if (startPos >= testLen) {
    946                 fHitEnd = TRUE;
    947                 return FALSE;
    948             }
    949             U16_FWD_1(inputBuf, startPos, fActiveLimit);
    950             // Note that it's perfectly OK for a pattern to have a zero-length
    951             //   match at the end of a string, so we must make sure that the loop
    952             //   runs with startPos == testLen the last time through.
    953             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    954                 return FALSE;
    955         }
    956         U_ASSERT(FALSE);
    957 
    958     case START_START:
    959         // Matches are only possible at the start of the input string
    960         //   (pattern begins with ^ or \A)
    961         if (startPos > fActiveStart) {
    962             fMatch = FALSE;
    963             return FALSE;
    964         }
    965         MatchChunkAt(startPos, FALSE, fDeferredStatus);
    966         if (U_FAILURE(fDeferredStatus)) {
    967             return FALSE;
    968         }
    969         return fMatch;
    970 
    971 
    972     case START_SET:
    973     {
    974         // Match may start on any char from a pre-computed set.
    975         U_ASSERT(fPattern->fMinMatchLen > 0);
    976         for (;;) {
    977             int32_t pos = startPos;
    978             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
    979             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
    980                 (c>=256 && fPattern->fInitialChars->contains(c))) {
    981                 MatchChunkAt(pos, FALSE, fDeferredStatus);
    982                 if (U_FAILURE(fDeferredStatus)) {
    983                     return FALSE;
    984                 }
    985                 if (fMatch) {
    986                     return TRUE;
    987                 }
    988             }
    989             if (pos >= testLen) {
    990                 fMatch = FALSE;
    991                 fHitEnd = TRUE;
    992                 return FALSE;
    993             }
    994             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    995                 return FALSE;
    996         }
    997     }
    998         U_ASSERT(FALSE);
    999 
   1000     case START_STRING:
   1001     case START_CHAR:
   1002     {
   1003         // Match starts on exactly one char.
   1004         U_ASSERT(fPattern->fMinMatchLen > 0);
   1005         UChar32 theChar = fPattern->fInitialChar;
   1006         for (;;) {
   1007             int32_t pos = startPos;
   1008             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
   1009             if (c == theChar) {
   1010                 MatchChunkAt(pos, FALSE, fDeferredStatus);
   1011                 if (U_FAILURE(fDeferredStatus)) {
   1012                     return FALSE;
   1013                 }
   1014                 if (fMatch) {
   1015                     return TRUE;
   1016                 }
   1017             }
   1018             if (pos >= testLen) {
   1019                 fMatch = FALSE;
   1020                 fHitEnd = TRUE;
   1021                 return FALSE;
   1022             }
   1023             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1024                 return FALSE;
   1025         }
   1026     }
   1027         U_ASSERT(FALSE);
   1028 
   1029     case START_LINE:
   1030     {
   1031         UChar32  c;
   1032         if (startPos == fAnchorStart) {
   1033             MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1034             if (U_FAILURE(fDeferredStatus)) {
   1035                 return FALSE;
   1036             }
   1037             if (fMatch) {
   1038                 return TRUE;
   1039             }
   1040             U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1041         }
   1042 
   1043         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
   1044             for (;;) {
   1045                 c = inputBuf[startPos-1];
   1046                 if (c == 0x0a) {
   1047                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1048                     if (U_FAILURE(fDeferredStatus)) {
   1049                         return FALSE;
   1050                     }
   1051                     if (fMatch) {
   1052                         return TRUE;
   1053                     }
   1054                 }
   1055                 if (startPos >= testLen) {
   1056                     fMatch = FALSE;
   1057                     fHitEnd = TRUE;
   1058                     return FALSE;
   1059                 }
   1060                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1061                 // Note that it's perfectly OK for a pattern to have a zero-length
   1062                 //   match at the end of a string, so we must make sure that the loop
   1063                 //   runs with startPos == testLen the last time through.
   1064 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1065                     return FALSE;
   1066             }
   1067         } else {
   1068             for (;;) {
   1069                 c = inputBuf[startPos-1];
   1070                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   1071                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
   1072                     if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
   1073                         startPos++;
   1074                     }
   1075                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1076                     if (U_FAILURE(fDeferredStatus)) {
   1077                         return FALSE;
   1078                     }
   1079                     if (fMatch) {
   1080                         return TRUE;
   1081                     }
   1082                 }
   1083                 if (startPos >= testLen) {
   1084                     fMatch = FALSE;
   1085                     fHitEnd = TRUE;
   1086                     return FALSE;
   1087                 }
   1088                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1089                 // Note that it's perfectly OK for a pattern to have a zero-length
   1090                 //   match at the end of a string, so we must make sure that the loop
   1091                 //   runs with startPos == testLen the last time through.
   1092 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1093                     return FALSE;
   1094             }
   1095         }
   1096     }
   1097 
   1098     default:
   1099         U_ASSERT(FALSE);
   1100     }
   1101 
   1102     U_ASSERT(FALSE);
   1103     return FALSE;
   1104 }
   1105 
   1106 
   1107 
   1108 //--------------------------------------------------------------------------------
   1109 //
   1110 //  group()
   1111 //
   1112 //--------------------------------------------------------------------------------
   1113 UnicodeString RegexMatcher::group(UErrorCode &status) const {
   1114     return group(0, status);
   1115 }
   1116 
   1117 //  Return immutable shallow clone
   1118 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
   1119     return group(0, dest, group_len, status);
   1120 }
   1121 
   1122 //  Return immutable shallow clone
   1123 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
   1124     group_len = 0;
   1125     UBool bailOut = FALSE;
   1126     if (U_FAILURE(status)) {
   1127         return dest;
   1128     }
   1129     if (U_FAILURE(fDeferredStatus)) {
   1130         status = fDeferredStatus;
   1131         bailOut = TRUE;
   1132     }
   1133     if (fMatch == FALSE) {
   1134         status = U_REGEX_INVALID_STATE;
   1135         bailOut = TRUE;
   1136     }
   1137     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1138         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1139         bailOut = TRUE;
   1140     }
   1141 
   1142     if (bailOut) {
   1143         return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
   1144     }
   1145 
   1146     int64_t s, e;
   1147     if (groupNum == 0) {
   1148         s = fMatchStart;
   1149         e = fMatchEnd;
   1150     } else {
   1151         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1152         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1153         U_ASSERT(groupOffset >= 0);
   1154         s = fFrame->fExtra[groupOffset];
   1155         e = fFrame->fExtra[groupOffset+1];
   1156     }
   1157 
   1158     if (s < 0) {
   1159         // A capture group wasn't part of the match
   1160         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
   1161     }
   1162     U_ASSERT(s <= e);
   1163     group_len = e - s;
   1164 
   1165     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
   1166     if (dest)
   1167         UTEXT_SETNATIVEINDEX(dest, s);
   1168     return dest;
   1169 }
   1170 
   1171 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
   1172     UnicodeString result;
   1173     if (U_FAILURE(status)) {
   1174         return result;
   1175     }
   1176     UText resultText = UTEXT_INITIALIZER;
   1177     utext_openUnicodeString(&resultText, &result, &status);
   1178     group(groupNum, &resultText, status);
   1179     utext_close(&resultText);
   1180     return result;
   1181 }
   1182 
   1183 
   1184 //  Return deep (mutable) clone
   1185 //		Technology Preview (as an API), but note that the UnicodeString API is implemented
   1186 //		using this function.
   1187 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
   1188     UBool bailOut = FALSE;
   1189     if (U_FAILURE(status)) {
   1190         return dest;
   1191     }
   1192     if (U_FAILURE(fDeferredStatus)) {
   1193         status = fDeferredStatus;
   1194         bailOut = TRUE;
   1195     }
   1196 
   1197     if (fMatch == FALSE) {
   1198         status = U_REGEX_INVALID_STATE;
   1199         bailOut = TRUE;
   1200     }
   1201     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1202         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1203         bailOut = TRUE;
   1204     }
   1205 
   1206     if (bailOut) {
   1207         if (dest) {
   1208             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1209             return dest;
   1210         } else {
   1211             return utext_openUChars(NULL, NULL, 0, &status);
   1212         }
   1213     }
   1214 
   1215     int64_t s, e;
   1216     if (groupNum == 0) {
   1217         s = fMatchStart;
   1218         e = fMatchEnd;
   1219     } else {
   1220         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1221         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1222         U_ASSERT(groupOffset >= 0);
   1223         s = fFrame->fExtra[groupOffset];
   1224         e = fFrame->fExtra[groupOffset+1];
   1225     }
   1226 
   1227     if (s < 0) {
   1228         // A capture group wasn't part of the match
   1229         if (dest) {
   1230             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1231             return dest;
   1232         } else {
   1233             return utext_openUChars(NULL, NULL, 0, &status);
   1234         }
   1235     }
   1236     U_ASSERT(s <= e);
   1237 
   1238     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1239         U_ASSERT(e <= fInputLength);
   1240         if (dest) {
   1241             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
   1242         } else {
   1243             UText groupText = UTEXT_INITIALIZER;
   1244             utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
   1245             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
   1246             utext_close(&groupText);
   1247         }
   1248     } else {
   1249         int32_t len16;
   1250         if (UTEXT_USES_U16(fInputText)) {
   1251             len16 = (int32_t)(e-s);
   1252         } else {
   1253             UErrorCode lengthStatus = U_ZERO_ERROR;
   1254             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
   1255         }
   1256         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   1257         if (groupChars == NULL) {
   1258             status = U_MEMORY_ALLOCATION_ERROR;
   1259             return dest;
   1260         }
   1261         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
   1262 
   1263         if (dest) {
   1264             utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
   1265         } else {
   1266             UText groupText = UTEXT_INITIALIZER;
   1267             utext_openUChars(&groupText, groupChars, len16, &status);
   1268             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
   1269             utext_close(&groupText);
   1270         }
   1271 
   1272         uprv_free(groupChars);
   1273     }
   1274     return dest;
   1275 }
   1276 
   1277 //--------------------------------------------------------------------------------
   1278 //
   1279 //  appendGroup() -- currently internal only, appends a group to a UText rather
   1280 //                   than replacing its contents
   1281 //
   1282 //--------------------------------------------------------------------------------
   1283 
   1284 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
   1285     if (U_FAILURE(status)) {
   1286         return 0;
   1287     }
   1288     if (U_FAILURE(fDeferredStatus)) {
   1289         status = fDeferredStatus;
   1290         return 0;
   1291     }
   1292     int64_t destLen = utext_nativeLength(dest);
   1293 
   1294     if (fMatch == FALSE) {
   1295         status = U_REGEX_INVALID_STATE;
   1296         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1297     }
   1298     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1299         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1300         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1301     }
   1302 
   1303     int64_t s, e;
   1304     if (groupNum == 0) {
   1305         s = fMatchStart;
   1306         e = fMatchEnd;
   1307     } else {
   1308         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1309         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1310         U_ASSERT(groupOffset >= 0);
   1311         s = fFrame->fExtra[groupOffset];
   1312         e = fFrame->fExtra[groupOffset+1];
   1313     }
   1314 
   1315     if (s < 0) {
   1316         // A capture group wasn't part of the match
   1317         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1318     }
   1319     U_ASSERT(s <= e);
   1320 
   1321     int64_t deltaLen;
   1322     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1323         U_ASSERT(e <= fInputLength);
   1324         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
   1325     } else {
   1326         int32_t len16;
   1327         if (UTEXT_USES_U16(fInputText)) {
   1328             len16 = (int32_t)(e-s);
   1329         } else {
   1330             UErrorCode lengthStatus = U_ZERO_ERROR;
   1331             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
   1332         }
   1333         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   1334         if (groupChars == NULL) {
   1335             status = U_MEMORY_ALLOCATION_ERROR;
   1336             return 0;
   1337         }
   1338         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
   1339 
   1340         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
   1341         uprv_free(groupChars);
   1342     }
   1343     return deltaLen;
   1344 }
   1345 
   1346 
   1347 
   1348 //--------------------------------------------------------------------------------
   1349 //
   1350 //  groupCount()
   1351 //
   1352 //--------------------------------------------------------------------------------
   1353 int32_t RegexMatcher::groupCount() const {
   1354     return fPattern->fGroupMap->size();
   1355 }
   1356 
   1357 
   1358 
   1359 //--------------------------------------------------------------------------------
   1360 //
   1361 //  hasAnchoringBounds()
   1362 //
   1363 //--------------------------------------------------------------------------------
   1364 UBool RegexMatcher::hasAnchoringBounds() const {
   1365     return fAnchoringBounds;
   1366 }
   1367 
   1368 
   1369 //--------------------------------------------------------------------------------
   1370 //
   1371 //  hasTransparentBounds()
   1372 //
   1373 //--------------------------------------------------------------------------------
   1374 UBool RegexMatcher::hasTransparentBounds() const {
   1375     return fTransparentBounds;
   1376 }
   1377 
   1378 
   1379 
   1380 //--------------------------------------------------------------------------------
   1381 //
   1382 //  hitEnd()
   1383 //
   1384 //--------------------------------------------------------------------------------
   1385 UBool RegexMatcher::hitEnd() const {
   1386     return fHitEnd;
   1387 }
   1388 
   1389 
   1390 //--------------------------------------------------------------------------------
   1391 //
   1392 //  input()
   1393 //
   1394 //--------------------------------------------------------------------------------
   1395 const UnicodeString &RegexMatcher::input() const {
   1396     if (!fInput) {
   1397         UErrorCode status = U_ZERO_ERROR;
   1398         int32_t len16;
   1399         if (UTEXT_USES_U16(fInputText)) {
   1400             len16 = (int32_t)fInputLength;
   1401         } else {
   1402             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
   1403             status = U_ZERO_ERROR; // overflow, length status
   1404         }
   1405         UnicodeString *result = new UnicodeString(len16, 0, 0);
   1406 
   1407         UChar *inputChars = result->getBuffer(len16);
   1408         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
   1409         result->releaseBuffer(len16);
   1410 
   1411         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
   1412     }
   1413 
   1414     return *fInput;
   1415 }
   1416 
   1417 //--------------------------------------------------------------------------------
   1418 //
   1419 //  inputText()
   1420 //
   1421 //--------------------------------------------------------------------------------
   1422 UText *RegexMatcher::inputText() const {
   1423     return fInputText;
   1424 }
   1425 
   1426 
   1427 //--------------------------------------------------------------------------------
   1428 //
   1429 //  getInput() -- like inputText(), but makes a clone or copies into another UText
   1430 //
   1431 //--------------------------------------------------------------------------------
   1432 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
   1433     UBool bailOut = FALSE;
   1434     if (U_FAILURE(status)) {
   1435         return dest;
   1436     }
   1437     if (U_FAILURE(fDeferredStatus)) {
   1438         status = fDeferredStatus;
   1439         bailOut = TRUE;
   1440     }
   1441 
   1442     if (bailOut) {
   1443         if (dest) {
   1444             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1445             return dest;
   1446         } else {
   1447             return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
   1448         }
   1449     }
   1450 
   1451     if (dest) {
   1452         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1453             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
   1454         } else {
   1455             int32_t input16Len;
   1456             if (UTEXT_USES_U16(fInputText)) {
   1457                 input16Len = (int32_t)fInputLength;
   1458             } else {
   1459                 UErrorCode lengthStatus = U_ZERO_ERROR;
   1460                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
   1461             }
   1462             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
   1463             if (inputChars == NULL) {
   1464                 return dest;
   1465             }
   1466 
   1467             status = U_ZERO_ERROR;
   1468             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
   1469             status = U_ZERO_ERROR;
   1470             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
   1471 
   1472             uprv_free(inputChars);
   1473         }
   1474         return dest;
   1475     } else {
   1476         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
   1477     }
   1478 }
   1479 
   1480 
   1481 static UBool compat_SyncMutableUTextContents(UText *ut);
   1482 static UBool compat_SyncMutableUTextContents(UText *ut) {
   1483     UBool retVal = FALSE;
   1484 
   1485     //  In the following test, we're really only interested in whether the UText should switch
   1486     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
   1487     //  will still point to the correct data.
   1488     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
   1489         UnicodeString *us=(UnicodeString *)ut->context;
   1490 
   1491         // Update to the latest length.
   1492         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
   1493         int32_t newLength = us->length();
   1494 
   1495         // Update the chunk description.
   1496         // The buffer may have switched between stack- and heap-based.
   1497         ut->chunkContents    = us->getBuffer();
   1498         ut->chunkLength      = newLength;
   1499         ut->chunkNativeLimit = newLength;
   1500         ut->nativeIndexingLimit = newLength;
   1501         retVal = TRUE;
   1502     }
   1503 
   1504     return retVal;
   1505 }
   1506 
   1507 //--------------------------------------------------------------------------------
   1508 //
   1509 //  lookingAt()
   1510 //
   1511 //--------------------------------------------------------------------------------
   1512 UBool RegexMatcher::lookingAt(UErrorCode &status) {
   1513     if (U_FAILURE(status)) {
   1514         return FALSE;
   1515     }
   1516     if (U_FAILURE(fDeferredStatus)) {
   1517         status = fDeferredStatus;
   1518         return FALSE;
   1519     }
   1520 
   1521     if (fInputUniStrMaybeMutable) {
   1522         if (compat_SyncMutableUTextContents(fInputText)) {
   1523         fInputLength = utext_nativeLength(fInputText);
   1524         reset();
   1525         }
   1526     }
   1527     else {
   1528         resetPreserveRegion();
   1529     }
   1530     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1531         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
   1532     } else {
   1533         MatchAt(fActiveStart, FALSE, status);
   1534     }
   1535     return fMatch;
   1536 }
   1537 
   1538 
   1539 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
   1540     if (U_FAILURE(status)) {
   1541         return FALSE;
   1542     }
   1543     if (U_FAILURE(fDeferredStatus)) {
   1544         status = fDeferredStatus;
   1545         return FALSE;
   1546     }
   1547     reset();
   1548 
   1549     if (start < 0) {
   1550         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1551         return FALSE;
   1552     }
   1553 
   1554     if (fInputUniStrMaybeMutable) {
   1555         if (compat_SyncMutableUTextContents(fInputText)) {
   1556         fInputLength = utext_nativeLength(fInputText);
   1557         reset();
   1558         }
   1559     }
   1560 
   1561     int64_t nativeStart;
   1562     nativeStart = start;
   1563     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1564         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1565         return FALSE;
   1566     }
   1567 
   1568     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1569         MatchChunkAt((int32_t)nativeStart, FALSE, status);
   1570     } else {
   1571         MatchAt(nativeStart, FALSE, status);
   1572     }
   1573     return fMatch;
   1574 }
   1575 
   1576 
   1577 
   1578 //--------------------------------------------------------------------------------
   1579 //
   1580 //  matches()
   1581 //
   1582 //--------------------------------------------------------------------------------
   1583 UBool RegexMatcher::matches(UErrorCode &status) {
   1584     if (U_FAILURE(status)) {
   1585         return FALSE;
   1586     }
   1587     if (U_FAILURE(fDeferredStatus)) {
   1588         status = fDeferredStatus;
   1589         return FALSE;
   1590     }
   1591 
   1592     if (fInputUniStrMaybeMutable) {
   1593         if (compat_SyncMutableUTextContents(fInputText)) {
   1594         fInputLength = utext_nativeLength(fInputText);
   1595         reset();
   1596         }
   1597     }
   1598     else {
   1599         resetPreserveRegion();
   1600     }
   1601 
   1602     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1603         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
   1604     } else {
   1605         MatchAt(fActiveStart, TRUE, status);
   1606     }
   1607     return fMatch;
   1608 }
   1609 
   1610 
   1611 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
   1612     if (U_FAILURE(status)) {
   1613         return FALSE;
   1614     }
   1615     if (U_FAILURE(fDeferredStatus)) {
   1616         status = fDeferredStatus;
   1617         return FALSE;
   1618     }
   1619     reset();
   1620 
   1621     if (start < 0) {
   1622         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1623         return FALSE;
   1624     }
   1625 
   1626     if (fInputUniStrMaybeMutable) {
   1627         if (compat_SyncMutableUTextContents(fInputText)) {
   1628         fInputLength = utext_nativeLength(fInputText);
   1629         reset();
   1630         }
   1631     }
   1632 
   1633     int64_t nativeStart;
   1634     nativeStart = start;
   1635     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1636         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1637         return FALSE;
   1638     }
   1639 
   1640     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1641         MatchChunkAt((int32_t)nativeStart, TRUE, status);
   1642     } else {
   1643         MatchAt(nativeStart, TRUE, status);
   1644     }
   1645     return fMatch;
   1646 }
   1647 
   1648 
   1649 
   1650 //--------------------------------------------------------------------------------
   1651 //
   1652 //    pattern
   1653 //
   1654 //--------------------------------------------------------------------------------
   1655 const RegexPattern &RegexMatcher::pattern() const {
   1656     return *fPattern;
   1657 }
   1658 
   1659 
   1660 
   1661 //--------------------------------------------------------------------------------
   1662 //
   1663 //    region
   1664 //
   1665 //--------------------------------------------------------------------------------
   1666 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
   1667     if (U_FAILURE(status)) {
   1668         return *this;
   1669     }
   1670 
   1671     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
   1672         status = U_ILLEGAL_ARGUMENT_ERROR;
   1673     }
   1674 
   1675     int64_t nativeStart = regionStart;
   1676     int64_t nativeLimit = regionLimit;
   1677     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
   1678       status = U_ILLEGAL_ARGUMENT_ERROR;
   1679     }
   1680 
   1681     if (startIndex == -1)
   1682       this->reset();
   1683     else
   1684       resetPreserveRegion();
   1685 
   1686     fRegionStart = nativeStart;
   1687     fRegionLimit = nativeLimit;
   1688     fActiveStart = nativeStart;
   1689     fActiveLimit = nativeLimit;
   1690 
   1691     if (startIndex != -1) {
   1692       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
   1693           status = U_INDEX_OUTOFBOUNDS_ERROR;
   1694       }
   1695       fMatchEnd = startIndex;
   1696     }
   1697 
   1698     if (!fTransparentBounds) {
   1699         fLookStart = nativeStart;
   1700         fLookLimit = nativeLimit;
   1701     }
   1702     if (fAnchoringBounds) {
   1703         fAnchorStart = nativeStart;
   1704         fAnchorLimit = nativeLimit;
   1705     }
   1706     return *this;
   1707 }
   1708 
   1709 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
   1710   return region(start, limit, -1, status);
   1711 }
   1712 
   1713 //--------------------------------------------------------------------------------
   1714 //
   1715 //    regionEnd
   1716 //
   1717 //--------------------------------------------------------------------------------
   1718 int32_t RegexMatcher::regionEnd() const {
   1719     return (int32_t)fRegionLimit;
   1720 }
   1721 
   1722 int64_t RegexMatcher::regionEnd64() const {
   1723     return fRegionLimit;
   1724 }
   1725 
   1726 //--------------------------------------------------------------------------------
   1727 //
   1728 //    regionStart
   1729 //
   1730 //--------------------------------------------------------------------------------
   1731 int32_t RegexMatcher::regionStart() const {
   1732     return (int32_t)fRegionStart;
   1733 }
   1734 
   1735 int64_t RegexMatcher::regionStart64() const {
   1736     return fRegionStart;
   1737 }
   1738 
   1739 
   1740 //--------------------------------------------------------------------------------
   1741 //
   1742 //    replaceAll
   1743 //
   1744 //--------------------------------------------------------------------------------
   1745 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
   1746     UText replacementText = UTEXT_INITIALIZER;
   1747     UText resultText = UTEXT_INITIALIZER;
   1748     UnicodeString resultString;
   1749     if (U_FAILURE(status)) {
   1750         return resultString;
   1751     }
   1752 
   1753     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1754     utext_openUnicodeString(&resultText, &resultString, &status);
   1755 
   1756     replaceAll(&replacementText, &resultText, status);
   1757 
   1758     utext_close(&resultText);
   1759     utext_close(&replacementText);
   1760 
   1761     return resultString;
   1762 }
   1763 
   1764 
   1765 //
   1766 //    replaceAll, UText mode
   1767 //
   1768 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
   1769     if (U_FAILURE(status)) {
   1770         return dest;
   1771     }
   1772     if (U_FAILURE(fDeferredStatus)) {
   1773         status = fDeferredStatus;
   1774         return dest;
   1775     }
   1776 
   1777     if (dest == NULL) {
   1778         UnicodeString emptyString;
   1779         UText empty = UTEXT_INITIALIZER;
   1780 
   1781         utext_openUnicodeString(&empty, &emptyString, &status);
   1782         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
   1783         utext_close(&empty);
   1784     }
   1785 
   1786     if (U_SUCCESS(status)) {
   1787         reset();
   1788         while (find()) {
   1789             appendReplacement(dest, replacement, status);
   1790             if (U_FAILURE(status)) {
   1791                 break;
   1792             }
   1793         }
   1794         appendTail(dest, status);
   1795     }
   1796 
   1797     return dest;
   1798 }
   1799 
   1800 
   1801 //--------------------------------------------------------------------------------
   1802 //
   1803 //    replaceFirst
   1804 //
   1805 //--------------------------------------------------------------------------------
   1806 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
   1807     UText replacementText = UTEXT_INITIALIZER;
   1808     UText resultText = UTEXT_INITIALIZER;
   1809     UnicodeString resultString;
   1810 
   1811     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1812     utext_openUnicodeString(&resultText, &resultString, &status);
   1813 
   1814     replaceFirst(&replacementText, &resultText, status);
   1815 
   1816     utext_close(&resultText);
   1817     utext_close(&replacementText);
   1818 
   1819     return resultString;
   1820 }
   1821 
   1822 //
   1823 //    replaceFirst, UText mode
   1824 //
   1825 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
   1826     if (U_FAILURE(status)) {
   1827         return dest;
   1828     }
   1829     if (U_FAILURE(fDeferredStatus)) {
   1830         status = fDeferredStatus;
   1831         return dest;
   1832     }
   1833 
   1834     reset();
   1835     if (!find()) {
   1836         return getInput(dest, status);
   1837     }
   1838 
   1839     if (dest == NULL) {
   1840         UnicodeString emptyString;
   1841         UText empty = UTEXT_INITIALIZER;
   1842 
   1843         utext_openUnicodeString(&empty, &emptyString, &status);
   1844         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
   1845         utext_close(&empty);
   1846     }
   1847 
   1848     appendReplacement(dest, replacement, status);
   1849     appendTail(dest, status);
   1850 
   1851     return dest;
   1852 }
   1853 
   1854 
   1855 //--------------------------------------------------------------------------------
   1856 //
   1857 //     requireEnd
   1858 //
   1859 //--------------------------------------------------------------------------------
   1860 UBool RegexMatcher::requireEnd() const {
   1861     return fRequireEnd;
   1862 }
   1863 
   1864 
   1865 //--------------------------------------------------------------------------------
   1866 //
   1867 //     reset
   1868 //
   1869 //--------------------------------------------------------------------------------
   1870 RegexMatcher &RegexMatcher::reset() {
   1871     fRegionStart    = 0;
   1872     fRegionLimit    = fInputLength;
   1873     fActiveStart    = 0;
   1874     fActiveLimit    = fInputLength;
   1875     fAnchorStart    = 0;
   1876     fAnchorLimit    = fInputLength;
   1877     fLookStart      = 0;
   1878     fLookLimit      = fInputLength;
   1879     resetPreserveRegion();
   1880     return *this;
   1881 }
   1882 
   1883 
   1884 
   1885 void RegexMatcher::resetPreserveRegion() {
   1886     fMatchStart     = 0;
   1887     fMatchEnd       = 0;
   1888     fLastMatchEnd   = -1;
   1889     fAppendPosition = 0;
   1890     fMatch          = FALSE;
   1891     fHitEnd         = FALSE;
   1892     fRequireEnd     = FALSE;
   1893     fTime           = 0;
   1894     fTickCounter    = TIMER_INITIAL_VALUE;
   1895     //resetStack(); // more expensive than it looks...
   1896 }
   1897 
   1898 
   1899 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
   1900     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
   1901     if (fPattern->fNeedsAltInput) {
   1902         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
   1903     }
   1904     fInputLength = utext_nativeLength(fInputText);
   1905 
   1906     reset();
   1907     delete fInput;
   1908     fInput = NULL;
   1909 
   1910     //  Do the following for any UnicodeString.
   1911     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
   1912     fInputUniStrMaybeMutable = TRUE;
   1913 
   1914     if (fWordBreakItr != NULL) {
   1915 #if UCONFIG_NO_BREAK_ITERATION==0
   1916         UErrorCode status = U_ZERO_ERROR;
   1917         fWordBreakItr->setText(fInputText, status);
   1918 #endif
   1919     }
   1920     return *this;
   1921 }
   1922 
   1923 
   1924 RegexMatcher &RegexMatcher::reset(UText *input) {
   1925     if (fInputText != input) {
   1926         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
   1927         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
   1928         fInputLength = utext_nativeLength(fInputText);
   1929 
   1930         delete fInput;
   1931         fInput = NULL;
   1932 
   1933         if (fWordBreakItr != NULL) {
   1934 #if UCONFIG_NO_BREAK_ITERATION==0
   1935             UErrorCode status = U_ZERO_ERROR;
   1936             fWordBreakItr->setText(input, status);
   1937 #endif
   1938         }
   1939     }
   1940     reset();
   1941     fInputUniStrMaybeMutable = FALSE;
   1942 
   1943     return *this;
   1944 }
   1945 
   1946 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
   1947     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
   1948     return *this;
   1949 }*/
   1950 
   1951 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
   1952     if (U_FAILURE(status)) {
   1953         return *this;
   1954     }
   1955     reset();       // Reset also resets the region to be the entire string.
   1956 
   1957     if (position < 0 || position > fActiveLimit) {
   1958         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1959         return *this;
   1960     }
   1961     fMatchEnd = position;
   1962     return *this;
   1963 }
   1964 
   1965 
   1966 //--------------------------------------------------------------------------------
   1967 //
   1968 //    refresh
   1969 //
   1970 //--------------------------------------------------------------------------------
   1971 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
   1972     if (U_FAILURE(status)) {
   1973         return *this;
   1974     }
   1975     if (input == NULL) {
   1976         status = U_ILLEGAL_ARGUMENT_ERROR;
   1977         return *this;
   1978     }
   1979     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
   1980         status = U_ILLEGAL_ARGUMENT_ERROR;
   1981         return *this;
   1982     }
   1983     int64_t  pos = utext_getNativeIndex(fInputText);
   1984     //  Shallow read-only clone of the new UText into the existing input UText
   1985     fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
   1986     if (U_FAILURE(status)) {
   1987         return *this;
   1988     }
   1989     utext_setNativeIndex(fInputText, pos);
   1990 
   1991     if (fAltInputText != NULL) {
   1992         pos = utext_getNativeIndex(fAltInputText);
   1993         fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
   1994         if (U_FAILURE(status)) {
   1995             return *this;
   1996         }
   1997         utext_setNativeIndex(fAltInputText, pos);
   1998     }
   1999     return *this;
   2000 }
   2001 
   2002 
   2003 
   2004 //--------------------------------------------------------------------------------
   2005 //
   2006 //    setTrace
   2007 //
   2008 //--------------------------------------------------------------------------------
   2009 void RegexMatcher::setTrace(UBool state) {
   2010     fTraceDebug = state;
   2011 }
   2012 
   2013 
   2014 
   2015 //---------------------------------------------------------------------
   2016 //
   2017 //   split
   2018 //
   2019 //---------------------------------------------------------------------
   2020 int32_t  RegexMatcher::split(const UnicodeString &input,
   2021         UnicodeString    dest[],
   2022         int32_t          destCapacity,
   2023         UErrorCode      &status)
   2024 {
   2025     UText inputText = UTEXT_INITIALIZER;
   2026     utext_openConstUnicodeString(&inputText, &input, &status);
   2027     if (U_FAILURE(status)) {
   2028         return 0;
   2029     }
   2030 
   2031     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
   2032     if (destText == NULL) {
   2033         status = U_MEMORY_ALLOCATION_ERROR;
   2034         return 0;
   2035     }
   2036     int32_t i;
   2037     for (i = 0; i < destCapacity; i++) {
   2038         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
   2039     }
   2040 
   2041     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
   2042 
   2043     for (i = 0; i < destCapacity; i++) {
   2044         utext_close(destText[i]);
   2045     }
   2046 
   2047     uprv_free(destText);
   2048     utext_close(&inputText);
   2049     return fieldCount;
   2050 }
   2051 
   2052 //
   2053 //   split, UText mode
   2054 //
   2055 int32_t  RegexMatcher::split(UText *input,
   2056         UText           *dest[],
   2057         int32_t          destCapacity,
   2058         UErrorCode      &status)
   2059 {
   2060     //
   2061     // Check arguements for validity
   2062     //
   2063     if (U_FAILURE(status)) {
   2064         return 0;
   2065     };
   2066 
   2067     if (destCapacity < 1) {
   2068         status = U_ILLEGAL_ARGUMENT_ERROR;
   2069         return 0;
   2070     }
   2071 
   2072     //
   2073     // Reset for the input text
   2074     //
   2075     reset(input);
   2076     int64_t   nextOutputStringStart = 0;
   2077     if (fActiveLimit == 0) {
   2078         return 0;
   2079     }
   2080 
   2081     //
   2082     // Loop through the input text, searching for the delimiter pattern
   2083     //
   2084     int32_t i;
   2085     int32_t numCaptureGroups = fPattern->fGroupMap->size();
   2086     for (i=0; ; i++) {
   2087         if (i>=destCapacity-1) {
   2088             // There is one or zero output string left.
   2089             // Fill the last output string with whatever is left from the input, then exit the loop.
   2090             //  ( i will be == destCapacity if we filled the output array while processing
   2091             //    capture groups of the delimiter expression, in which case we will discard the
   2092             //    last capture group saved in favor of the unprocessed remainder of the
   2093             //    input string.)
   2094             i = destCapacity-1;
   2095             if (fActiveLimit > nextOutputStringStart) {
   2096                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2097                     if (dest[i]) {
   2098                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2099                                       input->chunkContents+nextOutputStringStart,
   2100                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
   2101                     } else {
   2102                         UText remainingText = UTEXT_INITIALIZER;
   2103                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2104                                          fActiveLimit-nextOutputStringStart, &status);
   2105                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2106                         utext_close(&remainingText);
   2107                     }
   2108                 } else {
   2109                     UErrorCode lengthStatus = U_ZERO_ERROR;
   2110                     int32_t remaining16Length =
   2111                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
   2112                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2113                     if (remainingChars == NULL) {
   2114                         status = U_MEMORY_ALLOCATION_ERROR;
   2115                         break;
   2116                     }
   2117 
   2118                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2119                     if (dest[i]) {
   2120                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2121                     } else {
   2122                         UText remainingText = UTEXT_INITIALIZER;
   2123                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2124                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2125                         utext_close(&remainingText);
   2126                     }
   2127 
   2128                     uprv_free(remainingChars);
   2129                 }
   2130             }
   2131             break;
   2132         }
   2133         if (find()) {
   2134             // We found another delimiter.  Move everything from where we started looking
   2135             //  up until the start of the delimiter into the next output string.
   2136             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2137                 if (dest[i]) {
   2138                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2139                                   input->chunkContents+nextOutputStringStart,
   2140                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
   2141                 } else {
   2142                     UText remainingText = UTEXT_INITIALIZER;
   2143                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2144                                       fMatchStart-nextOutputStringStart, &status);
   2145                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2146                     utext_close(&remainingText);
   2147                 }
   2148             } else {
   2149                 UErrorCode lengthStatus = U_ZERO_ERROR;
   2150                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
   2151                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2152                 if (remainingChars == NULL) {
   2153                     status = U_MEMORY_ALLOCATION_ERROR;
   2154                     break;
   2155                 }
   2156                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
   2157                 if (dest[i]) {
   2158                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2159                 } else {
   2160                     UText remainingText = UTEXT_INITIALIZER;
   2161                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2162                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2163                     utext_close(&remainingText);
   2164                 }
   2165 
   2166                 uprv_free(remainingChars);
   2167             }
   2168             nextOutputStringStart = fMatchEnd;
   2169 
   2170             // If the delimiter pattern has capturing parentheses, the captured
   2171             //  text goes out into the next n destination strings.
   2172             int32_t groupNum;
   2173             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
   2174                 if (i >= destCapacity-2) {
   2175                     // Never fill the last available output string with capture group text.
   2176                     // It will filled with the last field, the remainder of the
   2177                     //  unsplit input text.
   2178                     break;
   2179                 }
   2180                 i++;
   2181                 dest[i] = group(groupNum, dest[i], status);
   2182             }
   2183 
   2184             if (nextOutputStringStart == fActiveLimit) {
   2185                 // The delimiter was at the end of the string.  We're done, but first
   2186                 // we output one last empty string, for the empty field following
   2187                 //   the delimiter at the end of input.
   2188                 if (i+1 < destCapacity) {
   2189                     ++i;
   2190                     if (dest[i] == NULL) {
   2191                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
   2192                     } else {
   2193                         static UChar emptyString[] = {(UChar)0};
   2194                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
   2195                     }
   2196                 }
   2197                 break;
   2198 
   2199             }
   2200         }
   2201         else
   2202         {
   2203             // We ran off the end of the input while looking for the next delimiter.
   2204             // All the remaining text goes into the current output string.
   2205             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2206                 if (dest[i]) {
   2207                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2208                                   input->chunkContents+nextOutputStringStart,
   2209                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
   2210                 } else {
   2211                     UText remainingText = UTEXT_INITIALIZER;
   2212                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2213                                      fActiveLimit-nextOutputStringStart, &status);
   2214                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2215                     utext_close(&remainingText);
   2216                 }
   2217             } else {
   2218                 UErrorCode lengthStatus = U_ZERO_ERROR;
   2219                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
   2220                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2221                 if (remainingChars == NULL) {
   2222                     status = U_MEMORY_ALLOCATION_ERROR;
   2223                     break;
   2224                 }
   2225 
   2226                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2227                 if (dest[i]) {
   2228                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2229                 } else {
   2230                     UText remainingText = UTEXT_INITIALIZER;
   2231                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2232                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2233                     utext_close(&remainingText);
   2234                 }
   2235 
   2236                 uprv_free(remainingChars);
   2237             }
   2238             break;
   2239         }
   2240         if (U_FAILURE(status)) {
   2241             break;
   2242         }
   2243     }   // end of for loop
   2244     return i+1;
   2245 }
   2246 
   2247 
   2248 //--------------------------------------------------------------------------------
   2249 //
   2250 //     start
   2251 //
   2252 //--------------------------------------------------------------------------------
   2253 int32_t RegexMatcher::start(UErrorCode &status) const {
   2254     return start(0, status);
   2255 }
   2256 
   2257 int64_t RegexMatcher::start64(UErrorCode &status) const {
   2258     return start64(0, status);
   2259 }
   2260 
   2261 //--------------------------------------------------------------------------------
   2262 //
   2263 //     start(int32_t group, UErrorCode &status)
   2264 //
   2265 //--------------------------------------------------------------------------------
   2266 
   2267 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
   2268     if (U_FAILURE(status)) {
   2269         return -1;
   2270     }
   2271     if (U_FAILURE(fDeferredStatus)) {
   2272         status = fDeferredStatus;
   2273         return -1;
   2274     }
   2275     if (fMatch == FALSE) {
   2276         status = U_REGEX_INVALID_STATE;
   2277         return -1;
   2278     }
   2279     if (group < 0 || group > fPattern->fGroupMap->size()) {
   2280         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2281         return -1;
   2282     }
   2283     int64_t s;
   2284     if (group == 0) {
   2285         s = fMatchStart;
   2286     } else {
   2287         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
   2288         U_ASSERT(groupOffset < fPattern->fFrameSize);
   2289         U_ASSERT(groupOffset >= 0);
   2290         s = fFrame->fExtra[groupOffset];
   2291     }
   2292 
   2293     return s;
   2294 }
   2295 
   2296 
   2297 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
   2298     return (int32_t)start64(group, status);
   2299 }
   2300 
   2301 //--------------------------------------------------------------------------------
   2302 //
   2303 //     useAnchoringBounds
   2304 //
   2305 //--------------------------------------------------------------------------------
   2306 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
   2307     fAnchoringBounds = b;
   2308     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
   2309     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
   2310     return *this;
   2311 }
   2312 
   2313 
   2314 //--------------------------------------------------------------------------------
   2315 //
   2316 //     useTransparentBounds
   2317 //
   2318 //--------------------------------------------------------------------------------
   2319 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
   2320     fTransparentBounds = b;
   2321     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
   2322     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
   2323     return *this;
   2324 }
   2325 
   2326 //--------------------------------------------------------------------------------
   2327 //
   2328 //     setTimeLimit
   2329 //
   2330 //--------------------------------------------------------------------------------
   2331 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
   2332     if (U_FAILURE(status)) {
   2333         return;
   2334     }
   2335     if (U_FAILURE(fDeferredStatus)) {
   2336         status = fDeferredStatus;
   2337         return;
   2338     }
   2339     if (limit < 0) {
   2340         status = U_ILLEGAL_ARGUMENT_ERROR;
   2341         return;
   2342     }
   2343     fTimeLimit = limit;
   2344 }
   2345 
   2346 
   2347 //--------------------------------------------------------------------------------
   2348 //
   2349 //     getTimeLimit
   2350 //
   2351 //--------------------------------------------------------------------------------
   2352 int32_t RegexMatcher::getTimeLimit() const {
   2353     return fTimeLimit;
   2354 }
   2355 
   2356 
   2357 //--------------------------------------------------------------------------------
   2358 //
   2359 //     setStackLimit
   2360 //
   2361 //--------------------------------------------------------------------------------
   2362 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
   2363     if (U_FAILURE(status)) {
   2364         return;
   2365     }
   2366     if (U_FAILURE(fDeferredStatus)) {
   2367         status = fDeferredStatus;
   2368         return;
   2369     }
   2370     if (limit < 0) {
   2371         status = U_ILLEGAL_ARGUMENT_ERROR;
   2372         return;
   2373     }
   2374 
   2375     // Reset the matcher.  This is needed here in case there is a current match
   2376     //    whose final stack frame (containing the match results, pointed to by fFrame)
   2377     //    would be lost by resizing to a smaller stack size.
   2378     reset();
   2379 
   2380     if (limit == 0) {
   2381         // Unlimited stack expansion
   2382         fStack->setMaxCapacity(0);
   2383     } else {
   2384         // Change the units of the limit  from bytes to ints, and bump the size up
   2385         //   to be big enough to hold at least one stack frame for the pattern,
   2386         //   if it isn't there already.
   2387         int32_t adjustedLimit = limit / sizeof(int32_t);
   2388         if (adjustedLimit < fPattern->fFrameSize) {
   2389             adjustedLimit = fPattern->fFrameSize;
   2390         }
   2391         fStack->setMaxCapacity(adjustedLimit);
   2392     }
   2393     fStackLimit = limit;
   2394 }
   2395 
   2396 
   2397 //--------------------------------------------------------------------------------
   2398 //
   2399 //     getStackLimit
   2400 //
   2401 //--------------------------------------------------------------------------------
   2402 int32_t RegexMatcher::getStackLimit() const {
   2403     return fStackLimit;
   2404 }
   2405 
   2406 
   2407 //--------------------------------------------------------------------------------
   2408 //
   2409 //     setMatchCallback
   2410 //
   2411 //--------------------------------------------------------------------------------
   2412 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
   2413                                     const void              *context,
   2414                                     UErrorCode              &status) {
   2415     if (U_FAILURE(status)) {
   2416         return;
   2417     }
   2418     fCallbackFn = callback;
   2419     fCallbackContext = context;
   2420 }
   2421 
   2422 
   2423 //--------------------------------------------------------------------------------
   2424 //
   2425 //     getMatchCallback
   2426 //
   2427 //--------------------------------------------------------------------------------
   2428 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
   2429                                   const void              *&context,
   2430                                   UErrorCode              &status) {
   2431     if (U_FAILURE(status)) {
   2432        return;
   2433     }
   2434     callback = fCallbackFn;
   2435     context  = fCallbackContext;
   2436 }
   2437 
   2438 
   2439 //--------------------------------------------------------------------------------
   2440 //
   2441 //     setMatchCallback
   2442 //
   2443 //--------------------------------------------------------------------------------
   2444 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
   2445                                                 const void                      *context,
   2446                                                 UErrorCode                      &status) {
   2447     if (U_FAILURE(status)) {
   2448         return;
   2449     }
   2450     fFindProgressCallbackFn = callback;
   2451     fFindProgressCallbackContext = context;
   2452 }
   2453 
   2454 
   2455 //--------------------------------------------------------------------------------
   2456 //
   2457 //     getMatchCallback
   2458 //
   2459 //--------------------------------------------------------------------------------
   2460 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
   2461                                                 const void                    *&context,
   2462                                                 UErrorCode                    &status) {
   2463     if (U_FAILURE(status)) {
   2464        return;
   2465     }
   2466     callback = fFindProgressCallbackFn;
   2467     context  = fFindProgressCallbackContext;
   2468 }
   2469 
   2470 
   2471 //================================================================================
   2472 //
   2473 //    Code following this point in this file is the internal
   2474 //    Match Engine Implementation.
   2475 //
   2476 //================================================================================
   2477 
   2478 
   2479 //--------------------------------------------------------------------------------
   2480 //
   2481 //   resetStack
   2482 //           Discard any previous contents of the state save stack, and initialize a
   2483 //           new stack frame to all -1.  The -1s are needed for capture group limits,
   2484 //           where they indicate that a group has not yet matched anything.
   2485 //--------------------------------------------------------------------------------
   2486 REStackFrame *RegexMatcher::resetStack() {
   2487     // Discard any previous contents of the state save stack, and initialize a
   2488     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
   2489     //  where they indicate that a group has not yet matched anything.
   2490     fStack->removeAllElements();
   2491 
   2492     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
   2493     int32_t i;
   2494     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
   2495         iFrame->fExtra[i] = -1;
   2496     }
   2497     return iFrame;
   2498 }
   2499 
   2500 
   2501 
   2502 //--------------------------------------------------------------------------------
   2503 //
   2504 //   isWordBoundary
   2505 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
   2506 //                     For us,
   2507 //                       If the current char is a combining mark,
   2508 //                          \b is FALSE.
   2509 //                       Else Scan backwards to the first non-combining char.
   2510 //                            We are at a boundary if the this char and the original chars are
   2511 //                               opposite in membership in \w set
   2512 //
   2513 //          parameters:   pos   - the current position in the input buffer
   2514 //
   2515 //              TODO:  double-check edge cases at region boundaries.
   2516 //
   2517 //--------------------------------------------------------------------------------
   2518 UBool RegexMatcher::isWordBoundary(int64_t pos) {
   2519     UBool isBoundary = FALSE;
   2520     UBool cIsWord    = FALSE;
   2521 
   2522     if (pos >= fLookLimit) {
   2523         fHitEnd = TRUE;
   2524     } else {
   2525         // Determine whether char c at current position is a member of the word set of chars.
   2526         // If we're off the end of the string, behave as though we're not at a word char.
   2527         UTEXT_SETNATIVEINDEX(fInputText, pos);
   2528         UChar32  c = UTEXT_CURRENT32(fInputText);
   2529         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2530             // Current char is a combining one.  Not a boundary.
   2531             return FALSE;
   2532         }
   2533         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   2534     }
   2535 
   2536     // Back up until we come to a non-combining char, determine whether
   2537     //  that char is a word char.
   2538     UBool prevCIsWord = FALSE;
   2539     for (;;) {
   2540         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
   2541             break;
   2542         }
   2543         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
   2544         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2545               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2546             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   2547             break;
   2548         }
   2549     }
   2550     isBoundary = cIsWord ^ prevCIsWord;
   2551     return isBoundary;
   2552 }
   2553 
   2554 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
   2555     UBool isBoundary = FALSE;
   2556     UBool cIsWord    = FALSE;
   2557 
   2558     const UChar *inputBuf = fInputText->chunkContents;
   2559 
   2560     if (pos >= fLookLimit) {
   2561         fHitEnd = TRUE;
   2562     } else {
   2563         // Determine whether char c at current position is a member of the word set of chars.
   2564         // If we're off the end of the string, behave as though we're not at a word char.
   2565         UChar32 c;
   2566         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
   2567         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2568             // Current char is a combining one.  Not a boundary.
   2569             return FALSE;
   2570         }
   2571         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   2572     }
   2573 
   2574     // Back up until we come to a non-combining char, determine whether
   2575     //  that char is a word char.
   2576     UBool prevCIsWord = FALSE;
   2577     for (;;) {
   2578         if (pos <= fLookStart) {
   2579             break;
   2580         }
   2581         UChar32 prevChar;
   2582         U16_PREV(inputBuf, fLookStart, pos, prevChar);
   2583         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2584               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2585             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   2586             break;
   2587         }
   2588     }
   2589     isBoundary = cIsWord ^ prevCIsWord;
   2590     return isBoundary;
   2591 }
   2592 
   2593 //--------------------------------------------------------------------------------
   2594 //
   2595 //   isUWordBoundary
   2596 //
   2597 //         Test for a word boundary using RBBI word break.
   2598 //
   2599 //          parameters:   pos   - the current position in the input buffer
   2600 //
   2601 //--------------------------------------------------------------------------------
   2602 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
   2603     UBool       returnVal = FALSE;
   2604 #if UCONFIG_NO_BREAK_ITERATION==0
   2605 
   2606     // If we haven't yet created a break iterator for this matcher, do it now.
   2607     if (fWordBreakItr == NULL) {
   2608         fWordBreakItr =
   2609             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
   2610         if (U_FAILURE(fDeferredStatus)) {
   2611             return FALSE;
   2612         }
   2613         fWordBreakItr->setText(fInputText, fDeferredStatus);
   2614     }
   2615 
   2616     if (pos >= fLookLimit) {
   2617         fHitEnd = TRUE;
   2618         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
   2619                             //    words are not boundaries.  All non-word chars stand by themselves,
   2620                             //    with word boundaries on both sides.
   2621     } else {
   2622         if (!UTEXT_USES_U16(fInputText)) {
   2623             // !!!: Would like a better way to do this!
   2624             UErrorCode status = U_ZERO_ERROR;
   2625             pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
   2626         }
   2627         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
   2628     }
   2629 #endif
   2630     return   returnVal;
   2631 }
   2632 
   2633 //--------------------------------------------------------------------------------
   2634 //
   2635 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
   2636 //                     saves. Increment the "time" counter, and call the
   2637 //                     user callback function if there is one installed.
   2638 //
   2639 //                     If the match operation needs to be aborted, either for a time-out
   2640 //                     or because the user callback asked for it, just set an error status.
   2641 //                     The engine will pick that up and stop in its outer loop.
   2642 //
   2643 //--------------------------------------------------------------------------------
   2644 void RegexMatcher::IncrementTime(UErrorCode &status) {
   2645     fTickCounter = TIMER_INITIAL_VALUE;
   2646     fTime++;
   2647     if (fCallbackFn != NULL) {
   2648         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
   2649             status = U_REGEX_STOPPED_BY_CALLER;
   2650             return;
   2651         }
   2652     }
   2653     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
   2654         status = U_REGEX_TIME_OUT;
   2655     }
   2656 }
   2657 
   2658 //--------------------------------------------------------------------------------
   2659 //
   2660 //   ReportFindProgress     This function is called once for each advance in the target
   2661 //                          string from the find() function, and calls the user progress callback
   2662 //                          function if there is one installed.
   2663 //
   2664 //                          NOTE:
   2665 //
   2666 //                          If the match operation needs to be aborted because the user
   2667 //                          callback asked for it, just set an error status.
   2668 //                          The engine will pick that up and stop in its outer loop.
   2669 //
   2670 //--------------------------------------------------------------------------------
   2671 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
   2672     if (fFindProgressCallbackFn != NULL) {
   2673         if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
   2674             status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
   2675             return FALSE;
   2676         }
   2677     }
   2678     return TRUE;
   2679 }
   2680 
   2681 //--------------------------------------------------------------------------------
   2682 //
   2683 //   StateSave
   2684 //       Make a new stack frame, initialized as a copy of the current stack frame.
   2685 //       Set the pattern index in the original stack frame from the operand value
   2686 //       in the opcode.  Execution of the engine continues with the state in
   2687 //       the newly created stack frame
   2688 //
   2689 //       Note that reserveBlock() may grow the stack, resulting in the
   2690 //       whole thing being relocated in memory.
   2691 //
   2692 //    Parameters:
   2693 //       fp           The top frame pointer when called.  At return, a new
   2694 //                    fame will be present
   2695 //       savePatIdx   An index into the compiled pattern.  Goes into the original
   2696 //                    (not new) frame.  If execution ever back-tracks out of the
   2697 //                    new frame, this will be where we continue from in the pattern.
   2698 //    Return
   2699 //                    The new frame pointer.
   2700 //
   2701 //--------------------------------------------------------------------------------
   2702 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
   2703     // push storage for a new frame.
   2704     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
   2705     if (newFP == NULL) {
   2706         // Failure on attempted stack expansion.
   2707         //   Stack function set some other error code, change it to a more
   2708         //   specific one for regular expressions.
   2709         status = U_REGEX_STACK_OVERFLOW;
   2710         // We need to return a writable stack frame, so just return the
   2711         //    previous frame.  The match operation will stop quickly
   2712         //    because of the error status, after which the frame will never
   2713         //    be looked at again.
   2714         return fp;
   2715     }
   2716     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
   2717 
   2718     // New stack frame = copy of old top frame.
   2719     int64_t *source = (int64_t *)fp;
   2720     int64_t *dest   = newFP;
   2721     for (;;) {
   2722         *dest++ = *source++;
   2723         if (source == newFP) {
   2724             break;
   2725         }
   2726     }
   2727 
   2728     fTickCounter--;
   2729     if (fTickCounter <= 0) {
   2730        IncrementTime(status);    // Re-initializes fTickCounter
   2731     }
   2732     fp->fPatIdx = savePatIdx;
   2733     return (REStackFrame *)newFP;
   2734 }
   2735 
   2736 
   2737 //--------------------------------------------------------------------------------
   2738 //
   2739 //   MatchAt      This is the actual matching engine.
   2740 //
   2741 //                  startIdx:    begin matching a this index.
   2742 //                  toEnd:       if true, match must extend to end of the input region
   2743 //
   2744 //--------------------------------------------------------------------------------
   2745 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
   2746     UBool       isMatch  = FALSE;      // True if the we have a match.
   2747 
   2748     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
   2749 
   2750     int32_t     op;                    // Operation from the compiled pattern, split into
   2751     int32_t     opType;                //    the opcode
   2752     int32_t     opValue;               //    and the operand value.
   2753 
   2754     #ifdef REGEX_RUN_DEBUG
   2755     if (fTraceDebug)
   2756     {
   2757         printf("MatchAt(startIdx=%ld)\n", startIdx);
   2758         printf("Original Pattern: ");
   2759         UChar32 c = utext_next32From(fPattern->fPattern, 0);
   2760         while (c != U_SENTINEL) {
   2761             if (c<32 || c>256) {
   2762                 c = '.';
   2763             }
   2764             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
   2765 
   2766             c = UTEXT_NEXT32(fPattern->fPattern);
   2767         }
   2768         printf("\n");
   2769         printf("Input String: ");
   2770         c = utext_next32From(fInputText, 0);
   2771         while (c != U_SENTINEL) {
   2772             if (c<32 || c>256) {
   2773                 c = '.';
   2774             }
   2775             printf("%c", c);
   2776 
   2777             c = UTEXT_NEXT32(fInputText);
   2778         }
   2779         printf("\n");
   2780         printf("\n");
   2781     }
   2782     #endif
   2783 
   2784     if (U_FAILURE(status)) {
   2785         return;
   2786     }
   2787 
   2788     //  Cache frequently referenced items from the compiled pattern
   2789     //
   2790     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   2791 
   2792     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   2793     UVector             *sets          = fPattern->fSets;
   2794 
   2795     fFrameSize = fPattern->fFrameSize;
   2796     REStackFrame        *fp            = resetStack();
   2797 
   2798     fp->fPatIdx   = 0;
   2799     fp->fInputIdx = startIdx;
   2800 
   2801     // Zero out the pattern's static data
   2802     int32_t i;
   2803     for (i = 0; i<fPattern->fDataSize; i++) {
   2804         fData[i] = 0;
   2805     }
   2806 
   2807     //
   2808     //  Main loop for interpreting the compiled pattern.
   2809     //  One iteration of the loop per pattern operation performed.
   2810     //
   2811     for (;;) {
   2812 #if 0
   2813         if (_heapchk() != _HEAPOK) {
   2814             fprintf(stderr, "Heap Trouble\n");
   2815         }
   2816 #endif
   2817 
   2818         op      = (int32_t)pat[fp->fPatIdx];
   2819         opType  = URX_TYPE(op);
   2820         opValue = URX_VAL(op);
   2821         #ifdef REGEX_RUN_DEBUG
   2822         if (fTraceDebug) {
   2823             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2824             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
   2825                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   2826             fPattern->dumpOp(fp->fPatIdx);
   2827         }
   2828         #endif
   2829         fp->fPatIdx++;
   2830 
   2831         switch (opType) {
   2832 
   2833 
   2834         case URX_NOP:
   2835             break;
   2836 
   2837 
   2838         case URX_BACKTRACK:
   2839             // Force a backtrack.  In some circumstances, the pattern compiler
   2840             //   will notice that the pattern can't possibly match anything, and will
   2841             //   emit one of these at that point.
   2842             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2843             break;
   2844 
   2845 
   2846         case URX_ONECHAR:
   2847             if (fp->fInputIdx < fActiveLimit) {
   2848                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2849                 UChar32 c = UTEXT_NEXT32(fInputText);
   2850                 if (c == opValue) {
   2851                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2852                     break;
   2853                 }
   2854             } else {
   2855                 fHitEnd = TRUE;
   2856             }
   2857 
   2858             #ifdef REGEX_SMART_BACKTRACKING
   2859             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   2860                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   2861                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   2862                     UBool success = FALSE;
   2863                     UChar32 c = UTEXT_PREVIOUS32(fInputText);
   2864                     while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
   2865                         if (c == opValue) {
   2866                             success = TRUE;
   2867                             break;
   2868                         } else if (c == U_SENTINEL) {
   2869                             break;
   2870                         }
   2871                         c = UTEXT_PREVIOUS32(fInputText);
   2872                     }
   2873                     if (success) {
   2874                         fHitEnd = FALSE;
   2875                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2876                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2877                         if (fp->fInputIdx > backSearchIndex) {
   2878                             fp = StateSave(fp, fp->fPatIdx, status);
   2879                         }
   2880                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
   2881                         break;
   2882                     }
   2883                 }
   2884             }
   2885             #endif
   2886 
   2887             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2888             break;
   2889 
   2890 
   2891         case URX_STRING:
   2892             {
   2893                 // Test input against a literal string.
   2894                 // Strings require two slots in the compiled pattern, one for the
   2895                 //   offset to the string text, and one for the length.
   2896                 int32_t   stringStartIdx = opValue;
   2897                 int32_t   stringLen;
   2898 
   2899                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
   2900                 fp->fPatIdx++;
   2901                 opType    = URX_TYPE(op);
   2902                 stringLen = URX_VAL(op);
   2903                 U_ASSERT(opType == URX_STRING_LEN);
   2904                 U_ASSERT(stringLen >= 2);
   2905 
   2906                 const UChar *patternChars = litText+stringStartIdx;
   2907                 const UChar *patternEnd = patternChars+stringLen;
   2908 
   2909                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2910                 UChar32 c;
   2911                 UBool success = TRUE;
   2912 
   2913                 while (patternChars < patternEnd && success) {
   2914                     c = UTEXT_NEXT32(fInputText);
   2915 
   2916                     if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) {
   2917                         if (U_IS_BMP(c)) {
   2918                             success = (*patternChars == c);
   2919                             patternChars += 1;
   2920                         } else if (patternChars+1 < patternEnd) {
   2921                             success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
   2922                             patternChars += 2;
   2923                         }
   2924                     } else {
   2925                         success = FALSE;
   2926                         fHitEnd = TRUE;          //   TODO:  See ticket 6074
   2927                     }
   2928                 }
   2929 
   2930                 if (success) {
   2931                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2932                 } else {
   2933                     #ifdef REGEX_SMART_BACKTRACKING
   2934                     if (fp->fInputIdx > backSearchIndex && fStack->size()) {
   2935                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   2936                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   2937                             // Reset to last start point
   2938                             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2939                             patternChars = litText+stringStartIdx;
   2940 
   2941                             // Search backwards for a possible start
   2942                             do {
   2943                                 c = UTEXT_PREVIOUS32(fInputText);
   2944                                 if (c == U_SENTINEL) {
   2945                                     break;
   2946                                 } else if ((U_IS_BMP(c) && *patternChars == c) ||
   2947                                     (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
   2948                                     success = TRUE;
   2949                                     break;
   2950                                 }
   2951                             } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
   2952 
   2953                             // And try again
   2954                             if (success) {
   2955                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2956                                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2957                                 if (fp->fInputIdx > backSearchIndex) {
   2958                                     fp = StateSave(fp, fp->fPatIdx, status);
   2959                                 }
   2960                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
   2961                                 break;
   2962                             }
   2963                         }
   2964                     }
   2965                     #endif
   2966                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2967                 }
   2968             }
   2969             break;
   2970 
   2971 
   2972         case URX_STATE_SAVE:
   2973             fp = StateSave(fp, opValue, status);
   2974             break;
   2975 
   2976 
   2977         case URX_END:
   2978             // The match loop will exit via this path on a successful match,
   2979             //   when we reach the end of the pattern.
   2980             if (toEnd && fp->fInputIdx != fActiveLimit) {
   2981                 // The pattern matched, but not to the end of input.  Try some more.
   2982                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2983                 break;
   2984             }
   2985             isMatch = TRUE;
   2986             goto  breakFromLoop;
   2987 
   2988         // Start and End Capture stack frame variables are laid out out like this:
   2989             //  fp->fExtra[opValue]  - The start of a completed capture group
   2990             //             opValue+1 - The end   of a completed capture group
   2991             //             opValue+2 - the start of a capture group whose end
   2992             //                          has not yet been reached (and might not ever be).
   2993         case URX_START_CAPTURE:
   2994             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2995             fp->fExtra[opValue+2] = fp->fInputIdx;
   2996             break;
   2997 
   2998 
   2999         case URX_END_CAPTURE:
   3000             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   3001             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   3002             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   3003             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   3004             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   3005             break;
   3006 
   3007 
   3008         case URX_DOLLAR:                   //  $, test for End of line
   3009                                            //     or for position before new line at end of input
   3010             {
   3011                 if (fp->fInputIdx >= fAnchorLimit) {
   3012                     // We really are at the end of input.  Success.
   3013                     fHitEnd = TRUE;
   3014                     fRequireEnd = TRUE;
   3015                     break;
   3016                 }
   3017 
   3018                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3019 
   3020                 // If we are positioned just before a new-line that is located at the
   3021                 //   end of input, succeed.
   3022                 UChar32 c = UTEXT_NEXT32(fInputText);
   3023                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   3024                     if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
   3025                         // If not in the middle of a CR/LF sequence
   3026                       if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
   3027                             // At new-line at end of input. Success
   3028                             fHitEnd = TRUE;
   3029                             fRequireEnd = TRUE;
   3030 
   3031                             break;
   3032                         }
   3033                     }
   3034                 } else {
   3035                     UChar32 nextC = UTEXT_NEXT32(fInputText);
   3036                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   3037                         fHitEnd = TRUE;
   3038                         fRequireEnd = TRUE;
   3039                         break;                         // At CR/LF at end of input.  Success
   3040                     }
   3041                 }
   3042 
   3043                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3044             }
   3045             break;
   3046 
   3047 
   3048          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   3049             if (fp->fInputIdx >= fAnchorLimit) {
   3050                 // Off the end of input.  Success.
   3051                 fHitEnd = TRUE;
   3052                 fRequireEnd = TRUE;
   3053                 break;
   3054             } else {
   3055                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3056                 UChar32 c = UTEXT_NEXT32(fInputText);
   3057                 // Either at the last character of input, or off the end.
   3058                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
   3059                     fHitEnd = TRUE;
   3060                     fRequireEnd = TRUE;
   3061                     break;
   3062                 }
   3063             }
   3064 
   3065             // Not at end of input.  Back-track out.
   3066             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3067             break;
   3068 
   3069 
   3070          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   3071              {
   3072                  if (fp->fInputIdx >= fAnchorLimit) {
   3073                      // We really are at the end of input.  Success.
   3074                      fHitEnd = TRUE;
   3075                      fRequireEnd = TRUE;
   3076                      break;
   3077                  }
   3078                  // If we are positioned just before a new-line, succeed.
   3079                  // It makes no difference where the new-line is within the input.
   3080                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3081                  UChar32 c = UTEXT_CURRENT32(fInputText);
   3082                  if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
   3083                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   3084                      //  In multi-line mode, hitting a new-line just before the end of input does not
   3085                      //   set the hitEnd or requireEnd flags
   3086                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
   3087                         break;
   3088                      }
   3089                  }
   3090                  // not at a new line.  Fail.
   3091                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3092              }
   3093              break;
   3094 
   3095 
   3096          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   3097              {
   3098                  if (fp->fInputIdx >= fAnchorLimit) {
   3099                      // We really are at the end of input.  Success.
   3100                      fHitEnd = TRUE;
   3101                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   3102                      break;               //   adding a new-line would not lose the match.
   3103                  }
   3104                  // If we are not positioned just before a new-line, the test fails; backtrack out.
   3105                  // It makes no difference where the new-line is within the input.
   3106                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3107                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
   3108                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3109                  }
   3110              }
   3111              break;
   3112 
   3113 
   3114        case URX_CARET:                    //  ^, test for start of line
   3115             if (fp->fInputIdx != fAnchorStart) {
   3116                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3117             }
   3118             break;
   3119 
   3120 
   3121        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   3122            {
   3123                if (fp->fInputIdx == fAnchorStart) {
   3124                    // We are at the start input.  Success.
   3125                    break;
   3126                }
   3127                // Check whether character just before the current pos is a new-line
   3128                //   unless we are at the end of input
   3129                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3130                UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3131                if ((fp->fInputIdx < fAnchorLimit) &&
   3132                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   3133                    //  It's a new-line.  ^ is true.  Success.
   3134                    //  TODO:  what should be done with positions between a CR and LF?
   3135                    break;
   3136                }
   3137                // Not at the start of a line.  Fail.
   3138                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3139            }
   3140            break;
   3141 
   3142 
   3143        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   3144            {
   3145                U_ASSERT(fp->fInputIdx >= fAnchorStart);
   3146                if (fp->fInputIdx <= fAnchorStart) {
   3147                    // We are at the start input.  Success.
   3148                    break;
   3149                }
   3150                // Check whether character just before the current pos is a new-line
   3151                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   3152                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3153                UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3154                if (c != 0x0a) {
   3155                    // Not at the start of a line.  Back-track out.
   3156                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3157                }
   3158            }
   3159            break;
   3160 
   3161         case URX_BACKSLASH_B:          // Test for word boundaries
   3162             {
   3163                 UBool success = isWordBoundary(fp->fInputIdx);
   3164                 success ^= (opValue != 0);     // flip sense for \B
   3165                 if (!success) {
   3166                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3167                 }
   3168             }
   3169             break;
   3170 
   3171 
   3172         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   3173             {
   3174                 UBool success = isUWordBoundary(fp->fInputIdx);
   3175                 success ^= (opValue != 0);     // flip sense for \B
   3176                 if (!success) {
   3177                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3178                 }
   3179             }
   3180             break;
   3181 
   3182 
   3183         case URX_BACKSLASH_D:            // Test for decimal digit
   3184             {
   3185                 if (fp->fInputIdx >= fActiveLimit) {
   3186                     fHitEnd = TRUE;
   3187                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3188                     break;
   3189                 }
   3190 
   3191                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3192 
   3193                 UChar32 c = UTEXT_NEXT32(fInputText);
   3194                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   3195                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   3196                 success ^= (opValue != 0);        // flip sense for \D
   3197                 if (success) {
   3198                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3199                 } else {
   3200                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3201                 }
   3202             }
   3203             break;
   3204 
   3205 
   3206         case URX_BACKSLASH_G:          // Test for position at end of previous match
   3207             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
   3208                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3209             }
   3210             break;
   3211 
   3212 
   3213         case URX_BACKSLASH_X:
   3214             //  Match a Grapheme, as defined by Unicode TR 29.
   3215             //  Differs slightly from Perl, which consumes combining marks independently
   3216             //    of context.
   3217             {
   3218 
   3219                 // Fail if at end of input
   3220                 if (fp->fInputIdx >= fActiveLimit) {
   3221                     fHitEnd = TRUE;
   3222                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3223                     break;
   3224                 }
   3225 
   3226                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3227 
   3228                 // Examine (and consume) the current char.
   3229                 //   Dispatch into a little state machine, based on the char.
   3230                 UChar32  c;
   3231                 c = UTEXT_NEXT32(fInputText);
   3232                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3233                 UnicodeSet **sets = fPattern->fStaticSets;
   3234                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   3235                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   3236                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   3237                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   3238                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   3239                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3240                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3241                 goto GC_Extend;
   3242 
   3243 
   3244 
   3245 GC_L:
   3246                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3247                 c = UTEXT_NEXT32(fInputText);
   3248                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3249                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   3250                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   3251                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   3252                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3253                 (void)UTEXT_PREVIOUS32(fInputText);
   3254                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3255                 goto GC_Extend;
   3256 
   3257 GC_V:
   3258                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3259                 c = UTEXT_NEXT32(fInputText);
   3260                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3261                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3262                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3263                 (void)UTEXT_PREVIOUS32(fInputText);
   3264                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3265                 goto GC_Extend;
   3266 
   3267 GC_T:
   3268                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3269                 c = UTEXT_NEXT32(fInputText);
   3270                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3271                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3272                 (void)UTEXT_PREVIOUS32(fInputText);
   3273                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3274                 goto GC_Extend;
   3275 
   3276 GC_Extend:
   3277                 // Combining characters are consumed here
   3278                 for (;;) {
   3279                     if (fp->fInputIdx >= fActiveLimit) {
   3280                         break;
   3281                     }
   3282                     c = UTEXT_CURRENT32(fInputText);
   3283                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   3284                         break;
   3285                     }
   3286                     (void)UTEXT_NEXT32(fInputText);
   3287                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3288                 }
   3289                 goto GC_Done;
   3290 
   3291 GC_Control:
   3292                 // Most control chars stand alone (don't combine with combining chars),
   3293                 //   except for that CR/LF sequence is a single grapheme cluster.
   3294                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
   3295                     c = UTEXT_NEXT32(fInputText);
   3296                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3297                 }
   3298 
   3299 GC_Done:
   3300                 if (fp->fInputIdx >= fActiveLimit) {
   3301                     fHitEnd = TRUE;
   3302                 }
   3303                 break;
   3304             }
   3305 
   3306 
   3307 
   3308 
   3309         case URX_BACKSLASH_Z:          // Test for end of Input
   3310             if (fp->fInputIdx < fAnchorLimit) {
   3311                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3312             } else {
   3313                 fHitEnd = TRUE;
   3314                 fRequireEnd = TRUE;
   3315             }
   3316             break;
   3317 
   3318 
   3319 
   3320         case URX_STATIC_SETREF:
   3321             {
   3322                 // Test input character against one of the predefined sets
   3323                 //    (Word Characters, for example)
   3324                 // The high bit of the op value is a flag for the match polarity.
   3325                 //    0:   success if input char is in set.
   3326                 //    1:   success if input char is not in set.
   3327                 if (fp->fInputIdx >= fActiveLimit) {
   3328                     fHitEnd = TRUE;
   3329                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3330                     break;
   3331                 }
   3332 
   3333                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   3334                 opValue &= ~URX_NEG_SET;
   3335                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3336 
   3337                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3338                 UChar32 c = UTEXT_NEXT32(fInputText);
   3339                 if (c < 256) {
   3340                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3341                     if (s8->contains(c)) {
   3342                         success = !success;
   3343                     }
   3344                 } else {
   3345                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3346                     if (s->contains(c)) {
   3347                         success = !success;
   3348                     }
   3349                 }
   3350                 if (success) {
   3351                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3352                 } else {
   3353                     // the character wasn't in the set.
   3354                     #ifdef REGEX_SMART_BACKTRACKING
   3355                     if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   3356                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3357                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3358                             // Try to find it, backwards
   3359                             UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
   3360                             success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
   3361                             do {
   3362                                 c = UTEXT_PREVIOUS32(fInputText);
   3363                                 if (c == U_SENTINEL) {
   3364                                     break;
   3365                                 } else if (c < 256) {
   3366                                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3367                                     if (s8->contains(c)) {
   3368                                         success = !success;
   3369                                     }
   3370                                 } else {
   3371                                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3372                                     if (s->contains(c)) {
   3373                                         success = !success;
   3374                                     }
   3375                                 }
   3376                             } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success);
   3377 
   3378                             if (success && c != U_SENTINEL) {
   3379                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3380                                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3381                                 if (fp->fInputIdx > backSearchIndex) {
   3382                                     fp = StateSave(fp, fp->fPatIdx, status);
   3383                                 }
   3384                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3385                                 break;
   3386                             }
   3387                         }
   3388                     }
   3389                     #endif
   3390                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3391                 }
   3392             }
   3393             break;
   3394 
   3395 
   3396         case URX_STAT_SETREF_N:
   3397             {
   3398                 // Test input character for NOT being a member of  one of
   3399                 //    the predefined sets (Word Characters, for example)
   3400                 if (fp->fInputIdx >= fActiveLimit) {
   3401                     fHitEnd = TRUE;
   3402                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3403                     break;
   3404                 }
   3405 
   3406                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3407 
   3408                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3409 
   3410                 UChar32 c = UTEXT_NEXT32(fInputText);
   3411                 if (c < 256) {
   3412                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3413                     if (s8->contains(c) == FALSE) {
   3414                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3415                         break;
   3416                     }
   3417                 } else {
   3418                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3419                     if (s->contains(c) == FALSE) {
   3420                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3421                         break;
   3422                     }
   3423                 }
   3424                 // the character wasn't in the set.
   3425                 #ifdef REGEX_SMART_BACKTRACKING
   3426                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   3427                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3428                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3429                         // Try to find it, backwards
   3430                         UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
   3431                         UBool success = FALSE;
   3432                         do {
   3433                             c = UTEXT_PREVIOUS32(fInputText);
   3434                             if (c == U_SENTINEL) {
   3435                                 break;
   3436                             } else if (c < 256) {
   3437                                 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3438                                 if (s8->contains(c) == FALSE) {
   3439                                     success = TRUE;
   3440                                     break;
   3441                                 }
   3442                             } else {
   3443                                 const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3444                                 if (s->contains(c) == FALSE) {
   3445                                     success = TRUE;
   3446                                     break;
   3447                                 }
   3448                             }
   3449                         } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
   3450 
   3451                         if (success) {
   3452                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3453                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3454                             if (fp->fInputIdx > backSearchIndex) {
   3455                                 fp = StateSave(fp, fp->fPatIdx, status);
   3456                             }
   3457                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3458                             break;
   3459                         }
   3460                     }
   3461                 }
   3462                 #endif
   3463                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3464             }
   3465             break;
   3466 
   3467 
   3468         case URX_SETREF:
   3469             if (fp->fInputIdx >= fActiveLimit) {
   3470                 fHitEnd = TRUE;
   3471                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3472                 break;
   3473             } else {
   3474                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3475 
   3476                 // There is input left.  Pick up one char and test it for set membership.
   3477                 UChar32 c = UTEXT_NEXT32(fInputText);
   3478                 U_ASSERT(opValue > 0 && opValue < sets->size());
   3479                 if (c<256) {
   3480                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   3481                     if (s8->contains(c)) {
   3482                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3483                         break;
   3484                     }
   3485                 } else {
   3486                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   3487                     if (s->contains(c)) {
   3488                         // The character is in the set.  A Match.
   3489                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3490                         break;
   3491                     }
   3492                 }
   3493 
   3494                 // the character wasn't in the set.
   3495                 #ifdef REGEX_SMART_BACKTRACKING
   3496                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   3497                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3498                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3499                         // Try to find it, backwards
   3500                         UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
   3501                         UBool success = FALSE;
   3502                         do {
   3503                             c = UTEXT_PREVIOUS32(fInputText);
   3504                             if (c == U_SENTINEL) {
   3505                                 break;
   3506                             } else if (c < 256) {
   3507                                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   3508                                 if (s8->contains(c)) {
   3509                                     success = TRUE;
   3510                                     break;
   3511                                 }
   3512                             } else {
   3513                                 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   3514                                 if (s->contains(c)) {
   3515                                     success = TRUE;
   3516                                     break;
   3517                                 }
   3518                             }
   3519                         } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
   3520 
   3521                         if (success) {
   3522                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3523                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3524                             if (fp->fInputIdx > backSearchIndex) {
   3525                                 fp = StateSave(fp, fp->fPatIdx, status);
   3526                             }
   3527                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3528                             break;
   3529                         }
   3530                     }
   3531                 }
   3532                 #endif
   3533                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3534             }
   3535             break;
   3536 
   3537 
   3538         case URX_DOTANY:
   3539             {
   3540                 // . matches anything, but stops at end-of-line.
   3541                 if (fp->fInputIdx >= fActiveLimit) {
   3542                     // At end of input.  Match failed.  Backtrack out.
   3543                     fHitEnd = TRUE;
   3544                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3545                     break;
   3546                 }
   3547 
   3548                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3549 
   3550                 // There is input left.  Advance over one char, unless we've hit end-of-line
   3551                 UChar32 c = UTEXT_NEXT32(fInputText);
   3552                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   3553                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   3554                     // End of line in normal mode.   . does not match.
   3555                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3556                     break;
   3557                 }
   3558                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3559             }
   3560             break;
   3561 
   3562 
   3563         case URX_DOTANY_ALL:
   3564             {
   3565                 // ., in dot-matches-all (including new lines) mode
   3566                 if (fp->fInputIdx >= fActiveLimit) {
   3567                     // At end of input.  Match failed.  Backtrack out.
   3568                     fHitEnd = TRUE;
   3569                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3570                     break;
   3571                 }
   3572 
   3573                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3574 
   3575                 // There is input left.  Advance over one char, except if we are
   3576                 //   at a cr/lf, advance over both of them.
   3577                 UChar32 c;
   3578                 c = UTEXT_NEXT32(fInputText);
   3579                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3580                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   3581                     // In the case of a CR/LF, we need to advance over both.
   3582                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
   3583                     if (nextc == 0x0a) {
   3584                         (void)UTEXT_NEXT32(fInputText);
   3585                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3586                     }
   3587                 }
   3588             }
   3589             break;
   3590 
   3591 
   3592         case URX_DOTANY_UNIX:
   3593             {
   3594                 // '.' operator, matches all, but stops at end-of-line.
   3595                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   3596                 if (fp->fInputIdx >= fActiveLimit) {
   3597                     // At end of input.  Match failed.  Backtrack out.
   3598                     fHitEnd = TRUE;
   3599                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3600                     break;
   3601                 }
   3602 
   3603                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3604 
   3605                 // There is input left.  Advance over one char, unless we've hit end-of-line
   3606                 UChar32 c = UTEXT_NEXT32(fInputText);
   3607                 if (c == 0x0a) {
   3608                     // End of line in normal mode.   '.' does not match the \n
   3609                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3610                 } else {
   3611                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3612                 }
   3613             }
   3614             break;
   3615 
   3616 
   3617         case URX_JMP:
   3618             fp->fPatIdx = opValue;
   3619             break;
   3620 
   3621         case URX_FAIL:
   3622             isMatch = FALSE;
   3623             goto breakFromLoop;
   3624 
   3625         case URX_JMP_SAV:
   3626             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   3627             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   3628             fp->fPatIdx = opValue;                         // Then JMP.
   3629             break;
   3630 
   3631         case URX_JMP_SAV_X:
   3632             // This opcode is used with (x)+, when x can match a zero length string.
   3633             // Same as JMP_SAV, except conditional on the match having made forward progress.
   3634             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   3635             //   data address of the input position at the start of the loop.
   3636             {
   3637                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   3638                 int32_t  stoOp = (int32_t)pat[opValue-1];
   3639                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   3640                 int32_t  frameLoc = URX_VAL(stoOp);
   3641                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   3642                 int64_t prevInputIdx = fp->fExtra[frameLoc];
   3643                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   3644                 if (prevInputIdx < fp->fInputIdx) {
   3645                     // The match did make progress.  Repeat the loop.
   3646                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   3647                     fp->fPatIdx = opValue;
   3648                     fp->fExtra[frameLoc] = fp->fInputIdx;
   3649                 }
   3650                 // If the input position did not advance, we do nothing here,
   3651                 //   execution will fall out of the loop.
   3652             }
   3653             break;
   3654 
   3655         case URX_CTR_INIT:
   3656             {
   3657                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3658                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   3659 
   3660                 // Pick up the three extra operands that CTR_INIT has, and
   3661                 //    skip the pattern location counter past
   3662                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3663                 fp->fPatIdx += 3;
   3664                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3665                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   3666                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   3667                 U_ASSERT(minCount>=0);
   3668                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   3669                 U_ASSERT(loopLoc>fp->fPatIdx);
   3670 
   3671                 if (minCount == 0) {
   3672                     fp = StateSave(fp, loopLoc+1, status);
   3673                 }
   3674                 if (maxCount == 0) {
   3675                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3676                 }
   3677             }
   3678             break;
   3679 
   3680         case URX_CTR_LOOP:
   3681             {
   3682                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3683                 int32_t initOp = (int32_t)pat[opValue];
   3684                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   3685                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3686                 int32_t minCount  = (int32_t)pat[opValue+2];
   3687                 int32_t maxCount  = (int32_t)pat[opValue+3];
   3688                 // Increment the counter.  Note: we DIDN'T worry about counter
   3689                 //   overflow, since the data comes from UnicodeStrings, which
   3690                 //   stores its length in an int32_t. Do we have to think about
   3691                 //   this now that we're using UText? Probably not, since the length
   3692                 //   in UChar32s is still an int32_t.
   3693                 (*pCounter)++;
   3694                 U_ASSERT(*pCounter > 0);
   3695                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   3696                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   3697                     break;
   3698                 }
   3699                 if (*pCounter >= minCount) {
   3700                     fp = StateSave(fp, fp->fPatIdx, status);
   3701                 }
   3702                 fp->fPatIdx = opValue + 4;    // Loop back.
   3703             }
   3704             break;
   3705 
   3706         case URX_CTR_INIT_NG:
   3707             {
   3708                 // Initialize a non-greedy loop
   3709                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3710                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   3711 
   3712                 // Pick up the three extra operands that CTR_INIT has, and
   3713                 //    skip the pattern location counter past
   3714                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3715                 fp->fPatIdx += 3;
   3716                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3717                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   3718                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   3719                 U_ASSERT(minCount>=0);
   3720                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   3721                 U_ASSERT(loopLoc>fp->fPatIdx);
   3722 
   3723                 if (minCount == 0) {
   3724                     if (maxCount != 0) {
   3725                         fp = StateSave(fp, fp->fPatIdx, status);
   3726                     }
   3727                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   3728                 }
   3729             }
   3730             break;
   3731 
   3732         case URX_CTR_LOOP_NG:
   3733             {
   3734                 // Non-greedy {min, max} loops
   3735                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3736                 int32_t initOp = (int32_t)pat[opValue];
   3737                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   3738                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3739                 int32_t minCount  = (int32_t)pat[opValue+2];
   3740                 int32_t maxCount  = (int32_t)pat[opValue+3];
   3741                 // Increment the counter.  Note: we DIDN'T worry about counter
   3742                 //   overflow, since the data comes from UnicodeStrings, which
   3743                 //   stores its length in an int32_t. Do we have to think about
   3744                 //   this now that we're using UText? Probably not, since the length
   3745                 //   in UChar32s is still an int32_t.
   3746                 (*pCounter)++;
   3747                 U_ASSERT(*pCounter > 0);
   3748 
   3749                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   3750                     // The loop has matched the maximum permitted number of times.
   3751                     //   Break out of here with no action.  Matching will
   3752                     //   continue with the following pattern.
   3753                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   3754                     break;
   3755                 }
   3756 
   3757                 if (*pCounter < minCount) {
   3758                     // We haven't met the minimum number of matches yet.
   3759                     //   Loop back for another one.
   3760                     fp->fPatIdx = opValue + 4;    // Loop back.
   3761                 } else {
   3762                     // We do have the minimum number of matches.
   3763                     //   Fall into the following pattern, but first do
   3764                     //   a state save to the top of the loop, so that a failure
   3765                     //   in the following pattern will try another iteration of the loop.
   3766                     fp = StateSave(fp, opValue + 4, status);
   3767                 }
   3768             }
   3769             break;
   3770 
   3771         case URX_STO_SP:
   3772             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3773             fData[opValue] = fStack->size();
   3774             break;
   3775 
   3776         case URX_LD_SP:
   3777             {
   3778                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3779                 int32_t newStackSize = (int32_t)fData[opValue];
   3780                 U_ASSERT(newStackSize <= fStack->size());
   3781                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3782                 if (newFP == (int64_t *)fp) {
   3783                     break;
   3784                 }
   3785                 int32_t i;
   3786                 for (i=0; i<fFrameSize; i++) {
   3787                     newFP[i] = ((int64_t *)fp)[i];
   3788                 }
   3789                 fp = (REStackFrame *)newFP;
   3790                 fStack->setSize(newStackSize);
   3791             }
   3792             break;
   3793 
   3794         case URX_BACKREF:
   3795         case URX_BACKREF_I:
   3796             {
   3797                 U_ASSERT(opValue < fFrameSize);
   3798                 int64_t groupStartIdx = fp->fExtra[opValue];
   3799                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   3800                 U_ASSERT(groupStartIdx <= groupEndIdx);
   3801                 if (groupStartIdx < 0) {
   3802                     // This capture group has not participated in the match thus far,
   3803                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3804                 }
   3805 
   3806                 if (groupEndIdx == groupStartIdx) {
   3807                     //   The capture group match was of an empty string.
   3808                     //   Verified by testing:  Perl matches succeed in this case, so
   3809                     //   we do too.
   3810                     break;
   3811                 }
   3812 
   3813                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
   3814                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3815 
   3816                 UBool haveMatch = (opType == URX_BACKREF ?
   3817                     (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) :
   3818                     (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status)));
   3819                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3820 
   3821                 if (fp->fInputIdx > fActiveLimit) {
   3822                     fHitEnd = TRUE;
   3823                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3824                 } else if (!haveMatch) {
   3825                     if (fp->fInputIdx == fActiveLimit) {
   3826                         fHitEnd = TRUE;
   3827                     }
   3828                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3829                 }
   3830             }
   3831             break;
   3832 
   3833         case URX_STO_INP_LOC:
   3834             {
   3835                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   3836                 fp->fExtra[opValue] = fp->fInputIdx;
   3837             }
   3838             break;
   3839 
   3840         case URX_JMPX:
   3841             {
   3842                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3843                 fp->fPatIdx += 1;
   3844                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   3845                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   3846                 int64_t savedInputIdx = fp->fExtra[dataLoc];
   3847                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   3848                 if (savedInputIdx < fp->fInputIdx) {
   3849                     fp->fPatIdx = opValue;                               // JMP
   3850                 } else {
   3851                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   3852                 }
   3853             }
   3854             break;
   3855 
   3856         case URX_LA_START:
   3857             {
   3858                 // Entering a lookahead block.
   3859                 // Save Stack Ptr, Input Pos.
   3860                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3861                 fData[opValue]   = fStack->size();
   3862                 fData[opValue+1] = fp->fInputIdx;
   3863                 fActiveStart     = fLookStart;          // Set the match region change for
   3864                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   3865             }
   3866             break;
   3867 
   3868         case URX_LA_END:
   3869             {
   3870                 // Leaving a look-ahead block.
   3871                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   3872                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3873                 int32_t stackSize = fStack->size();
   3874                 int32_t newStackSize =(int32_t)fData[opValue];
   3875                 U_ASSERT(stackSize >= newStackSize);
   3876                 if (stackSize > newStackSize) {
   3877                     // Copy the current top frame back to the new (cut back) top frame.
   3878                     //   This makes the capture groups from within the look-ahead
   3879                     //   expression available.
   3880                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3881                     int32_t i;
   3882                     for (i=0; i<fFrameSize; i++) {
   3883                         newFP[i] = ((int64_t *)fp)[i];
   3884                     }
   3885                     fp = (REStackFrame *)newFP;
   3886                     fStack->setSize(newStackSize);
   3887                 }
   3888                 fp->fInputIdx = fData[opValue+1];
   3889 
   3890                 // Restore the active region bounds in the input string; they may have
   3891                 //    been changed because of transparent bounds on a Region.
   3892                 fActiveStart = fRegionStart;
   3893                 fActiveLimit = fRegionLimit;
   3894             }
   3895             break;
   3896 
   3897         case URX_ONECHAR_I:
   3898             if (fp->fInputIdx < fActiveLimit) {
   3899                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3900 
   3901                 UChar32 c = UTEXT_NEXT32(fInputText);
   3902                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   3903                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3904                     break;
   3905                 }
   3906             } else {
   3907                 fHitEnd = TRUE;
   3908             }
   3909 
   3910             #ifdef REGEX_SMART_BACKTRACKING
   3911             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   3912                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3913                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3914                     UBool success = FALSE;
   3915                     UChar32 c = UTEXT_PREVIOUS32(fInputText);
   3916                     while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
   3917                         if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   3918                             success = TRUE;
   3919                             break;
   3920                         } else if (c == U_SENTINEL) {
   3921                             break;
   3922                         }
   3923                         c = UTEXT_PREVIOUS32(fInputText);
   3924                     }
   3925                     if (success) {
   3926                         fHitEnd = FALSE;
   3927                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3928                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3929                         if (fp->fInputIdx > backSearchIndex) {
   3930                             fp = StateSave(fp, fp->fPatIdx, status);
   3931                         }
   3932                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3933                         break;
   3934                     }
   3935                 }
   3936             }
   3937             #endif
   3938 
   3939             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3940             break;
   3941 
   3942         case URX_STRING_I:
   3943             {
   3944                 // Test input against a literal string.
   3945                 // Strings require two slots in the compiled pattern, one for the
   3946                 //   offset to the string text, and one for the length.
   3947                 const UCaseProps *csp = ucase_getSingleton();
   3948                 {
   3949                     int32_t stringStartIdx, stringLen;
   3950                     stringStartIdx = opValue;
   3951 
   3952                     op      = (int32_t)pat[fp->fPatIdx];
   3953                     fp->fPatIdx++;
   3954                     opType  = URX_TYPE(op);
   3955                     opValue = URX_VAL(op);
   3956                     U_ASSERT(opType == URX_STRING_LEN);
   3957                     stringLen = opValue;
   3958 
   3959                     const UChar *patternChars = litText+stringStartIdx;
   3960                     const UChar *patternEnd = patternChars+stringLen;
   3961 
   3962                     const UChar *foldChars = NULL;
   3963                     int32_t foldOffset, foldLength;
   3964                     UChar32 c;
   3965 
   3966                     foldOffset = foldLength = 0;
   3967                     UBool success = TRUE;
   3968 
   3969                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3970                     while (patternChars < patternEnd && success) {
   3971                         if(foldOffset < foldLength) {
   3972                             U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   3973                         } else {
   3974                             c = UTEXT_NEXT32(fInputText);
   3975                             if (c != U_SENTINEL) {
   3976                                 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
   3977                                 if(foldLength >= 0) {
   3978                                     if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
   3979                                         foldOffset = 0;
   3980                                         U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   3981                                     } else {
   3982                                         c = foldLength;
   3983                                         foldLength = foldOffset; // to avoid reading chars from the folding buffer
   3984                                     }
   3985                                 }
   3986                             }
   3987 
   3988                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3989                         }
   3990 
   3991                         success = FALSE;
   3992                         if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) {
   3993                             if (U_IS_BMP(c)) {
   3994                                 success = (*patternChars == c);
   3995                                 patternChars += 1;
   3996                             } else if (patternChars+1 < patternEnd) {
   3997                                 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
   3998                                 patternChars += 2;
   3999                             }
   4000                         } else {
   4001                             fHitEnd = TRUE;          //   TODO:  See ticket 6074
   4002                         }
   4003                     }
   4004 
   4005                     if (!success) {
   4006                         #ifdef REGEX_SMART_BACKTRACKING
   4007                         if (fp->fInputIdx > backSearchIndex && fStack->size()) {
   4008                             REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   4009                             if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   4010                                 // Reset to last start point
   4011                                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4012                                 patternChars = litText+stringStartIdx;
   4013 
   4014                                 // Search backwards for a possible start
   4015                                 do {
   4016                                     c = UTEXT_PREVIOUS32(fInputText);
   4017                                     if (c == U_SENTINEL) {
   4018                                         break;
   4019                                     } else {
   4020                                         foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
   4021                                         if(foldLength >= 0) {
   4022                                             if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
   4023                                                 foldOffset = 0;
   4024                                                 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   4025                                             } else {
   4026                                                 c = foldLength;
   4027                                                 foldLength = foldOffset; // to avoid reading chars from the folding buffer
   4028                                             }
   4029                                         }
   4030 
   4031                                         if ((U_IS_BMP(c) && *patternChars == c) ||
   4032                                                (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
   4033                                             success = TRUE;
   4034                                             break;
   4035                                         }
   4036                                     }
   4037                                 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
   4038 
   4039                                 // And try again
   4040                                 if (success) {
   4041                                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4042                                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4043                                     if (fp->fInputIdx > backSearchIndex) {
   4044                                         fp = StateSave(fp, fp->fPatIdx, status);
   4045                                     }
   4046                                     fp->fPatIdx++; // Skip the LOOP_C, we just did that
   4047                                     break;
   4048                                 }
   4049                             }
   4050                         }
   4051                         #endif
   4052                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4053                     }
   4054                 }
   4055             }
   4056             break;
   4057 
   4058         case URX_LB_START:
   4059             {
   4060                 // Entering a look-behind block.
   4061                 // Save Stack Ptr, Input Pos.
   4062                 //   TODO:  implement transparent bounds.  Ticket #6067
   4063                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4064                 fData[opValue]   = fStack->size();
   4065                 fData[opValue+1] = fp->fInputIdx;
   4066                 // Init the variable containing the start index for attempted matches.
   4067                 fData[opValue+2] = -1;
   4068                 // Save input string length, then reset to pin any matches to end at
   4069                 //   the current position.
   4070                 fData[opValue+3] = fActiveLimit;
   4071                 fActiveLimit     = fp->fInputIdx;
   4072             }
   4073             break;
   4074 
   4075 
   4076         case URX_LB_CONT:
   4077             {
   4078                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   4079                 //    at all possible input starting positions.
   4080 
   4081                 // Fetch the min and max possible match lengths.  They are the operands
   4082                 //   of this op in the pattern.
   4083                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
   4084                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
   4085                 U_ASSERT(minML <= maxML);
   4086                 U_ASSERT(minML >= 0);
   4087 
   4088                 // Fetch (from data) the last input index where a match was attempted.
   4089                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4090                 int64_t  *lbStartIdx = &fData[opValue+2];
   4091                 if (*lbStartIdx < 0) {
   4092                     // First time through loop.
   4093                     *lbStartIdx = fp->fInputIdx - minML;
   4094                 } else {
   4095                     // 2nd through nth time through the loop.
   4096                     // Back up start position for match by one.
   4097                     if (*lbStartIdx == 0) {
   4098                         (*lbStartIdx)--;
   4099                     } else {
   4100                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
   4101                         (void)UTEXT_PREVIOUS32(fInputText);
   4102                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4103                     }
   4104                 }
   4105 
   4106                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   4107                     // We have tried all potential match starting points without
   4108                     //  getting a match.  Backtrack out, and out of the
   4109                     //   Look Behind altogether.
   4110                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4111                     int64_t restoreInputLen = fData[opValue+3];
   4112                     U_ASSERT(restoreInputLen >= fActiveLimit);
   4113                     U_ASSERT(restoreInputLen <= fInputLength);
   4114                     fActiveLimit = restoreInputLen;
   4115                     break;
   4116                 }
   4117 
   4118                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   4119                 //      (successful match will fall off the end of the loop.)
   4120                 fp = StateSave(fp, fp->fPatIdx-3, status);
   4121                 fp->fInputIdx = *lbStartIdx;
   4122             }
   4123             break;
   4124 
   4125         case URX_LB_END:
   4126             // End of a look-behind block, after a successful match.
   4127             {
   4128                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4129                 if (fp->fInputIdx != fActiveLimit) {
   4130                     //  The look-behind expression matched, but the match did not
   4131                     //    extend all the way to the point that we are looking behind from.
   4132                     //  FAIL out of here, which will take us back to the LB_CONT, which
   4133                     //     will retry the match starting at another position or fail
   4134                     //     the look-behind altogether, whichever is appropriate.
   4135                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4136                     break;
   4137                 }
   4138 
   4139                 // Look-behind match is good.  Restore the orignal input string length,
   4140                 //   which had been truncated to pin the end of the lookbehind match to the
   4141                 //   position being looked-behind.
   4142                 int64_t originalInputLen = fData[opValue+3];
   4143                 U_ASSERT(originalInputLen >= fActiveLimit);
   4144                 U_ASSERT(originalInputLen <= fInputLength);
   4145                 fActiveLimit = originalInputLen;
   4146             }
   4147             break;
   4148 
   4149 
   4150         case URX_LBN_CONT:
   4151             {
   4152                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   4153                 //    at all possible input starting positions.
   4154 
   4155                 // Fetch the extra parameters of this op.
   4156                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
   4157                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
   4158                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
   4159                         continueLoc = URX_VAL(continueLoc);
   4160                 U_ASSERT(minML <= maxML);
   4161                 U_ASSERT(minML >= 0);
   4162                 U_ASSERT(continueLoc > fp->fPatIdx);
   4163 
   4164                 // Fetch (from data) the last input index where a match was attempted.
   4165                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4166                 int64_t  *lbStartIdx = &fData[opValue+2];
   4167                 if (*lbStartIdx < 0) {
   4168                     // First time through loop.
   4169                     *lbStartIdx = fp->fInputIdx - minML;
   4170                 } else {
   4171                     // 2nd through nth time through the loop.
   4172                     // Back up start position for match by one.
   4173                     if (*lbStartIdx == 0) {
   4174                         (*lbStartIdx)--;
   4175                     } else {
   4176                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
   4177                         (void)UTEXT_PREVIOUS32(fInputText);
   4178                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4179                     }
   4180                 }
   4181 
   4182                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   4183                     // We have tried all potential match starting points without
   4184                     //  getting a match, which means that the negative lookbehind as
   4185                     //  a whole has succeeded.  Jump forward to the continue location
   4186                     int64_t restoreInputLen = fData[opValue+3];
   4187                     U_ASSERT(restoreInputLen >= fActiveLimit);
   4188                     U_ASSERT(restoreInputLen <= fInputLength);
   4189                     fActiveLimit = restoreInputLen;
   4190                     fp->fPatIdx = continueLoc;
   4191                     break;
   4192                 }
   4193 
   4194                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   4195                 //      (successful match will cause a FAIL out of the loop altogether.)
   4196                 fp = StateSave(fp, fp->fPatIdx-4, status);
   4197                 fp->fInputIdx = *lbStartIdx;
   4198             }
   4199             break;
   4200 
   4201         case URX_LBN_END:
   4202             // End of a negative look-behind block, after a successful match.
   4203             {
   4204                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4205                 if (fp->fInputIdx != fActiveLimit) {
   4206                     //  The look-behind expression matched, but the match did not
   4207                     //    extend all the way to the point that we are looking behind from.
   4208                     //  FAIL out of here, which will take us back to the LB_CONT, which
   4209                     //     will retry the match starting at another position or succeed
   4210                     //     the look-behind altogether, whichever is appropriate.
   4211                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4212                     break;
   4213                 }
   4214 
   4215                 // Look-behind expression matched, which means look-behind test as
   4216                 //   a whole Fails
   4217 
   4218                 //   Restore the orignal input string length, which had been truncated
   4219                 //   inorder to pin the end of the lookbehind match
   4220                 //   to the position being looked-behind.
   4221                 int64_t originalInputLen = fData[opValue+3];
   4222                 U_ASSERT(originalInputLen >= fActiveLimit);
   4223                 U_ASSERT(originalInputLen <= fInputLength);
   4224                 fActiveLimit = originalInputLen;
   4225 
   4226                 // Restore original stack position, discarding any state saved
   4227                 //   by the successful pattern match.
   4228                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4229                 int32_t newStackSize = (int32_t)fData[opValue];
   4230                 U_ASSERT(fStack->size() > newStackSize);
   4231                 fStack->setSize(newStackSize);
   4232 
   4233                 //  FAIL, which will take control back to someplace
   4234                 //  prior to entering the look-behind test.
   4235                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4236             }
   4237             break;
   4238 
   4239 
   4240         case URX_LOOP_SR_I:
   4241             // Loop Initialization for the optimized implementation of
   4242             //     [some character set]*
   4243             //   This op scans through all matching input.
   4244             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4245             {
   4246                 U_ASSERT(opValue > 0 && opValue < sets->size());
   4247                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   4248                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   4249 
   4250                 // Loop through input, until either the input is exhausted or
   4251                 //   we reach a character that is not a member of the set.
   4252                 int64_t ix = fp->fInputIdx;
   4253                 UTEXT_SETNATIVEINDEX(fInputText, ix);
   4254                 for (;;) {
   4255                     if (ix >= fActiveLimit) {
   4256                         fHitEnd = TRUE;
   4257                         break;
   4258                     }
   4259                     UChar32 c = UTEXT_NEXT32(fInputText);
   4260                     if (c<256) {
   4261                         if (s8->contains(c) == FALSE) {
   4262                             break;
   4263                         }
   4264                     } else {
   4265                         if (s->contains(c) == FALSE) {
   4266                             break;
   4267                         }
   4268                     }
   4269                     ix = UTEXT_GETNATIVEINDEX(fInputText);
   4270                 }
   4271 
   4272                 // If there were no matching characters, skip over the loop altogether.
   4273                 //   The loop doesn't run at all, a * op always succeeds.
   4274                 if (ix == fp->fInputIdx) {
   4275                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4276                     break;
   4277                 }
   4278 
   4279                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4280                 //   must follow.  It's operand is the stack location
   4281                 //   that holds the starting input index for the match of this [set]*
   4282                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   4283                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4284                 int32_t stackLoc = URX_VAL(loopcOp);
   4285                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4286                 fp->fExtra[stackLoc] = fp->fInputIdx;
   4287                 #ifdef REGEX_SMART_BACKTRACKING
   4288                 backSearchIndex = fp->fInputIdx;
   4289                 #endif
   4290                 fp->fInputIdx = ix;
   4291 
   4292                 // Save State to the URX_LOOP_C op that follows this one,
   4293                 //   so that match failures in the following code will return to there.
   4294                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4295                 fp = StateSave(fp, fp->fPatIdx, status);
   4296                 fp->fPatIdx++;
   4297             }
   4298             break;
   4299 
   4300 
   4301         case URX_LOOP_DOT_I:
   4302             // Loop Initialization for the optimized implementation of .*
   4303             //   This op scans through all remaining input.
   4304             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4305             {
   4306                 // Loop through input until the input is exhausted (we reach an end-of-line)
   4307                 // In DOTALL mode, we can just go straight to the end of the input.
   4308                 int64_t ix;
   4309                 if ((opValue & 1) == 1) {
   4310                     // Dot-matches-All mode.  Jump straight to the end of the string.
   4311                     ix = fActiveLimit;
   4312                     fHitEnd = TRUE;
   4313                 } else {
   4314                     // NOT DOT ALL mode.  Line endings do not match '.'
   4315                     // Scan forward until a line ending or end of input.
   4316                     ix = fp->fInputIdx;
   4317                     UTEXT_SETNATIVEINDEX(fInputText, ix);
   4318                     for (;;) {
   4319                         if (ix >= fActiveLimit) {
   4320                             fHitEnd = TRUE;
   4321                             break;
   4322                         }
   4323                         UChar32 c = UTEXT_NEXT32(fInputText);
   4324                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
   4325                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
   4326                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   4327                                     (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
   4328                                 //  char is a line ending.  Exit the scanning loop.
   4329                                 break;
   4330                             }
   4331                         }
   4332                         ix = UTEXT_GETNATIVEINDEX(fInputText);
   4333                     }
   4334                 }
   4335 
   4336                 // If there were no matching characters, skip over the loop altogether.
   4337                 //   The loop doesn't run at all, a * op always succeeds.
   4338                 if (ix == fp->fInputIdx) {
   4339                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4340                     break;
   4341                 }
   4342 
   4343                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4344                 //   must follow.  It's operand is the stack location
   4345                 //   that holds the starting input index for the match of this .*
   4346                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   4347                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4348                 int32_t stackLoc = URX_VAL(loopcOp);
   4349                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4350                 fp->fExtra[stackLoc] = fp->fInputIdx;
   4351                 #ifdef REGEX_SMART_BACKTRACKING
   4352                 backSearchIndex = fp->fInputIdx;
   4353                 #endif
   4354                 fp->fInputIdx = ix;
   4355 
   4356                 // Save State to the URX_LOOP_C op that follows this one,
   4357                 //   so that match failures in the following code will return to there.
   4358                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4359                 fp = StateSave(fp, fp->fPatIdx, status);
   4360                 fp->fPatIdx++;
   4361             }
   4362             break;
   4363 
   4364 
   4365         case URX_LOOP_C:
   4366             {
   4367                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   4368                 backSearchIndex = fp->fExtra[opValue];
   4369                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
   4370                 if (backSearchIndex == fp->fInputIdx) {
   4371                     // We've backed up the input idx to the point that the loop started.
   4372                     // The loop is done.  Leave here without saving state.
   4373                     //  Subsequent failures won't come back here.
   4374                     break;
   4375                 }
   4376                 // Set up for the next iteration of the loop, with input index
   4377                 //   backed up by one from the last time through,
   4378                 //   and a state save to this instruction in case the following code fails again.
   4379                 //   (We're going backwards because this loop emulates stack unwinding, not
   4380                 //    the initial scan forward.)
   4381                 U_ASSERT(fp->fInputIdx > 0);
   4382                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4383                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
   4384                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4385 
   4386                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
   4387                 if (prevC == 0x0a &&
   4388                     fp->fInputIdx > backSearchIndex &&
   4389                     twoPrevC == 0x0d) {
   4390                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
   4391                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   4392                         // .*, stepping back over CRLF pair.
   4393                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4394                     }
   4395                 }
   4396 
   4397 
   4398                 fp = StateSave(fp, fp->fPatIdx-1, status);
   4399             }
   4400             break;
   4401 
   4402 
   4403 
   4404         default:
   4405             // Trouble.  The compiled pattern contains an entry with an
   4406             //           unrecognized type tag.
   4407             U_ASSERT(FALSE);
   4408         }
   4409 
   4410         if (U_FAILURE(status)) {
   4411             isMatch = FALSE;
   4412             break;
   4413         }
   4414     }
   4415 
   4416 breakFromLoop:
   4417     fMatch = isMatch;
   4418     if (isMatch) {
   4419         fLastMatchEnd = fMatchEnd;
   4420         fMatchStart   = startIdx;
   4421         fMatchEnd     = fp->fInputIdx;
   4422         if (fTraceDebug) {
   4423             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
   4424         }
   4425     }
   4426     else
   4427     {
   4428         if (fTraceDebug) {
   4429             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
   4430         }
   4431     }
   4432 
   4433     fFrame = fp;                // The active stack frame when the engine stopped.
   4434                                 //   Contains the capture group results that we need to
   4435                                 //    access later.
   4436     return;
   4437 }
   4438 
   4439 
   4440 //--------------------------------------------------------------------------------
   4441 //
   4442 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
   4443 //                  assumption that the entire string is available in the UText's
   4444 //                  chunk buffer. For now, that means we can use int32_t indexes,
   4445 //                  except for anything that needs to be saved (like group starts
   4446 //                  and ends).
   4447 //
   4448 //                  startIdx:    begin matching a this index.
   4449 //                  toEnd:       if true, match must extend to end of the input region
   4450 //
   4451 //--------------------------------------------------------------------------------
   4452 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
   4453     UBool       isMatch  = FALSE;      // True if the we have a match.
   4454 
   4455     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
   4456 
   4457     int32_t     op;                    // Operation from the compiled pattern, split into
   4458     int32_t     opType;                //    the opcode
   4459     int32_t     opValue;               //    and the operand value.
   4460 
   4461 #ifdef REGEX_RUN_DEBUG
   4462     if (fTraceDebug)
   4463     {
   4464         printf("MatchAt(startIdx=%ld)\n", startIdx);
   4465         printf("Original Pattern: ");
   4466         UChar32 c = utext_next32From(fPattern->fPattern, 0);
   4467         while (c != U_SENTINEL) {
   4468             if (c<32 || c>256) {
   4469                 c = '.';
   4470             }
   4471             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
   4472 
   4473             c = UTEXT_NEXT32(fPattern->fPattern);
   4474         }
   4475         printf("\n");
   4476         printf("Input String: ");
   4477         c = utext_next32From(fInputText, 0);
   4478         while (c != U_SENTINEL) {
   4479             if (c<32 || c>256) {
   4480                 c = '.';
   4481             }
   4482             printf("%c", c);
   4483 
   4484             c = UTEXT_NEXT32(fInputText);
   4485         }
   4486         printf("\n");
   4487         printf("\n");
   4488     }
   4489 #endif
   4490 
   4491     if (U_FAILURE(status)) {
   4492         return;
   4493     }
   4494 
   4495     //  Cache frequently referenced items from the compiled pattern
   4496     //
   4497     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   4498 
   4499     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   4500     UVector             *sets          = fPattern->fSets;
   4501 
   4502     const UChar         *inputBuf      = fInputText->chunkContents;
   4503 
   4504     fFrameSize = fPattern->fFrameSize;
   4505     REStackFrame        *fp            = resetStack();
   4506 
   4507     fp->fPatIdx   = 0;
   4508     fp->fInputIdx = startIdx;
   4509 
   4510     // Zero out the pattern's static data
   4511     int32_t i;
   4512     for (i = 0; i<fPattern->fDataSize; i++) {
   4513         fData[i] = 0;
   4514     }
   4515 
   4516     //
   4517     //  Main loop for interpreting the compiled pattern.
   4518     //  One iteration of the loop per pattern operation performed.
   4519     //
   4520     for (;;) {
   4521 #if 0
   4522         if (_heapchk() != _HEAPOK) {
   4523             fprintf(stderr, "Heap Trouble\n");
   4524         }
   4525 #endif
   4526 
   4527         op      = (int32_t)pat[fp->fPatIdx];
   4528         opType  = URX_TYPE(op);
   4529         opValue = URX_VAL(op);
   4530 #ifdef REGEX_RUN_DEBUG
   4531         if (fTraceDebug) {
   4532             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4533             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
   4534                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   4535             fPattern->dumpOp(fp->fPatIdx);
   4536         }
   4537 #endif
   4538         fp->fPatIdx++;
   4539 
   4540         switch (opType) {
   4541 
   4542 
   4543         case URX_NOP:
   4544             break;
   4545 
   4546 
   4547         case URX_BACKTRACK:
   4548             // Force a backtrack.  In some circumstances, the pattern compiler
   4549             //   will notice that the pattern can't possibly match anything, and will
   4550             //   emit one of these at that point.
   4551             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4552             break;
   4553 
   4554 
   4555         case URX_ONECHAR:
   4556             if (fp->fInputIdx < fActiveLimit) {
   4557                 UChar32 c;
   4558                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4559                 if (c == opValue) {
   4560                     break;
   4561                 }
   4562             } else {
   4563                 fHitEnd = TRUE;
   4564             }
   4565 
   4566             #ifdef REGEX_SMART_BACKTRACKING
   4567             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   4568                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   4569                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   4570                     int64_t reverseIndex = fp->fInputIdx;
   4571                     UChar32 c;
   4572                     do {
   4573                         U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   4574                         if (c == opValue) {
   4575                             break;
   4576                         }
   4577                     } while (reverseIndex > backSearchIndex);
   4578                     if (c == opValue) {
   4579                         fHitEnd = FALSE;
   4580                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4581                         fp->fInputIdx = reverseIndex;
   4582                         if (fp->fInputIdx > backSearchIndex) {
   4583                             fp = StateSave(fp, fp->fPatIdx, status);
   4584                         }
   4585                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
   4586                         break;
   4587                     }
   4588                 }
   4589             }
   4590             #endif
   4591 
   4592             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4593             break;
   4594 
   4595 
   4596         case URX_STRING:
   4597             {
   4598                 // Test input against a literal string.
   4599                 // Strings require two slots in the compiled pattern, one for the
   4600                 //   offset to the string text, and one for the length.
   4601                 int32_t   stringStartIdx = opValue;
   4602                 int32_t   stringLen;
   4603 
   4604                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
   4605                 fp->fPatIdx++;
   4606                 opType    = URX_TYPE(op);
   4607                 stringLen = URX_VAL(op);
   4608                 U_ASSERT(opType == URX_STRING_LEN);
   4609                 U_ASSERT(stringLen >= 2);
   4610 
   4611                 if (fp->fInputIdx + stringLen > fActiveLimit) {
   4612                     // No match.  String is longer than the remaining input text.
   4613                     fHitEnd = TRUE;          //   TODO:  See ticket 6074
   4614                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4615                     break;
   4616                 }
   4617 
   4618                 const UChar * pInp = inputBuf + fp->fInputIdx;
   4619                 const UChar * pPat = litText+stringStartIdx;
   4620                 const UChar * pEnd = pInp + stringLen;
   4621                 UBool success = FALSE;
   4622                 for(;;) {
   4623                     if (*pInp == *pPat) {
   4624                         pInp++;
   4625                         pPat++;
   4626                         if (pInp == pEnd) {
   4627                             // Successful Match.
   4628                             success = TRUE;
   4629                             break;
   4630                         }
   4631                     } else {
   4632                         // Match failed.
   4633                         break;
   4634                     }
   4635                 }
   4636 
   4637                 if (success) {
   4638                     fp->fInputIdx += stringLen;
   4639                 } else {
   4640                     #ifdef REGEX_SMART_BACKTRACKING
   4641                     if (fp->fInputIdx > backSearchIndex && fStack->size()) {
   4642                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   4643                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   4644                             // Reset to last start point
   4645                             int64_t reverseIndex = fp->fInputIdx;
   4646                             UChar32 c;
   4647                             pPat = litText+stringStartIdx;
   4648 
   4649                             // Search backwards for a possible start
   4650                             do {
   4651                                 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   4652                                 if ((U_IS_BMP(c) && *pPat == c) ||
   4653                                     (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) {
   4654                                     success = TRUE;
   4655                                     break;
   4656                                 }
   4657                             } while (reverseIndex > backSearchIndex);
   4658 
   4659                             // And try again
   4660                             if (success) {
   4661                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4662                                 fp->fInputIdx = reverseIndex;
   4663                                 if (fp->fInputIdx > backSearchIndex) {
   4664                                     fp = StateSave(fp, fp->fPatIdx, status);
   4665                                 }
   4666                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
   4667                                 break;
   4668                             }
   4669                         }
   4670                     }
   4671                     #endif
   4672                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4673                 }
   4674             }
   4675             break;
   4676 
   4677 
   4678         case URX_STATE_SAVE:
   4679             fp = StateSave(fp, opValue, status);
   4680             break;
   4681 
   4682 
   4683         case URX_END:
   4684             // The match loop will exit via this path on a successful match,
   4685             //   when we reach the end of the pattern.
   4686             if (toEnd && fp->fInputIdx != fActiveLimit) {
   4687                 // The pattern matched, but not to the end of input.  Try some more.
   4688                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4689                 break;
   4690             }
   4691             isMatch = TRUE;
   4692             goto  breakFromLoop;
   4693 
   4694             // Start and End Capture stack frame variables are laid out out like this:
   4695             //  fp->fExtra[opValue]  - The start of a completed capture group
   4696             //             opValue+1 - The end   of a completed capture group
   4697             //             opValue+2 - the start of a capture group whose end
   4698             //                          has not yet been reached (and might not ever be).
   4699         case URX_START_CAPTURE:
   4700             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4701             fp->fExtra[opValue+2] = fp->fInputIdx;
   4702             break;
   4703 
   4704 
   4705         case URX_END_CAPTURE:
   4706             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4707             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   4708             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   4709             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   4710             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   4711             break;
   4712 
   4713 
   4714         case URX_DOLLAR:                   //  $, test for End of line
   4715             //     or for position before new line at end of input
   4716             if (fp->fInputIdx < fAnchorLimit-2) {
   4717                 // We are no where near the end of input.  Fail.
   4718                 //   This is the common case.  Keep it first.
   4719                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4720                 break;
   4721             }
   4722             if (fp->fInputIdx >= fAnchorLimit) {
   4723                 // We really are at the end of input.  Success.
   4724                 fHitEnd = TRUE;
   4725                 fRequireEnd = TRUE;
   4726                 break;
   4727             }
   4728 
   4729             // If we are positioned just before a new-line that is located at the
   4730             //   end of input, succeed.
   4731             if (fp->fInputIdx == fAnchorLimit-1) {
   4732                 UChar32 c;
   4733                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
   4734 
   4735                 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
   4736                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4737                         // At new-line at end of input. Success
   4738                         fHitEnd = TRUE;
   4739                         fRequireEnd = TRUE;
   4740                         break;
   4741                     }
   4742                 }
   4743             } else if (fp->fInputIdx == fAnchorLimit-2 &&
   4744                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
   4745                     fHitEnd = TRUE;
   4746                     fRequireEnd = TRUE;
   4747                     break;                         // At CR/LF at end of input.  Success
   4748             }
   4749 
   4750             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4751 
   4752             break;
   4753 
   4754 
   4755         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   4756             if (fp->fInputIdx >= fAnchorLimit-1) {
   4757                 // Either at the last character of input, or off the end.
   4758                 if (fp->fInputIdx == fAnchorLimit-1) {
   4759                     // At last char of input.  Success if it's a new line.
   4760                     if (inputBuf[fp->fInputIdx] == 0x0a) {
   4761                         fHitEnd = TRUE;
   4762                         fRequireEnd = TRUE;
   4763                         break;
   4764                     }
   4765                 } else {
   4766                     // Off the end of input.  Success.
   4767                     fHitEnd = TRUE;
   4768                     fRequireEnd = TRUE;
   4769                     break;
   4770                 }
   4771             }
   4772 
   4773             // Not at end of input.  Back-track out.
   4774             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4775             break;
   4776 
   4777 
   4778         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   4779             {
   4780                 if (fp->fInputIdx >= fAnchorLimit) {
   4781                     // We really are at the end of input.  Success.
   4782                     fHitEnd = TRUE;
   4783                     fRequireEnd = TRUE;
   4784                     break;
   4785                 }
   4786                 // If we are positioned just before a new-line, succeed.
   4787                 // It makes no difference where the new-line is within the input.
   4788                 UChar32 c = inputBuf[fp->fInputIdx];
   4789                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
   4790                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   4791                     //  In multi-line mode, hitting a new-line just before the end of input does not
   4792                     //   set the hitEnd or requireEnd flags
   4793                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4794                         break;
   4795                     }
   4796                 }
   4797                 // not at a new line.  Fail.
   4798                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4799             }
   4800             break;
   4801 
   4802 
   4803         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   4804             {
   4805                 if (fp->fInputIdx >= fAnchorLimit) {
   4806                     // We really are at the end of input.  Success.
   4807                     fHitEnd = TRUE;
   4808                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   4809                     break;               //   adding a new-line would not lose the match.
   4810                 }
   4811                 // If we are not positioned just before a new-line, the test fails; backtrack out.
   4812                 // It makes no difference where the new-line is within the input.
   4813                 if (inputBuf[fp->fInputIdx] != 0x0a) {
   4814                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4815                 }
   4816             }
   4817             break;
   4818 
   4819 
   4820         case URX_CARET:                    //  ^, test for start of line
   4821             if (fp->fInputIdx != fAnchorStart) {
   4822                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4823             }
   4824             break;
   4825 
   4826 
   4827         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   4828             {
   4829                 if (fp->fInputIdx == fAnchorStart) {
   4830                     // We are at the start input.  Success.
   4831                     break;
   4832                 }
   4833                 // Check whether character just before the current pos is a new-line
   4834                 //   unless we are at the end of input
   4835                 UChar  c = inputBuf[fp->fInputIdx - 1];
   4836                 if ((fp->fInputIdx < fAnchorLimit) &&
   4837                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   4838                     //  It's a new-line.  ^ is true.  Success.
   4839                     //  TODO:  what should be done with positions between a CR and LF?
   4840                     break;
   4841                 }
   4842                 // Not at the start of a line.  Fail.
   4843                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4844             }
   4845             break;
   4846 
   4847 
   4848         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   4849             {
   4850                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
   4851                 if (fp->fInputIdx <= fAnchorStart) {
   4852                     // We are at the start input.  Success.
   4853                     break;
   4854                 }
   4855                 // Check whether character just before the current pos is a new-line
   4856                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   4857                 UChar  c = inputBuf[fp->fInputIdx - 1];
   4858                 if (c != 0x0a) {
   4859                     // Not at the start of a line.  Back-track out.
   4860                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4861                 }
   4862             }
   4863             break;
   4864 
   4865         case URX_BACKSLASH_B:          // Test for word boundaries
   4866             {
   4867                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
   4868                 success ^= (opValue != 0);     // flip sense for \B
   4869                 if (!success) {
   4870                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4871                 }
   4872             }
   4873             break;
   4874 
   4875 
   4876         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   4877             {
   4878                 UBool success = isUWordBoundary(fp->fInputIdx);
   4879                 success ^= (opValue != 0);     // flip sense for \B
   4880                 if (!success) {
   4881                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4882                 }
   4883             }
   4884             break;
   4885 
   4886 
   4887         case URX_BACKSLASH_D:            // Test for decimal digit
   4888             {
   4889                 if (fp->fInputIdx >= fActiveLimit) {
   4890                     fHitEnd = TRUE;
   4891                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4892                     break;
   4893                 }
   4894 
   4895                 UChar32 c;
   4896                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4897                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   4898                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   4899                 success ^= (opValue != 0);        // flip sense for \D
   4900                 if (!success) {
   4901                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4902                 }
   4903             }
   4904             break;
   4905 
   4906 
   4907         case URX_BACKSLASH_G:          // Test for position at end of previous match
   4908             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
   4909                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4910             }
   4911             break;
   4912 
   4913 
   4914         case URX_BACKSLASH_X:
   4915         //  Match a Grapheme, as defined by Unicode TR 29.
   4916         //  Differs slightly from Perl, which consumes combining marks independently
   4917         //    of context.
   4918         {
   4919 
   4920             // Fail if at end of input
   4921             if (fp->fInputIdx >= fActiveLimit) {
   4922                 fHitEnd = TRUE;
   4923                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4924                 break;
   4925             }
   4926 
   4927             // Examine (and consume) the current char.
   4928             //   Dispatch into a little state machine, based on the char.
   4929             UChar32  c;
   4930             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4931             UnicodeSet **sets = fPattern->fStaticSets;
   4932             if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   4933             if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   4934             if (sets[URX_GC_L]->contains(c))       goto GC_L;
   4935             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   4936             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   4937             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4938             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4939             goto GC_Extend;
   4940 
   4941 
   4942 
   4943 GC_L:
   4944             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4945             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4946             if (sets[URX_GC_L]->contains(c))       goto GC_L;
   4947             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   4948             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   4949             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4950             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4951             goto GC_Extend;
   4952 
   4953 GC_V:
   4954             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4955             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4956             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4957             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4958             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4959             goto GC_Extend;
   4960 
   4961 GC_T:
   4962             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4963             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4964             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4965             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4966             goto GC_Extend;
   4967 
   4968 GC_Extend:
   4969             // Combining characters are consumed here
   4970             for (;;) {
   4971                 if (fp->fInputIdx >= fActiveLimit) {
   4972                     break;
   4973                 }
   4974                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4975                 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   4976                     U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   4977                     break;
   4978                 }
   4979             }
   4980             goto GC_Done;
   4981 
   4982 GC_Control:
   4983             // Most control chars stand alone (don't combine with combining chars),
   4984             //   except for that CR/LF sequence is a single grapheme cluster.
   4985             if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
   4986                 fp->fInputIdx++;
   4987             }
   4988 
   4989 GC_Done:
   4990             if (fp->fInputIdx >= fActiveLimit) {
   4991                 fHitEnd = TRUE;
   4992             }
   4993             break;
   4994         }
   4995 
   4996 
   4997 
   4998 
   4999         case URX_BACKSLASH_Z:          // Test for end of Input
   5000             if (fp->fInputIdx < fAnchorLimit) {
   5001                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5002             } else {
   5003                 fHitEnd = TRUE;
   5004                 fRequireEnd = TRUE;
   5005             }
   5006             break;
   5007 
   5008 
   5009 
   5010         case URX_STATIC_SETREF:
   5011             {
   5012                 // Test input character against one of the predefined sets
   5013                 //    (Word Characters, for example)
   5014                 // The high bit of the op value is a flag for the match polarity.
   5015                 //    0:   success if input char is in set.
   5016                 //    1:   success if input char is not in set.
   5017                 if (fp->fInputIdx >= fActiveLimit) {
   5018                     fHitEnd = TRUE;
   5019                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5020                     break;
   5021                 }
   5022 
   5023                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   5024                 opValue &= ~URX_NEG_SET;
   5025                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   5026 
   5027                 UChar32 c;
   5028                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5029                 if (c < 256) {
   5030                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   5031                     if (s8->contains(c)) {
   5032                         success = !success;
   5033                     }
   5034                 } else {
   5035                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   5036                     if (s->contains(c)) {
   5037                         success = !success;
   5038                     }
   5039                 }
   5040                 if (!success) {
   5041                     #ifdef REGEX_SMART_BACKTRACKING
   5042                     if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   5043                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5044                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5045                             // Try to find it, backwards
   5046                             int64_t reverseIndex = fp->fInputIdx;
   5047                             U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
   5048                             success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
   5049                             do {
   5050                                 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5051                                 if (c < 256) {
   5052                                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   5053                                     if (s8->contains(c)) {
   5054                                         success = !success;
   5055                                     }
   5056                                 } else {
   5057                                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   5058                                     if (s->contains(c)) {
   5059                                         success = !success;
   5060                                     }
   5061                                 }
   5062                             } while (reverseIndex > backSearchIndex && !success);
   5063 
   5064                             if (success) {
   5065                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5066                                 fp->fInputIdx = reverseIndex;
   5067                                 if (fp->fInputIdx > backSearchIndex) {
   5068                                     fp = StateSave(fp, fp->fPatIdx, status);
   5069                                 }
   5070                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5071                                 break;
   5072                             }
   5073                         }
   5074                     }
   5075                     #endif
   5076                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5077                 }
   5078             }
   5079             break;
   5080 
   5081 
   5082         case URX_STAT_SETREF_N:
   5083             {
   5084                 // Test input character for NOT being a member of  one of
   5085                 //    the predefined sets (Word Characters, for example)
   5086                 if (fp->fInputIdx >= fActiveLimit) {
   5087                     fHitEnd = TRUE;
   5088                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5089                     break;
   5090                 }
   5091 
   5092                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   5093 
   5094                 UChar32  c;
   5095                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5096                 if (c < 256) {
   5097                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   5098                     if (s8->contains(c) == FALSE) {
   5099                         break;
   5100                     }
   5101                 } else {
   5102                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   5103                     if (s->contains(c) == FALSE) {
   5104                         break;
   5105                     }
   5106                 }
   5107 
   5108                 #ifdef REGEX_SMART_BACKTRACKING
   5109                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   5110                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5111                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5112                         // Try to find it, backwards
   5113                         int64_t reverseIndex = fp->fInputIdx;
   5114                         U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
   5115                         UBool success = FALSE;
   5116                         do {
   5117                             U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5118                             if (c < 256) {
   5119                                 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   5120                                 if (s8->contains(c) == FALSE) {
   5121                                     success = TRUE;
   5122                                     break;
   5123                                 }
   5124                             } else {
   5125                                 const UnicodeSet *s = fPattern->fStaticSets[opValue];
   5126                                 if (s->contains(c) == FALSE) {
   5127                                     success = TRUE;
   5128                                     break;
   5129                                 }
   5130                             }
   5131                         } while (reverseIndex > backSearchIndex);
   5132 
   5133                         if (success) {
   5134                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5135                             fp->fInputIdx = reverseIndex;
   5136                             if (fp->fInputIdx > backSearchIndex) {
   5137                                 fp = StateSave(fp, fp->fPatIdx, status);
   5138                             }
   5139                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5140                             break;
   5141                         }
   5142                     }
   5143                 }
   5144                 #endif
   5145                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5146             }
   5147             break;
   5148 
   5149 
   5150         case URX_SETREF:
   5151             {
   5152                 if (fp->fInputIdx >= fActiveLimit) {
   5153                     fHitEnd = TRUE;
   5154                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5155                     break;
   5156                 }
   5157 
   5158                 U_ASSERT(opValue > 0 && opValue < sets->size());
   5159 
   5160                 // There is input left.  Pick up one char and test it for set membership.
   5161                 UChar32  c;
   5162                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5163                 if (c<256) {
   5164                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5165                     if (s8->contains(c)) {
   5166                         // The character is in the set.  A Match.
   5167                         break;
   5168                     }
   5169                 } else {
   5170                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   5171                     if (s->contains(c)) {
   5172                         // The character is in the set.  A Match.
   5173                         break;
   5174                     }
   5175                 }
   5176 
   5177                 // the character wasn't in the set.
   5178                 #ifdef REGEX_SMART_BACKTRACKING
   5179                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   5180                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5181                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5182                         // Try to find it, backwards
   5183                         int64_t reverseIndex = fp->fInputIdx;
   5184                         U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
   5185                         UBool success = FALSE;
   5186                         do {
   5187                             U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5188                             if (c < 256) {
   5189                                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5190                                 if (s8->contains(c)) {
   5191                                     success = TRUE;
   5192                                     break;
   5193                                 }
   5194                             } else {
   5195                                 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   5196                                 if (s->contains(c)) {
   5197                                     success = TRUE;
   5198                                     break;
   5199                                 }
   5200                             }
   5201                         } while (reverseIndex > backSearchIndex);
   5202 
   5203                         if (success) {
   5204                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5205                             fp->fInputIdx = reverseIndex;
   5206                             if (fp->fInputIdx > reverseIndex) {
   5207                                 fp = StateSave(fp, fp->fPatIdx, status);
   5208                             }
   5209                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5210                             break;
   5211                         }
   5212                     }
   5213                 }
   5214                 #endif
   5215                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5216             }
   5217             break;
   5218 
   5219 
   5220         case URX_DOTANY:
   5221             {
   5222                 // . matches anything, but stops at end-of-line.
   5223                 if (fp->fInputIdx >= fActiveLimit) {
   5224                     // At end of input.  Match failed.  Backtrack out.
   5225                     fHitEnd = TRUE;
   5226                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5227                     break;
   5228                 }
   5229 
   5230                 // There is input left.  Advance over one char, unless we've hit end-of-line
   5231                 UChar32  c;
   5232                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5233                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   5234                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   5235                     // End of line in normal mode.   . does not match.
   5236                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5237                     break;
   5238                 }
   5239             }
   5240             break;
   5241 
   5242 
   5243         case URX_DOTANY_ALL:
   5244             {
   5245                 // . in dot-matches-all (including new lines) mode
   5246                 if (fp->fInputIdx >= fActiveLimit) {
   5247                     // At end of input.  Match failed.  Backtrack out.
   5248                     fHitEnd = TRUE;
   5249                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5250                     break;
   5251                 }
   5252 
   5253                 // There is input left.  Advance over one char, except if we are
   5254                 //   at a cr/lf, advance over both of them.
   5255                 UChar32 c;
   5256                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5257                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   5258                     // In the case of a CR/LF, we need to advance over both.
   5259                     if (inputBuf[fp->fInputIdx] == 0x0a) {
   5260                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
   5261                     }
   5262                 }
   5263             }
   5264             break;
   5265 
   5266 
   5267         case URX_DOTANY_UNIX:
   5268             {
   5269                 // '.' operator, matches all, but stops at end-of-line.
   5270                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   5271                 if (fp->fInputIdx >= fActiveLimit) {
   5272                     // At end of input.  Match failed.  Backtrack out.
   5273                     fHitEnd = TRUE;
   5274                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5275                     break;
   5276                 }
   5277 
   5278                 // There is input left.  Advance over one char, unless we've hit end-of-line
   5279                 UChar32 c;
   5280                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5281                 if (c == 0x0a) {
   5282                     // End of line in normal mode.   '.' does not match the \n
   5283                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5284                 }
   5285             }
   5286             break;
   5287 
   5288 
   5289         case URX_JMP:
   5290             fp->fPatIdx = opValue;
   5291             break;
   5292 
   5293         case URX_FAIL:
   5294             isMatch = FALSE;
   5295             goto breakFromLoop;
   5296 
   5297         case URX_JMP_SAV:
   5298             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   5299             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   5300             fp->fPatIdx = opValue;                         // Then JMP.
   5301             break;
   5302 
   5303         case URX_JMP_SAV_X:
   5304             // This opcode is used with (x)+, when x can match a zero length string.
   5305             // Same as JMP_SAV, except conditional on the match having made forward progress.
   5306             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   5307             //   data address of the input position at the start of the loop.
   5308             {
   5309                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   5310                 int32_t  stoOp = (int32_t)pat[opValue-1];
   5311                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   5312                 int32_t  frameLoc = URX_VAL(stoOp);
   5313                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   5314                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
   5315                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   5316                 if (prevInputIdx < fp->fInputIdx) {
   5317                     // The match did make progress.  Repeat the loop.
   5318                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   5319                     fp->fPatIdx = opValue;
   5320                     fp->fExtra[frameLoc] = fp->fInputIdx;
   5321                 }
   5322                 // If the input position did not advance, we do nothing here,
   5323                 //   execution will fall out of the loop.
   5324             }
   5325             break;
   5326 
   5327         case URX_CTR_INIT:
   5328             {
   5329                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   5330                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   5331 
   5332                 // Pick up the three extra operands that CTR_INIT has, and
   5333                 //    skip the pattern location counter past
   5334                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5335                 fp->fPatIdx += 3;
   5336                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   5337                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   5338                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   5339                 U_ASSERT(minCount>=0);
   5340                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   5341                 U_ASSERT(loopLoc>fp->fPatIdx);
   5342 
   5343                 if (minCount == 0) {
   5344                     fp = StateSave(fp, loopLoc+1, status);
   5345                 }
   5346                 if (maxCount == 0) {
   5347                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5348                 }
   5349             }
   5350             break;
   5351 
   5352         case URX_CTR_LOOP:
   5353             {
   5354                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   5355                 int32_t initOp = (int32_t)pat[opValue];
   5356                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   5357                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   5358                 int32_t minCount  = (int32_t)pat[opValue+2];
   5359                 int32_t maxCount  = (int32_t)pat[opValue+3];
   5360                 // Increment the counter.  Note: we DIDN'T worry about counter
   5361                 //   overflow, since the data comes from UnicodeStrings, which
   5362                 //   stores its length in an int32_t. Do we have to think about
   5363                 //   this now that we're using UText? Probably not, since the length
   5364                 //   in UChar32s is still an int32_t.
   5365                 (*pCounter)++;
   5366                 U_ASSERT(*pCounter > 0);
   5367                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   5368                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   5369                     break;
   5370                 }
   5371                 if (*pCounter >= minCount) {
   5372                     fp = StateSave(fp, fp->fPatIdx, status);
   5373                 }
   5374                 fp->fPatIdx = opValue + 4;    // Loop back.
   5375             }
   5376             break;
   5377 
   5378         case URX_CTR_INIT_NG:
   5379             {
   5380                 // Initialize a non-greedy loop
   5381                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   5382                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   5383 
   5384                 // Pick up the three extra operands that CTR_INIT has, and
   5385                 //    skip the pattern location counter past
   5386                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5387                 fp->fPatIdx += 3;
   5388                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   5389                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   5390                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   5391                 U_ASSERT(minCount>=0);
   5392                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   5393                 U_ASSERT(loopLoc>fp->fPatIdx);
   5394 
   5395                 if (minCount == 0) {
   5396                     if (maxCount != 0) {
   5397                         fp = StateSave(fp, fp->fPatIdx, status);
   5398                     }
   5399                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   5400                 }
   5401             }
   5402             break;
   5403 
   5404         case URX_CTR_LOOP_NG:
   5405             {
   5406                 // Non-greedy {min, max} loops
   5407                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   5408                 int32_t initOp = (int32_t)pat[opValue];
   5409                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   5410                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   5411                 int32_t minCount  = (int32_t)pat[opValue+2];
   5412                 int32_t maxCount  = (int32_t)pat[opValue+3];
   5413                 // Increment the counter.  Note: we DIDN'T worry about counter
   5414                 //   overflow, since the data comes from UnicodeStrings, which
   5415                 //   stores its length in an int32_t. Do we have to think about
   5416                 //   this now that we're using UText? Probably not, since the length
   5417                 //   in UChar32s is still an int32_t.
   5418                 (*pCounter)++;
   5419                 U_ASSERT(*pCounter > 0);
   5420 
   5421                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   5422                     // The loop has matched the maximum permitted number of times.
   5423                     //   Break out of here with no action.  Matching will
   5424                     //   continue with the following pattern.
   5425                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   5426                     break;
   5427                 }
   5428 
   5429                 if (*pCounter < minCount) {
   5430                     // We haven't met the minimum number of matches yet.
   5431                     //   Loop back for another one.
   5432                     fp->fPatIdx = opValue + 4;    // Loop back.
   5433                 } else {
   5434                     // We do have the minimum number of matches.
   5435                     //   Fall into the following pattern, but first do
   5436                     //   a state save to the top of the loop, so that a failure
   5437                     //   in the following pattern will try another iteration of the loop.
   5438                     fp = StateSave(fp, opValue + 4, status);
   5439                 }
   5440             }
   5441             break;
   5442 
   5443         case URX_STO_SP:
   5444             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5445             fData[opValue] = fStack->size();
   5446             break;
   5447 
   5448         case URX_LD_SP:
   5449             {
   5450                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5451                 int32_t newStackSize = (int32_t)fData[opValue];
   5452                 U_ASSERT(newStackSize <= fStack->size());
   5453                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5454                 if (newFP == (int64_t *)fp) {
   5455                     break;
   5456                 }
   5457                 int32_t i;
   5458                 for (i=0; i<fFrameSize; i++) {
   5459                     newFP[i] = ((int64_t *)fp)[i];
   5460                 }
   5461                 fp = (REStackFrame *)newFP;
   5462                 fStack->setSize(newStackSize);
   5463             }
   5464             break;
   5465 
   5466         case URX_BACKREF:
   5467         case URX_BACKREF_I:
   5468             {
   5469                 U_ASSERT(opValue < fFrameSize);
   5470                 int64_t groupStartIdx = fp->fExtra[opValue];
   5471                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   5472                 U_ASSERT(groupStartIdx <= groupEndIdx);
   5473                 int64_t len = groupEndIdx-groupStartIdx;
   5474                 if (groupStartIdx < 0) {
   5475                     // This capture group has not participated in the match thus far,
   5476                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   5477                 }
   5478 
   5479                 if (len == 0) {
   5480                         //   The capture group match was of an empty string.
   5481                         //   Verified by testing:  Perl matches succeed in this case, so
   5482                         //   we do too.
   5483                         break;
   5484                     }
   5485 
   5486                 UBool  haveMatch = FALSE;
   5487                 if (fp->fInputIdx + len <= fActiveLimit) {
   5488                     if (opType == URX_BACKREF) {
   5489                         if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) {
   5490                             haveMatch = TRUE;
   5491                         }
   5492                     } else {
   5493                         if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
   5494                                   (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) {
   5495                             haveMatch = TRUE;
   5496                         }
   5497                     }
   5498                 } else {
   5499                     // TODO: probably need to do a partial string comparison, and only
   5500                     //       set HitEnd if the available input matched.  Ticket #6074
   5501                     fHitEnd = TRUE;
   5502                 }
   5503                 if (haveMatch) {
   5504                     fp->fInputIdx += len;     // Match.  Advance current input position.
   5505                 } else {
   5506                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   5507                 }
   5508             }
   5509             break;
   5510 
   5511         case URX_STO_INP_LOC:
   5512             {
   5513                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   5514                 fp->fExtra[opValue] = fp->fInputIdx;
   5515             }
   5516             break;
   5517 
   5518         case URX_JMPX:
   5519             {
   5520                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5521                 fp->fPatIdx += 1;
   5522                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   5523                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   5524                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
   5525                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   5526                 if (savedInputIdx < fp->fInputIdx) {
   5527                     fp->fPatIdx = opValue;                               // JMP
   5528                 } else {
   5529                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   5530                 }
   5531             }
   5532             break;
   5533 
   5534         case URX_LA_START:
   5535             {
   5536                 // Entering a lookahead block.
   5537                 // Save Stack Ptr, Input Pos.
   5538                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5539                 fData[opValue]   = fStack->size();
   5540                 fData[opValue+1] = fp->fInputIdx;
   5541                 fActiveStart     = fLookStart;          // Set the match region change for
   5542                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   5543             }
   5544             break;
   5545 
   5546         case URX_LA_END:
   5547             {
   5548                 // Leaving a look-ahead block.
   5549                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   5550                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5551                 int32_t stackSize = fStack->size();
   5552                 int32_t newStackSize = (int32_t)fData[opValue];
   5553                 U_ASSERT(stackSize >= newStackSize);
   5554                 if (stackSize > newStackSize) {
   5555                     // Copy the current top frame back to the new (cut back) top frame.
   5556                     //   This makes the capture groups from within the look-ahead
   5557                     //   expression available.
   5558                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5559                     int32_t i;
   5560                     for (i=0; i<fFrameSize; i++) {
   5561                         newFP[i] = ((int64_t *)fp)[i];
   5562                     }
   5563                     fp = (REStackFrame *)newFP;
   5564                     fStack->setSize(newStackSize);
   5565                 }
   5566                 fp->fInputIdx = fData[opValue+1];
   5567 
   5568                 // Restore the active region bounds in the input string; they may have
   5569                 //    been changed because of transparent bounds on a Region.
   5570                 fActiveStart = fRegionStart;
   5571                 fActiveLimit = fRegionLimit;
   5572             }
   5573             break;
   5574 
   5575         case URX_ONECHAR_I:
   5576             if (fp->fInputIdx < fActiveLimit) {
   5577                 UChar32 c;
   5578                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5579                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   5580                     break;
   5581                 }
   5582             } else {
   5583                 fHitEnd = TRUE;
   5584             }
   5585 
   5586             #ifdef REGEX_SMART_BACKTRACKING
   5587             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   5588                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5589                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5590                     UBool success = FALSE;
   5591                     int64_t reverseIndex = fp->fInputIdx;
   5592                     UChar32 c;
   5593                     while (reverseIndex > backSearchIndex) {
   5594                         U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5595                         if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   5596                             success = TRUE;
   5597                             break;
   5598                         } else if (c == U_SENTINEL) {
   5599                             break;
   5600                         }
   5601                     }
   5602                     if (success) {
   5603                         fHitEnd = FALSE;
   5604                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5605                         fp->fInputIdx = reverseIndex;
   5606                         if (fp->fInputIdx > backSearchIndex) {
   5607                             fp = StateSave(fp, fp->fPatIdx, status);
   5608                         }
   5609                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5610                         break;
   5611                     }
   5612                 }
   5613             }
   5614             #endif
   5615 
   5616             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5617             break;
   5618 
   5619         case URX_STRING_I:
   5620             {
   5621                 // Test input against a literal string.
   5622                 // Strings require two slots in the compiled pattern, one for the
   5623                 //   offset to the string text, and one for the length.
   5624                 const UCaseProps *csp = ucase_getSingleton();
   5625                 {
   5626                     int32_t stringStartIdx, stringLen;
   5627                     stringStartIdx = opValue;
   5628 
   5629                     op      = (int32_t)pat[fp->fPatIdx];
   5630                     fp->fPatIdx++;
   5631                     opType  = URX_TYPE(op);
   5632                     opValue = URX_VAL(op);
   5633                     U_ASSERT(opType == URX_STRING_LEN);
   5634                     stringLen = opValue;
   5635 
   5636                     const UChar *patternChars = litText+stringStartIdx;
   5637                     const UChar *patternEnd = patternChars+stringLen;
   5638 
   5639                     const UChar *foldChars = NULL;
   5640                     int32_t foldOffset, foldLength;
   5641                     UChar32 c;
   5642                     UBool c_is_valid = FALSE;
   5643 
   5644                     #ifdef REGEX_SMART_BACKTRACKING
   5645                     int32_t originalInputIdx = fp->fInputIdx;
   5646                     #endif
   5647                     UBool success = TRUE;
   5648 
   5649                     foldOffset = foldLength = 0;
   5650 
   5651                     while (patternChars < patternEnd && success) {
   5652                         if (fp->fInputIdx < fActiveLimit) {  // don't read past end of string
   5653                             if(foldOffset < foldLength) {
   5654                                 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   5655                                 c_is_valid = TRUE;
   5656                             } else {
   5657                                 // test pre-condition of U16_NEXT: i < length
   5658                                 U_ASSERT(fp->fInputIdx < fActiveLimit);
   5659                                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5660                                 c_is_valid = TRUE;
   5661                                 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
   5662                                 if(foldLength >= 0) {
   5663                                     if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
   5664                                         foldOffset = 0;
   5665                                         U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   5666                                     } else {
   5667                                         c = foldLength;
   5668                                         foldLength = foldOffset; // to avoid reading chars from the folding buffer
   5669                                     }
   5670                                 }
   5671                             }
   5672                         } else {
   5673                           c_is_valid = FALSE;
   5674                         }
   5675 
   5676                         if (fp->fInputIdx <= fActiveLimit && c_is_valid) {
   5677                             if (U_IS_BMP(c)) {
   5678                                 success = (*patternChars == c);
   5679                                 patternChars += 1;
   5680                             } else if (patternChars+1 < patternEnd) {
   5681                                 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
   5682                                 patternChars += 2;
   5683                             }
   5684                         } else {
   5685                             success = FALSE;
   5686                             fHitEnd = TRUE;          //   TODO:  See ticket 6074
   5687                         }
   5688                     }
   5689 
   5690                     if (!success) {
   5691                         #ifdef REGEX_SMART_BACKTRACKING
   5692                         if (fp->fInputIdx > backSearchIndex && fStack->size()) {
   5693                             REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5694                             if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5695                                 // Reset to last start point
   5696                                 int64_t reverseIndex = originalInputIdx;
   5697                                 patternChars = litText+stringStartIdx;
   5698 
   5699                                 // Search backwards for a possible start
   5700                                 do {
   5701                                     U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5702                                     foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
   5703                                     if(foldLength >= 0) {
   5704                                         if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
   5705                                             foldOffset = 0;
   5706                                             U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   5707                                         } else {
   5708                                             c = foldLength;
   5709                                             foldLength = foldOffset; // to avoid reading chars from the folding buffer
   5710                                         }
   5711                                     }
   5712 
   5713                                     if ((U_IS_BMP(c) && *patternChars == c) ||
   5714                                            (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
   5715                                         success = TRUE;
   5716                                         break;
   5717                                     }
   5718                                 } while (reverseIndex > backSearchIndex);
   5719 
   5720                                 // And try again
   5721                                 if (success) {
   5722                                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5723                                     fp->fInputIdx = reverseIndex;
   5724                                     if (fp->fInputIdx > backSearchIndex) {
   5725                                         fp = StateSave(fp, fp->fPatIdx, status);
   5726                                     }
   5727                                     fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5728                                     break;
   5729                                 }
   5730                             }
   5731                         }
   5732                         #endif
   5733                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5734                     }
   5735                 }
   5736             }
   5737             break;
   5738 
   5739         case URX_LB_START:
   5740             {
   5741                 // Entering a look-behind block.
   5742                 // Save Stack Ptr, Input Pos.
   5743                 //   TODO:  implement transparent bounds.  Ticket #6067
   5744                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5745                 fData[opValue]   = fStack->size();
   5746                 fData[opValue+1] = fp->fInputIdx;
   5747                 // Init the variable containing the start index for attempted matches.
   5748                 fData[opValue+2] = -1;
   5749                 // Save input string length, then reset to pin any matches to end at
   5750                 //   the current position.
   5751                 fData[opValue+3] = fActiveLimit;
   5752                 fActiveLimit     = fp->fInputIdx;
   5753             }
   5754             break;
   5755 
   5756 
   5757         case URX_LB_CONT:
   5758             {
   5759                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   5760                 //    at all possible input starting positions.
   5761 
   5762                 // Fetch the min and max possible match lengths.  They are the operands
   5763                 //   of this op in the pattern.
   5764                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
   5765                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
   5766                 U_ASSERT(minML <= maxML);
   5767                 U_ASSERT(minML >= 0);
   5768 
   5769                 // Fetch (from data) the last input index where a match was attempted.
   5770                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5771                 int64_t  *lbStartIdx = &fData[opValue+2];
   5772                 if (*lbStartIdx < 0) {
   5773                     // First time through loop.
   5774                     *lbStartIdx = fp->fInputIdx - minML;
   5775                 } else {
   5776                     // 2nd through nth time through the loop.
   5777                     // Back up start position for match by one.
   5778                     if (*lbStartIdx == 0) {
   5779                         (*lbStartIdx)--;
   5780                     } else {
   5781                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
   5782                     }
   5783                 }
   5784 
   5785                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   5786                     // We have tried all potential match starting points without
   5787                     //  getting a match.  Backtrack out, and out of the
   5788                     //   Look Behind altogether.
   5789                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5790                     int64_t restoreInputLen = fData[opValue+3];
   5791                     U_ASSERT(restoreInputLen >= fActiveLimit);
   5792                     U_ASSERT(restoreInputLen <= fInputLength);
   5793                     fActiveLimit = restoreInputLen;
   5794                     break;
   5795                 }
   5796 
   5797                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5798                 //      (successful match will fall off the end of the loop.)
   5799                 fp = StateSave(fp, fp->fPatIdx-3, status);
   5800                 fp->fInputIdx =  *lbStartIdx;
   5801             }
   5802             break;
   5803 
   5804         case URX_LB_END:
   5805             // End of a look-behind block, after a successful match.
   5806             {
   5807                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5808                 if (fp->fInputIdx != fActiveLimit) {
   5809                     //  The look-behind expression matched, but the match did not
   5810                     //    extend all the way to the point that we are looking behind from.
   5811                     //  FAIL out of here, which will take us back to the LB_CONT, which
   5812                     //     will retry the match starting at another position or fail
   5813                     //     the look-behind altogether, whichever is appropriate.
   5814                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5815                     break;
   5816                 }
   5817 
   5818                 // Look-behind match is good.  Restore the orignal input string length,
   5819                 //   which had been truncated to pin the end of the lookbehind match to the
   5820                 //   position being looked-behind.
   5821                 int64_t originalInputLen = fData[opValue+3];
   5822                 U_ASSERT(originalInputLen >= fActiveLimit);
   5823                 U_ASSERT(originalInputLen <= fInputLength);
   5824                 fActiveLimit = originalInputLen;
   5825             }
   5826             break;
   5827 
   5828 
   5829         case URX_LBN_CONT:
   5830             {
   5831                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   5832                 //    at all possible input starting positions.
   5833 
   5834                 // Fetch the extra parameters of this op.
   5835                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
   5836                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
   5837                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
   5838                 continueLoc = URX_VAL(continueLoc);
   5839                 U_ASSERT(minML <= maxML);
   5840                 U_ASSERT(minML >= 0);
   5841                 U_ASSERT(continueLoc > fp->fPatIdx);
   5842 
   5843                 // Fetch (from data) the last input index where a match was attempted.
   5844                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5845                 int64_t  *lbStartIdx = &fData[opValue+2];
   5846                 if (*lbStartIdx < 0) {
   5847                     // First time through loop.
   5848                     *lbStartIdx = fp->fInputIdx - minML;
   5849                 } else {
   5850                     // 2nd through nth time through the loop.
   5851                     // Back up start position for match by one.
   5852                     if (*lbStartIdx == 0) {
   5853                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
   5854                     } else {
   5855                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
   5856                     }
   5857                 }
   5858 
   5859                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   5860                     // We have tried all potential match starting points without
   5861                     //  getting a match, which means that the negative lookbehind as
   5862                     //  a whole has succeeded.  Jump forward to the continue location
   5863                     int64_t restoreInputLen = fData[opValue+3];
   5864                     U_ASSERT(restoreInputLen >= fActiveLimit);
   5865                     U_ASSERT(restoreInputLen <= fInputLength);
   5866                     fActiveLimit = restoreInputLen;
   5867                     fp->fPatIdx = continueLoc;
   5868                     break;
   5869                 }
   5870 
   5871                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5872                 //      (successful match will cause a FAIL out of the loop altogether.)
   5873                 fp = StateSave(fp, fp->fPatIdx-4, status);
   5874                 fp->fInputIdx =  *lbStartIdx;
   5875             }
   5876             break;
   5877 
   5878         case URX_LBN_END:
   5879             // End of a negative look-behind block, after a successful match.
   5880             {
   5881                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5882                 if (fp->fInputIdx != fActiveLimit) {
   5883                     //  The look-behind expression matched, but the match did not
   5884                     //    extend all the way to the point that we are looking behind from.
   5885                     //  FAIL out of here, which will take us back to the LB_CONT, which
   5886                     //     will retry the match starting at another position or succeed
   5887                     //     the look-behind altogether, whichever is appropriate.
   5888                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5889                     break;
   5890                 }
   5891 
   5892                 // Look-behind expression matched, which means look-behind test as
   5893                 //   a whole Fails
   5894 
   5895                 //   Restore the orignal input string length, which had been truncated
   5896                 //   inorder to pin the end of the lookbehind match
   5897                 //   to the position being looked-behind.
   5898                 int64_t originalInputLen = fData[opValue+3];
   5899                 U_ASSERT(originalInputLen >= fActiveLimit);
   5900                 U_ASSERT(originalInputLen <= fInputLength);
   5901                 fActiveLimit = originalInputLen;
   5902 
   5903                 // Restore original stack position, discarding any state saved
   5904                 //   by the successful pattern match.
   5905                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5906                 int32_t newStackSize = (int32_t)fData[opValue];
   5907                 U_ASSERT(fStack->size() > newStackSize);
   5908                 fStack->setSize(newStackSize);
   5909 
   5910                 //  FAIL, which will take control back to someplace
   5911                 //  prior to entering the look-behind test.
   5912                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5913             }
   5914             break;
   5915 
   5916 
   5917         case URX_LOOP_SR_I:
   5918             // Loop Initialization for the optimized implementation of
   5919             //     [some character set]*
   5920             //   This op scans through all matching input.
   5921             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5922             {
   5923                 U_ASSERT(opValue > 0 && opValue < sets->size());
   5924                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5925                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   5926 
   5927                 // Loop through input, until either the input is exhausted or
   5928                 //   we reach a character that is not a member of the set.
   5929                 int32_t ix = (int32_t)fp->fInputIdx;
   5930                 for (;;) {
   5931                     if (ix >= fActiveLimit) {
   5932                         fHitEnd = TRUE;
   5933                         break;
   5934                     }
   5935                     UChar32   c;
   5936                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
   5937                     if (c<256) {
   5938                         if (s8->contains(c) == FALSE) {
   5939                             U16_BACK_1(inputBuf, 0, ix);
   5940                             break;
   5941                         }
   5942                     } else {
   5943                         if (s->contains(c) == FALSE) {
   5944                             U16_BACK_1(inputBuf, 0, ix);
   5945                             break;
   5946                         }
   5947                     }
   5948                 }
   5949 
   5950                 // If there were no matching characters, skip over the loop altogether.
   5951                 //   The loop doesn't run at all, a * op always succeeds.
   5952                 if (ix == fp->fInputIdx) {
   5953                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   5954                     break;
   5955                 }
   5956 
   5957                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   5958                 //   must follow.  It's operand is the stack location
   5959                 //   that holds the starting input index for the match of this [set]*
   5960                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   5961                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   5962                 int32_t stackLoc = URX_VAL(loopcOp);
   5963                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   5964                 fp->fExtra[stackLoc] = fp->fInputIdx;
   5965                 #ifdef REGEX_SMART_BACKTRACKING
   5966                 backSearchIndex = fp->fInputIdx;
   5967                 #endif
   5968                 fp->fInputIdx = ix;
   5969 
   5970                 // Save State to the URX_LOOP_C op that follows this one,
   5971                 //   so that match failures in the following code will return to there.
   5972                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   5973                 fp = StateSave(fp, fp->fPatIdx, status);
   5974                 fp->fPatIdx++;
   5975             }
   5976             break;
   5977 
   5978 
   5979         case URX_LOOP_DOT_I:
   5980             // Loop Initialization for the optimized implementation of .*
   5981             //   This op scans through all remaining input.
   5982             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5983             {
   5984                 // Loop through input until the input is exhausted (we reach an end-of-line)
   5985                 // In DOTALL mode, we can just go straight to the end of the input.
   5986                 int32_t ix;
   5987                 if ((opValue & 1) == 1) {
   5988                     // Dot-matches-All mode.  Jump straight to the end of the string.
   5989                     ix = (int32_t)fActiveLimit;
   5990                     fHitEnd = TRUE;
   5991                 } else {
   5992                     // NOT DOT ALL mode.  Line endings do not match '.'
   5993                     // Scan forward until a line ending or end of input.
   5994                     ix = (int32_t)fp->fInputIdx;
   5995                     for (;;) {
   5996                         if (ix >= fActiveLimit) {
   5997                             fHitEnd = TRUE;
   5998                             break;
   5999                         }
   6000                         UChar32   c;
   6001                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
   6002                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
   6003                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
   6004                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   6005                                    ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
   6006                                 //  char is a line ending.  Put the input pos back to the
   6007                                 //    line ending char, and exit the scanning loop.
   6008                                 U16_BACK_1(inputBuf, 0, ix);
   6009                                 break;
   6010                             }
   6011                         }
   6012                     }
   6013                 }
   6014 
   6015                 // If there were no matching characters, skip over the loop altogether.
   6016                 //   The loop doesn't run at all, a * op always succeeds.
   6017                 if (ix == fp->fInputIdx) {
   6018                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   6019                     break;
   6020                 }
   6021 
   6022                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   6023                 //   must follow.  It's operand is the stack location
   6024                 //   that holds the starting input index for the match of this .*
   6025                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   6026                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   6027                 int32_t stackLoc = URX_VAL(loopcOp);
   6028                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   6029                 fp->fExtra[stackLoc] = fp->fInputIdx;
   6030                 #ifdef REGEX_SMART_BACKTRACKING
   6031                 backSearchIndex = fp->fInputIdx;
   6032                 #endif
   6033                 fp->fInputIdx = ix;
   6034 
   6035                 // Save State to the URX_LOOP_C op that follows this one,
   6036                 //   so that match failures in the following code will return to there.
   6037                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   6038                 fp = StateSave(fp, fp->fPatIdx, status);
   6039                 fp->fPatIdx++;
   6040             }
   6041             break;
   6042 
   6043 
   6044         case URX_LOOP_C:
   6045             {
   6046                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   6047                 backSearchIndex = (int32_t)fp->fExtra[opValue];
   6048                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
   6049                 if (backSearchIndex == fp->fInputIdx) {
   6050                     // We've backed up the input idx to the point that the loop started.
   6051                     // The loop is done.  Leave here without saving state.
   6052                     //  Subsequent failures won't come back here.
   6053                     break;
   6054                 }
   6055                 // Set up for the next iteration of the loop, with input index
   6056                 //   backed up by one from the last time through,
   6057                 //   and a state save to this instruction in case the following code fails again.
   6058                 //   (We're going backwards because this loop emulates stack unwinding, not
   6059                 //    the initial scan forward.)
   6060                 U_ASSERT(fp->fInputIdx > 0);
   6061                 UChar32 prevC;
   6062                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
   6063 
   6064                 if (prevC == 0x0a &&
   6065                     fp->fInputIdx > backSearchIndex &&
   6066                     inputBuf[fp->fInputIdx-1] == 0x0d) {
   6067                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
   6068                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   6069                         // .*, stepping back over CRLF pair.
   6070                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   6071                     }
   6072                 }
   6073 
   6074 
   6075                 fp = StateSave(fp, fp->fPatIdx-1, status);
   6076             }
   6077             break;
   6078 
   6079 
   6080 
   6081         default:
   6082             // Trouble.  The compiled pattern contains an entry with an
   6083             //           unrecognized type tag.
   6084             U_ASSERT(FALSE);
   6085         }
   6086 
   6087         if (U_FAILURE(status)) {
   6088             isMatch = FALSE;
   6089             break;
   6090         }
   6091     }
   6092 
   6093 breakFromLoop:
   6094     fMatch = isMatch;
   6095     if (isMatch) {
   6096         fLastMatchEnd = fMatchEnd;
   6097         fMatchStart   = startIdx;
   6098         fMatchEnd     = fp->fInputIdx;
   6099         if (fTraceDebug) {
   6100             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
   6101         }
   6102     }
   6103     else
   6104     {
   6105         if (fTraceDebug) {
   6106             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
   6107         }
   6108     }
   6109 
   6110     fFrame = fp;                // The active stack frame when the engine stopped.
   6111     //   Contains the capture group results that we need to
   6112     //    access later.
   6113 
   6114     return;
   6115 }
   6116 
   6117 
   6118 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
   6119 
   6120 U_NAMESPACE_END
   6121 
   6122 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
   6123