Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **************************************************************************
      5 *   Copyright (C) 2002-2016 International Business Machines Corporation
      6 *   and others. All rights reserved.
      7 **************************************************************************
      8 */
      9 //
     10 //  file:  rematch.cpp
     11 //
     12 //         Contains the implementation of class RegexMatcher,
     13 //         which is one of the main API classes for the ICU regular expression package.
     14 //
     15 
     16 #include "unicode/utypes.h"
     17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     18 
     19 #include "unicode/regex.h"
     20 #include "unicode/uniset.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/ustring.h"
     23 #include "unicode/rbbi.h"
     24 #include "unicode/utf.h"
     25 #include "unicode/utf16.h"
     26 #include "uassert.h"
     27 #include "cmemory.h"
     28 #include "cstr.h"
     29 #include "uvector.h"
     30 #include "uvectr32.h"
     31 #include "uvectr64.h"
     32 #include "regeximp.h"
     33 #include "regexst.h"
     34 #include "regextxt.h"
     35 #include "ucase.h"
     36 
     37 // #include <malloc.h>        // Needed for heapcheck testing
     38 
     39 
     40 U_NAMESPACE_BEGIN
     41 
     42 // Default limit for the size of the back track stack, to avoid system
     43 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
     44 // This value puts ICU's limits higher than most other regexp implementations,
     45 //    which use recursion rather than the heap, and take more storage per
     46 //    backtrack point.
     47 //
     48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
     49 
     50 // Time limit counter constant.
     51 //   Time limits for expression evaluation are in terms of quanta of work by
     52 //   the engine, each of which is 10,000 state saves.
     53 //   This constant determines that state saves per tick number.
     54 static const int32_t TIMER_INITIAL_VALUE = 10000;
     55 
     56 
     57 // Test for any of the Unicode line terminating characters.
     58 static inline UBool isLineTerminator(UChar32 c) {
     59     if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
     60         return false;
     61     }
     62     return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
     63 }
     64 
     65 //-----------------------------------------------------------------------------
     66 //
     67 //   Constructor and Destructor
     68 //
     69 //-----------------------------------------------------------------------------
     70 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
     71     fDeferredStatus = U_ZERO_ERROR;
     72     init(fDeferredStatus);
     73     if (U_FAILURE(fDeferredStatus)) {
     74         return;
     75     }
     76     if (pat==NULL) {
     77         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
     78         return;
     79     }
     80     fPattern = pat;
     81     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
     82 }
     83 
     84 
     85 
     86 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
     87                            uint32_t flags, UErrorCode &status) {
     88     init(status);
     89     if (U_FAILURE(status)) {
     90         return;
     91     }
     92     UParseError    pe;
     93     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
     94     fPattern           = fPatternOwned;
     95 
     96     UText inputText = UTEXT_INITIALIZER;
     97     utext_openConstUnicodeString(&inputText, &input, &status);
     98     init2(&inputText, status);
     99     utext_close(&inputText);
    100 
    101     fInputUniStrMaybeMutable = TRUE;
    102 }
    103 
    104 
    105 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
    106                            uint32_t flags, UErrorCode &status) {
    107     init(status);
    108     if (U_FAILURE(status)) {
    109         return;
    110     }
    111     UParseError    pe;
    112     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    113     if (U_FAILURE(status)) {
    114         return;
    115     }
    116 
    117     fPattern           = fPatternOwned;
    118     init2(input, status);
    119 }
    120 
    121 
    122 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
    123                            uint32_t flags, UErrorCode &status) {
    124     init(status);
    125     if (U_FAILURE(status)) {
    126         return;
    127     }
    128     UParseError    pe;
    129     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    130     if (U_FAILURE(status)) {
    131         return;
    132     }
    133     fPattern           = fPatternOwned;
    134     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    135 }
    136 
    137 RegexMatcher::RegexMatcher(UText *regexp,
    138                            uint32_t flags, UErrorCode &status) {
    139     init(status);
    140     if (U_FAILURE(status)) {
    141         return;
    142     }
    143     UParseError    pe;
    144     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    145         if (U_FAILURE(status)) {
    146         return;
    147     }
    148 
    149     fPattern           = fPatternOwned;
    150     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    151 }
    152 
    153 
    154 
    155 
    156 RegexMatcher::~RegexMatcher() {
    157     delete fStack;
    158     if (fData != fSmallData) {
    159         uprv_free(fData);
    160         fData = NULL;
    161     }
    162     if (fPatternOwned) {
    163         delete fPatternOwned;
    164         fPatternOwned = NULL;
    165         fPattern = NULL;
    166     }
    167 
    168     if (fInput) {
    169         delete fInput;
    170     }
    171     if (fInputText) {
    172         utext_close(fInputText);
    173     }
    174     if (fAltInputText) {
    175         utext_close(fAltInputText);
    176     }
    177 
    178     #if UCONFIG_NO_BREAK_ITERATION==0
    179     delete fWordBreakItr;
    180     #endif
    181 }
    182 
    183 //
    184 //   init()   common initialization for use by all constructors.
    185 //            Initialize all fields, get the object into a consistent state.
    186 //            This must be done even when the initial status shows an error,
    187 //            so that the object is initialized sufficiently well for the destructor
    188 //            to run safely.
    189 //
    190 void RegexMatcher::init(UErrorCode &status) {
    191     fPattern           = NULL;
    192     fPatternOwned      = NULL;
    193     fFrameSize         = 0;
    194     fRegionStart       = 0;
    195     fRegionLimit       = 0;
    196     fAnchorStart       = 0;
    197     fAnchorLimit       = 0;
    198     fLookStart         = 0;
    199     fLookLimit         = 0;
    200     fActiveStart       = 0;
    201     fActiveLimit       = 0;
    202     fTransparentBounds = FALSE;
    203     fAnchoringBounds   = TRUE;
    204     fMatch             = FALSE;
    205     fMatchStart        = 0;
    206     fMatchEnd          = 0;
    207     fLastMatchEnd      = -1;
    208     fAppendPosition    = 0;
    209     fHitEnd            = FALSE;
    210     fRequireEnd        = FALSE;
    211     fStack             = NULL;
    212     fFrame             = NULL;
    213     fTimeLimit         = 0;
    214     fTime              = 0;
    215     fTickCounter       = 0;
    216     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
    217     fCallbackFn        = NULL;
    218     fCallbackContext   = NULL;
    219     fFindProgressCallbackFn      = NULL;
    220     fFindProgressCallbackContext = NULL;
    221     fTraceDebug        = FALSE;
    222     fDeferredStatus    = status;
    223     fData              = fSmallData;
    224     fWordBreakItr      = NULL;
    225 
    226     fStack             = NULL;
    227     fInputText         = NULL;
    228     fAltInputText      = NULL;
    229     fInput             = NULL;
    230     fInputLength       = 0;
    231     fInputUniStrMaybeMutable = FALSE;
    232 }
    233 
    234 //
    235 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
    236 //            This handles the common setup to be done after the Pattern is available.
    237 //
    238 void RegexMatcher::init2(UText *input, UErrorCode &status) {
    239     if (U_FAILURE(status)) {
    240         fDeferredStatus = status;
    241         return;
    242     }
    243 
    244     if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
    245         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
    246         if (fData == NULL) {
    247             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    248             return;
    249         }
    250     }
    251 
    252     fStack = new UVector64(status);
    253     if (fStack == NULL) {
    254         status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    255         return;
    256     }
    257 
    258     reset(input);
    259     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
    260     if (U_FAILURE(status)) {
    261         fDeferredStatus = status;
    262         return;
    263     }
    264 }
    265 
    266 
    267 static const UChar BACKSLASH  = 0x5c;
    268 static const UChar DOLLARSIGN = 0x24;
    269 static const UChar LEFTBRACKET = 0x7b;
    270 static const UChar RIGHTBRACKET = 0x7d;
    271 
    272 //--------------------------------------------------------------------------------
    273 //
    274 //    appendReplacement
    275 //
    276 //--------------------------------------------------------------------------------
    277 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
    278                                               const UnicodeString &replacement,
    279                                               UErrorCode &status) {
    280     UText replacementText = UTEXT_INITIALIZER;
    281 
    282     utext_openConstUnicodeString(&replacementText, &replacement, &status);
    283     if (U_SUCCESS(status)) {
    284         UText resultText = UTEXT_INITIALIZER;
    285         utext_openUnicodeString(&resultText, &dest, &status);
    286 
    287         if (U_SUCCESS(status)) {
    288             appendReplacement(&resultText, &replacementText, status);
    289             utext_close(&resultText);
    290         }
    291         utext_close(&replacementText);
    292     }
    293 
    294     return *this;
    295 }
    296 
    297 //
    298 //    appendReplacement, UText mode
    299 //
    300 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
    301                                               UText *replacement,
    302                                               UErrorCode &status) {
    303     if (U_FAILURE(status)) {
    304         return *this;
    305     }
    306     if (U_FAILURE(fDeferredStatus)) {
    307         status = fDeferredStatus;
    308         return *this;
    309     }
    310     if (fMatch == FALSE) {
    311         status = U_REGEX_INVALID_STATE;
    312         return *this;
    313     }
    314 
    315     // Copy input string from the end of previous match to start of current match
    316     int64_t  destLen = utext_nativeLength(dest);
    317     if (fMatchStart > fAppendPosition) {
    318         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    319             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
    320                                      (int32_t)(fMatchStart-fAppendPosition), &status);
    321         } else {
    322             int32_t len16;
    323             if (UTEXT_USES_U16(fInputText)) {
    324                 len16 = (int32_t)(fMatchStart-fAppendPosition);
    325             } else {
    326                 UErrorCode lengthStatus = U_ZERO_ERROR;
    327                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
    328             }
    329             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
    330             if (inputChars == NULL) {
    331                 status = U_MEMORY_ALLOCATION_ERROR;
    332                 return *this;
    333             }
    334             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
    335             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    336             uprv_free(inputChars);
    337         }
    338     }
    339     fAppendPosition = fMatchEnd;
    340 
    341 
    342     // scan the replacement text, looking for substitutions ($n) and \escapes.
    343     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
    344     //         move entire ranges not containing substitutions.
    345     UTEXT_SETNATIVEINDEX(replacement, 0);
    346     for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL;  c = UTEXT_NEXT32(replacement)) {
    347         if (c == BACKSLASH) {
    348             // Backslash Escape.  Copy the following char out without further checks.
    349             //                    Note:  Surrogate pairs don't need any special handling
    350             //                           The second half wont be a '$' or a '\', and
    351             //                           will move to the dest normally on the next
    352             //                           loop iteration.
    353             c = UTEXT_CURRENT32(replacement);
    354             if (c == U_SENTINEL) {
    355                 break;
    356             }
    357 
    358             if (c==0x55/*U*/ || c==0x75/*u*/) {
    359                 // We have a \udddd or \Udddddddd escape sequence.
    360                 int32_t offset = 0;
    361                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
    362                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
    363                 if (escapedChar != (UChar32)0xFFFFFFFF) {
    364                     if (U_IS_BMP(escapedChar)) {
    365                         UChar c16 = (UChar)escapedChar;
    366                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    367                     } else {
    368                         UChar surrogate[2];
    369                         surrogate[0] = U16_LEAD(escapedChar);
    370                         surrogate[1] = U16_TRAIL(escapedChar);
    371                         if (U_SUCCESS(status)) {
    372                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    373                         }
    374                     }
    375                     // TODO:  Report errors for mal-formed \u escapes?
    376                     //        As this is, the original sequence is output, which may be OK.
    377                     if (context.lastOffset == offset) {
    378                         (void)UTEXT_PREVIOUS32(replacement);
    379                     } else if (context.lastOffset != offset-1) {
    380                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
    381                     }
    382                 }
    383             } else {
    384                 (void)UTEXT_NEXT32(replacement);
    385                 // Plain backslash escape.  Just put out the escaped character.
    386                 if (U_IS_BMP(c)) {
    387                     UChar c16 = (UChar)c;
    388                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    389                 } else {
    390                     UChar surrogate[2];
    391                     surrogate[0] = U16_LEAD(c);
    392                     surrogate[1] = U16_TRAIL(c);
    393                     if (U_SUCCESS(status)) {
    394                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    395                     }
    396                 }
    397             }
    398         } else if (c != DOLLARSIGN) {
    399             // Normal char, not a $.  Copy it out without further checks.
    400             if (U_IS_BMP(c)) {
    401                 UChar c16 = (UChar)c;
    402                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    403             } else {
    404                 UChar surrogate[2];
    405                 surrogate[0] = U16_LEAD(c);
    406                 surrogate[1] = U16_TRAIL(c);
    407                 if (U_SUCCESS(status)) {
    408                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    409                 }
    410             }
    411         } else {
    412             // We've got a $.  Pick up a capture group name or number if one follows.
    413             // Consume digits so long as the resulting group number <= the number of
    414             // number of capture groups in the pattern.
    415 
    416             int32_t groupNum  = 0;
    417             int32_t numDigits = 0;
    418             UChar32 nextChar = utext_current32(replacement);
    419             if (nextChar == LEFTBRACKET) {
    420                 // Scan for a Named Capture Group, ${name}.
    421                 UnicodeString groupName;
    422                 utext_next32(replacement);
    423                 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
    424                     nextChar = utext_next32(replacement);
    425                     if (nextChar == U_SENTINEL) {
    426                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
    427                     } else if ((nextChar >= 0x41 && nextChar <= 0x5a) ||       // A..Z
    428                                (nextChar >= 0x61 && nextChar <= 0x7a) ||       // a..z
    429                                (nextChar >= 0x31 && nextChar <= 0x39)) {       // 0..9
    430                         groupName.append(nextChar);
    431                     } else if (nextChar == RIGHTBRACKET) {
    432                         groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
    433                         if (groupNum == 0) {
    434                             status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
    435                         }
    436                     } else {
    437                         // Character was something other than a name char or a closing '}'
    438                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
    439                     }
    440                 }
    441 
    442             } else if (u_isdigit(nextChar)) {
    443                 // $n    Scan for a capture group number
    444                 int32_t numCaptureGroups = fPattern->fGroupMap->size();
    445                 for (;;) {
    446                     nextChar = UTEXT_CURRENT32(replacement);
    447                     if (nextChar == U_SENTINEL) {
    448                         break;
    449                     }
    450                     if (u_isdigit(nextChar) == FALSE) {
    451                         break;
    452                     }
    453                     int32_t nextDigitVal = u_charDigitValue(nextChar);
    454                     if (groupNum*10 + nextDigitVal > numCaptureGroups) {
    455                         // Don't consume the next digit if it makes the capture group number too big.
    456                         if (numDigits == 0) {
    457                             status = U_INDEX_OUTOFBOUNDS_ERROR;
    458                         }
    459                         break;
    460                     }
    461                     (void)UTEXT_NEXT32(replacement);
    462                     groupNum=groupNum*10 + nextDigitVal;
    463                     ++numDigits;
    464                 }
    465             } else {
    466                 // $ not followed by capture group name or number.
    467                 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
    468             }
    469 
    470             if (U_SUCCESS(status)) {
    471                 destLen += appendGroup(groupNum, dest, status);
    472             }
    473         }  // End of $ capture group handling
    474     }  // End of per-character loop through the replacement string.
    475 
    476     return *this;
    477 }
    478 
    479 
    480 
    481 //--------------------------------------------------------------------------------
    482 //
    483 //    appendTail     Intended to be used in conjunction with appendReplacement()
    484 //                   To the destination string, append everything following
    485 //                   the last match position from the input string.
    486 //
    487 //                   Note:  Match ranges do not affect appendTail or appendReplacement
    488 //
    489 //--------------------------------------------------------------------------------
    490 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
    491     UErrorCode status = U_ZERO_ERROR;
    492     UText resultText = UTEXT_INITIALIZER;
    493     utext_openUnicodeString(&resultText, &dest, &status);
    494 
    495     if (U_SUCCESS(status)) {
    496         appendTail(&resultText, status);
    497         utext_close(&resultText);
    498     }
    499 
    500     return dest;
    501 }
    502 
    503 //
    504 //   appendTail, UText mode
    505 //
    506 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
    507     if (U_FAILURE(status)) {
    508         return dest;
    509     }
    510     if (U_FAILURE(fDeferredStatus)) {
    511         status = fDeferredStatus;
    512         return dest;
    513     }
    514 
    515     if (fInputLength > fAppendPosition) {
    516         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    517             int64_t destLen = utext_nativeLength(dest);
    518             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
    519                           (int32_t)(fInputLength-fAppendPosition), &status);
    520         } else {
    521             int32_t len16;
    522             if (UTEXT_USES_U16(fInputText)) {
    523                 len16 = (int32_t)(fInputLength-fAppendPosition);
    524             } else {
    525                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
    526                 status = U_ZERO_ERROR; // buffer overflow
    527             }
    528 
    529             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
    530             if (inputChars == NULL) {
    531                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    532             } else {
    533                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
    534                 int64_t destLen = utext_nativeLength(dest);
    535                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    536                 uprv_free(inputChars);
    537             }
    538         }
    539     }
    540     return dest;
    541 }
    542 
    543 
    544 
    545 //--------------------------------------------------------------------------------
    546 //
    547 //   end
    548 //
    549 //--------------------------------------------------------------------------------
    550 int32_t RegexMatcher::end(UErrorCode &err) const {
    551     return end(0, err);
    552 }
    553 
    554 int64_t RegexMatcher::end64(UErrorCode &err) const {
    555     return end64(0, err);
    556 }
    557 
    558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
    559     if (U_FAILURE(err)) {
    560         return -1;
    561     }
    562     if (fMatch == FALSE) {
    563         err = U_REGEX_INVALID_STATE;
    564         return -1;
    565     }
    566     if (group < 0 || group > fPattern->fGroupMap->size()) {
    567         err = U_INDEX_OUTOFBOUNDS_ERROR;
    568         return -1;
    569     }
    570     int64_t e = -1;
    571     if (group == 0) {
    572         e = fMatchEnd;
    573     } else {
    574         // Get the position within the stack frame of the variables for
    575         //    this capture group.
    576         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
    577         U_ASSERT(groupOffset < fPattern->fFrameSize);
    578         U_ASSERT(groupOffset >= 0);
    579         e = fFrame->fExtra[groupOffset + 1];
    580     }
    581 
    582         return e;
    583 }
    584 
    585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
    586     return (int32_t)end64(group, err);
    587 }
    588 
    589 //--------------------------------------------------------------------------------
    590 //
    591 //   findProgressInterrupt  This function is called once for each advance in the target
    592 //                          string from the find() function, and calls the user progress callback
    593 //                          function if there is one installed.
    594 //
    595 //         Return:  TRUE if the find operation is to be terminated.
    596 //                  FALSE if the find operation is to continue running.
    597 //
    598 //--------------------------------------------------------------------------------
    599 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
    600     if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
    601         status = U_REGEX_STOPPED_BY_CALLER;
    602         return TRUE;
    603     }
    604     return FALSE;
    605 }
    606 
    607 //--------------------------------------------------------------------------------
    608 //
    609 //   find()
    610 //
    611 //--------------------------------------------------------------------------------
    612 UBool RegexMatcher::find() {
    613     if (U_FAILURE(fDeferredStatus)) {
    614         return FALSE;
    615     }
    616     UErrorCode status = U_ZERO_ERROR;
    617     UBool result = find(status);
    618     return result;
    619 }
    620 
    621 //--------------------------------------------------------------------------------
    622 //
    623 //   find()
    624 //
    625 //--------------------------------------------------------------------------------
    626 UBool RegexMatcher::find(UErrorCode &status) {
    627     // Start at the position of the last match end.  (Will be zero if the
    628     //   matcher has been reset.)
    629     //
    630     if (U_FAILURE(status)) {
    631         return FALSE;
    632     }
    633     if (U_FAILURE(fDeferredStatus)) {
    634         status = fDeferredStatus;
    635         return FALSE;
    636     }
    637 
    638     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    639         return findUsingChunk(status);
    640     }
    641 
    642     int64_t startPos = fMatchEnd;
    643     if (startPos==0) {
    644         startPos = fActiveStart;
    645     }
    646 
    647     if (fMatch) {
    648         // Save the position of any previous successful match.
    649         fLastMatchEnd = fMatchEnd;
    650 
    651         if (fMatchStart == fMatchEnd) {
    652             // Previous match had zero length.  Move start position up one position
    653             //  to avoid sending find() into a loop on zero-length matches.
    654             if (startPos >= fActiveLimit) {
    655                 fMatch = FALSE;
    656                 fHitEnd = TRUE;
    657                 return FALSE;
    658             }
    659             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    660             (void)UTEXT_NEXT32(fInputText);
    661             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    662         }
    663     } else {
    664         if (fLastMatchEnd >= 0) {
    665             // A previous find() failed to match.  Don't try again.
    666             //   (without this test, a pattern with a zero-length match
    667             //    could match again at the end of an input string.)
    668             fHitEnd = TRUE;
    669             return FALSE;
    670         }
    671     }
    672 
    673 
    674     // Compute the position in the input string beyond which a match can not begin, because
    675     //   the minimum length match would extend past the end of the input.
    676     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    677     //          Be aware of possible overflows if making changes here.
    678     int64_t testStartLimit;
    679     if (UTEXT_USES_U16(fInputText)) {
    680         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
    681         if (startPos > testStartLimit) {
    682             fMatch = FALSE;
    683             fHitEnd = TRUE;
    684             return FALSE;
    685         }
    686     } else {
    687         // We don't know exactly how long the minimum match length is in native characters.
    688         // Treat anything > 0 as 1.
    689         testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
    690     }
    691 
    692     UChar32  c;
    693     U_ASSERT(startPos >= 0);
    694 
    695     switch (fPattern->fStartType) {
    696     case START_NO_INFO:
    697         // No optimization was found.
    698         //  Try a match at each input position.
    699         for (;;) {
    700             MatchAt(startPos, FALSE, status);
    701             if (U_FAILURE(status)) {
    702                 return FALSE;
    703             }
    704             if (fMatch) {
    705                 return TRUE;
    706             }
    707             if (startPos >= testStartLimit) {
    708                 fHitEnd = TRUE;
    709                 return FALSE;
    710             }
    711             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    712             (void)UTEXT_NEXT32(fInputText);
    713             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    714             // Note that it's perfectly OK for a pattern to have a zero-length
    715             //   match at the end of a string, so we must make sure that the loop
    716             //   runs with startPos == testStartLimit the last time through.
    717             if  (findProgressInterrupt(startPos, status))
    718                 return FALSE;
    719         }
    720         U_ASSERT(FALSE);
    721 
    722     case START_START:
    723         // Matches are only possible at the start of the input string
    724         //   (pattern begins with ^ or \A)
    725         if (startPos > fActiveStart) {
    726             fMatch = FALSE;
    727             return FALSE;
    728         }
    729         MatchAt(startPos, FALSE, status);
    730         if (U_FAILURE(status)) {
    731             return FALSE;
    732         }
    733         return fMatch;
    734 
    735 
    736     case START_SET:
    737         {
    738             // Match may start on any char from a pre-computed set.
    739             U_ASSERT(fPattern->fMinMatchLen > 0);
    740             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    741             for (;;) {
    742                 int64_t pos = startPos;
    743                 c = UTEXT_NEXT32(fInputText);
    744                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    745                 // c will be -1 (U_SENTINEL) at end of text, in which case we
    746                 // skip this next block (so we don't have a negative array index)
    747                 // and handle end of text in the following block.
    748                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
    749                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
    750                     MatchAt(pos, FALSE, status);
    751                     if (U_FAILURE(status)) {
    752                         return FALSE;
    753                     }
    754                     if (fMatch) {
    755                         return TRUE;
    756                     }
    757                     UTEXT_SETNATIVEINDEX(fInputText, pos);
    758                 }
    759                 if (startPos > testStartLimit) {
    760                     fMatch = FALSE;
    761                     fHitEnd = TRUE;
    762                     return FALSE;
    763                 }
    764                 if  (findProgressInterrupt(startPos, status))
    765                     return FALSE;
    766             }
    767         }
    768         U_ASSERT(FALSE);
    769 
    770     case START_STRING:
    771     case START_CHAR:
    772         {
    773             // Match starts on exactly one char.
    774             U_ASSERT(fPattern->fMinMatchLen > 0);
    775             UChar32 theChar = fPattern->fInitialChar;
    776             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    777             for (;;) {
    778                 int64_t pos = startPos;
    779                 c = UTEXT_NEXT32(fInputText);
    780                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    781                 if (c == theChar) {
    782                     MatchAt(pos, FALSE, status);
    783                     if (U_FAILURE(status)) {
    784                         return FALSE;
    785                     }
    786                     if (fMatch) {
    787                         return TRUE;
    788                     }
    789                     UTEXT_SETNATIVEINDEX(fInputText, startPos);
    790                 }
    791                 if (startPos > testStartLimit) {
    792                     fMatch = FALSE;
    793                     fHitEnd = TRUE;
    794                     return FALSE;
    795                 }
    796                 if  (findProgressInterrupt(startPos, status))
    797                     return FALSE;
    798            }
    799         }
    800         U_ASSERT(FALSE);
    801 
    802     case START_LINE:
    803         {
    804             UChar32  c;
    805             if (startPos == fAnchorStart) {
    806                 MatchAt(startPos, FALSE, status);
    807                 if (U_FAILURE(status)) {
    808                     return FALSE;
    809                 }
    810                 if (fMatch) {
    811                     return TRUE;
    812                 }
    813                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    814                 c = UTEXT_NEXT32(fInputText);
    815                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    816             } else {
    817                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    818                 c = UTEXT_PREVIOUS32(fInputText);
    819                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    820             }
    821 
    822             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
    823                 for (;;) {
    824                     if (c == 0x0a) {
    825                             MatchAt(startPos, FALSE, status);
    826                             if (U_FAILURE(status)) {
    827                                 return FALSE;
    828                             }
    829                             if (fMatch) {
    830                                 return TRUE;
    831                             }
    832                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    833                     }
    834                     if (startPos >= testStartLimit) {
    835                         fMatch = FALSE;
    836                         fHitEnd = TRUE;
    837                         return FALSE;
    838                     }
    839                     c = UTEXT_NEXT32(fInputText);
    840                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
    841                     // Note that it's perfectly OK for a pattern to have a zero-length
    842                     //   match at the end of a string, so we must make sure that the loop
    843                     //   runs with startPos == testStartLimit the last time through.
    844                     if  (findProgressInterrupt(startPos, status))
    845                         return FALSE;
    846                 }
    847             } else {
    848                 for (;;) {
    849                     if (isLineTerminator(c)) {
    850                         if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
    851                             (void)UTEXT_NEXT32(fInputText);
    852                             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    853                         }
    854                         MatchAt(startPos, FALSE, status);
    855                         if (U_FAILURE(status)) {
    856                             return FALSE;
    857                         }
    858                         if (fMatch) {
    859                             return TRUE;
    860                         }
    861                         UTEXT_SETNATIVEINDEX(fInputText, startPos);
    862                     }
    863                     if (startPos >= testStartLimit) {
    864                         fMatch = FALSE;
    865                         fHitEnd = TRUE;
    866                         return FALSE;
    867                     }
    868                     c = UTEXT_NEXT32(fInputText);
    869                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
    870                     // Note that it's perfectly OK for a pattern to have a zero-length
    871                     //   match at the end of a string, so we must make sure that the loop
    872                     //   runs with startPos == testStartLimit the last time through.
    873                     if  (findProgressInterrupt(startPos, status))
    874                         return FALSE;
    875                 }
    876             }
    877         }
    878 
    879     default:
    880         U_ASSERT(FALSE);
    881     }
    882 
    883     U_ASSERT(FALSE);
    884     return FALSE;
    885 }
    886 
    887 
    888 
    889 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
    890     if (U_FAILURE(status)) {
    891         return FALSE;
    892     }
    893     if (U_FAILURE(fDeferredStatus)) {
    894         status = fDeferredStatus;
    895         return FALSE;
    896     }
    897     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
    898                                           //        This will reset the region to be the full input length.
    899     if (start < 0) {
    900         status = U_INDEX_OUTOFBOUNDS_ERROR;
    901         return FALSE;
    902     }
    903 
    904     int64_t nativeStart = start;
    905     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
    906         status = U_INDEX_OUTOFBOUNDS_ERROR;
    907         return FALSE;
    908     }
    909     fMatchEnd = nativeStart;
    910     return find(status);
    911 }
    912 
    913 
    914 //--------------------------------------------------------------------------------
    915 //
    916 //   findUsingChunk() -- like find(), but with the advance knowledge that the
    917 //                       entire string is available in the UText's chunk buffer.
    918 //
    919 //--------------------------------------------------------------------------------
    920 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
    921     // Start at the position of the last match end.  (Will be zero if the
    922     //   matcher has been reset.
    923     //
    924 
    925     int32_t startPos = (int32_t)fMatchEnd;
    926     if (startPos==0) {
    927         startPos = (int32_t)fActiveStart;
    928     }
    929 
    930     const UChar *inputBuf = fInputText->chunkContents;
    931 
    932     if (fMatch) {
    933         // Save the position of any previous successful match.
    934         fLastMatchEnd = fMatchEnd;
    935 
    936         if (fMatchStart == fMatchEnd) {
    937             // Previous match had zero length.  Move start position up one position
    938             //  to avoid sending find() into a loop on zero-length matches.
    939             if (startPos >= fActiveLimit) {
    940                 fMatch = FALSE;
    941                 fHitEnd = TRUE;
    942                 return FALSE;
    943             }
    944             U16_FWD_1(inputBuf, startPos, fInputLength);
    945         }
    946     } else {
    947         if (fLastMatchEnd >= 0) {
    948             // A previous find() failed to match.  Don't try again.
    949             //   (without this test, a pattern with a zero-length match
    950             //    could match again at the end of an input string.)
    951             fHitEnd = TRUE;
    952             return FALSE;
    953         }
    954     }
    955 
    956 
    957     // Compute the position in the input string beyond which a match can not begin, because
    958     //   the minimum length match would extend past the end of the input.
    959     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    960     //          Be aware of possible overflows if making changes here.
    961     //   Note:  a match can begin at inputBuf + testLen; it is an inclusive limit.
    962     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
    963     if (startPos > testLen) {
    964         fMatch = FALSE;
    965         fHitEnd = TRUE;
    966         return FALSE;
    967     }
    968 
    969     UChar32  c;
    970     U_ASSERT(startPos >= 0);
    971 
    972     switch (fPattern->fStartType) {
    973     case START_NO_INFO:
    974         // No optimization was found.
    975         //  Try a match at each input position.
    976         for (;;) {
    977             MatchChunkAt(startPos, FALSE, status);
    978             if (U_FAILURE(status)) {
    979                 return FALSE;
    980             }
    981             if (fMatch) {
    982                 return TRUE;
    983             }
    984             if (startPos >= testLen) {
    985                 fHitEnd = TRUE;
    986                 return FALSE;
    987             }
    988             U16_FWD_1(inputBuf, startPos, fActiveLimit);
    989             // Note that it's perfectly OK for a pattern to have a zero-length
    990             //   match at the end of a string, so we must make sure that the loop
    991             //   runs with startPos == testLen the last time through.
    992             if  (findProgressInterrupt(startPos, status))
    993                 return FALSE;
    994         }
    995         U_ASSERT(FALSE);
    996 
    997     case START_START:
    998         // Matches are only possible at the start of the input string
    999         //   (pattern begins with ^ or \A)
   1000         if (startPos > fActiveStart) {
   1001             fMatch = FALSE;
   1002             return FALSE;
   1003         }
   1004         MatchChunkAt(startPos, FALSE, status);
   1005         if (U_FAILURE(status)) {
   1006             return FALSE;
   1007         }
   1008         return fMatch;
   1009 
   1010 
   1011     case START_SET:
   1012     {
   1013         // Match may start on any char from a pre-computed set.
   1014         U_ASSERT(fPattern->fMinMatchLen > 0);
   1015         for (;;) {
   1016             int32_t pos = startPos;
   1017             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
   1018             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
   1019                 (c>=256 && fPattern->fInitialChars->contains(c))) {
   1020                 MatchChunkAt(pos, FALSE, status);
   1021                 if (U_FAILURE(status)) {
   1022                     return FALSE;
   1023                 }
   1024                 if (fMatch) {
   1025                     return TRUE;
   1026                 }
   1027             }
   1028             if (startPos > testLen) {
   1029                 fMatch = FALSE;
   1030                 fHitEnd = TRUE;
   1031                 return FALSE;
   1032             }
   1033             if  (findProgressInterrupt(startPos, status))
   1034                 return FALSE;
   1035         }
   1036     }
   1037         U_ASSERT(FALSE);
   1038 
   1039     case START_STRING:
   1040     case START_CHAR:
   1041     {
   1042         // Match starts on exactly one char.
   1043         U_ASSERT(fPattern->fMinMatchLen > 0);
   1044         UChar32 theChar = fPattern->fInitialChar;
   1045         for (;;) {
   1046             int32_t pos = startPos;
   1047             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
   1048             if (c == theChar) {
   1049                 MatchChunkAt(pos, FALSE, status);
   1050                 if (U_FAILURE(status)) {
   1051                     return FALSE;
   1052                 }
   1053                 if (fMatch) {
   1054                     return TRUE;
   1055                 }
   1056             }
   1057             if (startPos > testLen) {
   1058                 fMatch = FALSE;
   1059                 fHitEnd = TRUE;
   1060                 return FALSE;
   1061             }
   1062             if  (findProgressInterrupt(startPos, status))
   1063                 return FALSE;
   1064         }
   1065     }
   1066     U_ASSERT(FALSE);
   1067 
   1068     case START_LINE:
   1069     {
   1070         UChar32  c;
   1071         if (startPos == fAnchorStart) {
   1072             MatchChunkAt(startPos, FALSE, status);
   1073             if (U_FAILURE(status)) {
   1074                 return FALSE;
   1075             }
   1076             if (fMatch) {
   1077                 return TRUE;
   1078             }
   1079             U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1080         }
   1081 
   1082         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
   1083             for (;;) {
   1084                 c = inputBuf[startPos-1];
   1085                 if (c == 0x0a) {
   1086                     MatchChunkAt(startPos, FALSE, status);
   1087                     if (U_FAILURE(status)) {
   1088                         return FALSE;
   1089                     }
   1090                     if (fMatch) {
   1091                         return TRUE;
   1092                     }
   1093                 }
   1094                 if (startPos >= testLen) {
   1095                     fMatch = FALSE;
   1096                     fHitEnd = TRUE;
   1097                     return FALSE;
   1098                 }
   1099                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1100                 // Note that it's perfectly OK for a pattern to have a zero-length
   1101                 //   match at the end of a string, so we must make sure that the loop
   1102                 //   runs with startPos == testLen the last time through.
   1103                 if  (findProgressInterrupt(startPos, status))
   1104                     return FALSE;
   1105             }
   1106         } else {
   1107             for (;;) {
   1108                 c = inputBuf[startPos-1];
   1109                 if (isLineTerminator(c)) {
   1110                     if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
   1111                         startPos++;
   1112                     }
   1113                     MatchChunkAt(startPos, FALSE, status);
   1114                     if (U_FAILURE(status)) {
   1115                         return FALSE;
   1116                     }
   1117                     if (fMatch) {
   1118                         return TRUE;
   1119                     }
   1120                 }
   1121                 if (startPos >= testLen) {
   1122                     fMatch = FALSE;
   1123                     fHitEnd = TRUE;
   1124                     return FALSE;
   1125                 }
   1126                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1127                 // Note that it's perfectly OK for a pattern to have a zero-length
   1128                 //   match at the end of a string, so we must make sure that the loop
   1129                 //   runs with startPos == testLen the last time through.
   1130                 if  (findProgressInterrupt(startPos, status))
   1131                     return FALSE;
   1132             }
   1133         }
   1134     }
   1135 
   1136     default:
   1137         U_ASSERT(FALSE);
   1138     }
   1139 
   1140     U_ASSERT(FALSE);
   1141     return FALSE;
   1142 }
   1143 
   1144 
   1145 
   1146 //--------------------------------------------------------------------------------
   1147 //
   1148 //  group()
   1149 //
   1150 //--------------------------------------------------------------------------------
   1151 UnicodeString RegexMatcher::group(UErrorCode &status) const {
   1152     return group(0, status);
   1153 }
   1154 
   1155 //  Return immutable shallow clone
   1156 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
   1157     return group(0, dest, group_len, status);
   1158 }
   1159 
   1160 //  Return immutable shallow clone
   1161 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
   1162     group_len = 0;
   1163     if (U_FAILURE(status)) {
   1164         return dest;
   1165     }
   1166     if (U_FAILURE(fDeferredStatus)) {
   1167         status = fDeferredStatus;
   1168     } else if (fMatch == FALSE) {
   1169         status = U_REGEX_INVALID_STATE;
   1170     } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1171         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1172     }
   1173 
   1174     if (U_FAILURE(status)) {
   1175         return dest;
   1176     }
   1177 
   1178     int64_t s, e;
   1179     if (groupNum == 0) {
   1180         s = fMatchStart;
   1181         e = fMatchEnd;
   1182     } else {
   1183         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1184         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1185         U_ASSERT(groupOffset >= 0);
   1186         s = fFrame->fExtra[groupOffset];
   1187         e = fFrame->fExtra[groupOffset+1];
   1188     }
   1189 
   1190     if (s < 0) {
   1191         // A capture group wasn't part of the match
   1192         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
   1193     }
   1194     U_ASSERT(s <= e);
   1195     group_len = e - s;
   1196 
   1197     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
   1198     if (dest)
   1199         UTEXT_SETNATIVEINDEX(dest, s);
   1200     return dest;
   1201 }
   1202 
   1203 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
   1204     UnicodeString result;
   1205     int64_t groupStart = start64(groupNum, status);
   1206     int64_t groupEnd = end64(groupNum, status);
   1207     if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
   1208         return result;
   1209     }
   1210 
   1211     // Get the group length using a utext_extract preflight.
   1212     //    UText is actually pretty efficient at this when underlying encoding is UTF-16.
   1213     int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
   1214     if (status != U_BUFFER_OVERFLOW_ERROR) {
   1215         return result;
   1216     }
   1217 
   1218     status = U_ZERO_ERROR;
   1219     UChar *buf = result.getBuffer(length);
   1220     if (buf == NULL) {
   1221         status = U_MEMORY_ALLOCATION_ERROR;
   1222     } else {
   1223         int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
   1224         result.releaseBuffer(extractLength);
   1225         U_ASSERT(length == extractLength);
   1226     }
   1227     return result;
   1228 }
   1229 
   1230 
   1231 //--------------------------------------------------------------------------------
   1232 //
   1233 //  appendGroup() -- currently internal only, appends a group to a UText rather
   1234 //                   than replacing its contents
   1235 //
   1236 //--------------------------------------------------------------------------------
   1237 
   1238 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
   1239     if (U_FAILURE(status)) {
   1240         return 0;
   1241     }
   1242     if (U_FAILURE(fDeferredStatus)) {
   1243         status = fDeferredStatus;
   1244         return 0;
   1245     }
   1246     int64_t destLen = utext_nativeLength(dest);
   1247 
   1248     if (fMatch == FALSE) {
   1249         status = U_REGEX_INVALID_STATE;
   1250         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1251     }
   1252     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1253         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1254         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1255     }
   1256 
   1257     int64_t s, e;
   1258     if (groupNum == 0) {
   1259         s = fMatchStart;
   1260         e = fMatchEnd;
   1261     } else {
   1262         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1263         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1264         U_ASSERT(groupOffset >= 0);
   1265         s = fFrame->fExtra[groupOffset];
   1266         e = fFrame->fExtra[groupOffset+1];
   1267     }
   1268 
   1269     if (s < 0) {
   1270         // A capture group wasn't part of the match
   1271         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1272     }
   1273     U_ASSERT(s <= e);
   1274 
   1275     int64_t deltaLen;
   1276     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1277         U_ASSERT(e <= fInputLength);
   1278         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
   1279     } else {
   1280         int32_t len16;
   1281         if (UTEXT_USES_U16(fInputText)) {
   1282             len16 = (int32_t)(e-s);
   1283         } else {
   1284             UErrorCode lengthStatus = U_ZERO_ERROR;
   1285             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
   1286         }
   1287         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   1288         if (groupChars == NULL) {
   1289             status = U_MEMORY_ALLOCATION_ERROR;
   1290             return 0;
   1291         }
   1292         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
   1293 
   1294         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
   1295         uprv_free(groupChars);
   1296     }
   1297     return deltaLen;
   1298 }
   1299 
   1300 
   1301 
   1302 //--------------------------------------------------------------------------------
   1303 //
   1304 //  groupCount()
   1305 //
   1306 //--------------------------------------------------------------------------------
   1307 int32_t RegexMatcher::groupCount() const {
   1308     return fPattern->fGroupMap->size();
   1309 }
   1310 
   1311 //--------------------------------------------------------------------------------
   1312 //
   1313 //  hasAnchoringBounds()
   1314 //
   1315 //--------------------------------------------------------------------------------
   1316 UBool RegexMatcher::hasAnchoringBounds() const {
   1317     return fAnchoringBounds;
   1318 }
   1319 
   1320 
   1321 //--------------------------------------------------------------------------------
   1322 //
   1323 //  hasTransparentBounds()
   1324 //
   1325 //--------------------------------------------------------------------------------
   1326 UBool RegexMatcher::hasTransparentBounds() const {
   1327     return fTransparentBounds;
   1328 }
   1329 
   1330 
   1331 
   1332 //--------------------------------------------------------------------------------
   1333 //
   1334 //  hitEnd()
   1335 //
   1336 //--------------------------------------------------------------------------------
   1337 UBool RegexMatcher::hitEnd() const {
   1338     return fHitEnd;
   1339 }
   1340 
   1341 
   1342 //--------------------------------------------------------------------------------
   1343 //
   1344 //  input()
   1345 //
   1346 //--------------------------------------------------------------------------------
   1347 const UnicodeString &RegexMatcher::input() const {
   1348     if (!fInput) {
   1349         UErrorCode status = U_ZERO_ERROR;
   1350         int32_t len16;
   1351         if (UTEXT_USES_U16(fInputText)) {
   1352             len16 = (int32_t)fInputLength;
   1353         } else {
   1354             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
   1355             status = U_ZERO_ERROR; // overflow, length status
   1356         }
   1357         UnicodeString *result = new UnicodeString(len16, 0, 0);
   1358 
   1359         UChar *inputChars = result->getBuffer(len16);
   1360         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
   1361         result->releaseBuffer(len16);
   1362 
   1363         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
   1364     }
   1365 
   1366     return *fInput;
   1367 }
   1368 
   1369 //--------------------------------------------------------------------------------
   1370 //
   1371 //  inputText()
   1372 //
   1373 //--------------------------------------------------------------------------------
   1374 UText *RegexMatcher::inputText() const {
   1375     return fInputText;
   1376 }
   1377 
   1378 
   1379 //--------------------------------------------------------------------------------
   1380 //
   1381 //  getInput() -- like inputText(), but makes a clone or copies into another UText
   1382 //
   1383 //--------------------------------------------------------------------------------
   1384 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
   1385     if (U_FAILURE(status)) {
   1386         return dest;
   1387     }
   1388     if (U_FAILURE(fDeferredStatus)) {
   1389         status = fDeferredStatus;
   1390         return dest;
   1391     }
   1392 
   1393     if (dest) {
   1394         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1395             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
   1396         } else {
   1397             int32_t input16Len;
   1398             if (UTEXT_USES_U16(fInputText)) {
   1399                 input16Len = (int32_t)fInputLength;
   1400             } else {
   1401                 UErrorCode lengthStatus = U_ZERO_ERROR;
   1402                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
   1403             }
   1404             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
   1405             if (inputChars == NULL) {
   1406                 return dest;
   1407             }
   1408 
   1409             status = U_ZERO_ERROR;
   1410             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
   1411             status = U_ZERO_ERROR;
   1412             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
   1413 
   1414             uprv_free(inputChars);
   1415         }
   1416         return dest;
   1417     } else {
   1418         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
   1419     }
   1420 }
   1421 
   1422 
   1423 static UBool compat_SyncMutableUTextContents(UText *ut);
   1424 static UBool compat_SyncMutableUTextContents(UText *ut) {
   1425     UBool retVal = FALSE;
   1426 
   1427     //  In the following test, we're really only interested in whether the UText should switch
   1428     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
   1429     //  will still point to the correct data.
   1430     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
   1431         UnicodeString *us=(UnicodeString *)ut->context;
   1432 
   1433         // Update to the latest length.
   1434         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
   1435         int32_t newLength = us->length();
   1436 
   1437         // Update the chunk description.
   1438         // The buffer may have switched between stack- and heap-based.
   1439         ut->chunkContents    = us->getBuffer();
   1440         ut->chunkLength      = newLength;
   1441         ut->chunkNativeLimit = newLength;
   1442         ut->nativeIndexingLimit = newLength;
   1443         retVal = TRUE;
   1444     }
   1445 
   1446     return retVal;
   1447 }
   1448 
   1449 //--------------------------------------------------------------------------------
   1450 //
   1451 //  lookingAt()
   1452 //
   1453 //--------------------------------------------------------------------------------
   1454 UBool RegexMatcher::lookingAt(UErrorCode &status) {
   1455     if (U_FAILURE(status)) {
   1456         return FALSE;
   1457     }
   1458     if (U_FAILURE(fDeferredStatus)) {
   1459         status = fDeferredStatus;
   1460         return FALSE;
   1461     }
   1462 
   1463     if (fInputUniStrMaybeMutable) {
   1464         if (compat_SyncMutableUTextContents(fInputText)) {
   1465         fInputLength = utext_nativeLength(fInputText);
   1466         reset();
   1467         }
   1468     }
   1469     else {
   1470         resetPreserveRegion();
   1471     }
   1472     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1473         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
   1474     } else {
   1475         MatchAt(fActiveStart, FALSE, status);
   1476     }
   1477     return fMatch;
   1478 }
   1479 
   1480 
   1481 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
   1482     if (U_FAILURE(status)) {
   1483         return FALSE;
   1484     }
   1485     if (U_FAILURE(fDeferredStatus)) {
   1486         status = fDeferredStatus;
   1487         return FALSE;
   1488     }
   1489     reset();
   1490 
   1491     if (start < 0) {
   1492         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1493         return FALSE;
   1494     }
   1495 
   1496     if (fInputUniStrMaybeMutable) {
   1497         if (compat_SyncMutableUTextContents(fInputText)) {
   1498         fInputLength = utext_nativeLength(fInputText);
   1499         reset();
   1500         }
   1501     }
   1502 
   1503     int64_t nativeStart;
   1504     nativeStart = start;
   1505     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1506         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1507         return FALSE;
   1508     }
   1509 
   1510     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1511         MatchChunkAt((int32_t)nativeStart, FALSE, status);
   1512     } else {
   1513         MatchAt(nativeStart, FALSE, status);
   1514     }
   1515     return fMatch;
   1516 }
   1517 
   1518 
   1519 
   1520 //--------------------------------------------------------------------------------
   1521 //
   1522 //  matches()
   1523 //
   1524 //--------------------------------------------------------------------------------
   1525 UBool RegexMatcher::matches(UErrorCode &status) {
   1526     if (U_FAILURE(status)) {
   1527         return FALSE;
   1528     }
   1529     if (U_FAILURE(fDeferredStatus)) {
   1530         status = fDeferredStatus;
   1531         return FALSE;
   1532     }
   1533 
   1534     if (fInputUniStrMaybeMutable) {
   1535         if (compat_SyncMutableUTextContents(fInputText)) {
   1536         fInputLength = utext_nativeLength(fInputText);
   1537         reset();
   1538         }
   1539     }
   1540     else {
   1541         resetPreserveRegion();
   1542     }
   1543 
   1544     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1545         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
   1546     } else {
   1547         MatchAt(fActiveStart, TRUE, status);
   1548     }
   1549     return fMatch;
   1550 }
   1551 
   1552 
   1553 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
   1554     if (U_FAILURE(status)) {
   1555         return FALSE;
   1556     }
   1557     if (U_FAILURE(fDeferredStatus)) {
   1558         status = fDeferredStatus;
   1559         return FALSE;
   1560     }
   1561     reset();
   1562 
   1563     if (start < 0) {
   1564         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1565         return FALSE;
   1566     }
   1567 
   1568     if (fInputUniStrMaybeMutable) {
   1569         if (compat_SyncMutableUTextContents(fInputText)) {
   1570         fInputLength = utext_nativeLength(fInputText);
   1571         reset();
   1572         }
   1573     }
   1574 
   1575     int64_t nativeStart;
   1576     nativeStart = start;
   1577     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1578         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1579         return FALSE;
   1580     }
   1581 
   1582     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1583         MatchChunkAt((int32_t)nativeStart, TRUE, status);
   1584     } else {
   1585         MatchAt(nativeStart, TRUE, status);
   1586     }
   1587     return fMatch;
   1588 }
   1589 
   1590 
   1591 
   1592 //--------------------------------------------------------------------------------
   1593 //
   1594 //    pattern
   1595 //
   1596 //--------------------------------------------------------------------------------
   1597 const RegexPattern &RegexMatcher::pattern() const {
   1598     return *fPattern;
   1599 }
   1600 
   1601 
   1602 
   1603 //--------------------------------------------------------------------------------
   1604 //
   1605 //    region
   1606 //
   1607 //--------------------------------------------------------------------------------
   1608 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
   1609     if (U_FAILURE(status)) {
   1610         return *this;
   1611     }
   1612 
   1613     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
   1614         status = U_ILLEGAL_ARGUMENT_ERROR;
   1615     }
   1616 
   1617     int64_t nativeStart = regionStart;
   1618     int64_t nativeLimit = regionLimit;
   1619     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
   1620       status = U_ILLEGAL_ARGUMENT_ERROR;
   1621     }
   1622 
   1623     if (startIndex == -1)
   1624       this->reset();
   1625     else
   1626       resetPreserveRegion();
   1627 
   1628     fRegionStart = nativeStart;
   1629     fRegionLimit = nativeLimit;
   1630     fActiveStart = nativeStart;
   1631     fActiveLimit = nativeLimit;
   1632 
   1633     if (startIndex != -1) {
   1634       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
   1635           status = U_INDEX_OUTOFBOUNDS_ERROR;
   1636       }
   1637       fMatchEnd = startIndex;
   1638     }
   1639 
   1640     if (!fTransparentBounds) {
   1641         fLookStart = nativeStart;
   1642         fLookLimit = nativeLimit;
   1643     }
   1644     if (fAnchoringBounds) {
   1645         fAnchorStart = nativeStart;
   1646         fAnchorLimit = nativeLimit;
   1647     }
   1648     return *this;
   1649 }
   1650 
   1651 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
   1652   return region(start, limit, -1, status);
   1653 }
   1654 
   1655 //--------------------------------------------------------------------------------
   1656 //
   1657 //    regionEnd
   1658 //
   1659 //--------------------------------------------------------------------------------
   1660 int32_t RegexMatcher::regionEnd() const {
   1661     return (int32_t)fRegionLimit;
   1662 }
   1663 
   1664 int64_t RegexMatcher::regionEnd64() const {
   1665     return fRegionLimit;
   1666 }
   1667 
   1668 //--------------------------------------------------------------------------------
   1669 //
   1670 //    regionStart
   1671 //
   1672 //--------------------------------------------------------------------------------
   1673 int32_t RegexMatcher::regionStart() const {
   1674     return (int32_t)fRegionStart;
   1675 }
   1676 
   1677 int64_t RegexMatcher::regionStart64() const {
   1678     return fRegionStart;
   1679 }
   1680 
   1681 
   1682 //--------------------------------------------------------------------------------
   1683 //
   1684 //    replaceAll
   1685 //
   1686 //--------------------------------------------------------------------------------
   1687 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
   1688     UText replacementText = UTEXT_INITIALIZER;
   1689     UText resultText = UTEXT_INITIALIZER;
   1690     UnicodeString resultString;
   1691     if (U_FAILURE(status)) {
   1692         return resultString;
   1693     }
   1694 
   1695     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1696     utext_openUnicodeString(&resultText, &resultString, &status);
   1697 
   1698     replaceAll(&replacementText, &resultText, status);
   1699 
   1700     utext_close(&resultText);
   1701     utext_close(&replacementText);
   1702 
   1703     return resultString;
   1704 }
   1705 
   1706 
   1707 //
   1708 //    replaceAll, UText mode
   1709 //
   1710 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
   1711     if (U_FAILURE(status)) {
   1712         return dest;
   1713     }
   1714     if (U_FAILURE(fDeferredStatus)) {
   1715         status = fDeferredStatus;
   1716         return dest;
   1717     }
   1718 
   1719     if (dest == NULL) {
   1720         UnicodeString emptyString;
   1721         UText empty = UTEXT_INITIALIZER;
   1722 
   1723         utext_openUnicodeString(&empty, &emptyString, &status);
   1724         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
   1725         utext_close(&empty);
   1726     }
   1727 
   1728     if (U_SUCCESS(status)) {
   1729         reset();
   1730         while (find()) {
   1731             appendReplacement(dest, replacement, status);
   1732             if (U_FAILURE(status)) {
   1733                 break;
   1734             }
   1735         }
   1736         appendTail(dest, status);
   1737     }
   1738 
   1739     return dest;
   1740 }
   1741 
   1742 
   1743 //--------------------------------------------------------------------------------
   1744 //
   1745 //    replaceFirst
   1746 //
   1747 //--------------------------------------------------------------------------------
   1748 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
   1749     UText replacementText = UTEXT_INITIALIZER;
   1750     UText resultText = UTEXT_INITIALIZER;
   1751     UnicodeString resultString;
   1752 
   1753     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1754     utext_openUnicodeString(&resultText, &resultString, &status);
   1755 
   1756     replaceFirst(&replacementText, &resultText, status);
   1757 
   1758     utext_close(&resultText);
   1759     utext_close(&replacementText);
   1760 
   1761     return resultString;
   1762 }
   1763 
   1764 //
   1765 //    replaceFirst, UText mode
   1766 //
   1767 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
   1768     if (U_FAILURE(status)) {
   1769         return dest;
   1770     }
   1771     if (U_FAILURE(fDeferredStatus)) {
   1772         status = fDeferredStatus;
   1773         return dest;
   1774     }
   1775 
   1776     reset();
   1777     if (!find()) {
   1778         return getInput(dest, status);
   1779     }
   1780 
   1781     if (dest == NULL) {
   1782         UnicodeString emptyString;
   1783         UText empty = UTEXT_INITIALIZER;
   1784 
   1785         utext_openUnicodeString(&empty, &emptyString, &status);
   1786         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
   1787         utext_close(&empty);
   1788     }
   1789 
   1790     appendReplacement(dest, replacement, status);
   1791     appendTail(dest, status);
   1792 
   1793     return dest;
   1794 }
   1795 
   1796 
   1797 //--------------------------------------------------------------------------------
   1798 //
   1799 //     requireEnd
   1800 //
   1801 //--------------------------------------------------------------------------------
   1802 UBool RegexMatcher::requireEnd() const {
   1803     return fRequireEnd;
   1804 }
   1805 
   1806 
   1807 //--------------------------------------------------------------------------------
   1808 //
   1809 //     reset
   1810 //
   1811 //--------------------------------------------------------------------------------
   1812 RegexMatcher &RegexMatcher::reset() {
   1813     fRegionStart    = 0;
   1814     fRegionLimit    = fInputLength;
   1815     fActiveStart    = 0;
   1816     fActiveLimit    = fInputLength;
   1817     fAnchorStart    = 0;
   1818     fAnchorLimit    = fInputLength;
   1819     fLookStart      = 0;
   1820     fLookLimit      = fInputLength;
   1821     resetPreserveRegion();
   1822     return *this;
   1823 }
   1824 
   1825 
   1826 
   1827 void RegexMatcher::resetPreserveRegion() {
   1828     fMatchStart     = 0;
   1829     fMatchEnd       = 0;
   1830     fLastMatchEnd   = -1;
   1831     fAppendPosition = 0;
   1832     fMatch          = FALSE;
   1833     fHitEnd         = FALSE;
   1834     fRequireEnd     = FALSE;
   1835     fTime           = 0;
   1836     fTickCounter    = TIMER_INITIAL_VALUE;
   1837     //resetStack(); // more expensive than it looks...
   1838 }
   1839 
   1840 
   1841 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
   1842     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
   1843     if (fPattern->fNeedsAltInput) {
   1844         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
   1845     }
   1846     if (U_FAILURE(fDeferredStatus)) {
   1847         return *this;
   1848     }
   1849     fInputLength = utext_nativeLength(fInputText);
   1850 
   1851     reset();
   1852     delete fInput;
   1853     fInput = NULL;
   1854 
   1855     //  Do the following for any UnicodeString.
   1856     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
   1857     fInputUniStrMaybeMutable = TRUE;
   1858 
   1859     if (fWordBreakItr != NULL) {
   1860 #if UCONFIG_NO_BREAK_ITERATION==0
   1861         UErrorCode status = U_ZERO_ERROR;
   1862         fWordBreakItr->setText(fInputText, status);
   1863 #endif
   1864     }
   1865     return *this;
   1866 }
   1867 
   1868 
   1869 RegexMatcher &RegexMatcher::reset(UText *input) {
   1870     if (fInputText != input) {
   1871         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
   1872         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
   1873         if (U_FAILURE(fDeferredStatus)) {
   1874             return *this;
   1875         }
   1876         fInputLength = utext_nativeLength(fInputText);
   1877 
   1878         delete fInput;
   1879         fInput = NULL;
   1880 
   1881         if (fWordBreakItr != NULL) {
   1882 #if UCONFIG_NO_BREAK_ITERATION==0
   1883             UErrorCode status = U_ZERO_ERROR;
   1884             fWordBreakItr->setText(input, status);
   1885 #endif
   1886         }
   1887     }
   1888     reset();
   1889     fInputUniStrMaybeMutable = FALSE;
   1890 
   1891     return *this;
   1892 }
   1893 
   1894 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
   1895     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
   1896     return *this;
   1897 }*/
   1898 
   1899 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
   1900     if (U_FAILURE(status)) {
   1901         return *this;
   1902     }
   1903     reset();       // Reset also resets the region to be the entire string.
   1904 
   1905     if (position < 0 || position > fActiveLimit) {
   1906         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1907         return *this;
   1908     }
   1909     fMatchEnd = position;
   1910     return *this;
   1911 }
   1912 
   1913 
   1914 //--------------------------------------------------------------------------------
   1915 //
   1916 //    refresh
   1917 //
   1918 //--------------------------------------------------------------------------------
   1919 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
   1920     if (U_FAILURE(status)) {
   1921         return *this;
   1922     }
   1923     if (input == NULL) {
   1924         status = U_ILLEGAL_ARGUMENT_ERROR;
   1925         return *this;
   1926     }
   1927     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
   1928         status = U_ILLEGAL_ARGUMENT_ERROR;
   1929         return *this;
   1930     }
   1931     int64_t  pos = utext_getNativeIndex(fInputText);
   1932     //  Shallow read-only clone of the new UText into the existing input UText
   1933     fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
   1934     if (U_FAILURE(status)) {
   1935         return *this;
   1936     }
   1937     utext_setNativeIndex(fInputText, pos);
   1938 
   1939     if (fAltInputText != NULL) {
   1940         pos = utext_getNativeIndex(fAltInputText);
   1941         fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
   1942         if (U_FAILURE(status)) {
   1943             return *this;
   1944         }
   1945         utext_setNativeIndex(fAltInputText, pos);
   1946     }
   1947     return *this;
   1948 }
   1949 
   1950 
   1951 
   1952 //--------------------------------------------------------------------------------
   1953 //
   1954 //    setTrace
   1955 //
   1956 //--------------------------------------------------------------------------------
   1957 void RegexMatcher::setTrace(UBool state) {
   1958     fTraceDebug = state;
   1959 }
   1960 
   1961 
   1962 
   1963 /**
   1964   *  UText, replace entire contents of the destination UText with a substring of the source UText.
   1965   *
   1966   *     @param src    The source UText
   1967   *     @param dest   The destination UText. Must be writable.
   1968   *                   May be NULL, in which case a new UText will be allocated.
   1969   *     @param start  Start index of source substring.
   1970   *     @param limit  Limit index of source substring.
   1971   *     @param status An error code.
   1972   */
   1973 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
   1974     if (U_FAILURE(*status)) {
   1975         return dest;
   1976     }
   1977     if (start == limit) {
   1978         if (dest) {
   1979             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
   1980             return dest;
   1981         } else {
   1982             return utext_openUChars(NULL, NULL, 0, status);
   1983         }
   1984     }
   1985     int32_t length = utext_extract(src, start, limit, NULL, 0, status);
   1986     if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
   1987         return dest;
   1988     }
   1989     *status = U_ZERO_ERROR;
   1990     MaybeStackArray<UChar, 40> buffer;
   1991     if (length >= buffer.getCapacity()) {
   1992         UChar *newBuf = buffer.resize(length+1);   // Leave space for terminating Nul.
   1993         if (newBuf == NULL) {
   1994             *status = U_MEMORY_ALLOCATION_ERROR;
   1995         }
   1996     }
   1997     utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
   1998     if (dest) {
   1999         utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
   2000         return dest;
   2001     }
   2002 
   2003     // Caller did not provide a prexisting UText.
   2004     // Open a new one, and have it adopt the text buffer storage.
   2005     if (U_FAILURE(*status)) {
   2006         return NULL;
   2007     }
   2008     int32_t ownedLength = 0;
   2009     UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
   2010     if (ownedBuf == NULL) {
   2011         *status = U_MEMORY_ALLOCATION_ERROR;
   2012         return NULL;
   2013     }
   2014     UText *result = utext_openUChars(NULL, ownedBuf, length, status);
   2015     if (U_FAILURE(*status)) {
   2016         uprv_free(ownedBuf);
   2017         return NULL;
   2018     }
   2019     result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
   2020     return result;
   2021 }
   2022 
   2023 
   2024 //---------------------------------------------------------------------
   2025 //
   2026 //   split
   2027 //
   2028 //---------------------------------------------------------------------
   2029 int32_t  RegexMatcher::split(const UnicodeString &input,
   2030         UnicodeString    dest[],
   2031         int32_t          destCapacity,
   2032         UErrorCode      &status)
   2033 {
   2034     UText inputText = UTEXT_INITIALIZER;
   2035     utext_openConstUnicodeString(&inputText, &input, &status);
   2036     if (U_FAILURE(status)) {
   2037         return 0;
   2038     }
   2039 
   2040     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
   2041     if (destText == NULL) {
   2042         status = U_MEMORY_ALLOCATION_ERROR;
   2043         return 0;
   2044     }
   2045     int32_t i;
   2046     for (i = 0; i < destCapacity; i++) {
   2047         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
   2048     }
   2049 
   2050     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
   2051 
   2052     for (i = 0; i < destCapacity; i++) {
   2053         utext_close(destText[i]);
   2054     }
   2055 
   2056     uprv_free(destText);
   2057     utext_close(&inputText);
   2058     return fieldCount;
   2059 }
   2060 
   2061 //
   2062 //   split, UText mode
   2063 //
   2064 int32_t  RegexMatcher::split(UText *input,
   2065         UText           *dest[],
   2066         int32_t          destCapacity,
   2067         UErrorCode      &status)
   2068 {
   2069     //
   2070     // Check arguements for validity
   2071     //
   2072     if (U_FAILURE(status)) {
   2073         return 0;
   2074     };
   2075 
   2076     if (destCapacity < 1) {
   2077         status = U_ILLEGAL_ARGUMENT_ERROR;
   2078         return 0;
   2079     }
   2080 
   2081     //
   2082     // Reset for the input text
   2083     //
   2084     reset(input);
   2085     int64_t   nextOutputStringStart = 0;
   2086     if (fActiveLimit == 0) {
   2087         return 0;
   2088     }
   2089 
   2090     //
   2091     // Loop through the input text, searching for the delimiter pattern
   2092     //
   2093     int32_t i;
   2094     int32_t numCaptureGroups = fPattern->fGroupMap->size();
   2095     for (i=0; ; i++) {
   2096         if (i>=destCapacity-1) {
   2097             // There is one or zero output string left.
   2098             // Fill the last output string with whatever is left from the input, then exit the loop.
   2099             //  ( i will be == destCapacity if we filled the output array while processing
   2100             //    capture groups of the delimiter expression, in which case we will discard the
   2101             //    last capture group saved in favor of the unprocessed remainder of the
   2102             //    input string.)
   2103             i = destCapacity-1;
   2104             if (fActiveLimit > nextOutputStringStart) {
   2105                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2106                     if (dest[i]) {
   2107                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2108                                       input->chunkContents+nextOutputStringStart,
   2109                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
   2110                     } else {
   2111                         UText remainingText = UTEXT_INITIALIZER;
   2112                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2113                                          fActiveLimit-nextOutputStringStart, &status);
   2114                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2115                         utext_close(&remainingText);
   2116                     }
   2117                 } else {
   2118                     UErrorCode lengthStatus = U_ZERO_ERROR;
   2119                     int32_t remaining16Length =
   2120                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
   2121                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2122                     if (remainingChars == NULL) {
   2123                         status = U_MEMORY_ALLOCATION_ERROR;
   2124                         break;
   2125                     }
   2126 
   2127                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2128                     if (dest[i]) {
   2129                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2130                     } else {
   2131                         UText remainingText = UTEXT_INITIALIZER;
   2132                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2133                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2134                         utext_close(&remainingText);
   2135                     }
   2136 
   2137                     uprv_free(remainingChars);
   2138                 }
   2139             }
   2140             break;
   2141         }
   2142         if (find()) {
   2143             // We found another delimiter.  Move everything from where we started looking
   2144             //  up until the start of the delimiter into the next output string.
   2145             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2146                 if (dest[i]) {
   2147                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2148                                   input->chunkContents+nextOutputStringStart,
   2149                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
   2150                 } else {
   2151                     UText remainingText = UTEXT_INITIALIZER;
   2152                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2153                                       fMatchStart-nextOutputStringStart, &status);
   2154                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2155                     utext_close(&remainingText);
   2156                 }
   2157             } else {
   2158                 UErrorCode lengthStatus = U_ZERO_ERROR;
   2159                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
   2160                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2161                 if (remainingChars == NULL) {
   2162                     status = U_MEMORY_ALLOCATION_ERROR;
   2163                     break;
   2164                 }
   2165                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
   2166                 if (dest[i]) {
   2167                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2168                 } else {
   2169                     UText remainingText = UTEXT_INITIALIZER;
   2170                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2171                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2172                     utext_close(&remainingText);
   2173                 }
   2174 
   2175                 uprv_free(remainingChars);
   2176             }
   2177             nextOutputStringStart = fMatchEnd;
   2178 
   2179             // If the delimiter pattern has capturing parentheses, the captured
   2180             //  text goes out into the next n destination strings.
   2181             int32_t groupNum;
   2182             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
   2183                 if (i >= destCapacity-2) {
   2184                     // Never fill the last available output string with capture group text.
   2185                     // It will filled with the last field, the remainder of the
   2186                     //  unsplit input text.
   2187                     break;
   2188                 }
   2189                 i++;
   2190                 dest[i] = utext_extract_replace(fInputText, dest[i],
   2191                                                start64(groupNum, status), end64(groupNum, status), &status);
   2192             }
   2193 
   2194             if (nextOutputStringStart == fActiveLimit) {
   2195                 // The delimiter was at the end of the string.  We're done, but first
   2196                 // we output one last empty string, for the empty field following
   2197                 //   the delimiter at the end of input.
   2198                 if (i+1 < destCapacity) {
   2199                     ++i;
   2200                     if (dest[i] == NULL) {
   2201                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
   2202                     } else {
   2203                         static const UChar emptyString[] = {(UChar)0};
   2204                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
   2205                     }
   2206                 }
   2207                 break;
   2208 
   2209             }
   2210         }
   2211         else
   2212         {
   2213             // We ran off the end of the input while looking for the next delimiter.
   2214             // All the remaining text goes into the current output string.
   2215             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2216                 if (dest[i]) {
   2217                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2218                                   input->chunkContents+nextOutputStringStart,
   2219                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
   2220                 } else {
   2221                     UText remainingText = UTEXT_INITIALIZER;
   2222                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2223                                      fActiveLimit-nextOutputStringStart, &status);
   2224                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2225                     utext_close(&remainingText);
   2226                 }
   2227             } else {
   2228                 UErrorCode lengthStatus = U_ZERO_ERROR;
   2229                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
   2230                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2231                 if (remainingChars == NULL) {
   2232                     status = U_MEMORY_ALLOCATION_ERROR;
   2233                     break;
   2234                 }
   2235 
   2236                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2237                 if (dest[i]) {
   2238                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2239                 } else {
   2240                     UText remainingText = UTEXT_INITIALIZER;
   2241                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2242                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2243                     utext_close(&remainingText);
   2244                 }
   2245 
   2246                 uprv_free(remainingChars);
   2247             }
   2248             break;
   2249         }
   2250         if (U_FAILURE(status)) {
   2251             break;
   2252         }
   2253     }   // end of for loop
   2254     return i+1;
   2255 }
   2256 
   2257 
   2258 //--------------------------------------------------------------------------------
   2259 //
   2260 //     start
   2261 //
   2262 //--------------------------------------------------------------------------------
   2263 int32_t RegexMatcher::start(UErrorCode &status) const {
   2264     return start(0, status);
   2265 }
   2266 
   2267 int64_t RegexMatcher::start64(UErrorCode &status) const {
   2268     return start64(0, status);
   2269 }
   2270 
   2271 //--------------------------------------------------------------------------------
   2272 //
   2273 //     start(int32_t group, UErrorCode &status)
   2274 //
   2275 //--------------------------------------------------------------------------------
   2276 
   2277 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
   2278     if (U_FAILURE(status)) {
   2279         return -1;
   2280     }
   2281     if (U_FAILURE(fDeferredStatus)) {
   2282         status = fDeferredStatus;
   2283         return -1;
   2284     }
   2285     if (fMatch == FALSE) {
   2286         status = U_REGEX_INVALID_STATE;
   2287         return -1;
   2288     }
   2289     if (group < 0 || group > fPattern->fGroupMap->size()) {
   2290         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2291         return -1;
   2292     }
   2293     int64_t s;
   2294     if (group == 0) {
   2295         s = fMatchStart;
   2296     } else {
   2297         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
   2298         U_ASSERT(groupOffset < fPattern->fFrameSize);
   2299         U_ASSERT(groupOffset >= 0);
   2300         s = fFrame->fExtra[groupOffset];
   2301     }
   2302 
   2303     return s;
   2304 }
   2305 
   2306 
   2307 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
   2308     return (int32_t)start64(group, status);
   2309 }
   2310 
   2311 //--------------------------------------------------------------------------------
   2312 //
   2313 //     useAnchoringBounds
   2314 //
   2315 //--------------------------------------------------------------------------------
   2316 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
   2317     fAnchoringBounds = b;
   2318     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
   2319     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
   2320     return *this;
   2321 }
   2322 
   2323 
   2324 //--------------------------------------------------------------------------------
   2325 //
   2326 //     useTransparentBounds
   2327 //
   2328 //--------------------------------------------------------------------------------
   2329 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
   2330     fTransparentBounds = b;
   2331     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
   2332     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
   2333     return *this;
   2334 }
   2335 
   2336 //--------------------------------------------------------------------------------
   2337 //
   2338 //     setTimeLimit
   2339 //
   2340 //--------------------------------------------------------------------------------
   2341 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
   2342     if (U_FAILURE(status)) {
   2343         return;
   2344     }
   2345     if (U_FAILURE(fDeferredStatus)) {
   2346         status = fDeferredStatus;
   2347         return;
   2348     }
   2349     if (limit < 0) {
   2350         status = U_ILLEGAL_ARGUMENT_ERROR;
   2351         return;
   2352     }
   2353     fTimeLimit = limit;
   2354 }
   2355 
   2356 
   2357 //--------------------------------------------------------------------------------
   2358 //
   2359 //     getTimeLimit
   2360 //
   2361 //--------------------------------------------------------------------------------
   2362 int32_t RegexMatcher::getTimeLimit() const {
   2363     return fTimeLimit;
   2364 }
   2365 
   2366 
   2367 //--------------------------------------------------------------------------------
   2368 //
   2369 //     setStackLimit
   2370 //
   2371 //--------------------------------------------------------------------------------
   2372 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
   2373     if (U_FAILURE(status)) {
   2374         return;
   2375     }
   2376     if (U_FAILURE(fDeferredStatus)) {
   2377         status = fDeferredStatus;
   2378         return;
   2379     }
   2380     if (limit < 0) {
   2381         status = U_ILLEGAL_ARGUMENT_ERROR;
   2382         return;
   2383     }
   2384 
   2385     // Reset the matcher.  This is needed here in case there is a current match
   2386     //    whose final stack frame (containing the match results, pointed to by fFrame)
   2387     //    would be lost by resizing to a smaller stack size.
   2388     reset();
   2389 
   2390     if (limit == 0) {
   2391         // Unlimited stack expansion
   2392         fStack->setMaxCapacity(0);
   2393     } else {
   2394         // Change the units of the limit  from bytes to ints, and bump the size up
   2395         //   to be big enough to hold at least one stack frame for the pattern,
   2396         //   if it isn't there already.
   2397         int32_t adjustedLimit = limit / sizeof(int32_t);
   2398         if (adjustedLimit < fPattern->fFrameSize) {
   2399             adjustedLimit = fPattern->fFrameSize;
   2400         }
   2401         fStack->setMaxCapacity(adjustedLimit);
   2402     }
   2403     fStackLimit = limit;
   2404 }
   2405 
   2406 
   2407 //--------------------------------------------------------------------------------
   2408 //
   2409 //     getStackLimit
   2410 //
   2411 //--------------------------------------------------------------------------------
   2412 int32_t RegexMatcher::getStackLimit() const {
   2413     return fStackLimit;
   2414 }
   2415 
   2416 
   2417 //--------------------------------------------------------------------------------
   2418 //
   2419 //     setMatchCallback
   2420 //
   2421 //--------------------------------------------------------------------------------
   2422 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
   2423                                     const void              *context,
   2424                                     UErrorCode              &status) {
   2425     if (U_FAILURE(status)) {
   2426         return;
   2427     }
   2428     fCallbackFn = callback;
   2429     fCallbackContext = context;
   2430 }
   2431 
   2432 
   2433 //--------------------------------------------------------------------------------
   2434 //
   2435 //     getMatchCallback
   2436 //
   2437 //--------------------------------------------------------------------------------
   2438 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
   2439                                   const void              *&context,
   2440                                   UErrorCode              &status) {
   2441     if (U_FAILURE(status)) {
   2442        return;
   2443     }
   2444     callback = fCallbackFn;
   2445     context  = fCallbackContext;
   2446 }
   2447 
   2448 
   2449 //--------------------------------------------------------------------------------
   2450 //
   2451 //     setMatchCallback
   2452 //
   2453 //--------------------------------------------------------------------------------
   2454 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
   2455                                                 const void                      *context,
   2456                                                 UErrorCode                      &status) {
   2457     if (U_FAILURE(status)) {
   2458         return;
   2459     }
   2460     fFindProgressCallbackFn = callback;
   2461     fFindProgressCallbackContext = context;
   2462 }
   2463 
   2464 
   2465 //--------------------------------------------------------------------------------
   2466 //
   2467 //     getMatchCallback
   2468 //
   2469 //--------------------------------------------------------------------------------
   2470 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
   2471                                                 const void                    *&context,
   2472                                                 UErrorCode                    &status) {
   2473     if (U_FAILURE(status)) {
   2474        return;
   2475     }
   2476     callback = fFindProgressCallbackFn;
   2477     context  = fFindProgressCallbackContext;
   2478 }
   2479 
   2480 
   2481 //================================================================================
   2482 //
   2483 //    Code following this point in this file is the internal
   2484 //    Match Engine Implementation.
   2485 //
   2486 //================================================================================
   2487 
   2488 
   2489 //--------------------------------------------------------------------------------
   2490 //
   2491 //   resetStack
   2492 //           Discard any previous contents of the state save stack, and initialize a
   2493 //           new stack frame to all -1.  The -1s are needed for capture group limits,
   2494 //           where they indicate that a group has not yet matched anything.
   2495 //--------------------------------------------------------------------------------
   2496 REStackFrame *RegexMatcher::resetStack() {
   2497     // Discard any previous contents of the state save stack, and initialize a
   2498     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
   2499     //  where they indicate that a group has not yet matched anything.
   2500     fStack->removeAllElements();
   2501 
   2502     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
   2503     if(U_FAILURE(fDeferredStatus)) {
   2504         return NULL;
   2505     }
   2506 
   2507     int32_t i;
   2508     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
   2509         iFrame->fExtra[i] = -1;
   2510     }
   2511     return iFrame;
   2512 }
   2513 
   2514 
   2515 
   2516 //--------------------------------------------------------------------------------
   2517 //
   2518 //   isWordBoundary
   2519 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
   2520 //                     For us,
   2521 //                       If the current char is a combining mark,
   2522 //                          \b is FALSE.
   2523 //                       Else Scan backwards to the first non-combining char.
   2524 //                            We are at a boundary if the this char and the original chars are
   2525 //                               opposite in membership in \w set
   2526 //
   2527 //          parameters:   pos   - the current position in the input buffer
   2528 //
   2529 //              TODO:  double-check edge cases at region boundaries.
   2530 //
   2531 //--------------------------------------------------------------------------------
   2532 UBool RegexMatcher::isWordBoundary(int64_t pos) {
   2533     UBool isBoundary = FALSE;
   2534     UBool cIsWord    = FALSE;
   2535 
   2536     if (pos >= fLookLimit) {
   2537         fHitEnd = TRUE;
   2538     } else {
   2539         // Determine whether char c at current position is a member of the word set of chars.
   2540         // If we're off the end of the string, behave as though we're not at a word char.
   2541         UTEXT_SETNATIVEINDEX(fInputText, pos);
   2542         UChar32  c = UTEXT_CURRENT32(fInputText);
   2543         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2544             // Current char is a combining one.  Not a boundary.
   2545             return FALSE;
   2546         }
   2547         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   2548     }
   2549 
   2550     // Back up until we come to a non-combining char, determine whether
   2551     //  that char is a word char.
   2552     UBool prevCIsWord = FALSE;
   2553     for (;;) {
   2554         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
   2555             break;
   2556         }
   2557         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
   2558         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2559               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2560             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   2561             break;
   2562         }
   2563     }
   2564     isBoundary = cIsWord ^ prevCIsWord;
   2565     return isBoundary;
   2566 }
   2567 
   2568 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
   2569     UBool isBoundary = FALSE;
   2570     UBool cIsWord    = FALSE;
   2571 
   2572     const UChar *inputBuf = fInputText->chunkContents;
   2573 
   2574     if (pos >= fLookLimit) {
   2575         fHitEnd = TRUE;
   2576     } else {
   2577         // Determine whether char c at current position is a member of the word set of chars.
   2578         // If we're off the end of the string, behave as though we're not at a word char.
   2579         UChar32 c;
   2580         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
   2581         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2582             // Current char is a combining one.  Not a boundary.
   2583             return FALSE;
   2584         }
   2585         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   2586     }
   2587 
   2588     // Back up until we come to a non-combining char, determine whether
   2589     //  that char is a word char.
   2590     UBool prevCIsWord = FALSE;
   2591     for (;;) {
   2592         if (pos <= fLookStart) {
   2593             break;
   2594         }
   2595         UChar32 prevChar;
   2596         U16_PREV(inputBuf, fLookStart, pos, prevChar);
   2597         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2598               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2599             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   2600             break;
   2601         }
   2602     }
   2603     isBoundary = cIsWord ^ prevCIsWord;
   2604     return isBoundary;
   2605 }
   2606 
   2607 //--------------------------------------------------------------------------------
   2608 //
   2609 //   isUWordBoundary
   2610 //
   2611 //         Test for a word boundary using RBBI word break.
   2612 //
   2613 //          parameters:   pos   - the current position in the input buffer
   2614 //
   2615 //--------------------------------------------------------------------------------
   2616 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
   2617     UBool       returnVal = FALSE;
   2618 #if UCONFIG_NO_BREAK_ITERATION==0
   2619 
   2620     // If we haven't yet created a break iterator for this matcher, do it now.
   2621     if (fWordBreakItr == NULL) {
   2622         fWordBreakItr =
   2623             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
   2624         if (U_FAILURE(fDeferredStatus)) {
   2625             return FALSE;
   2626         }
   2627         fWordBreakItr->setText(fInputText, fDeferredStatus);
   2628     }
   2629 
   2630     if (pos >= fLookLimit) {
   2631         fHitEnd = TRUE;
   2632         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
   2633                             //    words are not boundaries.  All non-word chars stand by themselves,
   2634                             //    with word boundaries on both sides.
   2635     } else {
   2636         if (!UTEXT_USES_U16(fInputText)) {
   2637             // !!!: Would like a better way to do this!
   2638             UErrorCode status = U_ZERO_ERROR;
   2639             pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
   2640         }
   2641         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
   2642     }
   2643 #endif
   2644     return   returnVal;
   2645 }
   2646 
   2647 //--------------------------------------------------------------------------------
   2648 //
   2649 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
   2650 //                     saves. Increment the "time" counter, and call the
   2651 //                     user callback function if there is one installed.
   2652 //
   2653 //                     If the match operation needs to be aborted, either for a time-out
   2654 //                     or because the user callback asked for it, just set an error status.
   2655 //                     The engine will pick that up and stop in its outer loop.
   2656 //
   2657 //--------------------------------------------------------------------------------
   2658 void RegexMatcher::IncrementTime(UErrorCode &status) {
   2659     fTickCounter = TIMER_INITIAL_VALUE;
   2660     fTime++;
   2661     if (fCallbackFn != NULL) {
   2662         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
   2663             status = U_REGEX_STOPPED_BY_CALLER;
   2664             return;
   2665         }
   2666     }
   2667     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
   2668         status = U_REGEX_TIME_OUT;
   2669     }
   2670 }
   2671 
   2672 //--------------------------------------------------------------------------------
   2673 //
   2674 //   StateSave
   2675 //       Make a new stack frame, initialized as a copy of the current stack frame.
   2676 //       Set the pattern index in the original stack frame from the operand value
   2677 //       in the opcode.  Execution of the engine continues with the state in
   2678 //       the newly created stack frame
   2679 //
   2680 //       Note that reserveBlock() may grow the stack, resulting in the
   2681 //       whole thing being relocated in memory.
   2682 //
   2683 //    Parameters:
   2684 //       fp           The top frame pointer when called.  At return, a new
   2685 //                    fame will be present
   2686 //       savePatIdx   An index into the compiled pattern.  Goes into the original
   2687 //                    (not new) frame.  If execution ever back-tracks out of the
   2688 //                    new frame, this will be where we continue from in the pattern.
   2689 //    Return
   2690 //                    The new frame pointer.
   2691 //
   2692 //--------------------------------------------------------------------------------
   2693 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
   2694     if (U_FAILURE(status)) {
   2695         return fp;
   2696     }
   2697     // push storage for a new frame.
   2698     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
   2699     if (U_FAILURE(status)) {
   2700         // Failure on attempted stack expansion.
   2701         //   Stack function set some other error code, change it to a more
   2702         //   specific one for regular expressions.
   2703         status = U_REGEX_STACK_OVERFLOW;
   2704         // We need to return a writable stack frame, so just return the
   2705         //    previous frame.  The match operation will stop quickly
   2706         //    because of the error status, after which the frame will never
   2707         //    be looked at again.
   2708         return fp;
   2709     }
   2710     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
   2711 
   2712     // New stack frame = copy of old top frame.
   2713     int64_t *source = (int64_t *)fp;
   2714     int64_t *dest   = newFP;
   2715     for (;;) {
   2716         *dest++ = *source++;
   2717         if (source == newFP) {
   2718             break;
   2719         }
   2720     }
   2721 
   2722     fTickCounter--;
   2723     if (fTickCounter <= 0) {
   2724        IncrementTime(status);    // Re-initializes fTickCounter
   2725     }
   2726     fp->fPatIdx = savePatIdx;
   2727     return (REStackFrame *)newFP;
   2728 }
   2729 
   2730 #if defined(REGEX_DEBUG)
   2731 namespace {
   2732 UnicodeString StringFromUText(UText *ut) {
   2733     UnicodeString result;
   2734     for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
   2735         result.append(c);
   2736     }
   2737     return result;
   2738 }
   2739 }
   2740 #endif // REGEX_DEBUG
   2741 
   2742 
   2743 //--------------------------------------------------------------------------------
   2744 //
   2745 //   MatchAt      This is the actual matching engine.
   2746 //
   2747 //                  startIdx:    begin matching a this index.
   2748 //                  toEnd:       if true, match must extend to end of the input region
   2749 //
   2750 //--------------------------------------------------------------------------------
   2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
   2752     UBool       isMatch  = FALSE;      // True if the we have a match.
   2753 
   2754     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
   2755 
   2756     int32_t     op;                    // Operation from the compiled pattern, split into
   2757     int32_t     opType;                //    the opcode
   2758     int32_t     opValue;               //    and the operand value.
   2759 
   2760 #ifdef REGEX_RUN_DEBUG
   2761     if (fTraceDebug) {
   2762         printf("MatchAt(startIdx=%ld)\n", startIdx);
   2763         printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
   2764         printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
   2765     }
   2766 #endif
   2767 
   2768     if (U_FAILURE(status)) {
   2769         return;
   2770     }
   2771 
   2772     //  Cache frequently referenced items from the compiled pattern
   2773     //
   2774     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   2775 
   2776     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   2777     UVector             *sets          = fPattern->fSets;
   2778 
   2779     fFrameSize = fPattern->fFrameSize;
   2780     REStackFrame        *fp            = resetStack();
   2781     if (U_FAILURE(fDeferredStatus)) {
   2782         status = fDeferredStatus;
   2783         return;
   2784     }
   2785 
   2786     fp->fPatIdx   = 0;
   2787     fp->fInputIdx = startIdx;
   2788 
   2789     // Zero out the pattern's static data
   2790     int32_t i;
   2791     for (i = 0; i<fPattern->fDataSize; i++) {
   2792         fData[i] = 0;
   2793     }
   2794 
   2795     //
   2796     //  Main loop for interpreting the compiled pattern.
   2797     //  One iteration of the loop per pattern operation performed.
   2798     //
   2799     for (;;) {
   2800         op      = (int32_t)pat[fp->fPatIdx];
   2801         opType  = URX_TYPE(op);
   2802         opValue = URX_VAL(op);
   2803 #ifdef REGEX_RUN_DEBUG
   2804         if (fTraceDebug) {
   2805             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2806             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
   2807                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   2808             fPattern->dumpOp(fp->fPatIdx);
   2809         }
   2810 #endif
   2811         fp->fPatIdx++;
   2812 
   2813         switch (opType) {
   2814 
   2815 
   2816         case URX_NOP:
   2817             break;
   2818 
   2819 
   2820         case URX_BACKTRACK:
   2821             // Force a backtrack.  In some circumstances, the pattern compiler
   2822             //   will notice that the pattern can't possibly match anything, and will
   2823             //   emit one of these at that point.
   2824             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2825             break;
   2826 
   2827 
   2828         case URX_ONECHAR:
   2829             if (fp->fInputIdx < fActiveLimit) {
   2830                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2831                 UChar32 c = UTEXT_NEXT32(fInputText);
   2832                 if (c == opValue) {
   2833                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2834                     break;
   2835                 }
   2836             } else {
   2837                 fHitEnd = TRUE;
   2838             }
   2839             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2840             break;
   2841 
   2842 
   2843         case URX_STRING:
   2844             {
   2845                 // Test input against a literal string.
   2846                 // Strings require two slots in the compiled pattern, one for the
   2847                 //   offset to the string text, and one for the length.
   2848 
   2849                 int32_t   stringStartIdx = opValue;
   2850                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
   2851                 fp->fPatIdx++;
   2852                 opType    = URX_TYPE(op);
   2853                 int32_t stringLen = URX_VAL(op);
   2854                 U_ASSERT(opType == URX_STRING_LEN);
   2855                 U_ASSERT(stringLen >= 2);
   2856 
   2857                 const UChar *patternString = litText+stringStartIdx;
   2858                 int32_t patternStringIndex = 0;
   2859                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2860                 UChar32 inputChar;
   2861                 UChar32 patternChar;
   2862                 UBool success = TRUE;
   2863                 while (patternStringIndex < stringLen) {
   2864                     if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
   2865                         success = FALSE;
   2866                         fHitEnd = TRUE;
   2867                         break;
   2868                     }
   2869                     inputChar = UTEXT_NEXT32(fInputText);
   2870                     U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
   2871                     if (patternChar != inputChar) {
   2872                         success = FALSE;
   2873                         break;
   2874                     }
   2875                 }
   2876 
   2877                 if (success) {
   2878                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2879                 } else {
   2880                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2881                 }
   2882             }
   2883             break;
   2884 
   2885 
   2886         case URX_STATE_SAVE:
   2887             fp = StateSave(fp, opValue, status);
   2888             break;
   2889 
   2890 
   2891         case URX_END:
   2892             // The match loop will exit via this path on a successful match,
   2893             //   when we reach the end of the pattern.
   2894             if (toEnd && fp->fInputIdx != fActiveLimit) {
   2895                 // The pattern matched, but not to the end of input.  Try some more.
   2896                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2897                 break;
   2898             }
   2899             isMatch = TRUE;
   2900             goto  breakFromLoop;
   2901 
   2902         // Start and End Capture stack frame variables are laid out out like this:
   2903             //  fp->fExtra[opValue]  - The start of a completed capture group
   2904             //             opValue+1 - The end   of a completed capture group
   2905             //             opValue+2 - the start of a capture group whose end
   2906             //                          has not yet been reached (and might not ever be).
   2907         case URX_START_CAPTURE:
   2908             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2909             fp->fExtra[opValue+2] = fp->fInputIdx;
   2910             break;
   2911 
   2912 
   2913         case URX_END_CAPTURE:
   2914             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2915             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   2916             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   2917             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   2918             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   2919             break;
   2920 
   2921 
   2922         case URX_DOLLAR:                   //  $, test for End of line
   2923                                            //     or for position before new line at end of input
   2924             {
   2925                 if (fp->fInputIdx >= fAnchorLimit) {
   2926                     // We really are at the end of input.  Success.
   2927                     fHitEnd = TRUE;
   2928                     fRequireEnd = TRUE;
   2929                     break;
   2930                 }
   2931 
   2932                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2933 
   2934                 // If we are positioned just before a new-line that is located at the
   2935                 //   end of input, succeed.
   2936                 UChar32 c = UTEXT_NEXT32(fInputText);
   2937                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   2938                     if (isLineTerminator(c)) {
   2939                         // If not in the middle of a CR/LF sequence
   2940                         if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
   2941                             // At new-line at end of input. Success
   2942                             fHitEnd = TRUE;
   2943                             fRequireEnd = TRUE;
   2944 
   2945                             break;
   2946                         }
   2947                     }
   2948                 } else {
   2949                     UChar32 nextC = UTEXT_NEXT32(fInputText);
   2950                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   2951                         fHitEnd = TRUE;
   2952                         fRequireEnd = TRUE;
   2953                         break;                         // At CR/LF at end of input.  Success
   2954                     }
   2955                 }
   2956 
   2957                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2958             }
   2959             break;
   2960 
   2961 
   2962          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   2963             if (fp->fInputIdx >= fAnchorLimit) {
   2964                 // Off the end of input.  Success.
   2965                 fHitEnd = TRUE;
   2966                 fRequireEnd = TRUE;
   2967                 break;
   2968             } else {
   2969                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2970                 UChar32 c = UTEXT_NEXT32(fInputText);
   2971                 // Either at the last character of input, or off the end.
   2972                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
   2973                     fHitEnd = TRUE;
   2974                     fRequireEnd = TRUE;
   2975                     break;
   2976                 }
   2977             }
   2978 
   2979             // Not at end of input.  Back-track out.
   2980             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2981             break;
   2982 
   2983 
   2984          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   2985              {
   2986                  if (fp->fInputIdx >= fAnchorLimit) {
   2987                      // We really are at the end of input.  Success.
   2988                      fHitEnd = TRUE;
   2989                      fRequireEnd = TRUE;
   2990                      break;
   2991                  }
   2992                  // If we are positioned just before a new-line, succeed.
   2993                  // It makes no difference where the new-line is within the input.
   2994                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2995                  UChar32 c = UTEXT_CURRENT32(fInputText);
   2996                  if (isLineTerminator(c)) {
   2997                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   2998                      //  In multi-line mode, hitting a new-line just before the end of input does not
   2999                      //   set the hitEnd or requireEnd flags
   3000                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
   3001                         break;
   3002                      }
   3003                  }
   3004                  // not at a new line.  Fail.
   3005                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3006              }
   3007              break;
   3008 
   3009 
   3010          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   3011              {
   3012                  if (fp->fInputIdx >= fAnchorLimit) {
   3013                      // We really are at the end of input.  Success.
   3014                      fHitEnd = TRUE;
   3015                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   3016                      break;               //   adding a new-line would not lose the match.
   3017                  }
   3018                  // If we are not positioned just before a new-line, the test fails; backtrack out.
   3019                  // It makes no difference where the new-line is within the input.
   3020                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3021                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
   3022                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3023                  }
   3024              }
   3025              break;
   3026 
   3027 
   3028        case URX_CARET:                    //  ^, test for start of line
   3029             if (fp->fInputIdx != fAnchorStart) {
   3030                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3031             }
   3032             break;
   3033 
   3034 
   3035        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   3036            {
   3037                if (fp->fInputIdx == fAnchorStart) {
   3038                    // We are at the start input.  Success.
   3039                    break;
   3040                }
   3041                // Check whether character just before the current pos is a new-line
   3042                //   unless we are at the end of input
   3043                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3044                UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3045                if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
   3046                    //  It's a new-line.  ^ is true.  Success.
   3047                    //  TODO:  what should be done with positions between a CR and LF?
   3048                    break;
   3049                }
   3050                // Not at the start of a line.  Fail.
   3051                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3052            }
   3053            break;
   3054 
   3055 
   3056        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   3057            {
   3058                U_ASSERT(fp->fInputIdx >= fAnchorStart);
   3059                if (fp->fInputIdx <= fAnchorStart) {
   3060                    // We are at the start input.  Success.
   3061                    break;
   3062                }
   3063                // Check whether character just before the current pos is a new-line
   3064                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   3065                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3066                UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3067                if (c != 0x0a) {
   3068                    // Not at the start of a line.  Back-track out.
   3069                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3070                }
   3071            }
   3072            break;
   3073 
   3074         case URX_BACKSLASH_B:          // Test for word boundaries
   3075             {
   3076                 UBool success = isWordBoundary(fp->fInputIdx);
   3077                 success ^= (UBool)(opValue != 0);     // flip sense for \B
   3078                 if (!success) {
   3079                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3080                 }
   3081             }
   3082             break;
   3083 
   3084 
   3085         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   3086             {
   3087                 UBool success = isUWordBoundary(fp->fInputIdx);
   3088                 success ^= (UBool)(opValue != 0);     // flip sense for \B
   3089                 if (!success) {
   3090                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3091                 }
   3092             }
   3093             break;
   3094 
   3095 
   3096         case URX_BACKSLASH_D:            // Test for decimal digit
   3097             {
   3098                 if (fp->fInputIdx >= fActiveLimit) {
   3099                     fHitEnd = TRUE;
   3100                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3101                     break;
   3102                 }
   3103 
   3104                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3105 
   3106                 UChar32 c = UTEXT_NEXT32(fInputText);
   3107                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   3108                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   3109                 success ^= (UBool)(opValue != 0);        // flip sense for \D
   3110                 if (success) {
   3111                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3112                 } else {
   3113                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3114                 }
   3115             }
   3116             break;
   3117 
   3118 
   3119         case URX_BACKSLASH_G:          // Test for position at end of previous match
   3120             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
   3121                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3122             }
   3123             break;
   3124 
   3125 
   3126         case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
   3127             {
   3128                 if (fp->fInputIdx >= fActiveLimit) {
   3129                     fHitEnd = TRUE;
   3130                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3131                     break;
   3132                 }
   3133                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3134                 UChar32 c = UTEXT_NEXT32(fInputText);
   3135                 int8_t ctype = u_charType(c);
   3136                 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
   3137                 success ^= (UBool)(opValue != 0);        // flip sense for \H
   3138                 if (success) {
   3139                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3140                 } else {
   3141                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3142                 }
   3143             }
   3144             break;
   3145 
   3146 
   3147         case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
   3148             {
   3149                 if (fp->fInputIdx >= fActiveLimit) {
   3150                     fHitEnd = TRUE;
   3151                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3152                     break;
   3153                 }
   3154                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3155                 UChar32 c = UTEXT_NEXT32(fInputText);
   3156                 if (isLineTerminator(c)) {
   3157                     if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
   3158                         utext_next32(fInputText);
   3159                     }
   3160                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3161                 } else {
   3162                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3163                 }
   3164             }
   3165             break;
   3166 
   3167 
   3168         case URX_BACKSLASH_V:            // \v, any single line ending character.
   3169             {
   3170                 if (fp->fInputIdx >= fActiveLimit) {
   3171                     fHitEnd = TRUE;
   3172                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3173                     break;
   3174                 }
   3175                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3176                 UChar32 c = UTEXT_NEXT32(fInputText);
   3177                 UBool success = isLineTerminator(c);
   3178                 success ^= (UBool)(opValue != 0);        // flip sense for \V
   3179                 if (success) {
   3180                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3181                 } else {
   3182                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3183                 }
   3184             }
   3185             break;
   3186 
   3187 
   3188         case URX_BACKSLASH_X:
   3189             //  Match a Grapheme, as defined by Unicode TR 29.
   3190             //  Differs slightly from Perl, which consumes combining marks independently
   3191             //    of context.
   3192             {
   3193 
   3194                 // Fail if at end of input
   3195                 if (fp->fInputIdx >= fActiveLimit) {
   3196                     fHitEnd = TRUE;
   3197                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3198                     break;
   3199                 }
   3200 
   3201                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3202 
   3203                 // Examine (and consume) the current char.
   3204                 //   Dispatch into a little state machine, based on the char.
   3205                 UChar32  c;
   3206                 c = UTEXT_NEXT32(fInputText);
   3207                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3208                 UnicodeSet **sets = fPattern->fStaticSets;
   3209                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   3210                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   3211                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   3212                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   3213                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   3214                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3215                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3216                 goto GC_Extend;
   3217 
   3218 
   3219 
   3220 GC_L:
   3221                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3222                 c = UTEXT_NEXT32(fInputText);
   3223                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3224                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   3225                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   3226                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   3227                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3228                 (void)UTEXT_PREVIOUS32(fInputText);
   3229                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3230                 goto GC_Extend;
   3231 
   3232 GC_V:
   3233                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3234                 c = UTEXT_NEXT32(fInputText);
   3235                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3236                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3237                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3238                 (void)UTEXT_PREVIOUS32(fInputText);
   3239                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3240                 goto GC_Extend;
   3241 
   3242 GC_T:
   3243                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3244                 c = UTEXT_NEXT32(fInputText);
   3245                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3246                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3247                 (void)UTEXT_PREVIOUS32(fInputText);
   3248                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3249                 goto GC_Extend;
   3250 
   3251 GC_Extend:
   3252                 // Combining characters are consumed here
   3253                 for (;;) {
   3254                     if (fp->fInputIdx >= fActiveLimit) {
   3255                         break;
   3256                     }
   3257                     c = UTEXT_CURRENT32(fInputText);
   3258                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   3259                         break;
   3260                     }
   3261                     (void)UTEXT_NEXT32(fInputText);
   3262                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3263                 }
   3264                 goto GC_Done;
   3265 
   3266 GC_Control:
   3267                 // Most control chars stand alone (don't combine with combining chars),
   3268                 //   except for that CR/LF sequence is a single grapheme cluster.
   3269                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
   3270                     c = UTEXT_NEXT32(fInputText);
   3271                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3272                 }
   3273 
   3274 GC_Done:
   3275                 if (fp->fInputIdx >= fActiveLimit) {
   3276                     fHitEnd = TRUE;
   3277                 }
   3278                 break;
   3279             }
   3280 
   3281 
   3282 
   3283 
   3284         case URX_BACKSLASH_Z:          // Test for end of Input
   3285             if (fp->fInputIdx < fAnchorLimit) {
   3286                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3287             } else {
   3288                 fHitEnd = TRUE;
   3289                 fRequireEnd = TRUE;
   3290             }
   3291             break;
   3292 
   3293 
   3294 
   3295         case URX_STATIC_SETREF:
   3296             {
   3297                 // Test input character against one of the predefined sets
   3298                 //    (Word Characters, for example)
   3299                 // The high bit of the op value is a flag for the match polarity.
   3300                 //    0:   success if input char is in set.
   3301                 //    1:   success if input char is not in set.
   3302                 if (fp->fInputIdx >= fActiveLimit) {
   3303                     fHitEnd = TRUE;
   3304                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3305                     break;
   3306                 }
   3307 
   3308                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   3309                 opValue &= ~URX_NEG_SET;
   3310                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3311 
   3312                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3313                 UChar32 c = UTEXT_NEXT32(fInputText);
   3314                 if (c < 256) {
   3315                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3316                     if (s8->contains(c)) {
   3317                         success = !success;
   3318                     }
   3319                 } else {
   3320                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3321                     if (s->contains(c)) {
   3322                         success = !success;
   3323                     }
   3324                 }
   3325                 if (success) {
   3326                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3327                 } else {
   3328                     // the character wasn't in the set.
   3329                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3330                 }
   3331             }
   3332             break;
   3333 
   3334 
   3335         case URX_STAT_SETREF_N:
   3336             {
   3337                 // Test input character for NOT being a member of  one of
   3338                 //    the predefined sets (Word Characters, for example)
   3339                 if (fp->fInputIdx >= fActiveLimit) {
   3340                     fHitEnd = TRUE;
   3341                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3342                     break;
   3343                 }
   3344 
   3345                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3346 
   3347                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3348 
   3349                 UChar32 c = UTEXT_NEXT32(fInputText);
   3350                 if (c < 256) {
   3351                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3352                     if (s8->contains(c) == FALSE) {
   3353                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3354                         break;
   3355                     }
   3356                 } else {
   3357                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3358                     if (s->contains(c) == FALSE) {
   3359                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3360                         break;
   3361                     }
   3362                 }
   3363                 // the character wasn't in the set.
   3364                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3365             }
   3366             break;
   3367 
   3368 
   3369         case URX_SETREF:
   3370             if (fp->fInputIdx >= fActiveLimit) {
   3371                 fHitEnd = TRUE;
   3372                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3373                 break;
   3374             } else {
   3375                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3376 
   3377                 // There is input left.  Pick up one char and test it for set membership.
   3378                 UChar32 c = UTEXT_NEXT32(fInputText);
   3379                 U_ASSERT(opValue > 0 && opValue < sets->size());
   3380                 if (c<256) {
   3381                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   3382                     if (s8->contains(c)) {
   3383                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3384                         break;
   3385                     }
   3386                 } else {
   3387                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   3388                     if (s->contains(c)) {
   3389                         // The character is in the set.  A Match.
   3390                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3391                         break;
   3392                     }
   3393                 }
   3394 
   3395                 // the character wasn't in the set.
   3396                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3397             }
   3398             break;
   3399 
   3400 
   3401         case URX_DOTANY:
   3402             {
   3403                 // . matches anything, but stops at end-of-line.
   3404                 if (fp->fInputIdx >= fActiveLimit) {
   3405                     // At end of input.  Match failed.  Backtrack out.
   3406                     fHitEnd = TRUE;
   3407                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3408                     break;
   3409                 }
   3410 
   3411                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3412 
   3413                 // There is input left.  Advance over one char, unless we've hit end-of-line
   3414                 UChar32 c = UTEXT_NEXT32(fInputText);
   3415                 if (isLineTerminator(c)) {
   3416                     // End of line in normal mode.   . does not match.
   3417                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3418                     break;
   3419                 }
   3420                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3421             }
   3422             break;
   3423 
   3424 
   3425         case URX_DOTANY_ALL:
   3426             {
   3427                 // ., in dot-matches-all (including new lines) mode
   3428                 if (fp->fInputIdx >= fActiveLimit) {
   3429                     // At end of input.  Match failed.  Backtrack out.
   3430                     fHitEnd = TRUE;
   3431                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3432                     break;
   3433                 }
   3434 
   3435                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3436 
   3437                 // There is input left.  Advance over one char, except if we are
   3438                 //   at a cr/lf, advance over both of them.
   3439                 UChar32 c;
   3440                 c = UTEXT_NEXT32(fInputText);
   3441                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3442                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   3443                     // In the case of a CR/LF, we need to advance over both.
   3444                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
   3445                     if (nextc == 0x0a) {
   3446                         (void)UTEXT_NEXT32(fInputText);
   3447                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3448                     }
   3449                 }
   3450             }
   3451             break;
   3452 
   3453 
   3454         case URX_DOTANY_UNIX:
   3455             {
   3456                 // '.' operator, matches all, but stops at end-of-line.
   3457                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   3458                 if (fp->fInputIdx >= fActiveLimit) {
   3459                     // At end of input.  Match failed.  Backtrack out.
   3460                     fHitEnd = TRUE;
   3461                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3462                     break;
   3463                 }
   3464 
   3465                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3466 
   3467                 // There is input left.  Advance over one char, unless we've hit end-of-line
   3468                 UChar32 c = UTEXT_NEXT32(fInputText);
   3469                 if (c == 0x0a) {
   3470                     // End of line in normal mode.   '.' does not match the \n
   3471                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3472                 } else {
   3473                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3474                 }
   3475             }
   3476             break;
   3477 
   3478 
   3479         case URX_JMP:
   3480             fp->fPatIdx = opValue;
   3481             break;
   3482 
   3483         case URX_FAIL:
   3484             isMatch = FALSE;
   3485             goto breakFromLoop;
   3486 
   3487         case URX_JMP_SAV:
   3488             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   3489             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   3490             fp->fPatIdx = opValue;                         // Then JMP.
   3491             break;
   3492 
   3493         case URX_JMP_SAV_X:
   3494             // This opcode is used with (x)+, when x can match a zero length string.
   3495             // Same as JMP_SAV, except conditional on the match having made forward progress.
   3496             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   3497             //   data address of the input position at the start of the loop.
   3498             {
   3499                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   3500                 int32_t  stoOp = (int32_t)pat[opValue-1];
   3501                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   3502                 int32_t  frameLoc = URX_VAL(stoOp);
   3503                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   3504                 int64_t prevInputIdx = fp->fExtra[frameLoc];
   3505                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   3506                 if (prevInputIdx < fp->fInputIdx) {
   3507                     // The match did make progress.  Repeat the loop.
   3508                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   3509                     fp->fPatIdx = opValue;
   3510                     fp->fExtra[frameLoc] = fp->fInputIdx;
   3511                 }
   3512                 // If the input position did not advance, we do nothing here,
   3513                 //   execution will fall out of the loop.
   3514             }
   3515             break;
   3516 
   3517         case URX_CTR_INIT:
   3518             {
   3519                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3520                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
   3521 
   3522                 // Pick up the three extra operands that CTR_INIT has, and
   3523                 //    skip the pattern location counter past
   3524                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3525                 fp->fPatIdx += 3;
   3526                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3527                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   3528                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   3529                 U_ASSERT(minCount>=0);
   3530                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   3531                 U_ASSERT(loopLoc>=fp->fPatIdx);
   3532 
   3533                 if (minCount == 0) {
   3534                     fp = StateSave(fp, loopLoc+1, status);
   3535                 }
   3536                 if (maxCount == -1) {
   3537                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
   3538                 } else if (maxCount == 0) {
   3539                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3540                 }
   3541             }
   3542             break;
   3543 
   3544         case URX_CTR_LOOP:
   3545             {
   3546                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3547                 int32_t initOp = (int32_t)pat[opValue];
   3548                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   3549                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3550                 int32_t minCount  = (int32_t)pat[opValue+2];
   3551                 int32_t maxCount  = (int32_t)pat[opValue+3];
   3552                 (*pCounter)++;
   3553                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
   3554                     U_ASSERT(*pCounter == maxCount);
   3555                     break;
   3556                 }
   3557                 if (*pCounter >= minCount) {
   3558                     if (maxCount == -1) {
   3559                         // Loop has no hard upper bound.
   3560                         // Check that it is progressing through the input, break if it is not.
   3561                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
   3562                         if (fp->fInputIdx == *pLastInputIdx) {
   3563                             break;
   3564                         } else {
   3565                             *pLastInputIdx = fp->fInputIdx;
   3566                         }
   3567                     }
   3568                     fp = StateSave(fp, fp->fPatIdx, status);
   3569                 } else {
   3570                     // Increment time-out counter. (StateSave() does it if count >= minCount)
   3571                     fTickCounter--;
   3572                     if (fTickCounter <= 0) {
   3573                         IncrementTime(status);    // Re-initializes fTickCounter
   3574                     }
   3575                 }
   3576 
   3577                 fp->fPatIdx = opValue + 4;    // Loop back.
   3578             }
   3579             break;
   3580 
   3581         case URX_CTR_INIT_NG:
   3582             {
   3583                 // Initialize a non-greedy loop
   3584                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3585                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
   3586 
   3587                 // Pick up the three extra operands that CTR_INIT_NG has, and
   3588                 //    skip the pattern location counter past
   3589                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3590                 fp->fPatIdx += 3;
   3591                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3592                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   3593                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   3594                 U_ASSERT(minCount>=0);
   3595                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   3596                 U_ASSERT(loopLoc>fp->fPatIdx);
   3597                 if (maxCount == -1) {
   3598                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
   3599                 }
   3600 
   3601                 if (minCount == 0) {
   3602                     if (maxCount != 0) {
   3603                         fp = StateSave(fp, fp->fPatIdx, status);
   3604                     }
   3605                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   3606                 }
   3607             }
   3608             break;
   3609 
   3610         case URX_CTR_LOOP_NG:
   3611             {
   3612                 // Non-greedy {min, max} loops
   3613                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3614                 int32_t initOp = (int32_t)pat[opValue];
   3615                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   3616                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3617                 int32_t minCount  = (int32_t)pat[opValue+2];
   3618                 int32_t maxCount  = (int32_t)pat[opValue+3];
   3619 
   3620                 (*pCounter)++;
   3621                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
   3622                     // The loop has matched the maximum permitted number of times.
   3623                     //   Break out of here with no action.  Matching will
   3624                     //   continue with the following pattern.
   3625                     U_ASSERT(*pCounter == maxCount);
   3626                     break;
   3627                 }
   3628 
   3629                 if (*pCounter < minCount) {
   3630                     // We haven't met the minimum number of matches yet.
   3631                     //   Loop back for another one.
   3632                     fp->fPatIdx = opValue + 4;    // Loop back.
   3633                     // Increment time-out counter. (StateSave() does it if count >= minCount)
   3634                     fTickCounter--;
   3635                     if (fTickCounter <= 0) {
   3636                         IncrementTime(status);    // Re-initializes fTickCounter
   3637                     }
   3638                 } else {
   3639                     // We do have the minimum number of matches.
   3640 
   3641                     // If there is no upper bound on the loop iterations, check that the input index
   3642                     // is progressing, and stop the loop if it is not.
   3643                     if (maxCount == -1) {
   3644                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
   3645                         if (fp->fInputIdx == *pLastInputIdx) {
   3646                             break;
   3647                         }
   3648                         *pLastInputIdx = fp->fInputIdx;
   3649                     }
   3650 
   3651                     // Loop Continuation: we will fall into the pattern following the loop
   3652                     //   (non-greedy, don't execute loop body first), but first do
   3653                     //   a state save to the top of the loop, so that a match failure
   3654                     //   in the following pattern will try another iteration of the loop.
   3655                     fp = StateSave(fp, opValue + 4, status);
   3656                 }
   3657             }
   3658             break;
   3659 
   3660         case URX_STO_SP:
   3661             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3662             fData[opValue] = fStack->size();
   3663             break;
   3664 
   3665         case URX_LD_SP:
   3666             {
   3667                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3668                 int32_t newStackSize = (int32_t)fData[opValue];
   3669                 U_ASSERT(newStackSize <= fStack->size());
   3670                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3671                 if (newFP == (int64_t *)fp) {
   3672                     break;
   3673                 }
   3674                 int32_t i;
   3675                 for (i=0; i<fFrameSize; i++) {
   3676                     newFP[i] = ((int64_t *)fp)[i];
   3677                 }
   3678                 fp = (REStackFrame *)newFP;
   3679                 fStack->setSize(newStackSize);
   3680             }
   3681             break;
   3682 
   3683         case URX_BACKREF:
   3684             {
   3685                 U_ASSERT(opValue < fFrameSize);
   3686                 int64_t groupStartIdx = fp->fExtra[opValue];
   3687                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   3688                 U_ASSERT(groupStartIdx <= groupEndIdx);
   3689                 if (groupStartIdx < 0) {
   3690                     // This capture group has not participated in the match thus far,
   3691                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3692                     break;
   3693                 }
   3694                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
   3695                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3696 
   3697                 //   Note: if the capture group match was of an empty string the backref
   3698                 //         match succeeds.  Verified by testing:  Perl matches succeed
   3699                 //         in this case, so we do too.
   3700 
   3701                 UBool success = TRUE;
   3702                 for (;;) {
   3703                     if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
   3704                         success = TRUE;
   3705                         break;
   3706                     }
   3707                     if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
   3708                         success = FALSE;
   3709                         fHitEnd = TRUE;
   3710                         break;
   3711                     }
   3712                     UChar32 captureGroupChar = utext_next32(fAltInputText);
   3713                     UChar32 inputChar = utext_next32(fInputText);
   3714                     if (inputChar != captureGroupChar) {
   3715                         success = FALSE;
   3716                         break;
   3717                     }
   3718                 }
   3719 
   3720                 if (success) {
   3721                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3722                 } else {
   3723                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3724                 }
   3725             }
   3726             break;
   3727 
   3728 
   3729 
   3730         case URX_BACKREF_I:
   3731             {
   3732                 U_ASSERT(opValue < fFrameSize);
   3733                 int64_t groupStartIdx = fp->fExtra[opValue];
   3734                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   3735                 U_ASSERT(groupStartIdx <= groupEndIdx);
   3736                 if (groupStartIdx < 0) {
   3737                     // This capture group has not participated in the match thus far,
   3738                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3739                     break;
   3740                 }
   3741                 utext_setNativeIndex(fAltInputText, groupStartIdx);
   3742                 utext_setNativeIndex(fInputText, fp->fInputIdx);
   3743                 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
   3744                 CaseFoldingUTextIterator inputItr(*fInputText);
   3745 
   3746                 //   Note: if the capture group match was of an empty string the backref
   3747                 //         match succeeds.  Verified by testing:  Perl matches succeed
   3748                 //         in this case, so we do too.
   3749 
   3750                 UBool success = TRUE;
   3751                 for (;;) {
   3752                     if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
   3753                         success = TRUE;
   3754                         break;
   3755                     }
   3756                     if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
   3757                         success = FALSE;
   3758                         fHitEnd = TRUE;
   3759                         break;
   3760                     }
   3761                     UChar32 captureGroupChar = captureGroupItr.next();
   3762                     UChar32 inputChar = inputItr.next();
   3763                     if (inputChar != captureGroupChar) {
   3764                         success = FALSE;
   3765                         break;
   3766                     }
   3767                 }
   3768 
   3769                 if (success && inputItr.inExpansion()) {
   3770                     // We otained a match by consuming part of a string obtained from
   3771                     // case-folding a single code point of the input text.
   3772                     // This does not count as an overall match.
   3773                     success = FALSE;
   3774                 }
   3775 
   3776                 if (success) {
   3777                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3778                 } else {
   3779                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3780                 }
   3781 
   3782             }
   3783             break;
   3784 
   3785         case URX_STO_INP_LOC:
   3786             {
   3787                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   3788                 fp->fExtra[opValue] = fp->fInputIdx;
   3789             }
   3790             break;
   3791 
   3792         case URX_JMPX:
   3793             {
   3794                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3795                 fp->fPatIdx += 1;
   3796                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   3797                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   3798                 int64_t savedInputIdx = fp->fExtra[dataLoc];
   3799                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   3800                 if (savedInputIdx < fp->fInputIdx) {
   3801                     fp->fPatIdx = opValue;                               // JMP
   3802                 } else {
   3803                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   3804                 }
   3805             }
   3806             break;
   3807 
   3808         case URX_LA_START:
   3809             {
   3810                 // Entering a lookahead block.
   3811                 // Save Stack Ptr, Input Pos.
   3812                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3813                 fData[opValue]   = fStack->size();
   3814                 fData[opValue+1] = fp->fInputIdx;
   3815                 fActiveStart     = fLookStart;          // Set the match region change for
   3816                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   3817             }
   3818             break;
   3819 
   3820         case URX_LA_END:
   3821             {
   3822                 // Leaving a look-ahead block.
   3823                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   3824                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3825                 int32_t stackSize = fStack->size();
   3826                 int32_t newStackSize =(int32_t)fData[opValue];
   3827                 U_ASSERT(stackSize >= newStackSize);
   3828                 if (stackSize > newStackSize) {
   3829                     // Copy the current top frame back to the new (cut back) top frame.
   3830                     //   This makes the capture groups from within the look-ahead
   3831                     //   expression available.
   3832                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3833                     int32_t i;
   3834                     for (i=0; i<fFrameSize; i++) {
   3835                         newFP[i] = ((int64_t *)fp)[i];
   3836                     }
   3837                     fp = (REStackFrame *)newFP;
   3838                     fStack->setSize(newStackSize);
   3839                 }
   3840                 fp->fInputIdx = fData[opValue+1];
   3841 
   3842                 // Restore the active region bounds in the input string; they may have
   3843                 //    been changed because of transparent bounds on a Region.
   3844                 fActiveStart = fRegionStart;
   3845                 fActiveLimit = fRegionLimit;
   3846             }
   3847             break;
   3848 
   3849         case URX_ONECHAR_I:
   3850             // Case insensitive one char.  The char from the pattern is already case folded.
   3851             // Input text is not, but case folding the input can not reduce two or more code
   3852             // points to one.
   3853             if (fp->fInputIdx < fActiveLimit) {
   3854                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3855 
   3856                 UChar32 c = UTEXT_NEXT32(fInputText);
   3857                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   3858                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3859                     break;
   3860                 }
   3861             } else {
   3862                 fHitEnd = TRUE;
   3863             }
   3864 
   3865             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3866             break;
   3867 
   3868         case URX_STRING_I:
   3869             {
   3870                 // Case-insensitive test input against a literal string.
   3871                 // Strings require two slots in the compiled pattern, one for the
   3872                 //   offset to the string text, and one for the length.
   3873                 //   The compiled string has already been case folded.
   3874                 {
   3875                     const UChar *patternString = litText + opValue;
   3876                     int32_t      patternStringIdx  = 0;
   3877 
   3878                     op      = (int32_t)pat[fp->fPatIdx];
   3879                     fp->fPatIdx++;
   3880                     opType  = URX_TYPE(op);
   3881                     opValue = URX_VAL(op);
   3882                     U_ASSERT(opType == URX_STRING_LEN);
   3883                     int32_t patternStringLen = opValue;  // Length of the string from the pattern.
   3884 
   3885 
   3886                     UChar32   cPattern;
   3887                     UChar32   cText;
   3888                     UBool     success = TRUE;
   3889 
   3890                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3891                     CaseFoldingUTextIterator inputIterator(*fInputText);
   3892                     while (patternStringIdx < patternStringLen) {
   3893                         if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
   3894                             success = FALSE;
   3895                             fHitEnd = TRUE;
   3896                             break;
   3897                         }
   3898                         U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
   3899                         cText = inputIterator.next();
   3900                         if (cText != cPattern) {
   3901                             success = FALSE;
   3902                             break;
   3903                         }
   3904                     }
   3905                     if (inputIterator.inExpansion()) {
   3906                         success = FALSE;
   3907                     }
   3908 
   3909                     if (success) {
   3910                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3911                     } else {
   3912                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3913                     }
   3914                 }
   3915             }
   3916             break;
   3917 
   3918         case URX_LB_START:
   3919             {
   3920                 // Entering a look-behind block.
   3921                 // Save Stack Ptr, Input Pos.
   3922                 //   TODO:  implement transparent bounds.  Ticket #6067
   3923                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3924                 fData[opValue]   = fStack->size();
   3925                 fData[opValue+1] = fp->fInputIdx;
   3926                 // Init the variable containing the start index for attempted matches.
   3927                 fData[opValue+2] = -1;
   3928                 // Save input string length, then reset to pin any matches to end at
   3929                 //   the current position.
   3930                 fData[opValue+3] = fActiveLimit;
   3931                 fActiveLimit     = fp->fInputIdx;
   3932             }
   3933             break;
   3934 
   3935 
   3936         case URX_LB_CONT:
   3937             {
   3938                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   3939                 //    at all possible input starting positions.
   3940 
   3941                 // Fetch the min and max possible match lengths.  They are the operands
   3942                 //   of this op in the pattern.
   3943                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
   3944                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
   3945                 if (!UTEXT_USES_U16(fInputText)) {
   3946                     // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
   3947                     // The max length need not be exact; it just needs to be >= actual maximum.
   3948                     maxML *= 3;
   3949                 }
   3950                 U_ASSERT(minML <= maxML);
   3951                 U_ASSERT(minML >= 0);
   3952 
   3953                 // Fetch (from data) the last input index where a match was attempted.
   3954                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3955                 int64_t  &lbStartIdx = fData[opValue+2];
   3956                 if (lbStartIdx < 0) {
   3957                     // First time through loop.
   3958                     lbStartIdx = fp->fInputIdx - minML;
   3959                     if (lbStartIdx > 0) {
   3960                         // move index to a code point boudary, if it's not on one already.
   3961                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
   3962                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3963                     }
   3964                 } else {
   3965                     // 2nd through nth time through the loop.
   3966                     // Back up start position for match by one.
   3967                     if (lbStartIdx == 0) {
   3968                         (lbStartIdx)--;
   3969                     } else {
   3970                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
   3971                         (void)UTEXT_PREVIOUS32(fInputText);
   3972                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3973                     }
   3974                 }
   3975 
   3976                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
   3977                     // We have tried all potential match starting points without
   3978                     //  getting a match.  Backtrack out, and out of the
   3979                     //   Look Behind altogether.
   3980                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3981                     int64_t restoreInputLen = fData[opValue+3];
   3982                     U_ASSERT(restoreInputLen >= fActiveLimit);
   3983                     U_ASSERT(restoreInputLen <= fInputLength);
   3984                     fActiveLimit = restoreInputLen;
   3985                     break;
   3986                 }
   3987 
   3988                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   3989                 //      (successful match will fall off the end of the loop.)
   3990                 fp = StateSave(fp, fp->fPatIdx-3, status);
   3991                 fp->fInputIdx = lbStartIdx;
   3992             }
   3993             break;
   3994 
   3995         case URX_LB_END:
   3996             // End of a look-behind block, after a successful match.
   3997             {
   3998                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3999                 if (fp->fInputIdx != fActiveLimit) {
   4000                     //  The look-behind expression matched, but the match did not
   4001                     //    extend all the way to the point that we are looking behind from.
   4002                     //  FAIL out of here, which will take us back to the LB_CONT, which
   4003                     //     will retry the match starting at another position or fail
   4004                     //     the look-behind altogether, whichever is appropriate.
   4005                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4006                     break;
   4007                 }
   4008 
   4009                 // Look-behind match is good.  Restore the orignal input string length,
   4010                 //   which had been truncated to pin the end of the lookbehind match to the
   4011                 //   position being looked-behind.
   4012                 int64_t originalInputLen = fData[opValue+3];
   4013                 U_ASSERT(originalInputLen >= fActiveLimit);
   4014                 U_ASSERT(originalInputLen <= fInputLength);
   4015                 fActiveLimit = originalInputLen;
   4016             }
   4017             break;
   4018 
   4019 
   4020         case URX_LBN_CONT:
   4021             {
   4022                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   4023                 //    at all possible input starting positions.
   4024 
   4025                 // Fetch the extra parameters of this op.
   4026                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
   4027                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
   4028                 if (!UTEXT_USES_U16(fInputText)) {
   4029                     // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
   4030                     // The max length need not be exact; it just needs to be >= actual maximum.
   4031                     maxML *= 3;
   4032                 }
   4033                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
   4034                         continueLoc = URX_VAL(continueLoc);
   4035                 U_ASSERT(minML <= maxML);
   4036                 U_ASSERT(minML >= 0);
   4037                 U_ASSERT(continueLoc > fp->fPatIdx);
   4038 
   4039                 // Fetch (from data) the last input index where a match was attempted.
   4040                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4041                 int64_t  &lbStartIdx = fData[opValue+2];
   4042                 if (lbStartIdx < 0) {
   4043                     // First time through loop.
   4044                     lbStartIdx = fp->fInputIdx - minML;
   4045                     if (lbStartIdx > 0) {
   4046                         // move index to a code point boudary, if it's not on one already.
   4047                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
   4048                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4049                     }
   4050                 } else {
   4051                     // 2nd through nth time through the loop.
   4052                     // Back up start position for match by one.
   4053                     if (lbStartIdx == 0) {
   4054                         (lbStartIdx)--;
   4055                     } else {
   4056                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
   4057                         (void)UTEXT_PREVIOUS32(fInputText);
   4058                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4059                     }
   4060                 }
   4061 
   4062                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
   4063                     // We have tried all potential match starting points without
   4064                     //  getting a match, which means that the negative lookbehind as
   4065                     //  a whole has succeeded.  Jump forward to the continue location
   4066                     int64_t restoreInputLen = fData[opValue+3];
   4067                     U_ASSERT(restoreInputLen >= fActiveLimit);
   4068                     U_ASSERT(restoreInputLen <= fInputLength);
   4069                     fActiveLimit = restoreInputLen;
   4070                     fp->fPatIdx = continueLoc;
   4071                     break;
   4072                 }
   4073 
   4074                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   4075                 //      (successful match will cause a FAIL out of the loop altogether.)
   4076                 fp = StateSave(fp, fp->fPatIdx-4, status);
   4077                 fp->fInputIdx = lbStartIdx;
   4078             }
   4079             break;
   4080 
   4081         case URX_LBN_END:
   4082             // End of a negative look-behind block, after a successful match.
   4083             {
   4084                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4085                 if (fp->fInputIdx != fActiveLimit) {
   4086                     //  The look-behind expression matched, but the match did not
   4087                     //    extend all the way to the point that we are looking behind from.
   4088                     //  FAIL out of here, which will take us back to the LB_CONT, which
   4089                     //     will retry the match starting at another position or succeed
   4090                     //     the look-behind altogether, whichever is appropriate.
   4091                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4092                     break;
   4093                 }
   4094 
   4095                 // Look-behind expression matched, which means look-behind test as
   4096                 //   a whole Fails
   4097 
   4098                 //   Restore the orignal input string length, which had been truncated
   4099                 //   inorder to pin the end of the lookbehind match
   4100                 //   to the position being looked-behind.
   4101                 int64_t originalInputLen = fData[opValue+3];
   4102                 U_ASSERT(originalInputLen >= fActiveLimit);
   4103                 U_ASSERT(originalInputLen <= fInputLength);
   4104                 fActiveLimit = originalInputLen;
   4105 
   4106                 // Restore original stack position, discarding any state saved
   4107                 //   by the successful pattern match.
   4108                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4109                 int32_t newStackSize = (int32_t)fData[opValue];
   4110                 U_ASSERT(fStack->size() > newStackSize);
   4111                 fStack->setSize(newStackSize);
   4112 
   4113                 //  FAIL, which will take control back to someplace
   4114                 //  prior to entering the look-behind test.
   4115                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4116             }
   4117             break;
   4118 
   4119 
   4120         case URX_LOOP_SR_I:
   4121             // Loop Initialization for the optimized implementation of
   4122             //     [some character set]*
   4123             //   This op scans through all matching input.
   4124             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4125             {
   4126                 U_ASSERT(opValue > 0 && opValue < sets->size());
   4127                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   4128                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   4129 
   4130                 // Loop through input, until either the input is exhausted or
   4131                 //   we reach a character that is not a member of the set.
   4132                 int64_t ix = fp->fInputIdx;
   4133                 UTEXT_SETNATIVEINDEX(fInputText, ix);
   4134                 for (;;) {
   4135                     if (ix >= fActiveLimit) {
   4136                         fHitEnd = TRUE;
   4137                         break;
   4138                     }
   4139                     UChar32 c = UTEXT_NEXT32(fInputText);
   4140                     if (c<256) {
   4141                         if (s8->contains(c) == FALSE) {
   4142                             break;
   4143                         }
   4144                     } else {
   4145                         if (s->contains(c) == FALSE) {
   4146                             break;
   4147                         }
   4148                     }
   4149                     ix = UTEXT_GETNATIVEINDEX(fInputText);
   4150                 }
   4151 
   4152                 // If there were no matching characters, skip over the loop altogether.
   4153                 //   The loop doesn't run at all, a * op always succeeds.
   4154                 if (ix == fp->fInputIdx) {
   4155                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4156                     break;
   4157                 }
   4158 
   4159                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4160                 //   must follow.  It's operand is the stack location
   4161                 //   that holds the starting input index for the match of this [set]*
   4162                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   4163                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4164                 int32_t stackLoc = URX_VAL(loopcOp);
   4165                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4166                 fp->fExtra[stackLoc] = fp->fInputIdx;
   4167                 fp->fInputIdx = ix;
   4168 
   4169                 // Save State to the URX_LOOP_C op that follows this one,
   4170                 //   so that match failures in the following code will return to there.
   4171                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4172                 fp = StateSave(fp, fp->fPatIdx, status);
   4173                 fp->fPatIdx++;
   4174             }
   4175             break;
   4176 
   4177 
   4178         case URX_LOOP_DOT_I:
   4179             // Loop Initialization for the optimized implementation of .*
   4180             //   This op scans through all remaining input.
   4181             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4182             {
   4183                 // Loop through input until the input is exhausted (we reach an end-of-line)
   4184                 // In DOTALL mode, we can just go straight to the end of the input.
   4185                 int64_t ix;
   4186                 if ((opValue & 1) == 1) {
   4187                     // Dot-matches-All mode.  Jump straight to the end of the string.
   4188                     ix = fActiveLimit;
   4189                     fHitEnd = TRUE;
   4190                 } else {
   4191                     // NOT DOT ALL mode.  Line endings do not match '.'
   4192                     // Scan forward until a line ending or end of input.
   4193                     ix = fp->fInputIdx;
   4194                     UTEXT_SETNATIVEINDEX(fInputText, ix);
   4195                     for (;;) {
   4196                         if (ix >= fActiveLimit) {
   4197                             fHitEnd = TRUE;
   4198                             break;
   4199                         }
   4200                         UChar32 c = UTEXT_NEXT32(fInputText);
   4201                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
   4202                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
   4203                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   4204                                     isLineTerminator(c))) {
   4205                                 //  char is a line ending.  Exit the scanning loop.
   4206                                 break;
   4207                             }
   4208                         }
   4209                         ix = UTEXT_GETNATIVEINDEX(fInputText);
   4210                     }
   4211                 }
   4212 
   4213                 // If there were no matching characters, skip over the loop altogether.
   4214                 //   The loop doesn't run at all, a * op always succeeds.
   4215                 if (ix == fp->fInputIdx) {
   4216                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4217                     break;
   4218                 }
   4219 
   4220                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4221                 //   must follow.  It's operand is the stack location
   4222                 //   that holds the starting input index for the match of this .*
   4223                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   4224                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4225                 int32_t stackLoc = URX_VAL(loopcOp);
   4226                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4227                 fp->fExtra[stackLoc] = fp->fInputIdx;
   4228                 fp->fInputIdx = ix;
   4229 
   4230                 // Save State to the URX_LOOP_C op that follows this one,
   4231                 //   so that match failures in the following code will return to there.
   4232                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4233                 fp = StateSave(fp, fp->fPatIdx, status);
   4234                 fp->fPatIdx++;
   4235             }
   4236             break;
   4237 
   4238 
   4239         case URX_LOOP_C:
   4240             {
   4241                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   4242                 backSearchIndex = fp->fExtra[opValue];
   4243                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
   4244                 if (backSearchIndex == fp->fInputIdx) {
   4245                     // We've backed up the input idx to the point that the loop started.
   4246                     // The loop is done.  Leave here without saving state.
   4247                     //  Subsequent failures won't come back here.
   4248                     break;
   4249                 }
   4250                 // Set up for the next iteration of the loop, with input index
   4251                 //   backed up by one from the last time through,
   4252                 //   and a state save to this instruction in case the following code fails again.
   4253                 //   (We're going backwards because this loop emulates stack unwinding, not
   4254                 //    the initial scan forward.)
   4255                 U_ASSERT(fp->fInputIdx > 0);
   4256                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4257                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
   4258                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4259 
   4260                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
   4261                 if (prevC == 0x0a &&
   4262                     fp->fInputIdx > backSearchIndex &&
   4263                     twoPrevC == 0x0d) {
   4264                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
   4265                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   4266                         // .*, stepping back over CRLF pair.
   4267                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4268                     }
   4269                 }
   4270 
   4271 
   4272                 fp = StateSave(fp, fp->fPatIdx-1, status);
   4273             }
   4274             break;
   4275 
   4276 
   4277 
   4278         default:
   4279             // Trouble.  The compiled pattern contains an entry with an
   4280             //           unrecognized type tag.
   4281             U_ASSERT(FALSE);
   4282         }
   4283 
   4284         if (U_FAILURE(status)) {
   4285             isMatch = FALSE;
   4286             break;
   4287         }
   4288     }
   4289 
   4290 breakFromLoop:
   4291     fMatch = isMatch;
   4292     if (isMatch) {
   4293         fLastMatchEnd = fMatchEnd;
   4294         fMatchStart   = startIdx;
   4295         fMatchEnd     = fp->fInputIdx;
   4296     }
   4297 
   4298 #ifdef REGEX_RUN_DEBUG
   4299     if (fTraceDebug) {
   4300         if (isMatch) {
   4301             printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
   4302         } else {
   4303             printf("No match\n\n");
   4304         }
   4305     }
   4306 #endif
   4307 
   4308     fFrame = fp;                // The active stack frame when the engine stopped.
   4309                                 //   Contains the capture group results that we need to
   4310                                 //    access later.
   4311     return;
   4312 }
   4313 
   4314 
   4315 //--------------------------------------------------------------------------------
   4316 //
   4317 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
   4318 //                  assumption that the entire string is available in the UText's
   4319 //                  chunk buffer. For now, that means we can use int32_t indexes,
   4320 //                  except for anything that needs to be saved (like group starts
   4321 //                  and ends).
   4322 //
   4323 //                  startIdx:    begin matching a this index.
   4324 //                  toEnd:       if true, match must extend to end of the input region
   4325 //
   4326 //--------------------------------------------------------------------------------
   4327 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
   4328     UBool       isMatch  = FALSE;      // True if the we have a match.
   4329 
   4330     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
   4331 
   4332     int32_t     op;                    // Operation from the compiled pattern, split into
   4333     int32_t     opType;                //    the opcode
   4334     int32_t     opValue;               //    and the operand value.
   4335 
   4336 #ifdef REGEX_RUN_DEBUG
   4337     if (fTraceDebug) {
   4338         printf("MatchAt(startIdx=%d)\n", startIdx);
   4339         printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
   4340         printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
   4341     }
   4342 #endif
   4343 
   4344     if (U_FAILURE(status)) {
   4345         return;
   4346     }
   4347 
   4348     //  Cache frequently referenced items from the compiled pattern
   4349     //
   4350     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   4351 
   4352     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   4353     UVector             *sets          = fPattern->fSets;
   4354 
   4355     const UChar         *inputBuf      = fInputText->chunkContents;
   4356 
   4357     fFrameSize = fPattern->fFrameSize;
   4358     REStackFrame        *fp            = resetStack();
   4359     if (U_FAILURE(fDeferredStatus)) {
   4360         status = fDeferredStatus;
   4361         return;
   4362     }
   4363 
   4364     fp->fPatIdx   = 0;
   4365     fp->fInputIdx = startIdx;
   4366 
   4367     // Zero out the pattern's static data
   4368     int32_t i;
   4369     for (i = 0; i<fPattern->fDataSize; i++) {
   4370         fData[i] = 0;
   4371     }
   4372 
   4373     //
   4374     //  Main loop for interpreting the compiled pattern.
   4375     //  One iteration of the loop per pattern operation performed.
   4376     //
   4377     for (;;) {
   4378         op      = (int32_t)pat[fp->fPatIdx];
   4379         opType  = URX_TYPE(op);
   4380         opValue = URX_VAL(op);
   4381 #ifdef REGEX_RUN_DEBUG
   4382         if (fTraceDebug) {
   4383             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4384             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
   4385                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   4386             fPattern->dumpOp(fp->fPatIdx);
   4387         }
   4388 #endif
   4389         fp->fPatIdx++;
   4390 
   4391         switch (opType) {
   4392 
   4393 
   4394         case URX_NOP:
   4395             break;
   4396 
   4397 
   4398         case URX_BACKTRACK:
   4399             // Force a backtrack.  In some circumstances, the pattern compiler
   4400             //   will notice that the pattern can't possibly match anything, and will
   4401             //   emit one of these at that point.
   4402             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4403             break;
   4404 
   4405 
   4406         case URX_ONECHAR:
   4407             if (fp->fInputIdx < fActiveLimit) {
   4408                 UChar32 c;
   4409                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4410                 if (c == opValue) {
   4411                     break;
   4412                 }
   4413             } else {
   4414                 fHitEnd = TRUE;
   4415             }
   4416             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4417             break;
   4418 
   4419 
   4420         case URX_STRING:
   4421             {
   4422                 // Test input against a literal string.
   4423                 // Strings require two slots in the compiled pattern, one for the
   4424                 //   offset to the string text, and one for the length.
   4425                 int32_t   stringStartIdx = opValue;
   4426                 int32_t   stringLen;
   4427 
   4428                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
   4429                 fp->fPatIdx++;
   4430                 opType    = URX_TYPE(op);
   4431                 stringLen = URX_VAL(op);
   4432                 U_ASSERT(opType == URX_STRING_LEN);
   4433                 U_ASSERT(stringLen >= 2);
   4434 
   4435                 const UChar * pInp = inputBuf + fp->fInputIdx;
   4436                 const UChar * pInpLimit = inputBuf + fActiveLimit;
   4437                 const UChar * pPat = litText+stringStartIdx;
   4438                 const UChar * pEnd = pInp + stringLen;
   4439                 UBool success = TRUE;
   4440                 while (pInp < pEnd) {
   4441                     if (pInp >= pInpLimit) {
   4442                         fHitEnd = TRUE;
   4443                         success = FALSE;
   4444                         break;
   4445                     }
   4446                     if (*pInp++ != *pPat++) {
   4447                         success = FALSE;
   4448                         break;
   4449                     }
   4450                 }
   4451 
   4452                 if (success) {
   4453                     fp->fInputIdx += stringLen;
   4454                 } else {
   4455                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4456                 }
   4457             }
   4458             break;
   4459 
   4460 
   4461         case URX_STATE_SAVE:
   4462             fp = StateSave(fp, opValue, status);
   4463             break;
   4464 
   4465 
   4466         case URX_END:
   4467             // The match loop will exit via this path on a successful match,
   4468             //   when we reach the end of the pattern.
   4469             if (toEnd && fp->fInputIdx != fActiveLimit) {
   4470                 // The pattern matched, but not to the end of input.  Try some more.
   4471                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4472                 break;
   4473             }
   4474             isMatch = TRUE;
   4475             goto  breakFromLoop;
   4476 
   4477             // Start and End Capture stack frame variables are laid out out like this:
   4478             //  fp->fExtra[opValue]  - The start of a completed capture group
   4479             //             opValue+1 - The end   of a completed capture group
   4480             //             opValue+2 - the start of a capture group whose end
   4481             //                          has not yet been reached (and might not ever be).
   4482         case URX_START_CAPTURE:
   4483             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4484             fp->fExtra[opValue+2] = fp->fInputIdx;
   4485             break;
   4486 
   4487 
   4488         case URX_END_CAPTURE:
   4489             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4490             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   4491             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   4492             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   4493             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   4494             break;
   4495 
   4496 
   4497         case URX_DOLLAR:                   //  $, test for End of line
   4498             //     or for position before new line at end of input
   4499             if (fp->fInputIdx < fAnchorLimit-2) {
   4500                 // We are no where near the end of input.  Fail.
   4501                 //   This is the common case.  Keep it first.
   4502                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4503                 break;
   4504             }
   4505             if (fp->fInputIdx >= fAnchorLimit) {
   4506                 // We really are at the end of input.  Success.
   4507                 fHitEnd = TRUE;
   4508                 fRequireEnd = TRUE;
   4509                 break;
   4510             }
   4511 
   4512             // If we are positioned just before a new-line that is located at the
   4513             //   end of input, succeed.
   4514             if (fp->fInputIdx == fAnchorLimit-1) {
   4515                 UChar32 c;
   4516                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
   4517 
   4518                 if (isLineTerminator(c)) {
   4519                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4520                         // At new-line at end of input. Success
   4521                         fHitEnd = TRUE;
   4522                         fRequireEnd = TRUE;
   4523                         break;
   4524                     }
   4525                 }
   4526             } else if (fp->fInputIdx == fAnchorLimit-2 &&
   4527                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
   4528                     fHitEnd = TRUE;
   4529                     fRequireEnd = TRUE;
   4530                     break;                         // At CR/LF at end of input.  Success
   4531             }
   4532 
   4533             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4534 
   4535             break;
   4536 
   4537 
   4538         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   4539             if (fp->fInputIdx >= fAnchorLimit-1) {
   4540                 // Either at the last character of input, or off the end.
   4541                 if (fp->fInputIdx == fAnchorLimit-1) {
   4542                     // At last char of input.  Success if it's a new line.
   4543                     if (inputBuf[fp->fInputIdx] == 0x0a) {
   4544                         fHitEnd = TRUE;
   4545                         fRequireEnd = TRUE;
   4546                         break;
   4547                     }
   4548                 } else {
   4549                     // Off the end of input.  Success.
   4550                     fHitEnd = TRUE;
   4551                     fRequireEnd = TRUE;
   4552                     break;
   4553                 }
   4554             }
   4555 
   4556             // Not at end of input.  Back-track out.
   4557             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4558             break;
   4559 
   4560 
   4561         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   4562             {
   4563                 if (fp->fInputIdx >= fAnchorLimit) {
   4564                     // We really are at the end of input.  Success.
   4565                     fHitEnd = TRUE;
   4566                     fRequireEnd = TRUE;
   4567                     break;
   4568                 }
   4569                 // If we are positioned just before a new-line, succeed.
   4570                 // It makes no difference where the new-line is within the input.
   4571                 UChar32 c = inputBuf[fp->fInputIdx];
   4572                 if (isLineTerminator(c)) {
   4573                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   4574                     //  In multi-line mode, hitting a new-line just before the end of input does not
   4575                     //   set the hitEnd or requireEnd flags
   4576                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4577                         break;
   4578                     }
   4579                 }
   4580                 // not at a new line.  Fail.
   4581                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4582             }
   4583             break;
   4584 
   4585 
   4586         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   4587             {
   4588                 if (fp->fInputIdx >= fAnchorLimit) {
   4589                     // We really are at the end of input.  Success.
   4590                     fHitEnd = TRUE;
   4591                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   4592                     break;               //   adding a new-line would not lose the match.
   4593                 }
   4594                 // If we are not positioned just before a new-line, the test fails; backtrack out.
   4595                 // It makes no difference where the new-line is within the input.
   4596                 if (inputBuf[fp->fInputIdx] != 0x0a) {
   4597                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4598                 }
   4599             }
   4600             break;
   4601 
   4602 
   4603         case URX_CARET:                    //  ^, test for start of line
   4604             if (fp->fInputIdx != fAnchorStart) {
   4605                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4606             }
   4607             break;
   4608 
   4609 
   4610         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   4611             {
   4612                 if (fp->fInputIdx == fAnchorStart) {
   4613                     // We are at the start input.  Success.
   4614                     break;
   4615                 }
   4616                 // Check whether character just before the current pos is a new-line
   4617                 //   unless we are at the end of input
   4618                 UChar  c = inputBuf[fp->fInputIdx - 1];
   4619                 if ((fp->fInputIdx < fAnchorLimit) &&
   4620                     isLineTerminator(c)) {
   4621                     //  It's a new-line.  ^ is true.  Success.
   4622                     //  TODO:  what should be done with positions between a CR and LF?
   4623                     break;
   4624                 }
   4625                 // Not at the start of a line.  Fail.
   4626                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4627             }
   4628             break;
   4629 
   4630 
   4631         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   4632             {
   4633                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
   4634                 if (fp->fInputIdx <= fAnchorStart) {
   4635                     // We are at the start input.  Success.
   4636                     break;
   4637                 }
   4638                 // Check whether character just before the current pos is a new-line
   4639                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   4640                 UChar  c = inputBuf[fp->fInputIdx - 1];
   4641                 if (c != 0x0a) {
   4642                     // Not at the start of a line.  Back-track out.
   4643                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4644                 }
   4645             }
   4646             break;
   4647 
   4648         case URX_BACKSLASH_B:          // Test for word boundaries
   4649             {
   4650                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
   4651                 success ^= (UBool)(opValue != 0);     // flip sense for \B
   4652                 if (!success) {
   4653                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4654                 }
   4655             }
   4656             break;
   4657 
   4658 
   4659         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   4660             {
   4661                 UBool success = isUWordBoundary(fp->fInputIdx);
   4662                 success ^= (UBool)(opValue != 0);     // flip sense for \B
   4663                 if (!success) {
   4664                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4665                 }
   4666             }
   4667             break;
   4668 
   4669 
   4670         case URX_BACKSLASH_D:            // Test for decimal digit
   4671             {
   4672                 if (fp->fInputIdx >= fActiveLimit) {
   4673                     fHitEnd = TRUE;
   4674                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4675                     break;
   4676                 }
   4677 
   4678                 UChar32 c;
   4679                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4680                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   4681                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   4682                 success ^= (UBool)(opValue != 0);        // flip sense for \D
   4683                 if (!success) {
   4684                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4685                 }
   4686             }
   4687             break;
   4688 
   4689 
   4690         case URX_BACKSLASH_G:          // Test for position at end of previous match
   4691             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
   4692                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4693             }
   4694             break;
   4695 
   4696 
   4697         case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
   4698             {
   4699                 if (fp->fInputIdx >= fActiveLimit) {
   4700                     fHitEnd = TRUE;
   4701                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4702                     break;
   4703                 }
   4704                 UChar32 c;
   4705                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4706                 int8_t ctype = u_charType(c);
   4707                 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
   4708                 success ^= (UBool)(opValue != 0);        // flip sense for \H
   4709                 if (!success) {
   4710                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4711                 }
   4712             }
   4713             break;
   4714 
   4715 
   4716         case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
   4717             {
   4718                 if (fp->fInputIdx >= fActiveLimit) {
   4719                     fHitEnd = TRUE;
   4720                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4721                     break;
   4722                 }
   4723                 UChar32 c;
   4724                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4725                 if (isLineTerminator(c)) {
   4726                     if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
   4727                         // Check for CR/LF sequence. Consume both together when found.
   4728                         UChar c2;
   4729                         U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
   4730                         if (c2 != 0x0a) {
   4731                             U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
   4732                         }
   4733                     }
   4734                 } else {
   4735                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4736                 }
   4737             }
   4738             break;
   4739 
   4740 
   4741         case URX_BACKSLASH_V:         // Any single code point line ending.
   4742             {
   4743                 if (fp->fInputIdx >= fActiveLimit) {
   4744                     fHitEnd = TRUE;
   4745                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4746                     break;
   4747                 }
   4748                 UChar32 c;
   4749                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4750                 UBool success = isLineTerminator(c);
   4751                 success ^= (UBool)(opValue != 0);        // flip sense for \V
   4752                 if (!success) {
   4753                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4754                 }
   4755             }
   4756             break;
   4757 
   4758 
   4759 
   4760         case URX_BACKSLASH_X:
   4761         //  Match a Grapheme, as defined by Unicode TR 29.
   4762         //  Differs slightly from Perl, which consumes combining marks independently
   4763         //    of context.
   4764         {
   4765 
   4766             // Fail if at end of input
   4767             if (fp->fInputIdx >= fActiveLimit) {
   4768                 fHitEnd = TRUE;
   4769                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4770                 break;
   4771             }
   4772 
   4773             // Examine (and consume) the current char.
   4774             //   Dispatch into a little state machine, based on the char.
   4775             UChar32  c;
   4776             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4777             UnicodeSet **sets = fPattern->fStaticSets;
   4778             if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   4779             if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   4780             if (sets[URX_GC_L]->contains(c))       goto GC_L;
   4781             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   4782             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   4783             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4784             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4785             goto GC_Extend;
   4786 
   4787 
   4788 
   4789 GC_L:
   4790             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4791             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4792             if (sets[URX_GC_L]->contains(c))       goto GC_L;
   4793             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   4794             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   4795             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4796             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4797             goto GC_Extend;
   4798 
   4799 GC_V:
   4800             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4801             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4802             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4803             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4804             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4805             goto GC_Extend;
   4806 
   4807 GC_T:
   4808             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4809             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4810             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4811             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4812             goto GC_Extend;
   4813 
   4814 GC_Extend:
   4815             // Combining characters are consumed here
   4816             for (;;) {
   4817                 if (fp->fInputIdx >= fActiveLimit) {
   4818                     break;
   4819                 }
   4820                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4821                 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   4822                     U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   4823                     break;
   4824                 }
   4825             }
   4826             goto GC_Done;
   4827 
   4828 GC_Control:
   4829             // Most control chars stand alone (don't combine with combining chars),
   4830             //   except for that CR/LF sequence is a single grapheme cluster.
   4831             if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
   4832                 fp->fInputIdx++;
   4833             }
   4834 
   4835 GC_Done:
   4836             if (fp->fInputIdx >= fActiveLimit) {
   4837                 fHitEnd = TRUE;
   4838             }
   4839             break;
   4840         }
   4841 
   4842 
   4843 
   4844 
   4845         case URX_BACKSLASH_Z:          // Test for end of Input
   4846             if (fp->fInputIdx < fAnchorLimit) {
   4847                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4848             } else {
   4849                 fHitEnd = TRUE;
   4850                 fRequireEnd = TRUE;
   4851             }
   4852             break;
   4853 
   4854 
   4855 
   4856         case URX_STATIC_SETREF:
   4857             {
   4858                 // Test input character against one of the predefined sets
   4859                 //    (Word Characters, for example)
   4860                 // The high bit of the op value is a flag for the match polarity.
   4861                 //    0:   success if input char is in set.
   4862                 //    1:   success if input char is not in set.
   4863                 if (fp->fInputIdx >= fActiveLimit) {
   4864                     fHitEnd = TRUE;
   4865                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4866                     break;
   4867                 }
   4868 
   4869                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   4870                 opValue &= ~URX_NEG_SET;
   4871                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   4872 
   4873                 UChar32 c;
   4874                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4875                 if (c < 256) {
   4876                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   4877                     if (s8->contains(c)) {
   4878                         success = !success;
   4879                     }
   4880                 } else {
   4881                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   4882                     if (s->contains(c)) {
   4883                         success = !success;
   4884                     }
   4885                 }
   4886                 if (!success) {
   4887                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4888                 }
   4889             }
   4890             break;
   4891 
   4892 
   4893         case URX_STAT_SETREF_N:
   4894             {
   4895                 // Test input character for NOT being a member of  one of
   4896                 //    the predefined sets (Word Characters, for example)
   4897                 if (fp->fInputIdx >= fActiveLimit) {
   4898                     fHitEnd = TRUE;
   4899                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4900                     break;
   4901                 }
   4902 
   4903                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   4904 
   4905                 UChar32  c;
   4906                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4907                 if (c < 256) {
   4908                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   4909                     if (s8->contains(c) == FALSE) {
   4910                         break;
   4911                     }
   4912                 } else {
   4913                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   4914                     if (s->contains(c) == FALSE) {
   4915                         break;
   4916                     }
   4917                 }
   4918                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4919             }
   4920             break;
   4921 
   4922 
   4923         case URX_SETREF:
   4924             {
   4925                 if (fp->fInputIdx >= fActiveLimit) {
   4926                     fHitEnd = TRUE;
   4927                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4928                     break;
   4929                 }
   4930 
   4931                 U_ASSERT(opValue > 0 && opValue < sets->size());
   4932 
   4933                 // There is input left.  Pick up one char and test it for set membership.
   4934                 UChar32  c;
   4935                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4936                 if (c<256) {
   4937                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   4938                     if (s8->contains(c)) {
   4939                         // The character is in the set.  A Match.
   4940                         break;
   4941                     }
   4942                 } else {
   4943                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   4944                     if (s->contains(c)) {
   4945                         // The character is in the set.  A Match.
   4946                         break;
   4947                     }
   4948                 }
   4949 
   4950                 // the character wasn't in the set.
   4951                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4952             }
   4953             break;
   4954 
   4955 
   4956         case URX_DOTANY:
   4957             {
   4958                 // . matches anything, but stops at end-of-line.
   4959                 if (fp->fInputIdx >= fActiveLimit) {
   4960                     // At end of input.  Match failed.  Backtrack out.
   4961                     fHitEnd = TRUE;
   4962                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4963                     break;
   4964                 }
   4965 
   4966                 // There is input left.  Advance over one char, unless we've hit end-of-line
   4967                 UChar32  c;
   4968                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4969                 if (isLineTerminator(c)) {
   4970                     // End of line in normal mode.   . does not match.
   4971                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4972                     break;
   4973                 }
   4974             }
   4975             break;
   4976 
   4977 
   4978         case URX_DOTANY_ALL:
   4979             {
   4980                 // . in dot-matches-all (including new lines) mode
   4981                 if (fp->fInputIdx >= fActiveLimit) {
   4982                     // At end of input.  Match failed.  Backtrack out.
   4983                     fHitEnd = TRUE;
   4984                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4985                     break;
   4986                 }
   4987 
   4988                 // There is input left.  Advance over one char, except if we are
   4989                 //   at a cr/lf, advance over both of them.
   4990                 UChar32 c;
   4991                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4992                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   4993                     // In the case of a CR/LF, we need to advance over both.
   4994                     if (inputBuf[fp->fInputIdx] == 0x0a) {
   4995                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
   4996                     }
   4997                 }
   4998             }
   4999             break;
   5000 
   5001 
   5002         case URX_DOTANY_UNIX:
   5003             {
   5004                 // '.' operator, matches all, but stops at end-of-line.
   5005                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   5006                 if (fp->fInputIdx >= fActiveLimit) {
   5007                     // At end of input.  Match failed.  Backtrack out.
   5008                     fHitEnd = TRUE;
   5009                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5010                     break;
   5011                 }
   5012 
   5013                 // There is input left.  Advance over one char, unless we've hit end-of-line
   5014                 UChar32 c;
   5015                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5016                 if (c == 0x0a) {
   5017                     // End of line in normal mode.   '.' does not match the \n
   5018                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5019                 }
   5020             }
   5021             break;
   5022 
   5023 
   5024         case URX_JMP:
   5025             fp->fPatIdx = opValue;
   5026             break;
   5027 
   5028         case URX_FAIL:
   5029             isMatch = FALSE;
   5030             goto breakFromLoop;
   5031 
   5032         case URX_JMP_SAV:
   5033             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   5034             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   5035             fp->fPatIdx = opValue;                         // Then JMP.
   5036             break;
   5037 
   5038         case URX_JMP_SAV_X:
   5039             // This opcode is used with (x)+, when x can match a zero length string.
   5040             // Same as JMP_SAV, except conditional on the match having made forward progress.
   5041             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   5042             //   data address of the input position at the start of the loop.
   5043             {
   5044                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   5045                 int32_t  stoOp = (int32_t)pat[opValue-1];
   5046                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   5047                 int32_t  frameLoc = URX_VAL(stoOp);
   5048                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   5049                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
   5050                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   5051                 if (prevInputIdx < fp->fInputIdx) {
   5052                     // The match did make progress.  Repeat the loop.
   5053                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   5054                     fp->fPatIdx = opValue;
   5055                     fp->fExtra[frameLoc] = fp->fInputIdx;
   5056                 }
   5057                 // If the input position did not advance, we do nothing here,
   5058                 //   execution will fall out of the loop.
   5059             }
   5060             break;
   5061 
   5062         case URX_CTR_INIT:
   5063             {
   5064                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   5065                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
   5066 
   5067                 // Pick up the three extra operands that CTR_INIT has, and
   5068                 //    skip the pattern location counter past
   5069                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5070                 fp->fPatIdx += 3;
   5071                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   5072                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   5073                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   5074                 U_ASSERT(minCount>=0);
   5075                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   5076                 U_ASSERT(loopLoc>=fp->fPatIdx);
   5077 
   5078                 if (minCount == 0) {
   5079                     fp = StateSave(fp, loopLoc+1, status);
   5080                 }
   5081                 if (maxCount == -1) {
   5082                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
   5083                 } else if (maxCount == 0) {
   5084                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5085                 }
   5086             }
   5087             break;
   5088 
   5089         case URX_CTR_LOOP:
   5090             {
   5091                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   5092                 int32_t initOp = (int32_t)pat[opValue];
   5093                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   5094                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   5095                 int32_t minCount  = (int32_t)pat[opValue+2];
   5096                 int32_t maxCount  = (int32_t)pat[opValue+3];
   5097                 (*pCounter)++;
   5098                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
   5099                     U_ASSERT(*pCounter == maxCount);
   5100                     break;
   5101                 }
   5102                 if (*pCounter >= minCount) {
   5103                     if (maxCount == -1) {
   5104                         // Loop has no hard upper bound.
   5105                         // Check that it is progressing through the input, break if it is not.
   5106                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
   5107                         if (fp->fInputIdx == *pLastInputIdx) {
   5108                             break;
   5109                         } else {
   5110                             *pLastInputIdx = fp->fInputIdx;
   5111                         }
   5112                     }
   5113                     fp = StateSave(fp, fp->fPatIdx, status);
   5114                 } else {
   5115                     // Increment time-out counter. (StateSave() does it if count >= minCount)
   5116                     fTickCounter--;
   5117                     if (fTickCounter <= 0) {
   5118                         IncrementTime(status);    // Re-initializes fTickCounter
   5119                     }
   5120                 }
   5121                 fp->fPatIdx = opValue + 4;    // Loop back.
   5122             }
   5123             break;
   5124 
   5125         case URX_CTR_INIT_NG:
   5126             {
   5127                 // Initialize a non-greedy loop
   5128                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   5129                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
   5130 
   5131                 // Pick up the three extra operands that CTR_INIT_NG has, and
   5132                 //    skip the pattern location counter past
   5133                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5134                 fp->fPatIdx += 3;
   5135                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   5136                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   5137                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   5138                 U_ASSERT(minCount>=0);
   5139                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   5140                 U_ASSERT(loopLoc>fp->fPatIdx);
   5141                 if (maxCount == -1) {
   5142                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
   5143                 }
   5144 
   5145                 if (minCount == 0) {
   5146                     if (maxCount != 0) {
   5147                         fp = StateSave(fp, fp->fPatIdx, status);
   5148                     }
   5149                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   5150                 }
   5151             }
   5152             break;
   5153 
   5154         case URX_CTR_LOOP_NG:
   5155             {
   5156                 // Non-greedy {min, max} loops
   5157                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   5158                 int32_t initOp = (int32_t)pat[opValue];
   5159                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   5160                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   5161                 int32_t minCount  = (int32_t)pat[opValue+2];
   5162                 int32_t maxCount  = (int32_t)pat[opValue+3];
   5163 
   5164                 (*pCounter)++;
   5165                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
   5166                     // The loop has matched the maximum permitted number of times.
   5167                     //   Break out of here with no action.  Matching will
   5168                     //   continue with the following pattern.
   5169                     U_ASSERT(*pCounter == maxCount);
   5170                     break;
   5171                 }
   5172 
   5173                 if (*pCounter < minCount) {
   5174                     // We haven't met the minimum number of matches yet.
   5175                     //   Loop back for another one.
   5176                     fp->fPatIdx = opValue + 4;    // Loop back.
   5177                     fTickCounter--;
   5178                     if (fTickCounter <= 0) {
   5179                         IncrementTime(status);    // Re-initializes fTickCounter
   5180                     }
   5181                 } else {
   5182                     // We do have the minimum number of matches.
   5183 
   5184                     // If there is no upper bound on the loop iterations, check that the input index
   5185                     // is progressing, and stop the loop if it is not.
   5186                     if (maxCount == -1) {
   5187                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
   5188                         if (fp->fInputIdx == *pLastInputIdx) {
   5189                             break;
   5190                         }
   5191                         *pLastInputIdx = fp->fInputIdx;
   5192                     }
   5193 
   5194                     // Loop Continuation: we will fall into the pattern following the loop
   5195                     //   (non-greedy, don't execute loop body first), but first do
   5196                     //   a state save to the top of the loop, so that a match failure
   5197                     //   in the following pattern will try another iteration of the loop.
   5198                     fp = StateSave(fp, opValue + 4, status);
   5199                 }
   5200             }
   5201             break;
   5202 
   5203         case URX_STO_SP:
   5204             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5205             fData[opValue] = fStack->size();
   5206             break;
   5207 
   5208         case URX_LD_SP:
   5209             {
   5210                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5211                 int32_t newStackSize = (int32_t)fData[opValue];
   5212                 U_ASSERT(newStackSize <= fStack->size());
   5213                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5214                 if (newFP == (int64_t *)fp) {
   5215                     break;
   5216                 }
   5217                 int32_t i;
   5218                 for (i=0; i<fFrameSize; i++) {
   5219                     newFP[i] = ((int64_t *)fp)[i];
   5220                 }
   5221                 fp = (REStackFrame *)newFP;
   5222                 fStack->setSize(newStackSize);
   5223             }
   5224             break;
   5225 
   5226         case URX_BACKREF:
   5227             {
   5228                 U_ASSERT(opValue < fFrameSize);
   5229                 int64_t groupStartIdx = fp->fExtra[opValue];
   5230                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   5231                 U_ASSERT(groupStartIdx <= groupEndIdx);
   5232                 int64_t inputIndex = fp->fInputIdx;
   5233                 if (groupStartIdx < 0) {
   5234                     // This capture group has not participated in the match thus far,
   5235                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   5236                     break;
   5237                 }
   5238                 UBool success = TRUE;
   5239                 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
   5240                     if (inputIndex >= fActiveLimit) {
   5241                         success = FALSE;
   5242                         fHitEnd = TRUE;
   5243                         break;
   5244                     }
   5245                     if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
   5246                         success = FALSE;
   5247                         break;
   5248                     }
   5249                 }
   5250                 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
   5251                         inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
   5252                     // Capture group ended with an unpaired lead surrogate.
   5253                     // Back reference is not permitted to match lead only of a surrogatge pair.
   5254                     success = FALSE;
   5255                 }
   5256                 if (success) {
   5257                     fp->fInputIdx = inputIndex;
   5258                 } else {
   5259                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5260                 }
   5261             }
   5262             break;
   5263 
   5264         case URX_BACKREF_I:
   5265             {
   5266                 U_ASSERT(opValue < fFrameSize);
   5267                 int64_t groupStartIdx = fp->fExtra[opValue];
   5268                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   5269                 U_ASSERT(groupStartIdx <= groupEndIdx);
   5270                 if (groupStartIdx < 0) {
   5271                     // This capture group has not participated in the match thus far,
   5272                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   5273                     break;
   5274                 }
   5275                 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
   5276                 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
   5277 
   5278                 //   Note: if the capture group match was of an empty string the backref
   5279                 //         match succeeds.  Verified by testing:  Perl matches succeed
   5280                 //         in this case, so we do too.
   5281 
   5282                 UBool success = TRUE;
   5283                 for (;;) {
   5284                     UChar32 captureGroupChar = captureGroupItr.next();
   5285                     if (captureGroupChar == U_SENTINEL) {
   5286                         success = TRUE;
   5287                         break;
   5288                     }
   5289                     UChar32 inputChar = inputItr.next();
   5290                     if (inputChar == U_SENTINEL) {
   5291                         success = FALSE;
   5292                         fHitEnd = TRUE;
   5293                         break;
   5294                     }
   5295                     if (inputChar != captureGroupChar) {
   5296                         success = FALSE;
   5297                         break;
   5298                     }
   5299                 }
   5300 
   5301                 if (success && inputItr.inExpansion()) {
   5302                     // We otained a match by consuming part of a string obtained from
   5303                     // case-folding a single code point of the input text.
   5304                     // This does not count as an overall match.
   5305                     success = FALSE;
   5306                 }
   5307 
   5308                 if (success) {
   5309                     fp->fInputIdx = inputItr.getIndex();
   5310                 } else {
   5311                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5312                 }
   5313             }
   5314             break;
   5315 
   5316         case URX_STO_INP_LOC:
   5317             {
   5318                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   5319                 fp->fExtra[opValue] = fp->fInputIdx;
   5320             }
   5321             break;
   5322 
   5323         case URX_JMPX:
   5324             {
   5325                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5326                 fp->fPatIdx += 1;
   5327                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   5328                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   5329                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
   5330                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   5331                 if (savedInputIdx < fp->fInputIdx) {
   5332                     fp->fPatIdx = opValue;                               // JMP
   5333                 } else {
   5334                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   5335                 }
   5336             }
   5337             break;
   5338 
   5339         case URX_LA_START:
   5340             {
   5341                 // Entering a lookahead block.
   5342                 // Save Stack Ptr, Input Pos.
   5343                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5344                 fData[opValue]   = fStack->size();
   5345                 fData[opValue+1] = fp->fInputIdx;
   5346                 fActiveStart     = fLookStart;          // Set the match region change for
   5347                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   5348             }
   5349             break;
   5350 
   5351         case URX_LA_END:
   5352             {
   5353                 // Leaving a look-ahead block.
   5354                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   5355                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5356                 int32_t stackSize = fStack->size();
   5357                 int32_t newStackSize = (int32_t)fData[opValue];
   5358                 U_ASSERT(stackSize >= newStackSize);
   5359                 if (stackSize > newStackSize) {
   5360                     // Copy the current top frame back to the new (cut back) top frame.
   5361                     //   This makes the capture groups from within the look-ahead
   5362                     //   expression available.
   5363                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5364                     int32_t i;
   5365                     for (i=0; i<fFrameSize; i++) {
   5366                         newFP[i] = ((int64_t *)fp)[i];
   5367                     }
   5368                     fp = (REStackFrame *)newFP;
   5369                     fStack->setSize(newStackSize);
   5370                 }
   5371                 fp->fInputIdx = fData[opValue+1];
   5372 
   5373                 // Restore the active region bounds in the input string; they may have
   5374                 //    been changed because of transparent bounds on a Region.
   5375                 fActiveStart = fRegionStart;
   5376                 fActiveLimit = fRegionLimit;
   5377             }
   5378             break;
   5379 
   5380         case URX_ONECHAR_I:
   5381             if (fp->fInputIdx < fActiveLimit) {
   5382                 UChar32 c;
   5383                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5384                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   5385                     break;
   5386                 }
   5387             } else {
   5388                 fHitEnd = TRUE;
   5389             }
   5390             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5391             break;
   5392 
   5393         case URX_STRING_I:
   5394             // Case-insensitive test input against a literal string.
   5395             // Strings require two slots in the compiled pattern, one for the
   5396             //   offset to the string text, and one for the length.
   5397             //   The compiled string has already been case folded.
   5398             {
   5399                 const UChar *patternString = litText + opValue;
   5400 
   5401                 op      = (int32_t)pat[fp->fPatIdx];
   5402                 fp->fPatIdx++;
   5403                 opType  = URX_TYPE(op);
   5404                 opValue = URX_VAL(op);
   5405                 U_ASSERT(opType == URX_STRING_LEN);
   5406                 int32_t patternStringLen = opValue;  // Length of the string from the pattern.
   5407 
   5408                 UChar32      cText;
   5409                 UChar32      cPattern;
   5410                 UBool        success = TRUE;
   5411                 int32_t      patternStringIdx  = 0;
   5412                 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
   5413                 while (patternStringIdx < patternStringLen) {
   5414                     U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
   5415                     cText = inputIterator.next();
   5416                     if (cText != cPattern) {
   5417                         success = FALSE;
   5418                         if (cText == U_SENTINEL) {
   5419                             fHitEnd = TRUE;
   5420                         }
   5421                         break;
   5422                     }
   5423                 }
   5424                 if (inputIterator.inExpansion()) {
   5425                     success = FALSE;
   5426                 }
   5427 
   5428                 if (success) {
   5429                     fp->fInputIdx = inputIterator.getIndex();
   5430                 } else {
   5431                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5432                 }
   5433             }
   5434             break;
   5435 
   5436         case URX_LB_START:
   5437             {
   5438                 // Entering a look-behind block.
   5439                 // Save Stack Ptr, Input Pos.
   5440                 //   TODO:  implement transparent bounds.  Ticket #6067
   5441                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5442                 fData[opValue]   = fStack->size();
   5443                 fData[opValue+1] = fp->fInputIdx;
   5444                 // Init the variable containing the start index for attempted matches.
   5445                 fData[opValue+2] = -1;
   5446                 // Save input string length, then reset to pin any matches to end at
   5447                 //   the current position.
   5448                 fData[opValue+3] = fActiveLimit;
   5449                 fActiveLimit     = fp->fInputIdx;
   5450             }
   5451             break;
   5452 
   5453 
   5454         case URX_LB_CONT:
   5455             {
   5456                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   5457                 //    at all possible input starting positions.
   5458 
   5459                 // Fetch the min and max possible match lengths.  They are the operands
   5460                 //   of this op in the pattern.
   5461                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
   5462                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
   5463                 U_ASSERT(minML <= maxML);
   5464                 U_ASSERT(minML >= 0);
   5465 
   5466                 // Fetch (from data) the last input index where a match was attempted.
   5467                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5468                 int64_t  &lbStartIdx = fData[opValue+2];
   5469                 if (lbStartIdx < 0) {
   5470                     // First time through loop.
   5471                     lbStartIdx = fp->fInputIdx - minML;
   5472                     if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
   5473                         U16_SET_CP_START(inputBuf, 0, lbStartIdx);
   5474                     }
   5475                 } else {
   5476                     // 2nd through nth time through the loop.
   5477                     // Back up start position for match by one.
   5478                     if (lbStartIdx == 0) {
   5479                         lbStartIdx--;
   5480                     } else {
   5481                         U16_BACK_1(inputBuf, 0, lbStartIdx);
   5482                     }
   5483                 }
   5484 
   5485                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
   5486                     // We have tried all potential match starting points without
   5487                     //  getting a match.  Backtrack out, and out of the
   5488                     //   Look Behind altogether.
   5489                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5490                     int64_t restoreInputLen = fData[opValue+3];
   5491                     U_ASSERT(restoreInputLen >= fActiveLimit);
   5492                     U_ASSERT(restoreInputLen <= fInputLength);
   5493                     fActiveLimit = restoreInputLen;
   5494                     break;
   5495                 }
   5496 
   5497                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5498                 //      (successful match will fall off the end of the loop.)
   5499                 fp = StateSave(fp, fp->fPatIdx-3, status);
   5500                 fp->fInputIdx =  lbStartIdx;
   5501             }
   5502             break;
   5503 
   5504         case URX_LB_END:
   5505             // End of a look-behind block, after a successful match.
   5506             {
   5507                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5508                 if (fp->fInputIdx != fActiveLimit) {
   5509                     //  The look-behind expression matched, but the match did not
   5510                     //    extend all the way to the point that we are looking behind from.
   5511                     //  FAIL out of here, which will take us back to the LB_CONT, which
   5512                     //     will retry the match starting at another position or fail
   5513                     //     the look-behind altogether, whichever is appropriate.
   5514                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5515                     break;
   5516                 }
   5517 
   5518                 // Look-behind match is good.  Restore the orignal input string length,
   5519                 //   which had been truncated to pin the end of the lookbehind match to the
   5520                 //   position being looked-behind.
   5521                 int64_t originalInputLen = fData[opValue+3];
   5522                 U_ASSERT(originalInputLen >= fActiveLimit);
   5523                 U_ASSERT(originalInputLen <= fInputLength);
   5524                 fActiveLimit = originalInputLen;
   5525             }
   5526             break;
   5527 
   5528 
   5529         case URX_LBN_CONT:
   5530             {
   5531                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   5532                 //    at all possible input starting positions.
   5533 
   5534                 // Fetch the extra parameters of this op.
   5535                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
   5536                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
   5537                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
   5538                 continueLoc = URX_VAL(continueLoc);
   5539                 U_ASSERT(minML <= maxML);
   5540                 U_ASSERT(minML >= 0);
   5541                 U_ASSERT(continueLoc > fp->fPatIdx);
   5542 
   5543                 // Fetch (from data) the last input index where a match was attempted.
   5544                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5545                 int64_t  &lbStartIdx = fData[opValue+2];
   5546                 if (lbStartIdx < 0) {
   5547                     // First time through loop.
   5548                     lbStartIdx = fp->fInputIdx - minML;
   5549                     if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
   5550                         U16_SET_CP_START(inputBuf, 0, lbStartIdx);
   5551                     }
   5552                 } else {
   5553                     // 2nd through nth time through the loop.
   5554                     // Back up start position for match by one.
   5555                     if (lbStartIdx == 0) {
   5556                         lbStartIdx--;   // Because U16_BACK is unsafe starting at 0.
   5557                     } else {
   5558                         U16_BACK_1(inputBuf, 0, lbStartIdx);
   5559                     }
   5560                 }
   5561 
   5562                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
   5563                     // We have tried all potential match starting points without
   5564                     //  getting a match, which means that the negative lookbehind as
   5565                     //  a whole has succeeded.  Jump forward to the continue location
   5566                     int64_t restoreInputLen = fData[opValue+3];
   5567                     U_ASSERT(restoreInputLen >= fActiveLimit);
   5568                     U_ASSERT(restoreInputLen <= fInputLength);
   5569                     fActiveLimit = restoreInputLen;
   5570                     fp->fPatIdx = continueLoc;
   5571                     break;
   5572                 }
   5573 
   5574                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5575                 //      (successful match will cause a FAIL out of the loop altogether.)
   5576                 fp = StateSave(fp, fp->fPatIdx-4, status);
   5577                 fp->fInputIdx =  lbStartIdx;
   5578             }
   5579             break;
   5580 
   5581         case URX_LBN_END:
   5582             // End of a negative look-behind block, after a successful match.
   5583             {
   5584                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5585                 if (fp->fInputIdx != fActiveLimit) {
   5586                     //  The look-behind expression matched, but the match did not
   5587                     //    extend all the way to the point that we are looking behind from.
   5588                     //  FAIL out of here, which will take us back to the LB_CONT, which
   5589                     //     will retry the match starting at another position or succeed
   5590                     //     the look-behind altogether, whichever is appropriate.
   5591                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5592                     break;
   5593                 }
   5594 
   5595                 // Look-behind expression matched, which means look-behind test as
   5596                 //   a whole Fails
   5597 
   5598                 //   Restore the orignal input string length, which had been truncated
   5599                 //   inorder to pin the end of the lookbehind match
   5600                 //   to the position being looked-behind.
   5601                 int64_t originalInputLen = fData[opValue+3];
   5602                 U_ASSERT(originalInputLen >= fActiveLimit);
   5603                 U_ASSERT(originalInputLen <= fInputLength);
   5604                 fActiveLimit = originalInputLen;
   5605 
   5606                 // Restore original stack position, discarding any state saved
   5607                 //   by the successful pattern match.
   5608                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5609                 int32_t newStackSize = (int32_t)fData[opValue];
   5610                 U_ASSERT(fStack->size() > newStackSize);
   5611                 fStack->setSize(newStackSize);
   5612 
   5613                 //  FAIL, which will take control back to someplace
   5614                 //  prior to entering the look-behind test.
   5615                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5616             }
   5617             break;
   5618 
   5619 
   5620         case URX_LOOP_SR_I:
   5621             // Loop Initialization for the optimized implementation of
   5622             //     [some character set]*
   5623             //   This op scans through all matching input.
   5624             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5625             {
   5626                 U_ASSERT(opValue > 0 && opValue < sets->size());
   5627                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5628                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   5629 
   5630                 // Loop through input, until either the input is exhausted or
   5631                 //   we reach a character that is not a member of the set.
   5632                 int32_t ix = (int32_t)fp->fInputIdx;
   5633                 for (;;) {
   5634                     if (ix >= fActiveLimit) {
   5635                         fHitEnd = TRUE;
   5636                         break;
   5637                     }
   5638                     UChar32   c;
   5639                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
   5640                     if (c<256) {
   5641                         if (s8->contains(c) == FALSE) {
   5642                             U16_BACK_1(inputBuf, 0, ix);
   5643                             break;
   5644                         }
   5645                     } else {
   5646                         if (s->contains(c) == FALSE) {
   5647                             U16_BACK_1(inputBuf, 0, ix);
   5648                             break;
   5649                         }
   5650                     }
   5651                 }
   5652 
   5653                 // If there were no matching characters, skip over the loop altogether.
   5654                 //   The loop doesn't run at all, a * op always succeeds.
   5655                 if (ix == fp->fInputIdx) {
   5656                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   5657                     break;
   5658                 }
   5659 
   5660                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   5661                 //   must follow.  It's operand is the stack location
   5662                 //   that holds the starting input index for the match of this [set]*
   5663                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   5664                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   5665                 int32_t stackLoc = URX_VAL(loopcOp);
   5666                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   5667                 fp->fExtra[stackLoc] = fp->fInputIdx;
   5668                 fp->fInputIdx = ix;
   5669 
   5670                 // Save State to the URX_LOOP_C op that follows this one,
   5671                 //   so that match failures in the following code will return to there.
   5672                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   5673                 fp = StateSave(fp, fp->fPatIdx, status);
   5674                 fp->fPatIdx++;
   5675             }
   5676             break;
   5677 
   5678 
   5679         case URX_LOOP_DOT_I:
   5680             // Loop Initialization for the optimized implementation of .*
   5681             //   This op scans through all remaining input.
   5682             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5683             {
   5684                 // Loop through input until the input is exhausted (we reach an end-of-line)
   5685                 // In DOTALL mode, we can just go straight to the end of the input.
   5686                 int32_t ix;
   5687                 if ((opValue & 1) == 1) {
   5688                     // Dot-matches-All mode.  Jump straight to the end of the string.
   5689                     ix = (int32_t)fActiveLimit;
   5690                     fHitEnd = TRUE;
   5691                 } else {
   5692                     // NOT DOT ALL mode.  Line endings do not match '.'
   5693                     // Scan forward until a line ending or end of input.
   5694                     ix = (int32_t)fp->fInputIdx;
   5695                     for (;;) {
   5696                         if (ix >= fActiveLimit) {
   5697                             fHitEnd = TRUE;
   5698                             break;
   5699                         }
   5700                         UChar32   c;
   5701                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
   5702                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
   5703                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
   5704                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   5705                                    isLineTerminator(c))) {
   5706                                 //  char is a line ending.  Put the input pos back to the
   5707                                 //    line ending char, and exit the scanning loop.
   5708                                 U16_BACK_1(inputBuf, 0, ix);
   5709                                 break;
   5710                             }
   5711                         }
   5712                     }
   5713                 }
   5714 
   5715                 // If there were no matching characters, skip over the loop altogether.
   5716                 //   The loop doesn't run at all, a * op always succeeds.
   5717                 if (ix == fp->fInputIdx) {
   5718                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   5719                     break;
   5720                 }
   5721 
   5722                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   5723                 //   must follow.  It's operand is the stack location
   5724                 //   that holds the starting input index for the match of this .*
   5725                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   5726                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   5727                 int32_t stackLoc = URX_VAL(loopcOp);
   5728                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   5729                 fp->fExtra[stackLoc] = fp->fInputIdx;
   5730                 fp->fInputIdx = ix;
   5731 
   5732                 // Save State to the URX_LOOP_C op that follows this one,
   5733                 //   so that match failures in the following code will return to there.
   5734                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   5735                 fp = StateSave(fp, fp->fPatIdx, status);
   5736                 fp->fPatIdx++;
   5737             }
   5738             break;
   5739 
   5740 
   5741         case URX_LOOP_C:
   5742             {
   5743                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   5744                 backSearchIndex = (int32_t)fp->fExtra[opValue];
   5745                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
   5746                 if (backSearchIndex == fp->fInputIdx) {
   5747                     // We've backed up the input idx to the point that the loop started.
   5748                     // The loop is done.  Leave here without saving state.
   5749                     //  Subsequent failures won't come back here.
   5750                     break;
   5751                 }
   5752                 // Set up for the next iteration of the loop, with input index
   5753                 //   backed up by one from the last time through,
   5754                 //   and a state save to this instruction in case the following code fails again.
   5755                 //   (We're going backwards because this loop emulates stack unwinding, not
   5756                 //    the initial scan forward.)
   5757                 U_ASSERT(fp->fInputIdx > 0);
   5758                 UChar32 prevC;
   5759                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
   5760 
   5761                 if (prevC == 0x0a &&
   5762                     fp->fInputIdx > backSearchIndex &&
   5763                     inputBuf[fp->fInputIdx-1] == 0x0d) {
   5764                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
   5765                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   5766                         // .*, stepping back over CRLF pair.
   5767                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   5768                     }
   5769                 }
   5770 
   5771 
   5772                 fp = StateSave(fp, fp->fPatIdx-1, status);
   5773             }
   5774             break;
   5775 
   5776 
   5777 
   5778         default:
   5779             // Trouble.  The compiled pattern contains an entry with an
   5780             //           unrecognized type tag.
   5781             U_ASSERT(FALSE);
   5782         }
   5783 
   5784         if (U_FAILURE(status)) {
   5785             isMatch = FALSE;
   5786             break;
   5787         }
   5788     }
   5789 
   5790 breakFromLoop:
   5791     fMatch = isMatch;
   5792     if (isMatch) {
   5793         fLastMatchEnd = fMatchEnd;
   5794         fMatchStart   = startIdx;
   5795         fMatchEnd     = fp->fInputIdx;
   5796     }
   5797 
   5798 #ifdef REGEX_RUN_DEBUG
   5799     if (fTraceDebug) {
   5800         if (isMatch) {
   5801             printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
   5802         } else {
   5803             printf("No match\n\n");
   5804         }
   5805     }
   5806 #endif
   5807 
   5808     fFrame = fp;                // The active stack frame when the engine stopped.
   5809                                 //   Contains the capture group results that we need to
   5810                                 //    access later.
   5811 
   5812     return;
   5813 }
   5814 
   5815 
   5816 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
   5817 
   5818 U_NAMESPACE_END
   5819 
   5820 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
   5821