Home | History | Annotate | Download | only in i18n
      1 /*
      2 **************************************************************************
      3 *   Copyright (C) 2002-2010 International Business Machines Corporation  *
      4 *   and others. All rights reserved.                                     *
      5 **************************************************************************
      6 */
      7 //
      8 //  file:  rematch.cpp
      9 //
     10 //         Contains the implementation of class RegexMatcher,
     11 //         which is one of the main API classes for the ICU regular expression package.
     12 //
     13 
     14 #include "unicode/utypes.h"
     15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     16 
     17 #include "unicode/regex.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/ustring.h"
     21 #include "unicode/rbbi.h"
     22 #include "uassert.h"
     23 #include "cmemory.h"
     24 #include "uvector.h"
     25 #include "uvectr32.h"
     26 #include "uvectr64.h"
     27 #include "regeximp.h"
     28 #include "regexst.h"
     29 #include "regextxt.h"
     30 #include "ucase.h"
     31 
     32 // #include <malloc.h>        // Needed for heapcheck testing
     33 
     34 
     35 // Find progress callback
     36 // ----------------------
     37 // Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
     38 //
     39 #define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
     40     (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
     41 
     42 
     43 // Smart Backtracking
     44 // ------------------
     45 // When a failure would go back to a LOOP_C instruction,
     46 // strings, characters, and setrefs scan backwards for a valid start
     47 // character themselves, pop the stack, and save state, emulating the
     48 // LOOP_C's effect but assured that the next character of input is a
     49 // possible matching character.
     50 //
     51 // Good idea in theory; unfortunately it only helps out a few specific
     52 // cases and slows the engine down a little in the rest.
     53 
     54 //#define REGEX_SMART_BACKTRACKING 1
     55 
     56 U_NAMESPACE_BEGIN
     57 
     58 // Default limit for the size of the back track stack, to avoid system
     59 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
     60 // This value puts ICU's limits higher than most other regexp implementations,
     61 //    which use recursion rather than the heap, and take more storage per
     62 //    backtrack point.
     63 //
     64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
     65 
     66 // Time limit counter constant.
     67 //   Time limits for expression evaluation are in terms of quanta of work by
     68 //   the engine, each of which is 10,000 state saves.
     69 //   This constant determines that state saves per tick number.
     70 static const int32_t TIMER_INITIAL_VALUE = 10000;
     71 
     72 //-----------------------------------------------------------------------------
     73 //
     74 //   Constructor and Destructor
     75 //
     76 //-----------------------------------------------------------------------------
     77 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
     78     fDeferredStatus = U_ZERO_ERROR;
     79     init(fDeferredStatus);
     80     if (U_FAILURE(fDeferredStatus)) {
     81         return;
     82     }
     83     if (pat==NULL) {
     84         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
     85         return;
     86     }
     87     fPattern = pat;
     88     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
     89 }
     90 
     91 
     92 
     93 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
     94                            uint32_t flags, UErrorCode &status) {
     95     init(status);
     96     if (U_FAILURE(status)) {
     97         return;
     98     }
     99     UParseError    pe;
    100     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    101     fPattern           = fPatternOwned;
    102 
    103     UText inputText = UTEXT_INITIALIZER;
    104     utext_openConstUnicodeString(&inputText, &input, &status);
    105     init2(&inputText, status);
    106     utext_close(&inputText);
    107 
    108     fInputUniStrMaybeMutable = TRUE;
    109 }
    110 
    111 
    112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
    113                            uint32_t flags, UErrorCode &status) {
    114     init(status);
    115     if (U_FAILURE(status)) {
    116         return;
    117     }
    118     UParseError    pe;
    119     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    120     if (U_FAILURE(status)) {
    121         return;
    122     }
    123 
    124     fPattern           = fPatternOwned;
    125     init2(input, status);
    126 }
    127 
    128 
    129 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
    130                            uint32_t flags, UErrorCode &status) {
    131     init(status);
    132     if (U_FAILURE(status)) {
    133         return;
    134     }
    135     UParseError    pe;
    136     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    137     if (U_FAILURE(status)) {
    138         return;
    139     }
    140     fPattern           = fPatternOwned;
    141     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    142 }
    143 
    144 RegexMatcher::RegexMatcher(UText *regexp,
    145                            uint32_t flags, UErrorCode &status) {
    146     init(status);
    147     if (U_FAILURE(status)) {
    148         return;
    149     }
    150     UParseError    pe;
    151     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    152         if (U_FAILURE(status)) {
    153         return;
    154     }
    155 
    156     fPattern           = fPatternOwned;
    157     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    158 }
    159 
    160 
    161 
    162 
    163 RegexMatcher::~RegexMatcher() {
    164     delete fStack;
    165     if (fData != fSmallData) {
    166         uprv_free(fData);
    167         fData = NULL;
    168     }
    169     if (fPatternOwned) {
    170         delete fPatternOwned;
    171         fPatternOwned = NULL;
    172         fPattern = NULL;
    173     }
    174 
    175     if (fInput) {
    176         delete fInput;
    177     }
    178     if (fInputText) {
    179         utext_close(fInputText);
    180     }
    181     if (fAltInputText) {
    182         utext_close(fAltInputText);
    183     }
    184 
    185     #if UCONFIG_NO_BREAK_ITERATION==0
    186     delete fWordBreakItr;
    187     #endif
    188 }
    189 
    190 //
    191 //   init()   common initialization for use by all constructors.
    192 //            Initialize all fields, get the object into a consistent state.
    193 //            This must be done even when the initial status shows an error,
    194 //            so that the object is initialized sufficiently well for the destructor
    195 //            to run safely.
    196 //
    197 void RegexMatcher::init(UErrorCode &status) {
    198     fPattern           = NULL;
    199     fPatternOwned      = NULL;
    200     fFrameSize         = 0;
    201     fRegionStart       = 0;
    202     fRegionLimit       = 0;
    203     fAnchorStart       = 0;
    204     fAnchorLimit       = 0;
    205     fLookStart         = 0;
    206     fLookLimit         = 0;
    207     fActiveStart       = 0;
    208     fActiveLimit       = 0;
    209     fTransparentBounds = FALSE;
    210     fAnchoringBounds   = TRUE;
    211     fMatch             = FALSE;
    212     fMatchStart        = 0;
    213     fMatchEnd          = 0;
    214     fLastMatchEnd      = -1;
    215     fAppendPosition    = 0;
    216     fHitEnd            = FALSE;
    217     fRequireEnd        = FALSE;
    218     fStack             = NULL;
    219     fFrame             = NULL;
    220     fTimeLimit         = 0;
    221     fTime              = 0;
    222     fTickCounter       = 0;
    223     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
    224     fCallbackFn        = NULL;
    225     fCallbackContext   = NULL;
    226     fFindProgressCallbackFn      = NULL;
    227     fFindProgressCallbackContext = NULL;
    228     fTraceDebug        = FALSE;
    229     fDeferredStatus    = status;
    230     fData              = fSmallData;
    231     fWordBreakItr      = NULL;
    232 
    233     fStack             = new UVector64(status);
    234     fInputText         = NULL;
    235     fAltInputText      = NULL;
    236     fInput             = NULL;
    237     fInputLength       = 0;
    238     fInputUniStrMaybeMutable = FALSE;
    239 
    240     if (U_FAILURE(status)) {
    241         fDeferredStatus = status;
    242     }
    243 }
    244 
    245 //
    246 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
    247 //            This handles the common setup to be done after the Pattern is available.
    248 //
    249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
    250     if (U_FAILURE(status)) {
    251         fDeferredStatus = status;
    252         return;
    253     }
    254 
    255     if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
    256         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
    257         if (fData == NULL) {
    258             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    259             return;
    260         }
    261     }
    262 
    263     reset(input);
    264     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
    265     if (U_FAILURE(status)) {
    266         fDeferredStatus = status;
    267         return;
    268     }
    269 }
    270 
    271 
    272 static const UChar BACKSLASH  = 0x5c;
    273 static const UChar DOLLARSIGN = 0x24;
    274 //--------------------------------------------------------------------------------
    275 //
    276 //    appendReplacement
    277 //
    278 //--------------------------------------------------------------------------------
    279 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
    280                                               const UnicodeString &replacement,
    281                                               UErrorCode &status) {
    282     UText replacementText = UTEXT_INITIALIZER;
    283 
    284     utext_openConstUnicodeString(&replacementText, &replacement, &status);
    285     if (U_SUCCESS(status)) {
    286         UText resultText = UTEXT_INITIALIZER;
    287         utext_openUnicodeString(&resultText, &dest, &status);
    288 
    289         if (U_SUCCESS(status)) {
    290             appendReplacement(&resultText, &replacementText, status);
    291             utext_close(&resultText);
    292         }
    293         utext_close(&replacementText);
    294     }
    295 
    296     return *this;
    297 }
    298 
    299 //
    300 //    appendReplacement, UText mode
    301 //
    302 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
    303                                               UText *replacement,
    304                                               UErrorCode &status) {
    305     if (U_FAILURE(status)) {
    306         return *this;
    307     }
    308     if (U_FAILURE(fDeferredStatus)) {
    309         status = fDeferredStatus;
    310         return *this;
    311     }
    312     if (fMatch == FALSE) {
    313         status = U_REGEX_INVALID_STATE;
    314         return *this;
    315     }
    316 
    317     // Copy input string from the end of previous match to start of current match
    318     int64_t  destLen = utext_nativeLength(dest);
    319     if (fMatchStart > fAppendPosition) {
    320         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    321             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
    322                                      (int32_t)(fMatchStart-fAppendPosition), &status);
    323         } else {
    324             int32_t len16;
    325             if (UTEXT_USES_U16(fInputText)) {
    326                 len16 = (int32_t)(fMatchStart-fAppendPosition);
    327             } else {
    328                 UErrorCode lengthStatus = U_ZERO_ERROR;
    329                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
    330             }
    331             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
    332             if (inputChars == NULL) {
    333                 status = U_MEMORY_ALLOCATION_ERROR;
    334                 return *this;
    335             }
    336             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
    337             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    338             uprv_free(inputChars);
    339         }
    340     }
    341     fAppendPosition = fMatchEnd;
    342 
    343 
    344     // scan the replacement text, looking for substitutions ($n) and \escapes.
    345     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
    346     //         move entire ranges not containing substitutions.
    347     UTEXT_SETNATIVEINDEX(replacement, 0);
    348     UChar32 c = UTEXT_NEXT32(replacement);
    349     while (c != U_SENTINEL) {
    350         if (c == BACKSLASH) {
    351             // Backslash Escape.  Copy the following char out without further checks.
    352             //                    Note:  Surrogate pairs don't need any special handling
    353             //                           The second half wont be a '$' or a '\', and
    354             //                           will move to the dest normally on the next
    355             //                           loop iteration.
    356             c = UTEXT_CURRENT32(replacement);
    357             if (c == U_SENTINEL) {
    358                 break;
    359             }
    360 
    361             if (c==0x55/*U*/ || c==0x75/*u*/) {
    362                 // We have a \udddd or \Udddddddd escape sequence.
    363                 int32_t offset = 0;
    364                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
    365                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
    366                 if (escapedChar != (UChar32)0xFFFFFFFF) {
    367                     if (U_IS_BMP(escapedChar)) {
    368                         UChar c16 = (UChar)escapedChar;
    369                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    370                     } else {
    371                         UChar surrogate[2];
    372                         surrogate[0] = U16_LEAD(escapedChar);
    373                         surrogate[1] = U16_TRAIL(escapedChar);
    374                         if (U_SUCCESS(status)) {
    375                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    376                         }
    377                     }
    378                     // TODO:  Report errors for mal-formed \u escapes?
    379                     //        As this is, the original sequence is output, which may be OK.
    380                     if (context.lastOffset == offset) {
    381                         UTEXT_PREVIOUS32(replacement);
    382                     } else if (context.lastOffset != offset-1) {
    383                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
    384                     }
    385                 }
    386             } else {
    387                 UTEXT_NEXT32(replacement);
    388                 // Plain backslash escape.  Just put out the escaped character.
    389                 if (U_IS_BMP(c)) {
    390                     UChar c16 = (UChar)c;
    391                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    392                 } else {
    393                     UChar surrogate[2];
    394                     surrogate[0] = U16_LEAD(c);
    395                     surrogate[1] = U16_TRAIL(c);
    396                     if (U_SUCCESS(status)) {
    397                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    398                     }
    399                 }
    400             }
    401         } else if (c != DOLLARSIGN) {
    402             // Normal char, not a $.  Copy it out without further checks.
    403             if (U_IS_BMP(c)) {
    404                 UChar c16 = (UChar)c;
    405                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    406             } else {
    407                 UChar surrogate[2];
    408                 surrogate[0] = U16_LEAD(c);
    409                 surrogate[1] = U16_TRAIL(c);
    410                 if (U_SUCCESS(status)) {
    411                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    412                 }
    413             }
    414         } else {
    415             // We've got a $.  Pick up a capture group number if one follows.
    416             // Consume at most the number of digits necessary for the largest capture
    417             // number that is valid for this pattern.
    418 
    419             int32_t numDigits = 0;
    420             int32_t groupNum  = 0;
    421             UChar32 digitC;
    422             for (;;) {
    423                 digitC = UTEXT_CURRENT32(replacement);
    424                 if (digitC == U_SENTINEL) {
    425                     break;
    426                 }
    427                 if (u_isdigit(digitC) == FALSE) {
    428                     break;
    429                 }
    430                 UTEXT_NEXT32(replacement);
    431                 groupNum=groupNum*10 + u_charDigitValue(digitC);
    432                 numDigits++;
    433                 if (numDigits >= fPattern->fMaxCaptureDigits) {
    434                     break;
    435                 }
    436             }
    437 
    438 
    439             if (numDigits == 0) {
    440                 // The $ didn't introduce a group number at all.
    441                 // Treat it as just part of the substitution text.
    442                 UChar c16 = DOLLARSIGN;
    443                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    444             } else {
    445                 // Finally, append the capture group data to the destination.
    446                 destLen += appendGroup(groupNum, dest, status);
    447                 if (U_FAILURE(status)) {
    448                     // Can fail if group number is out of range.
    449                     break;
    450                 }
    451             }
    452         }
    453 
    454         if (U_FAILURE(status)) {
    455             break;
    456         } else {
    457             c = UTEXT_NEXT32(replacement);
    458         }
    459     }
    460 
    461     return *this;
    462 }
    463 
    464 
    465 
    466 //--------------------------------------------------------------------------------
    467 //
    468 //    appendTail     Intended to be used in conjunction with appendReplacement()
    469 //                   To the destination string, append everything following
    470 //                   the last match position from the input string.
    471 //
    472 //                   Note:  Match ranges do not affect appendTail or appendReplacement
    473 //
    474 //--------------------------------------------------------------------------------
    475 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
    476     UErrorCode status = U_ZERO_ERROR;
    477     UText resultText = UTEXT_INITIALIZER;
    478     utext_openUnicodeString(&resultText, &dest, &status);
    479 
    480     if (U_SUCCESS(status)) {
    481         appendTail(&resultText, status);
    482         utext_close(&resultText);
    483     }
    484 
    485     return dest;
    486 }
    487 
    488 //
    489 //   appendTail, UText mode
    490 //
    491 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
    492     UBool bailOut = FALSE;
    493     if (U_FAILURE(status)) {
    494         bailOut = TRUE;
    495     }
    496     if (U_FAILURE(fDeferredStatus)) {
    497         status = fDeferredStatus;
    498         bailOut = TRUE;
    499     }
    500 
    501     if (bailOut) {
    502         //  dest must not be NULL
    503         if (dest) {
    504             utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
    505             return dest;
    506         }
    507     }
    508 
    509     if (fInputLength > fAppendPosition) {
    510         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    511             int64_t destLen = utext_nativeLength(dest);
    512             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
    513                           (int32_t)(fInputLength-fAppendPosition), &status);
    514         } else {
    515             int32_t len16;
    516             if (UTEXT_USES_U16(fInputText)) {
    517                 len16 = (int32_t)(fInputLength-fAppendPosition);
    518             } else {
    519                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
    520                 status = U_ZERO_ERROR; // buffer overflow
    521             }
    522 
    523             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
    524             if (inputChars == NULL) {
    525                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    526             } else {
    527                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
    528                 int64_t destLen = utext_nativeLength(dest);
    529                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    530                 uprv_free(inputChars);
    531             }
    532         }
    533     }
    534     return dest;
    535 }
    536 
    537 
    538 
    539 //--------------------------------------------------------------------------------
    540 //
    541 //   end
    542 //
    543 //--------------------------------------------------------------------------------
    544 int32_t RegexMatcher::end(UErrorCode &err) const {
    545     return end(0, err);
    546 }
    547 
    548 int64_t RegexMatcher::end64(UErrorCode &err) const {
    549     return end64(0, err);
    550 }
    551 
    552 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
    553     if (U_FAILURE(err)) {
    554         return -1;
    555     }
    556     if (fMatch == FALSE) {
    557         err = U_REGEX_INVALID_STATE;
    558         return -1;
    559     }
    560     if (group < 0 || group > fPattern->fGroupMap->size()) {
    561         err = U_INDEX_OUTOFBOUNDS_ERROR;
    562         return -1;
    563     }
    564     int64_t e = -1;
    565     if (group == 0) {
    566         e = fMatchEnd;
    567     } else {
    568         // Get the position within the stack frame of the variables for
    569         //    this capture group.
    570         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
    571         U_ASSERT(groupOffset < fPattern->fFrameSize);
    572         U_ASSERT(groupOffset >= 0);
    573         e = fFrame->fExtra[groupOffset + 1];
    574     }
    575 
    576         return e;
    577 }
    578 
    579 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
    580     return (int32_t)end64(group, err);
    581 }
    582 
    583 
    584 //--------------------------------------------------------------------------------
    585 //
    586 //   find()
    587 //
    588 //--------------------------------------------------------------------------------
    589 UBool RegexMatcher::find() {
    590     // Start at the position of the last match end.  (Will be zero if the
    591     //   matcher has been reset.)
    592     //
    593     if (U_FAILURE(fDeferredStatus)) {
    594         return FALSE;
    595     }
    596 
    597     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    598         return findUsingChunk();
    599     }
    600 
    601     int64_t startPos = fMatchEnd;
    602     if (startPos==0) {
    603         startPos = fActiveStart;
    604     }
    605 
    606     if (fMatch) {
    607         // Save the position of any previous successful match.
    608         fLastMatchEnd = fMatchEnd;
    609 
    610         if (fMatchStart == fMatchEnd) {
    611             // Previous match had zero length.  Move start position up one position
    612             //  to avoid sending find() into a loop on zero-length matches.
    613             if (startPos >= fActiveLimit) {
    614                 fMatch = FALSE;
    615                 fHitEnd = TRUE;
    616                 return FALSE;
    617             }
    618             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    619             UTEXT_NEXT32(fInputText);
    620             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    621         }
    622     } else {
    623         if (fLastMatchEnd >= 0) {
    624             // A previous find() failed to match.  Don't try again.
    625             //   (without this test, a pattern with a zero-length match
    626             //    could match again at the end of an input string.)
    627             fHitEnd = TRUE;
    628             return FALSE;
    629         }
    630     }
    631 
    632 
    633     // Compute the position in the input string beyond which a match can not begin, because
    634     //   the minimum length match would extend past the end of the input.
    635     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    636     //          Be aware of possible overflows if making changes here.
    637     int64_t testStartLimit;
    638     if (UTEXT_USES_U16(fInputText)) {
    639         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
    640         if (startPos > testStartLimit) {
    641             fMatch = FALSE;
    642             fHitEnd = TRUE;
    643             return FALSE;
    644         }
    645     } else {
    646         // For now, let the matcher discover that it can't match on its own
    647         // We don't know how long the match len is in native characters
    648         testStartLimit = fActiveLimit;
    649     }
    650 
    651     UChar32  c;
    652     U_ASSERT(startPos >= 0);
    653 
    654     switch (fPattern->fStartType) {
    655     case START_NO_INFO:
    656         // No optimization was found.
    657         //  Try a match at each input position.
    658         for (;;) {
    659             MatchAt(startPos, FALSE, fDeferredStatus);
    660             if (U_FAILURE(fDeferredStatus)) {
    661                 return FALSE;
    662             }
    663             if (fMatch) {
    664                 return TRUE;
    665             }
    666             if (startPos >= testStartLimit) {
    667                 fHitEnd = TRUE;
    668                 return FALSE;
    669             }
    670             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    671             UTEXT_NEXT32(fInputText);
    672             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    673             // Note that it's perfectly OK for a pattern to have a zero-length
    674             //   match at the end of a string, so we must make sure that the loop
    675             //   runs with startPos == testStartLimit the last time through.
    676             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    677                 return FALSE;
    678         }
    679         U_ASSERT(FALSE);
    680 
    681     case START_START:
    682         // Matches are only possible at the start of the input string
    683         //   (pattern begins with ^ or \A)
    684         if (startPos > fActiveStart) {
    685             fMatch = FALSE;
    686             return FALSE;
    687         }
    688         MatchAt(startPos, FALSE, fDeferredStatus);
    689         if (U_FAILURE(fDeferredStatus)) {
    690             return FALSE;
    691         }
    692         return fMatch;
    693 
    694 
    695     case START_SET:
    696         {
    697             // Match may start on any char from a pre-computed set.
    698             U_ASSERT(fPattern->fMinMatchLen > 0);
    699             int64_t pos;
    700             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    701             for (;;) {
    702                 c = UTEXT_NEXT32(fInputText);
    703                 pos = UTEXT_GETNATIVEINDEX(fInputText);
    704                 // c will be -1 (U_SENTINEL) at end of text, in which case we
    705                 // skip this next block (so we don't have a negative array index)
    706                 // and handle end of text in the following block.
    707                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
    708                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
    709                     MatchAt(startPos, FALSE, fDeferredStatus);
    710                     if (U_FAILURE(fDeferredStatus)) {
    711                         return FALSE;
    712                     }
    713                     if (fMatch) {
    714                         return TRUE;
    715                     }
    716                     UTEXT_SETNATIVEINDEX(fInputText, pos);
    717                 }
    718                 if (startPos >= testStartLimit) {
    719                     fMatch = FALSE;
    720                     fHitEnd = TRUE;
    721                     return FALSE;
    722                 }
    723                 startPos = pos;
    724 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    725                     return FALSE;
    726             }
    727         }
    728         U_ASSERT(FALSE);
    729 
    730     case START_STRING:
    731     case START_CHAR:
    732         {
    733             // Match starts on exactly one char.
    734             U_ASSERT(fPattern->fMinMatchLen > 0);
    735             UChar32 theChar = fPattern->fInitialChar;
    736             int64_t pos;
    737             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    738             for (;;) {
    739                 c = UTEXT_NEXT32(fInputText);
    740                 pos = UTEXT_GETNATIVEINDEX(fInputText);
    741                 if (c == theChar) {
    742                     MatchAt(startPos, FALSE, fDeferredStatus);
    743                     if (U_FAILURE(fDeferredStatus)) {
    744                         return FALSE;
    745                     }
    746                     if (fMatch) {
    747                         return TRUE;
    748                     }
    749                     UTEXT_SETNATIVEINDEX(fInputText, pos);
    750                 }
    751                 if (startPos >= testStartLimit) {
    752                     fMatch = FALSE;
    753                     fHitEnd = TRUE;
    754                     return FALSE;
    755                 }
    756                 startPos = pos;
    757 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    758                     return FALSE;
    759            }
    760         }
    761         U_ASSERT(FALSE);
    762 
    763     case START_LINE:
    764         {
    765             UChar32  c;
    766             if (startPos == fAnchorStart) {
    767                 MatchAt(startPos, FALSE, fDeferredStatus);
    768                 if (U_FAILURE(fDeferredStatus)) {
    769                     return FALSE;
    770                 }
    771                 if (fMatch) {
    772                     return TRUE;
    773                 }
    774                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    775                 c = UTEXT_NEXT32(fInputText);
    776                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    777             } else {
    778                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    779                 c = UTEXT_PREVIOUS32(fInputText);
    780                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    781             }
    782 
    783             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
    784                 for (;;) {
    785                     if (c == 0x0a) {
    786                             MatchAt(startPos, FALSE, fDeferredStatus);
    787                             if (U_FAILURE(fDeferredStatus)) {
    788                                 return FALSE;
    789                             }
    790                             if (fMatch) {
    791                                 return TRUE;
    792                             }
    793                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    794                     }
    795                     if (startPos >= testStartLimit) {
    796                         fMatch = FALSE;
    797                         fHitEnd = TRUE;
    798                         return FALSE;
    799                     }
    800                     c = UTEXT_NEXT32(fInputText);
    801                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
    802                     // Note that it's perfectly OK for a pattern to have a zero-length
    803                     //   match at the end of a string, so we must make sure that the loop
    804                     //   runs with startPos == testStartLimit the last time through.
    805 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    806                         return FALSE;
    807                 }
    808             } else {
    809                 for (;;) {
    810                     if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
    811                         ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
    812                             if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
    813                                 UTEXT_NEXT32(fInputText);
    814                                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    815                             }
    816                             MatchAt(startPos, FALSE, fDeferredStatus);
    817                             if (U_FAILURE(fDeferredStatus)) {
    818                                 return FALSE;
    819                             }
    820                             if (fMatch) {
    821                                 return TRUE;
    822                             }
    823                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    824                     }
    825                     if (startPos >= testStartLimit) {
    826                         fMatch = FALSE;
    827                         fHitEnd = TRUE;
    828                         return FALSE;
    829                     }
    830                     c = UTEXT_NEXT32(fInputText);
    831                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
    832                     // Note that it's perfectly OK for a pattern to have a zero-length
    833                     //   match at the end of a string, so we must make sure that the loop
    834                     //   runs with startPos == testStartLimit the last time through.
    835 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    836                         return FALSE;
    837                 }
    838             }
    839         }
    840 
    841     default:
    842         U_ASSERT(FALSE);
    843     }
    844 
    845     U_ASSERT(FALSE);
    846     return FALSE;
    847 }
    848 
    849 
    850 
    851 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
    852     if (U_FAILURE(status)) {
    853         return FALSE;
    854     }
    855     if (U_FAILURE(fDeferredStatus)) {
    856         status = fDeferredStatus;
    857         return FALSE;
    858     }
    859     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
    860                                           //        This will reset the region to be the full input length.
    861     if (start < 0) {
    862         status = U_INDEX_OUTOFBOUNDS_ERROR;
    863         return FALSE;
    864     }
    865 
    866     int64_t nativeStart = start;
    867     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
    868         status = U_INDEX_OUTOFBOUNDS_ERROR;
    869         return FALSE;
    870     }
    871     fMatchEnd = nativeStart;
    872     return find();
    873 }
    874 
    875 
    876 //--------------------------------------------------------------------------------
    877 //
    878 //   findUsingChunk() -- like find(), but with the advance knowledge that the
    879 //                       entire string is available in the UText's chunk buffer.
    880 //
    881 //--------------------------------------------------------------------------------
    882 UBool RegexMatcher::findUsingChunk() {
    883     // Start at the position of the last match end.  (Will be zero if the
    884     //   matcher has been reset.
    885     //
    886 
    887     int32_t startPos = (int32_t)fMatchEnd;
    888     if (startPos==0) {
    889         startPos = (int32_t)fActiveStart;
    890     }
    891 
    892     const UChar *inputBuf = fInputText->chunkContents;
    893 
    894     if (fMatch) {
    895         // Save the position of any previous successful match.
    896         fLastMatchEnd = fMatchEnd;
    897 
    898         if (fMatchStart == fMatchEnd) {
    899             // Previous match had zero length.  Move start position up one position
    900             //  to avoid sending find() into a loop on zero-length matches.
    901             if (startPos >= fActiveLimit) {
    902                 fMatch = FALSE;
    903                 fHitEnd = TRUE;
    904                 return FALSE;
    905             }
    906             U16_FWD_1(inputBuf, startPos, fInputLength);
    907         }
    908     } else {
    909         if (fLastMatchEnd >= 0) {
    910             // A previous find() failed to match.  Don't try again.
    911             //   (without this test, a pattern with a zero-length match
    912             //    could match again at the end of an input string.)
    913             fHitEnd = TRUE;
    914             return FALSE;
    915         }
    916     }
    917 
    918 
    919     // Compute the position in the input string beyond which a match can not begin, because
    920     //   the minimum length match would extend past the end of the input.
    921     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    922     //          Be aware of possible overflows if making changes here.
    923     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
    924     if (startPos > testLen) {
    925         fMatch = FALSE;
    926         fHitEnd = TRUE;
    927         return FALSE;
    928     }
    929 
    930     UChar32  c;
    931     U_ASSERT(startPos >= 0);
    932 
    933     switch (fPattern->fStartType) {
    934     case START_NO_INFO:
    935         // No optimization was found.
    936         //  Try a match at each input position.
    937         for (;;) {
    938             MatchChunkAt(startPos, FALSE, fDeferredStatus);
    939             if (U_FAILURE(fDeferredStatus)) {
    940                 return FALSE;
    941             }
    942             if (fMatch) {
    943                 return TRUE;
    944             }
    945             if (startPos >= testLen) {
    946                 fHitEnd = TRUE;
    947                 return FALSE;
    948             }
    949             U16_FWD_1(inputBuf, startPos, fActiveLimit);
    950             // Note that it's perfectly OK for a pattern to have a zero-length
    951             //   match at the end of a string, so we must make sure that the loop
    952             //   runs with startPos == testLen the last time through.
    953             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    954                 return FALSE;
    955         }
    956         U_ASSERT(FALSE);
    957 
    958     case START_START:
    959         // Matches are only possible at the start of the input string
    960         //   (pattern begins with ^ or \A)
    961         if (startPos > fActiveStart) {
    962             fMatch = FALSE;
    963             return FALSE;
    964         }
    965         MatchChunkAt(startPos, FALSE, fDeferredStatus);
    966         if (U_FAILURE(fDeferredStatus)) {
    967             return FALSE;
    968         }
    969         return fMatch;
    970 
    971 
    972     case START_SET:
    973     {
    974         // Match may start on any char from a pre-computed set.
    975         U_ASSERT(fPattern->fMinMatchLen > 0);
    976         for (;;) {
    977             int32_t pos = startPos;
    978             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
    979             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
    980                 (c>=256 && fPattern->fInitialChars->contains(c))) {
    981                 MatchChunkAt(pos, FALSE, fDeferredStatus);
    982                 if (U_FAILURE(fDeferredStatus)) {
    983                     return FALSE;
    984                 }
    985                 if (fMatch) {
    986                     return TRUE;
    987                 }
    988             }
    989             if (pos >= testLen) {
    990                 fMatch = FALSE;
    991                 fHitEnd = TRUE;
    992                 return FALSE;
    993             }
    994             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
    995                 return FALSE;
    996         }
    997     }
    998         U_ASSERT(FALSE);
    999 
   1000     case START_STRING:
   1001     case START_CHAR:
   1002     {
   1003         // Match starts on exactly one char.
   1004         U_ASSERT(fPattern->fMinMatchLen > 0);
   1005         UChar32 theChar = fPattern->fInitialChar;
   1006         for (;;) {
   1007             int32_t pos = startPos;
   1008             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
   1009             if (c == theChar) {
   1010                 MatchChunkAt(pos, FALSE, fDeferredStatus);
   1011                 if (U_FAILURE(fDeferredStatus)) {
   1012                     return FALSE;
   1013                 }
   1014                 if (fMatch) {
   1015                     return TRUE;
   1016                 }
   1017             }
   1018             if (pos >= testLen) {
   1019                 fMatch = FALSE;
   1020                 fHitEnd = TRUE;
   1021                 return FALSE;
   1022             }
   1023             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1024                 return FALSE;
   1025         }
   1026     }
   1027         U_ASSERT(FALSE);
   1028 
   1029     case START_LINE:
   1030     {
   1031         UChar32  c;
   1032         if (startPos == fAnchorStart) {
   1033             MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1034             if (U_FAILURE(fDeferredStatus)) {
   1035                 return FALSE;
   1036             }
   1037             if (fMatch) {
   1038                 return TRUE;
   1039             }
   1040             U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1041         }
   1042 
   1043         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
   1044             for (;;) {
   1045                 c = inputBuf[startPos-1];
   1046                 if (c == 0x0a) {
   1047                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1048                     if (U_FAILURE(fDeferredStatus)) {
   1049                         return FALSE;
   1050                     }
   1051                     if (fMatch) {
   1052                         return TRUE;
   1053                     }
   1054                 }
   1055                 if (startPos >= testLen) {
   1056                     fMatch = FALSE;
   1057                     fHitEnd = TRUE;
   1058                     return FALSE;
   1059                 }
   1060                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1061                 // Note that it's perfectly OK for a pattern to have a zero-length
   1062                 //   match at the end of a string, so we must make sure that the loop
   1063                 //   runs with startPos == testLen the last time through.
   1064 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1065                     return FALSE;
   1066             }
   1067         } else {
   1068             for (;;) {
   1069                 c = inputBuf[startPos-1];
   1070                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   1071                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
   1072                     if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
   1073                         startPos++;
   1074                     }
   1075                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1076                     if (U_FAILURE(fDeferredStatus)) {
   1077                         return FALSE;
   1078                     }
   1079                     if (fMatch) {
   1080                         return TRUE;
   1081                     }
   1082                 }
   1083                 if (startPos >= testLen) {
   1084                     fMatch = FALSE;
   1085                     fHitEnd = TRUE;
   1086                     return FALSE;
   1087                 }
   1088                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1089                 // Note that it's perfectly OK for a pattern to have a zero-length
   1090                 //   match at the end of a string, so we must make sure that the loop
   1091                 //   runs with startPos == testLen the last time through.
   1092 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
   1093                     return FALSE;
   1094             }
   1095         }
   1096     }
   1097 
   1098     default:
   1099         U_ASSERT(FALSE);
   1100     }
   1101 
   1102     U_ASSERT(FALSE);
   1103     return FALSE;
   1104 }
   1105 
   1106 
   1107 
   1108 //--------------------------------------------------------------------------------
   1109 //
   1110 //  group()
   1111 //
   1112 //--------------------------------------------------------------------------------
   1113 UnicodeString RegexMatcher::group(UErrorCode &status) const {
   1114     return group(0, status);
   1115 }
   1116 
   1117 //  Return immutable shallow clone
   1118 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
   1119     return group(0, dest, group_len, status);
   1120 }
   1121 
   1122 //  Return immutable shallow clone
   1123 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
   1124     group_len = 0;
   1125     UBool bailOut = FALSE;
   1126     if (U_FAILURE(status)) {
   1127         return dest;
   1128     }
   1129     if (U_FAILURE(fDeferredStatus)) {
   1130         status = fDeferredStatus;
   1131         bailOut = TRUE;
   1132     }
   1133     if (fMatch == FALSE) {
   1134         status = U_REGEX_INVALID_STATE;
   1135         bailOut = TRUE;
   1136     }
   1137     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1138         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1139         bailOut = TRUE;
   1140     }
   1141 
   1142     if (bailOut) {
   1143         return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
   1144     }
   1145 
   1146     int64_t s, e;
   1147     if (groupNum == 0) {
   1148         s = fMatchStart;
   1149         e = fMatchEnd;
   1150     } else {
   1151         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1152         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1153         U_ASSERT(groupOffset >= 0);
   1154         s = fFrame->fExtra[groupOffset];
   1155         e = fFrame->fExtra[groupOffset+1];
   1156     }
   1157 
   1158     if (s < 0) {
   1159         // A capture group wasn't part of the match
   1160         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
   1161     }
   1162     U_ASSERT(s <= e);
   1163     group_len = e - s;
   1164 
   1165     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
   1166     if (dest)
   1167         UTEXT_SETNATIVEINDEX(dest, s);
   1168     return dest;
   1169 }
   1170 
   1171 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
   1172     UnicodeString result;
   1173     if (U_FAILURE(status)) {
   1174         return result;
   1175     }
   1176     UText resultText = UTEXT_INITIALIZER;
   1177     utext_openUnicodeString(&resultText, &result, &status);
   1178     group(groupNum, &resultText, status);
   1179     utext_close(&resultText);
   1180     return result;
   1181 }
   1182 
   1183 
   1184 //  Return deep (mutable) clone
   1185 //		Technology Preview (as an API), but note that the UnicodeString API is implemented
   1186 //		using this function.
   1187 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
   1188     UBool bailOut = FALSE;
   1189     if (U_FAILURE(status)) {
   1190         return dest;
   1191     }
   1192     if (U_FAILURE(fDeferredStatus)) {
   1193         status = fDeferredStatus;
   1194         bailOut = TRUE;
   1195     }
   1196 
   1197     if (fMatch == FALSE) {
   1198         status = U_REGEX_INVALID_STATE;
   1199         bailOut = TRUE;
   1200     }
   1201     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1202         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1203         bailOut = TRUE;
   1204     }
   1205 
   1206     if (bailOut) {
   1207         if (dest) {
   1208             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1209             return dest;
   1210         } else {
   1211             return utext_openUChars(NULL, NULL, 0, &status);
   1212         }
   1213     }
   1214 
   1215     int64_t s, e;
   1216     if (groupNum == 0) {
   1217         s = fMatchStart;
   1218         e = fMatchEnd;
   1219     } else {
   1220         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1221         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1222         U_ASSERT(groupOffset >= 0);
   1223         s = fFrame->fExtra[groupOffset];
   1224         e = fFrame->fExtra[groupOffset+1];
   1225     }
   1226 
   1227     if (s < 0) {
   1228         // A capture group wasn't part of the match
   1229         if (dest) {
   1230             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1231             return dest;
   1232         } else {
   1233             return utext_openUChars(NULL, NULL, 0, &status);
   1234         }
   1235     }
   1236     U_ASSERT(s <= e);
   1237 
   1238     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1239         U_ASSERT(e <= fInputLength);
   1240         if (dest) {
   1241             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
   1242         } else {
   1243             UText groupText = UTEXT_INITIALIZER;
   1244             utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
   1245             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
   1246             utext_close(&groupText);
   1247         }
   1248     } else {
   1249         int32_t len16;
   1250         if (UTEXT_USES_U16(fInputText)) {
   1251             len16 = (int32_t)(e-s);
   1252         } else {
   1253             UErrorCode lengthStatus = U_ZERO_ERROR;
   1254             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
   1255         }
   1256         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   1257         if (groupChars == NULL) {
   1258             status = U_MEMORY_ALLOCATION_ERROR;
   1259             return dest;
   1260         }
   1261         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
   1262 
   1263         if (dest) {
   1264             utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
   1265         } else {
   1266             UText groupText = UTEXT_INITIALIZER;
   1267             utext_openUChars(&groupText, groupChars, len16, &status);
   1268             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
   1269             utext_close(&groupText);
   1270         }
   1271 
   1272         uprv_free(groupChars);
   1273     }
   1274     return dest;
   1275 }
   1276 
   1277 //--------------------------------------------------------------------------------
   1278 //
   1279 //  appendGroup() -- currently internal only, appends a group to a UText rather
   1280 //                   than replacing its contents
   1281 //
   1282 //--------------------------------------------------------------------------------
   1283 
   1284 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
   1285     if (U_FAILURE(status)) {
   1286         return 0;
   1287     }
   1288     if (U_FAILURE(fDeferredStatus)) {
   1289         status = fDeferredStatus;
   1290         return 0;
   1291     }
   1292     int64_t destLen = utext_nativeLength(dest);
   1293 
   1294     if (fMatch == FALSE) {
   1295         status = U_REGEX_INVALID_STATE;
   1296         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1297     }
   1298     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1299         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1300         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1301     }
   1302 
   1303     int64_t s, e;
   1304     if (groupNum == 0) {
   1305         s = fMatchStart;
   1306         e = fMatchEnd;
   1307     } else {
   1308         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1309         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1310         U_ASSERT(groupOffset >= 0);
   1311         s = fFrame->fExtra[groupOffset];
   1312         e = fFrame->fExtra[groupOffset+1];
   1313     }
   1314 
   1315     if (s < 0) {
   1316         // A capture group wasn't part of the match
   1317         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1318     }
   1319     U_ASSERT(s <= e);
   1320 
   1321     int64_t deltaLen;
   1322     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1323         U_ASSERT(e <= fInputLength);
   1324         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
   1325     } else {
   1326         int32_t len16;
   1327         if (UTEXT_USES_U16(fInputText)) {
   1328             len16 = (int32_t)(e-s);
   1329         } else {
   1330             UErrorCode lengthStatus = U_ZERO_ERROR;
   1331             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
   1332         }
   1333         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   1334         if (groupChars == NULL) {
   1335             status = U_MEMORY_ALLOCATION_ERROR;
   1336             return 0;
   1337         }
   1338         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
   1339 
   1340         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
   1341         uprv_free(groupChars);
   1342     }
   1343     return deltaLen;
   1344 }
   1345 
   1346 
   1347 
   1348 //--------------------------------------------------------------------------------
   1349 //
   1350 //  groupCount()
   1351 //
   1352 //--------------------------------------------------------------------------------
   1353 int32_t RegexMatcher::groupCount() const {
   1354     return fPattern->fGroupMap->size();
   1355 }
   1356 
   1357 
   1358 
   1359 //--------------------------------------------------------------------------------
   1360 //
   1361 //  hasAnchoringBounds()
   1362 //
   1363 //--------------------------------------------------------------------------------
   1364 UBool RegexMatcher::hasAnchoringBounds() const {
   1365     return fAnchoringBounds;
   1366 }
   1367 
   1368 
   1369 //--------------------------------------------------------------------------------
   1370 //
   1371 //  hasTransparentBounds()
   1372 //
   1373 //--------------------------------------------------------------------------------
   1374 UBool RegexMatcher::hasTransparentBounds() const {
   1375     return fTransparentBounds;
   1376 }
   1377 
   1378 
   1379 
   1380 //--------------------------------------------------------------------------------
   1381 //
   1382 //  hitEnd()
   1383 //
   1384 //--------------------------------------------------------------------------------
   1385 UBool RegexMatcher::hitEnd() const {
   1386     return fHitEnd;
   1387 }
   1388 
   1389 
   1390 //--------------------------------------------------------------------------------
   1391 //
   1392 //  input()
   1393 //
   1394 //--------------------------------------------------------------------------------
   1395 const UnicodeString &RegexMatcher::input() const {
   1396     if (!fInput) {
   1397         UErrorCode status = U_ZERO_ERROR;
   1398         int32_t len16;
   1399         if (UTEXT_USES_U16(fInputText)) {
   1400             len16 = (int32_t)fInputLength;
   1401         } else {
   1402             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
   1403             status = U_ZERO_ERROR; // overflow, length status
   1404         }
   1405         UnicodeString *result = new UnicodeString(len16, 0, 0);
   1406 
   1407         UChar *inputChars = result->getBuffer(len16);
   1408         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
   1409         result->releaseBuffer(len16);
   1410 
   1411         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
   1412     }
   1413 
   1414     return *fInput;
   1415 }
   1416 
   1417 //--------------------------------------------------------------------------------
   1418 //
   1419 //  inputText()
   1420 //
   1421 //--------------------------------------------------------------------------------
   1422 UText *RegexMatcher::inputText() const {
   1423     return fInputText;
   1424 }
   1425 
   1426 
   1427 //--------------------------------------------------------------------------------
   1428 //
   1429 //  getInput() -- like inputText(), but makes a clone or copies into another UText
   1430 //
   1431 //--------------------------------------------------------------------------------
   1432 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
   1433     UBool bailOut = FALSE;
   1434     if (U_FAILURE(status)) {
   1435         return dest;
   1436     }
   1437     if (U_FAILURE(fDeferredStatus)) {
   1438         status = fDeferredStatus;
   1439         bailOut = TRUE;
   1440     }
   1441 
   1442     if (bailOut) {
   1443         if (dest) {
   1444             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1445             return dest;
   1446         } else {
   1447             return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
   1448         }
   1449     }
   1450 
   1451     if (dest) {
   1452         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1453             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
   1454         } else {
   1455             int32_t input16Len;
   1456             if (UTEXT_USES_U16(fInputText)) {
   1457                 input16Len = (int32_t)fInputLength;
   1458             } else {
   1459                 UErrorCode lengthStatus = U_ZERO_ERROR;
   1460                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
   1461             }
   1462             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
   1463             if (inputChars == NULL) {
   1464                 return dest;
   1465             }
   1466 
   1467             status = U_ZERO_ERROR;
   1468             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
   1469             status = U_ZERO_ERROR;
   1470             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
   1471 
   1472             uprv_free(inputChars);
   1473         }
   1474         return dest;
   1475     } else {
   1476         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
   1477     }
   1478 }
   1479 
   1480 
   1481 static UBool compat_SyncMutableUTextContents(UText *ut);
   1482 static UBool compat_SyncMutableUTextContents(UText *ut) {
   1483     UBool retVal = FALSE;
   1484 
   1485     //  In the following test, we're really only interested in whether the UText should switch
   1486     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
   1487     //  will still point to the correct data.
   1488     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
   1489         UnicodeString *us=(UnicodeString *)ut->context;
   1490 
   1491         // Update to the latest length.
   1492         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
   1493         int32_t newLength = us->length();
   1494 
   1495         // Update the chunk description.
   1496         // The buffer may have switched between stack- and heap-based.
   1497         ut->chunkContents    = us->getBuffer();
   1498         ut->chunkLength      = newLength;
   1499         ut->chunkNativeLimit = newLength;
   1500         ut->nativeIndexingLimit = newLength;
   1501         retVal = TRUE;
   1502     }
   1503 
   1504     return retVal;
   1505 }
   1506 
   1507 //--------------------------------------------------------------------------------
   1508 //
   1509 //  lookingAt()
   1510 //
   1511 //--------------------------------------------------------------------------------
   1512 UBool RegexMatcher::lookingAt(UErrorCode &status) {
   1513     if (U_FAILURE(status)) {
   1514         return FALSE;
   1515     }
   1516     if (U_FAILURE(fDeferredStatus)) {
   1517         status = fDeferredStatus;
   1518         return FALSE;
   1519     }
   1520 
   1521     if (fInputUniStrMaybeMutable) {
   1522         if (compat_SyncMutableUTextContents(fInputText)) {
   1523         fInputLength = utext_nativeLength(fInputText);
   1524         reset();
   1525         }
   1526     }
   1527     else {
   1528         resetPreserveRegion();
   1529     }
   1530     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1531         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
   1532     } else {
   1533         MatchAt(fActiveStart, FALSE, status);
   1534     }
   1535     return fMatch;
   1536 }
   1537 
   1538 
   1539 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
   1540     if (U_FAILURE(status)) {
   1541         return FALSE;
   1542     }
   1543     if (U_FAILURE(fDeferredStatus)) {
   1544         status = fDeferredStatus;
   1545         return FALSE;
   1546     }
   1547     reset();
   1548 
   1549     if (start < 0) {
   1550         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1551         return FALSE;
   1552     }
   1553 
   1554     if (fInputUniStrMaybeMutable) {
   1555         if (compat_SyncMutableUTextContents(fInputText)) {
   1556         fInputLength = utext_nativeLength(fInputText);
   1557         reset();
   1558         }
   1559     }
   1560 
   1561     int64_t nativeStart;
   1562     nativeStart = start;
   1563     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1564         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1565         return FALSE;
   1566     }
   1567 
   1568     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1569         MatchChunkAt((int32_t)nativeStart, FALSE, status);
   1570     } else {
   1571         MatchAt(nativeStart, FALSE, status);
   1572     }
   1573     return fMatch;
   1574 }
   1575 
   1576 
   1577 
   1578 //--------------------------------------------------------------------------------
   1579 //
   1580 //  matches()
   1581 //
   1582 //--------------------------------------------------------------------------------
   1583 UBool RegexMatcher::matches(UErrorCode &status) {
   1584     if (U_FAILURE(status)) {
   1585         return FALSE;
   1586     }
   1587     if (U_FAILURE(fDeferredStatus)) {
   1588         status = fDeferredStatus;
   1589         return FALSE;
   1590     }
   1591 
   1592     if (fInputUniStrMaybeMutable) {
   1593         if (compat_SyncMutableUTextContents(fInputText)) {
   1594         fInputLength = utext_nativeLength(fInputText);
   1595         reset();
   1596         }
   1597     }
   1598     else {
   1599         resetPreserveRegion();
   1600     }
   1601 
   1602     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1603         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
   1604     } else {
   1605         MatchAt(fActiveStart, TRUE, status);
   1606     }
   1607     return fMatch;
   1608 }
   1609 
   1610 
   1611 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
   1612     if (U_FAILURE(status)) {
   1613         return FALSE;
   1614     }
   1615     if (U_FAILURE(fDeferredStatus)) {
   1616         status = fDeferredStatus;
   1617         return FALSE;
   1618     }
   1619     reset();
   1620 
   1621     if (start < 0) {
   1622         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1623         return FALSE;
   1624     }
   1625 
   1626     if (fInputUniStrMaybeMutable) {
   1627         if (compat_SyncMutableUTextContents(fInputText)) {
   1628         fInputLength = utext_nativeLength(fInputText);
   1629         reset();
   1630         }
   1631     }
   1632 
   1633     int64_t nativeStart;
   1634     nativeStart = start;
   1635     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1636         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1637         return FALSE;
   1638     }
   1639 
   1640     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1641         MatchChunkAt((int32_t)nativeStart, TRUE, status);
   1642     } else {
   1643         MatchAt(nativeStart, TRUE, status);
   1644     }
   1645     return fMatch;
   1646 }
   1647 
   1648 
   1649 
   1650 //--------------------------------------------------------------------------------
   1651 //
   1652 //    pattern
   1653 //
   1654 //--------------------------------------------------------------------------------
   1655 const RegexPattern &RegexMatcher::pattern() const {
   1656     return *fPattern;
   1657 }
   1658 
   1659 
   1660 
   1661 //--------------------------------------------------------------------------------
   1662 //
   1663 //    region
   1664 //
   1665 //--------------------------------------------------------------------------------
   1666 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
   1667     if (U_FAILURE(status)) {
   1668         return *this;
   1669     }
   1670 
   1671     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
   1672         status = U_ILLEGAL_ARGUMENT_ERROR;
   1673     }
   1674 
   1675     int64_t nativeStart = regionStart;
   1676     int64_t nativeLimit = regionLimit;
   1677     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
   1678       status = U_ILLEGAL_ARGUMENT_ERROR;
   1679     }
   1680 
   1681     if (startIndex == -1)
   1682       this->reset();
   1683     else
   1684       resetPreserveRegion();
   1685 
   1686     fRegionStart = nativeStart;
   1687     fRegionLimit = nativeLimit;
   1688     fActiveStart = nativeStart;
   1689     fActiveLimit = nativeLimit;
   1690 
   1691     if (startIndex != -1) {
   1692       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
   1693           status = U_INDEX_OUTOFBOUNDS_ERROR;
   1694       }
   1695       fMatchEnd = startIndex;
   1696     }
   1697 
   1698     if (!fTransparentBounds) {
   1699         fLookStart = nativeStart;
   1700         fLookLimit = nativeLimit;
   1701     }
   1702     if (fAnchoringBounds) {
   1703         fAnchorStart = nativeStart;
   1704         fAnchorLimit = nativeLimit;
   1705     }
   1706     return *this;
   1707 }
   1708 
   1709 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
   1710   return region(start, limit, -1, status);
   1711 }
   1712 
   1713 //--------------------------------------------------------------------------------
   1714 //
   1715 //    regionEnd
   1716 //
   1717 //--------------------------------------------------------------------------------
   1718 int32_t RegexMatcher::regionEnd() const {
   1719     return (int32_t)fRegionLimit;
   1720 }
   1721 
   1722 int64_t RegexMatcher::regionEnd64() const {
   1723     return fRegionLimit;
   1724 }
   1725 
   1726 //--------------------------------------------------------------------------------
   1727 //
   1728 //    regionStart
   1729 //
   1730 //--------------------------------------------------------------------------------
   1731 int32_t RegexMatcher::regionStart() const {
   1732     return (int32_t)fRegionStart;
   1733 }
   1734 
   1735 int64_t RegexMatcher::regionStart64() const {
   1736     return fRegionStart;
   1737 }
   1738 
   1739 
   1740 //--------------------------------------------------------------------------------
   1741 //
   1742 //    replaceAll
   1743 //
   1744 //--------------------------------------------------------------------------------
   1745 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
   1746     UText replacementText = UTEXT_INITIALIZER;
   1747     UText resultText = UTEXT_INITIALIZER;
   1748     UnicodeString resultString;
   1749     if (U_FAILURE(status)) {
   1750         return resultString;
   1751     }
   1752 
   1753     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1754     utext_openUnicodeString(&resultText, &resultString, &status);
   1755 
   1756     replaceAll(&replacementText, &resultText, status);
   1757 
   1758     utext_close(&resultText);
   1759     utext_close(&replacementText);
   1760 
   1761     return resultString;
   1762 }
   1763 
   1764 
   1765 //
   1766 //    replaceAll, UText mode
   1767 //
   1768 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
   1769     if (U_FAILURE(status)) {
   1770         return dest;
   1771     }
   1772     if (U_FAILURE(fDeferredStatus)) {
   1773         status = fDeferredStatus;
   1774         return dest;
   1775     }
   1776 
   1777     if (dest == NULL) {
   1778         UnicodeString emptyString;
   1779         UText empty = UTEXT_INITIALIZER;
   1780 
   1781         utext_openUnicodeString(&empty, &emptyString, &status);
   1782         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
   1783         utext_close(&empty);
   1784     }
   1785 
   1786     if (U_SUCCESS(status)) {
   1787         reset();
   1788         while (find()) {
   1789             appendReplacement(dest, replacement, status);
   1790             if (U_FAILURE(status)) {
   1791                 break;
   1792             }
   1793         }
   1794         appendTail(dest, status);
   1795     }
   1796 
   1797     return dest;
   1798 }
   1799 
   1800 
   1801 //--------------------------------------------------------------------------------
   1802 //
   1803 //    replaceFirst
   1804 //
   1805 //--------------------------------------------------------------------------------
   1806 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
   1807     UText replacementText = UTEXT_INITIALIZER;
   1808     UText resultText = UTEXT_INITIALIZER;
   1809     UnicodeString resultString;
   1810 
   1811     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1812     utext_openUnicodeString(&resultText, &resultString, &status);
   1813 
   1814     replaceFirst(&replacementText, &resultText, status);
   1815 
   1816     utext_close(&resultText);
   1817     utext_close(&replacementText);
   1818 
   1819     return resultString;
   1820 }
   1821 
   1822 //
   1823 //    replaceFirst, UText mode
   1824 //
   1825 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
   1826     if (U_FAILURE(status)) {
   1827         return dest;
   1828     }
   1829     if (U_FAILURE(fDeferredStatus)) {
   1830         status = fDeferredStatus;
   1831         return dest;
   1832     }
   1833 
   1834     reset();
   1835     if (!find()) {
   1836         return getInput(dest, status);
   1837     }
   1838 
   1839     if (dest == NULL) {
   1840         UnicodeString emptyString;
   1841         UText empty = UTEXT_INITIALIZER;
   1842 
   1843         utext_openUnicodeString(&empty, &emptyString, &status);
   1844         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
   1845         utext_close(&empty);
   1846     }
   1847 
   1848     appendReplacement(dest, replacement, status);
   1849     appendTail(dest, status);
   1850 
   1851     return dest;
   1852 }
   1853 
   1854 
   1855 //--------------------------------------------------------------------------------
   1856 //
   1857 //     requireEnd
   1858 //
   1859 //--------------------------------------------------------------------------------
   1860 UBool RegexMatcher::requireEnd() const {
   1861     return fRequireEnd;
   1862 }
   1863 
   1864 
   1865 //--------------------------------------------------------------------------------
   1866 //
   1867 //     reset
   1868 //
   1869 //--------------------------------------------------------------------------------
   1870 RegexMatcher &RegexMatcher::reset() {
   1871     fRegionStart    = 0;
   1872     fRegionLimit    = fInputLength;
   1873     fActiveStart    = 0;
   1874     fActiveLimit    = fInputLength;
   1875     fAnchorStart    = 0;
   1876     fAnchorLimit    = fInputLength;
   1877     fLookStart      = 0;
   1878     fLookLimit      = fInputLength;
   1879     resetPreserveRegion();
   1880     return *this;
   1881 }
   1882 
   1883 
   1884 
   1885 void RegexMatcher::resetPreserveRegion() {
   1886     fMatchStart     = 0;
   1887     fMatchEnd       = 0;
   1888     fLastMatchEnd   = -1;
   1889     fAppendPosition = 0;
   1890     fMatch          = FALSE;
   1891     fHitEnd         = FALSE;
   1892     fRequireEnd     = FALSE;
   1893     fTime           = 0;
   1894     fTickCounter    = TIMER_INITIAL_VALUE;
   1895     //resetStack(); // more expensive than it looks...
   1896 }
   1897 
   1898 
   1899 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
   1900     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
   1901     if (fPattern->fNeedsAltInput) {
   1902         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
   1903     }
   1904     fInputLength = utext_nativeLength(fInputText);
   1905 
   1906     reset();
   1907     delete fInput;
   1908     fInput = NULL;
   1909 
   1910     //  Do the following for any UnicodeString.
   1911     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
   1912     fInputUniStrMaybeMutable = TRUE;
   1913 
   1914     if (fWordBreakItr != NULL) {
   1915 #if UCONFIG_NO_BREAK_ITERATION==0
   1916         UErrorCode status = U_ZERO_ERROR;
   1917         fWordBreakItr->setText(fInputText, status);
   1918 #endif
   1919     }
   1920     return *this;
   1921 }
   1922 
   1923 
   1924 RegexMatcher &RegexMatcher::reset(UText *input) {
   1925     if (fInputText != input) {
   1926         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
   1927         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
   1928         fInputLength = utext_nativeLength(fInputText);
   1929 
   1930         delete fInput;
   1931         fInput = NULL;
   1932 
   1933         if (fWordBreakItr != NULL) {
   1934 #if UCONFIG_NO_BREAK_ITERATION==0
   1935             UErrorCode status = U_ZERO_ERROR;
   1936             fWordBreakItr->setText(input, status);
   1937 #endif
   1938         }
   1939     }
   1940     reset();
   1941     fInputUniStrMaybeMutable = FALSE;
   1942 
   1943     return *this;
   1944 }
   1945 
   1946 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
   1947     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
   1948     return *this;
   1949 }*/
   1950 
   1951 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
   1952     if (U_FAILURE(status)) {
   1953         return *this;
   1954     }
   1955     reset();       // Reset also resets the region to be the entire string.
   1956 
   1957     if (position < 0 || position > fActiveLimit) {
   1958         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1959         return *this;
   1960     }
   1961     fMatchEnd = position;
   1962     return *this;
   1963 }
   1964 
   1965 
   1966 // BEGIN android-added
   1967 // Removed this function after Android upgrad to ICU4.8.
   1968 //--------------------------------------------------------------------------------
   1969 //
   1970 //    refresh
   1971 //
   1972 //--------------------------------------------------------------------------------
   1973 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
   1974     if (U_FAILURE(status)) {
   1975         return *this;
   1976     }
   1977     if (input == NULL) {
   1978         status = U_ILLEGAL_ARGUMENT_ERROR;
   1979         return *this;
   1980     }
   1981     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
   1982         status = U_ILLEGAL_ARGUMENT_ERROR;
   1983         return *this;
   1984     }
   1985     int64_t  pos = utext_getNativeIndex(fInputText);
   1986     //  Shallow read-only clone of the new UText into the existing input UText
   1987     fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
   1988     if (U_FAILURE(status)) {
   1989         return *this;
   1990     }
   1991     utext_setNativeIndex(fInputText, pos);
   1992 
   1993     if (fAltInputText != NULL) {
   1994         pos = utext_getNativeIndex(fAltInputText);
   1995         fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
   1996         if (U_FAILURE(status)) {
   1997             return *this;
   1998         }
   1999         utext_setNativeIndex(fAltInputText, pos);
   2000     }
   2001     return *this;
   2002 }
   2003 // END android-added
   2004 
   2005 
   2006 //--------------------------------------------------------------------------------
   2007 //
   2008 //    setTrace
   2009 //
   2010 //--------------------------------------------------------------------------------
   2011 void RegexMatcher::setTrace(UBool state) {
   2012     fTraceDebug = state;
   2013 }
   2014 
   2015 
   2016 
   2017 //---------------------------------------------------------------------
   2018 //
   2019 //   split
   2020 //
   2021 //---------------------------------------------------------------------
   2022 int32_t  RegexMatcher::split(const UnicodeString &input,
   2023         UnicodeString    dest[],
   2024         int32_t          destCapacity,
   2025         UErrorCode      &status)
   2026 {
   2027     UText inputText = UTEXT_INITIALIZER;
   2028     utext_openConstUnicodeString(&inputText, &input, &status);
   2029     if (U_FAILURE(status)) {
   2030         return 0;
   2031     }
   2032 
   2033     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
   2034     if (destText == NULL) {
   2035         status = U_MEMORY_ALLOCATION_ERROR;
   2036         return 0;
   2037     }
   2038     int32_t i;
   2039     for (i = 0; i < destCapacity; i++) {
   2040         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
   2041     }
   2042 
   2043     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
   2044 
   2045     for (i = 0; i < destCapacity; i++) {
   2046         utext_close(destText[i]);
   2047     }
   2048 
   2049     uprv_free(destText);
   2050     utext_close(&inputText);
   2051     return fieldCount;
   2052 }
   2053 
   2054 //
   2055 //   split, UText mode
   2056 //
   2057 int32_t  RegexMatcher::split(UText *input,
   2058         UText           *dest[],
   2059         int32_t          destCapacity,
   2060         UErrorCode      &status)
   2061 {
   2062     //
   2063     // Check arguements for validity
   2064     //
   2065     if (U_FAILURE(status)) {
   2066         return 0;
   2067     };
   2068 
   2069     if (destCapacity < 1) {
   2070         status = U_ILLEGAL_ARGUMENT_ERROR;
   2071         return 0;
   2072     }
   2073 
   2074     //
   2075     // Reset for the input text
   2076     //
   2077     reset(input);
   2078     int64_t   nextOutputStringStart = 0;
   2079     if (fActiveLimit == 0) {
   2080         return 0;
   2081     }
   2082 
   2083     //
   2084     // Loop through the input text, searching for the delimiter pattern
   2085     //
   2086     int32_t i;
   2087     int32_t numCaptureGroups = fPattern->fGroupMap->size();
   2088     for (i=0; ; i++) {
   2089         if (i>=destCapacity-1) {
   2090             // There is one or zero output string left.
   2091             // Fill the last output string with whatever is left from the input, then exit the loop.
   2092             //  ( i will be == destCapacity if we filled the output array while processing
   2093             //    capture groups of the delimiter expression, in which case we will discard the
   2094             //    last capture group saved in favor of the unprocessed remainder of the
   2095             //    input string.)
   2096             i = destCapacity-1;
   2097             if (fActiveLimit > nextOutputStringStart) {
   2098                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2099                     if (dest[i]) {
   2100                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2101                                       input->chunkContents+nextOutputStringStart,
   2102                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
   2103                     } else {
   2104                         UText remainingText = UTEXT_INITIALIZER;
   2105                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2106                                          fActiveLimit-nextOutputStringStart, &status);
   2107                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2108                         utext_close(&remainingText);
   2109                     }
   2110                 } else {
   2111                     UErrorCode lengthStatus = U_ZERO_ERROR;
   2112                     int32_t remaining16Length =
   2113                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
   2114                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2115                     if (remainingChars == NULL) {
   2116                         status = U_MEMORY_ALLOCATION_ERROR;
   2117                         break;
   2118                     }
   2119 
   2120                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2121                     if (dest[i]) {
   2122                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2123                     } else {
   2124                         UText remainingText = UTEXT_INITIALIZER;
   2125                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2126                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2127                         utext_close(&remainingText);
   2128                     }
   2129 
   2130                     uprv_free(remainingChars);
   2131                 }
   2132             }
   2133             break;
   2134         }
   2135         if (find()) {
   2136             // We found another delimiter.  Move everything from where we started looking
   2137             //  up until the start of the delimiter into the next output string.
   2138             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2139                 if (dest[i]) {
   2140                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2141                                   input->chunkContents+nextOutputStringStart,
   2142                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
   2143                 } else {
   2144                     UText remainingText = UTEXT_INITIALIZER;
   2145                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2146                                       fMatchStart-nextOutputStringStart, &status);
   2147                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2148                     utext_close(&remainingText);
   2149                 }
   2150             } else {
   2151                 UErrorCode lengthStatus = U_ZERO_ERROR;
   2152                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
   2153                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2154                 if (remainingChars == NULL) {
   2155                     status = U_MEMORY_ALLOCATION_ERROR;
   2156                     break;
   2157                 }
   2158                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
   2159                 if (dest[i]) {
   2160                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2161                 } else {
   2162                     UText remainingText = UTEXT_INITIALIZER;
   2163                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2164                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2165                     utext_close(&remainingText);
   2166                 }
   2167 
   2168                 uprv_free(remainingChars);
   2169             }
   2170             nextOutputStringStart = fMatchEnd;
   2171 
   2172             // If the delimiter pattern has capturing parentheses, the captured
   2173             //  text goes out into the next n destination strings.
   2174             int32_t groupNum;
   2175             UBool lastGroupWasNullUText = FALSE;
   2176             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
   2177                 if (i==destCapacity-1) {
   2178                     break;
   2179                 }
   2180                 i++;
   2181                 lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE);
   2182                 dest[i] = group(groupNum, dest[i], status);
   2183             }
   2184 
   2185             if (nextOutputStringStart == fActiveLimit) {
   2186                 // The delimiter was at the end of the string.  We're done.
   2187                 break;
   2188             } else if (i == destCapacity-1) {
   2189                 // We're out of capture groups, and the rest of the string is more important
   2190                 if (lastGroupWasNullUText) {
   2191                     utext_close(dest[i]);
   2192                     dest[i] = NULL;
   2193                 }
   2194             }
   2195 
   2196         }
   2197         else
   2198         {
   2199             // We ran off the end of the input while looking for the next delimiter.
   2200             // All the remaining text goes into the current output string.
   2201             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2202                 if (dest[i]) {
   2203                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2204                                   input->chunkContents+nextOutputStringStart,
   2205                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
   2206                 } else {
   2207                     UText remainingText = UTEXT_INITIALIZER;
   2208                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2209                                      fActiveLimit-nextOutputStringStart, &status);
   2210                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2211                     utext_close(&remainingText);
   2212                 }
   2213             } else {
   2214                 UErrorCode lengthStatus = U_ZERO_ERROR;
   2215                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
   2216                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2217                 if (remainingChars == NULL) {
   2218                     status = U_MEMORY_ALLOCATION_ERROR;
   2219                     break;
   2220                 }
   2221 
   2222                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2223                 if (dest[i]) {
   2224                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2225                 } else {
   2226                     UText remainingText = UTEXT_INITIALIZER;
   2227                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2228                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2229                     utext_close(&remainingText);
   2230                 }
   2231 
   2232                 uprv_free(remainingChars);
   2233             }
   2234             break;
   2235         }
   2236         if (U_FAILURE(status)) {
   2237             break;
   2238         }
   2239     }   // end of for loop
   2240     return i+1;
   2241 }
   2242 
   2243 
   2244 //--------------------------------------------------------------------------------
   2245 //
   2246 //     start
   2247 //
   2248 //--------------------------------------------------------------------------------
   2249 int32_t RegexMatcher::start(UErrorCode &status) const {
   2250     return start(0, status);
   2251 }
   2252 
   2253 int64_t RegexMatcher::start64(UErrorCode &status) const {
   2254     return start64(0, status);
   2255 }
   2256 
   2257 //--------------------------------------------------------------------------------
   2258 //
   2259 //     start(int32_t group, UErrorCode &status)
   2260 //
   2261 //--------------------------------------------------------------------------------
   2262 
   2263 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
   2264     if (U_FAILURE(status)) {
   2265         return -1;
   2266     }
   2267     if (U_FAILURE(fDeferredStatus)) {
   2268         status = fDeferredStatus;
   2269         return -1;
   2270     }
   2271     if (fMatch == FALSE) {
   2272         status = U_REGEX_INVALID_STATE;
   2273         return -1;
   2274     }
   2275     if (group < 0 || group > fPattern->fGroupMap->size()) {
   2276         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2277         return -1;
   2278     }
   2279     int64_t s;
   2280     if (group == 0) {
   2281         s = fMatchStart;
   2282     } else {
   2283         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
   2284         U_ASSERT(groupOffset < fPattern->fFrameSize);
   2285         U_ASSERT(groupOffset >= 0);
   2286         s = fFrame->fExtra[groupOffset];
   2287     }
   2288 
   2289     return s;
   2290 }
   2291 
   2292 
   2293 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
   2294     return (int32_t)start64(group, status);
   2295 }
   2296 
   2297 //--------------------------------------------------------------------------------
   2298 //
   2299 //     useAnchoringBounds
   2300 //
   2301 //--------------------------------------------------------------------------------
   2302 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
   2303     fAnchoringBounds = b;
   2304     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
   2305     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
   2306     return *this;
   2307 }
   2308 
   2309 
   2310 //--------------------------------------------------------------------------------
   2311 //
   2312 //     useTransparentBounds
   2313 //
   2314 //--------------------------------------------------------------------------------
   2315 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
   2316     fTransparentBounds = b;
   2317     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
   2318     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
   2319     return *this;
   2320 }
   2321 
   2322 //--------------------------------------------------------------------------------
   2323 //
   2324 //     setTimeLimit
   2325 //
   2326 //--------------------------------------------------------------------------------
   2327 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
   2328     if (U_FAILURE(status)) {
   2329         return;
   2330     }
   2331     if (U_FAILURE(fDeferredStatus)) {
   2332         status = fDeferredStatus;
   2333         return;
   2334     }
   2335     if (limit < 0) {
   2336         status = U_ILLEGAL_ARGUMENT_ERROR;
   2337         return;
   2338     }
   2339     fTimeLimit = limit;
   2340 }
   2341 
   2342 
   2343 //--------------------------------------------------------------------------------
   2344 //
   2345 //     getTimeLimit
   2346 //
   2347 //--------------------------------------------------------------------------------
   2348 int32_t RegexMatcher::getTimeLimit() const {
   2349     return fTimeLimit;
   2350 }
   2351 
   2352 
   2353 //--------------------------------------------------------------------------------
   2354 //
   2355 //     setStackLimit
   2356 //
   2357 //--------------------------------------------------------------------------------
   2358 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
   2359     if (U_FAILURE(status)) {
   2360         return;
   2361     }
   2362     if (U_FAILURE(fDeferredStatus)) {
   2363         status = fDeferredStatus;
   2364         return;
   2365     }
   2366     if (limit < 0) {
   2367         status = U_ILLEGAL_ARGUMENT_ERROR;
   2368         return;
   2369     }
   2370 
   2371     // Reset the matcher.  This is needed here in case there is a current match
   2372     //    whose final stack frame (containing the match results, pointed to by fFrame)
   2373     //    would be lost by resizing to a smaller stack size.
   2374     reset();
   2375 
   2376     if (limit == 0) {
   2377         // Unlimited stack expansion
   2378         fStack->setMaxCapacity(0);
   2379     } else {
   2380         // Change the units of the limit  from bytes to ints, and bump the size up
   2381         //   to be big enough to hold at least one stack frame for the pattern,
   2382         //   if it isn't there already.
   2383         int32_t adjustedLimit = limit / sizeof(int32_t);
   2384         if (adjustedLimit < fPattern->fFrameSize) {
   2385             adjustedLimit = fPattern->fFrameSize;
   2386         }
   2387         fStack->setMaxCapacity(adjustedLimit);
   2388     }
   2389     fStackLimit = limit;
   2390 }
   2391 
   2392 
   2393 //--------------------------------------------------------------------------------
   2394 //
   2395 //     getStackLimit
   2396 //
   2397 //--------------------------------------------------------------------------------
   2398 int32_t RegexMatcher::getStackLimit() const {
   2399     return fStackLimit;
   2400 }
   2401 
   2402 
   2403 //--------------------------------------------------------------------------------
   2404 //
   2405 //     setMatchCallback
   2406 //
   2407 //--------------------------------------------------------------------------------
   2408 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
   2409                                     const void              *context,
   2410                                     UErrorCode              &status) {
   2411     if (U_FAILURE(status)) {
   2412         return;
   2413     }
   2414     fCallbackFn = callback;
   2415     fCallbackContext = context;
   2416 }
   2417 
   2418 
   2419 //--------------------------------------------------------------------------------
   2420 //
   2421 //     getMatchCallback
   2422 //
   2423 //--------------------------------------------------------------------------------
   2424 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
   2425                                   const void              *&context,
   2426                                   UErrorCode              &status) {
   2427     if (U_FAILURE(status)) {
   2428        return;
   2429     }
   2430     callback = fCallbackFn;
   2431     context  = fCallbackContext;
   2432 }
   2433 
   2434 
   2435 //--------------------------------------------------------------------------------
   2436 //
   2437 //     setMatchCallback
   2438 //
   2439 //--------------------------------------------------------------------------------
   2440 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
   2441                                                 const void                      *context,
   2442                                                 UErrorCode                      &status) {
   2443     if (U_FAILURE(status)) {
   2444         return;
   2445     }
   2446     fFindProgressCallbackFn = callback;
   2447     fFindProgressCallbackContext = context;
   2448 }
   2449 
   2450 
   2451 //--------------------------------------------------------------------------------
   2452 //
   2453 //     getMatchCallback
   2454 //
   2455 //--------------------------------------------------------------------------------
   2456 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
   2457                                                 const void                    *&context,
   2458                                                 UErrorCode                    &status) {
   2459     if (U_FAILURE(status)) {
   2460        return;
   2461     }
   2462     callback = fFindProgressCallbackFn;
   2463     context  = fFindProgressCallbackContext;
   2464 }
   2465 
   2466 
   2467 //================================================================================
   2468 //
   2469 //    Code following this point in this file is the internal
   2470 //    Match Engine Implementation.
   2471 //
   2472 //================================================================================
   2473 
   2474 
   2475 //--------------------------------------------------------------------------------
   2476 //
   2477 //   resetStack
   2478 //           Discard any previous contents of the state save stack, and initialize a
   2479 //           new stack frame to all -1.  The -1s are needed for capture group limits,
   2480 //           where they indicate that a group has not yet matched anything.
   2481 //--------------------------------------------------------------------------------
   2482 REStackFrame *RegexMatcher::resetStack() {
   2483     // Discard any previous contents of the state save stack, and initialize a
   2484     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
   2485     //  where they indicate that a group has not yet matched anything.
   2486     fStack->removeAllElements();
   2487 
   2488     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
   2489     int32_t i;
   2490     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
   2491         iFrame->fExtra[i] = -1;
   2492     }
   2493     return iFrame;
   2494 }
   2495 
   2496 
   2497 
   2498 //--------------------------------------------------------------------------------
   2499 //
   2500 //   isWordBoundary
   2501 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
   2502 //                     For us,
   2503 //                       If the current char is a combining mark,
   2504 //                          \b is FALSE.
   2505 //                       Else Scan backwards to the first non-combining char.
   2506 //                            We are at a boundary if the this char and the original chars are
   2507 //                               opposite in membership in \w set
   2508 //
   2509 //          parameters:   pos   - the current position in the input buffer
   2510 //
   2511 //              TODO:  double-check edge cases at region boundaries.
   2512 //
   2513 //--------------------------------------------------------------------------------
   2514 UBool RegexMatcher::isWordBoundary(int64_t pos) {
   2515     UBool isBoundary = FALSE;
   2516     UBool cIsWord    = FALSE;
   2517 
   2518     if (pos >= fLookLimit) {
   2519         fHitEnd = TRUE;
   2520     } else {
   2521         // Determine whether char c at current position is a member of the word set of chars.
   2522         // If we're off the end of the string, behave as though we're not at a word char.
   2523         UTEXT_SETNATIVEINDEX(fInputText, pos);
   2524         UChar32  c = UTEXT_CURRENT32(fInputText);
   2525         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2526             // Current char is a combining one.  Not a boundary.
   2527             return FALSE;
   2528         }
   2529         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   2530     }
   2531 
   2532     // Back up until we come to a non-combining char, determine whether
   2533     //  that char is a word char.
   2534     UBool prevCIsWord = FALSE;
   2535     for (;;) {
   2536         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
   2537             break;
   2538         }
   2539         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
   2540         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2541               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2542             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   2543             break;
   2544         }
   2545     }
   2546     isBoundary = cIsWord ^ prevCIsWord;
   2547     return isBoundary;
   2548 }
   2549 
   2550 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
   2551     UBool isBoundary = FALSE;
   2552     UBool cIsWord    = FALSE;
   2553 
   2554     const UChar *inputBuf = fInputText->chunkContents;
   2555 
   2556     if (pos >= fLookLimit) {
   2557         fHitEnd = TRUE;
   2558     } else {
   2559         // Determine whether char c at current position is a member of the word set of chars.
   2560         // If we're off the end of the string, behave as though we're not at a word char.
   2561         UChar32 c;
   2562         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
   2563         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2564             // Current char is a combining one.  Not a boundary.
   2565             return FALSE;
   2566         }
   2567         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   2568     }
   2569 
   2570     // Back up until we come to a non-combining char, determine whether
   2571     //  that char is a word char.
   2572     UBool prevCIsWord = FALSE;
   2573     for (;;) {
   2574         if (pos <= fLookStart) {
   2575             break;
   2576         }
   2577         UChar32 prevChar;
   2578         U16_PREV(inputBuf, fLookStart, pos, prevChar);
   2579         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2580               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2581             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   2582             break;
   2583         }
   2584     }
   2585     isBoundary = cIsWord ^ prevCIsWord;
   2586     return isBoundary;
   2587 }
   2588 
   2589 //--------------------------------------------------------------------------------
   2590 //
   2591 //   isUWordBoundary
   2592 //
   2593 //         Test for a word boundary using RBBI word break.
   2594 //
   2595 //          parameters:   pos   - the current position in the input buffer
   2596 //
   2597 //--------------------------------------------------------------------------------
   2598 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
   2599     UBool       returnVal = FALSE;
   2600 #if UCONFIG_NO_BREAK_ITERATION==0
   2601 
   2602     // If we haven't yet created a break iterator for this matcher, do it now.
   2603     if (fWordBreakItr == NULL) {
   2604         fWordBreakItr =
   2605             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
   2606         if (U_FAILURE(fDeferredStatus)) {
   2607             return FALSE;
   2608         }
   2609         fWordBreakItr->setText(fInputText, fDeferredStatus);
   2610     }
   2611 
   2612     if (pos >= fLookLimit) {
   2613         fHitEnd = TRUE;
   2614         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
   2615                             //    words are not boundaries.  All non-word chars stand by themselves,
   2616                             //    with word boundaries on both sides.
   2617     } else {
   2618         if (!UTEXT_USES_U16(fInputText)) {
   2619             // !!!: Would like a better way to do this!
   2620             UErrorCode status = U_ZERO_ERROR;
   2621             pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
   2622         }
   2623         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
   2624     }
   2625 #endif
   2626     return   returnVal;
   2627 }
   2628 
   2629 //--------------------------------------------------------------------------------
   2630 //
   2631 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
   2632 //                     saves. Increment the "time" counter, and call the
   2633 //                     user callback function if there is one installed.
   2634 //
   2635 //                     If the match operation needs to be aborted, either for a time-out
   2636 //                     or because the user callback asked for it, just set an error status.
   2637 //                     The engine will pick that up and stop in its outer loop.
   2638 //
   2639 //--------------------------------------------------------------------------------
   2640 void RegexMatcher::IncrementTime(UErrorCode &status) {
   2641     fTickCounter = TIMER_INITIAL_VALUE;
   2642     fTime++;
   2643     if (fCallbackFn != NULL) {
   2644         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
   2645             status = U_REGEX_STOPPED_BY_CALLER;
   2646             return;
   2647         }
   2648     }
   2649     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
   2650         status = U_REGEX_TIME_OUT;
   2651     }
   2652 }
   2653 
   2654 //--------------------------------------------------------------------------------
   2655 //
   2656 //   ReportFindProgress     This function is called once for each advance in the target
   2657 //                          string from the find() function, and calls the user progress callback
   2658 //                          function if there is one installed.
   2659 //
   2660 //                          NOTE:
   2661 //
   2662 //                          If the match operation needs to be aborted because the user
   2663 //                          callback asked for it, just set an error status.
   2664 //                          The engine will pick that up and stop in its outer loop.
   2665 //
   2666 //--------------------------------------------------------------------------------
   2667 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
   2668     if (fFindProgressCallbackFn != NULL) {
   2669         if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
   2670             status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
   2671             return FALSE;
   2672         }
   2673     }
   2674     return TRUE;
   2675 }
   2676 
   2677 //--------------------------------------------------------------------------------
   2678 //
   2679 //   StateSave
   2680 //       Make a new stack frame, initialized as a copy of the current stack frame.
   2681 //       Set the pattern index in the original stack frame from the operand value
   2682 //       in the opcode.  Execution of the engine continues with the state in
   2683 //       the newly created stack frame
   2684 //
   2685 //       Note that reserveBlock() may grow the stack, resulting in the
   2686 //       whole thing being relocated in memory.
   2687 //
   2688 //    Parameters:
   2689 //       fp           The top frame pointer when called.  At return, a new
   2690 //                    fame will be present
   2691 //       savePatIdx   An index into the compiled pattern.  Goes into the original
   2692 //                    (not new) frame.  If execution ever back-tracks out of the
   2693 //                    new frame, this will be where we continue from in the pattern.
   2694 //    Return
   2695 //                    The new frame pointer.
   2696 //
   2697 //--------------------------------------------------------------------------------
   2698 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
   2699     // push storage for a new frame.
   2700     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
   2701     if (newFP == NULL) {
   2702         // Failure on attempted stack expansion.
   2703         //   Stack function set some other error code, change it to a more
   2704         //   specific one for regular expressions.
   2705         status = U_REGEX_STACK_OVERFLOW;
   2706         // We need to return a writable stack frame, so just return the
   2707         //    previous frame.  The match operation will stop quickly
   2708         //    because of the error status, after which the frame will never
   2709         //    be looked at again.
   2710         return fp;
   2711     }
   2712     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
   2713 
   2714     // New stack frame = copy of old top frame.
   2715     int64_t *source = (int64_t *)fp;
   2716     int64_t *dest   = newFP;
   2717     for (;;) {
   2718         *dest++ = *source++;
   2719         if (source == newFP) {
   2720             break;
   2721         }
   2722     }
   2723 
   2724     fTickCounter--;
   2725     if (fTickCounter <= 0) {
   2726        IncrementTime(status);    // Re-initializes fTickCounter
   2727     }
   2728     fp->fPatIdx = savePatIdx;
   2729     return (REStackFrame *)newFP;
   2730 }
   2731 
   2732 
   2733 //--------------------------------------------------------------------------------
   2734 //
   2735 //   MatchAt      This is the actual matching engine.
   2736 //
   2737 //                  startIdx:    begin matching a this index.
   2738 //                  toEnd:       if true, match must extend to end of the input region
   2739 //
   2740 //--------------------------------------------------------------------------------
   2741 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
   2742     UBool       isMatch  = FALSE;      // True if the we have a match.
   2743 
   2744     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
   2745 
   2746     int32_t     op;                    // Operation from the compiled pattern, split into
   2747     int32_t     opType;                //    the opcode
   2748     int32_t     opValue;               //    and the operand value.
   2749 
   2750     #ifdef REGEX_RUN_DEBUG
   2751     if (fTraceDebug)
   2752     {
   2753         printf("MatchAt(startIdx=%ld)\n", startIdx);
   2754         printf("Original Pattern: ");
   2755         UChar32 c = utext_next32From(fPattern->fPattern, 0);
   2756         while (c != U_SENTINEL) {
   2757             if (c<32 || c>256) {
   2758                 c = '.';
   2759             }
   2760             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
   2761 
   2762             c = UTEXT_NEXT32(fPattern->fPattern);
   2763         }
   2764         printf("\n");
   2765         printf("Input String: ");
   2766         c = utext_next32From(fInputText, 0);
   2767         while (c != U_SENTINEL) {
   2768             if (c<32 || c>256) {
   2769                 c = '.';
   2770             }
   2771             printf("%c", c);
   2772 
   2773             c = UTEXT_NEXT32(fInputText);
   2774         }
   2775         printf("\n");
   2776         printf("\n");
   2777     }
   2778     #endif
   2779 
   2780     if (U_FAILURE(status)) {
   2781         return;
   2782     }
   2783 
   2784     //  Cache frequently referenced items from the compiled pattern
   2785     //
   2786     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   2787 
   2788     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   2789     UVector             *sets          = fPattern->fSets;
   2790 
   2791     fFrameSize = fPattern->fFrameSize;
   2792     REStackFrame        *fp            = resetStack();
   2793 
   2794     fp->fPatIdx   = 0;
   2795     fp->fInputIdx = startIdx;
   2796 
   2797     // Zero out the pattern's static data
   2798     int32_t i;
   2799     for (i = 0; i<fPattern->fDataSize; i++) {
   2800         fData[i] = 0;
   2801     }
   2802 
   2803     //
   2804     //  Main loop for interpreting the compiled pattern.
   2805     //  One iteration of the loop per pattern operation performed.
   2806     //
   2807     for (;;) {
   2808 #if 0
   2809         if (_heapchk() != _HEAPOK) {
   2810             fprintf(stderr, "Heap Trouble\n");
   2811         }
   2812 #endif
   2813 
   2814         op      = (int32_t)pat[fp->fPatIdx];
   2815         opType  = URX_TYPE(op);
   2816         opValue = URX_VAL(op);
   2817         #ifdef REGEX_RUN_DEBUG
   2818         if (fTraceDebug) {
   2819             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2820             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
   2821                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   2822             fPattern->dumpOp(fp->fPatIdx);
   2823         }
   2824         #endif
   2825         fp->fPatIdx++;
   2826 
   2827         switch (opType) {
   2828 
   2829 
   2830         case URX_NOP:
   2831             break;
   2832 
   2833 
   2834         case URX_BACKTRACK:
   2835             // Force a backtrack.  In some circumstances, the pattern compiler
   2836             //   will notice that the pattern can't possibly match anything, and will
   2837             //   emit one of these at that point.
   2838             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2839             break;
   2840 
   2841 
   2842         case URX_ONECHAR:
   2843             if (fp->fInputIdx < fActiveLimit) {
   2844                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2845                 UChar32 c = UTEXT_NEXT32(fInputText);
   2846                 if (c == opValue) {
   2847                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2848                     break;
   2849                 }
   2850             } else {
   2851                 fHitEnd = TRUE;
   2852             }
   2853 
   2854             #ifdef REGEX_SMART_BACKTRACKING
   2855             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   2856                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   2857                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   2858                     UBool success = FALSE;
   2859                     UChar32 c = UTEXT_PREVIOUS32(fInputText);
   2860                     while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
   2861                         if (c == opValue) {
   2862                             success = TRUE;
   2863                             break;
   2864                         } else if (c == U_SENTINEL) {
   2865                             break;
   2866                         }
   2867                         c = UTEXT_PREVIOUS32(fInputText);
   2868                     }
   2869                     if (success) {
   2870                         fHitEnd = FALSE;
   2871                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2872                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2873                         if (fp->fInputIdx > backSearchIndex) {
   2874                             fp = StateSave(fp, fp->fPatIdx, status);
   2875                         }
   2876                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
   2877                         break;
   2878                     }
   2879                 }
   2880             }
   2881             #endif
   2882 
   2883             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2884             break;
   2885 
   2886 
   2887         case URX_STRING:
   2888             {
   2889                 // Test input against a literal string.
   2890                 // Strings require two slots in the compiled pattern, one for the
   2891                 //   offset to the string text, and one for the length.
   2892                 int32_t   stringStartIdx = opValue;
   2893                 int32_t   stringLen;
   2894 
   2895                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
   2896                 fp->fPatIdx++;
   2897                 opType    = URX_TYPE(op);
   2898                 stringLen = URX_VAL(op);
   2899                 U_ASSERT(opType == URX_STRING_LEN);
   2900                 U_ASSERT(stringLen >= 2);
   2901 
   2902                 const UChar *patternChars = litText+stringStartIdx;
   2903                 const UChar *patternEnd = patternChars+stringLen;
   2904 
   2905                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2906                 UChar32 c;
   2907                 UBool success = TRUE;
   2908 
   2909                 while (patternChars < patternEnd && success) {
   2910                     c = UTEXT_NEXT32(fInputText);
   2911 
   2912                     if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) {
   2913                         if (U_IS_BMP(c)) {
   2914                             success = (*patternChars == c);
   2915                             patternChars += 1;
   2916                         } else if (patternChars+1 < patternEnd) {
   2917                             success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
   2918                             patternChars += 2;
   2919                         }
   2920                     } else {
   2921                         success = FALSE;
   2922                         fHitEnd = TRUE;          //   TODO:  See ticket 6074
   2923                     }
   2924                 }
   2925 
   2926                 if (success) {
   2927                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2928                 } else {
   2929                     #ifdef REGEX_SMART_BACKTRACKING
   2930                     if (fp->fInputIdx > backSearchIndex && fStack->size()) {
   2931                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   2932                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   2933                             // Reset to last start point
   2934                             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2935                             patternChars = litText+stringStartIdx;
   2936 
   2937                             // Search backwards for a possible start
   2938                             do {
   2939                                 c = UTEXT_PREVIOUS32(fInputText);
   2940                                 if (c == U_SENTINEL) {
   2941                                     break;
   2942                                 } else if ((U_IS_BMP(c) && *patternChars == c) ||
   2943                                     (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
   2944                                     success = TRUE;
   2945                                     break;
   2946                                 }
   2947                             } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
   2948 
   2949                             // And try again
   2950                             if (success) {
   2951                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2952                                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2953                                 if (fp->fInputIdx > backSearchIndex) {
   2954                                     fp = StateSave(fp, fp->fPatIdx, status);
   2955                                 }
   2956                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
   2957                                 break;
   2958                             }
   2959                         }
   2960                     }
   2961                     #endif
   2962                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2963                 }
   2964             }
   2965             break;
   2966 
   2967 
   2968         case URX_STATE_SAVE:
   2969             fp = StateSave(fp, opValue, status);
   2970             break;
   2971 
   2972 
   2973         case URX_END:
   2974             // The match loop will exit via this path on a successful match,
   2975             //   when we reach the end of the pattern.
   2976             if (toEnd && fp->fInputIdx != fActiveLimit) {
   2977                 // The pattern matched, but not to the end of input.  Try some more.
   2978                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2979                 break;
   2980             }
   2981             isMatch = TRUE;
   2982             goto  breakFromLoop;
   2983 
   2984         // Start and End Capture stack frame variables are laid out out like this:
   2985             //  fp->fExtra[opValue]  - The start of a completed capture group
   2986             //             opValue+1 - The end   of a completed capture group
   2987             //             opValue+2 - the start of a capture group whose end
   2988             //                          has not yet been reached (and might not ever be).
   2989         case URX_START_CAPTURE:
   2990             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2991             fp->fExtra[opValue+2] = fp->fInputIdx;
   2992             break;
   2993 
   2994 
   2995         case URX_END_CAPTURE:
   2996             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2997             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   2998             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   2999             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   3000             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   3001             break;
   3002 
   3003 
   3004         case URX_DOLLAR:                   //  $, test for End of line
   3005                                            //     or for position before new line at end of input
   3006             {
   3007                 if (fp->fInputIdx >= fAnchorLimit) {
   3008                     // We really are at the end of input.  Success.
   3009                     fHitEnd = TRUE;
   3010                     fRequireEnd = TRUE;
   3011                     break;
   3012                 }
   3013 
   3014                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3015 
   3016                 // If we are positioned just before a new-line that is located at the
   3017                 //   end of input, succeed.
   3018                 UChar32 c = UTEXT_NEXT32(fInputText);
   3019                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   3020                     if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
   3021                         // If not in the middle of a CR/LF sequence
   3022                         if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
   3023                             // At new-line at end of input. Success
   3024                             fHitEnd = TRUE;
   3025                             fRequireEnd = TRUE;
   3026 
   3027                             break;
   3028                         }
   3029                     }
   3030                 } else {
   3031                     UChar32 nextC = UTEXT_NEXT32(fInputText);
   3032                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   3033                         fHitEnd = TRUE;
   3034                         fRequireEnd = TRUE;
   3035                         break;                         // At CR/LF at end of input.  Success
   3036                     }
   3037                 }
   3038 
   3039                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3040             }
   3041             break;
   3042 
   3043 
   3044          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   3045             if (fp->fInputIdx >= fAnchorLimit) {
   3046                 // Off the end of input.  Success.
   3047                 fHitEnd = TRUE;
   3048                 fRequireEnd = TRUE;
   3049                 break;
   3050             } else {
   3051                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3052                 UChar32 c = UTEXT_NEXT32(fInputText);
   3053                 // Either at the last character of input, or off the end.
   3054                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
   3055                     fHitEnd = TRUE;
   3056                     fRequireEnd = TRUE;
   3057                     break;
   3058                 }
   3059             }
   3060 
   3061             // Not at end of input.  Back-track out.
   3062             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3063             break;
   3064 
   3065 
   3066          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   3067              {
   3068                  if (fp->fInputIdx >= fAnchorLimit) {
   3069                      // We really are at the end of input.  Success.
   3070                      fHitEnd = TRUE;
   3071                      fRequireEnd = TRUE;
   3072                      break;
   3073                  }
   3074                  // If we are positioned just before a new-line, succeed.
   3075                  // It makes no difference where the new-line is within the input.
   3076                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3077                  UChar32 c = UTEXT_CURRENT32(fInputText);
   3078                  if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
   3079                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   3080                      //  In multi-line mode, hitting a new-line just before the end of input does not
   3081                      //   set the hitEnd or requireEnd flags
   3082                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
   3083                         break;
   3084                      }
   3085                  }
   3086                  // not at a new line.  Fail.
   3087                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3088              }
   3089              break;
   3090 
   3091 
   3092          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   3093              {
   3094                  if (fp->fInputIdx >= fAnchorLimit) {
   3095                      // We really are at the end of input.  Success.
   3096                      fHitEnd = TRUE;
   3097                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   3098                      break;               //   adding a new-line would not lose the match.
   3099                  }
   3100                  // If we are not positioned just before a new-line, the test fails; backtrack out.
   3101                  // It makes no difference where the new-line is within the input.
   3102                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3103                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
   3104                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3105                  }
   3106              }
   3107              break;
   3108 
   3109 
   3110        case URX_CARET:                    //  ^, test for start of line
   3111             if (fp->fInputIdx != fAnchorStart) {
   3112                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3113             }
   3114             break;
   3115 
   3116 
   3117        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   3118            {
   3119                if (fp->fInputIdx == fAnchorStart) {
   3120                    // We are at the start input.  Success.
   3121                    break;
   3122                }
   3123                // Check whether character just before the current pos is a new-line
   3124                //   unless we are at the end of input
   3125                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3126                UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3127                if ((fp->fInputIdx < fAnchorLimit) &&
   3128                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   3129                    //  It's a new-line.  ^ is true.  Success.
   3130                    //  TODO:  what should be done with positions between a CR and LF?
   3131                    break;
   3132                }
   3133                // Not at the start of a line.  Fail.
   3134                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3135            }
   3136            break;
   3137 
   3138 
   3139        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   3140            {
   3141                U_ASSERT(fp->fInputIdx >= fAnchorStart);
   3142                if (fp->fInputIdx <= fAnchorStart) {
   3143                    // We are at the start input.  Success.
   3144                    break;
   3145                }
   3146                // Check whether character just before the current pos is a new-line
   3147                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   3148                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3149                UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3150                if (c != 0x0a) {
   3151                    // Not at the start of a line.  Back-track out.
   3152                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3153                }
   3154            }
   3155            break;
   3156 
   3157         case URX_BACKSLASH_B:          // Test for word boundaries
   3158             {
   3159                 UBool success = isWordBoundary(fp->fInputIdx);
   3160                 success ^= (opValue != 0);     // flip sense for \B
   3161                 if (!success) {
   3162                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3163                 }
   3164             }
   3165             break;
   3166 
   3167 
   3168         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   3169             {
   3170                 UBool success = isUWordBoundary(fp->fInputIdx);
   3171                 success ^= (opValue != 0);     // flip sense for \B
   3172                 if (!success) {
   3173                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3174                 }
   3175             }
   3176             break;
   3177 
   3178 
   3179         case URX_BACKSLASH_D:            // Test for decimal digit
   3180             {
   3181                 if (fp->fInputIdx >= fActiveLimit) {
   3182                     fHitEnd = TRUE;
   3183                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3184                     break;
   3185                 }
   3186 
   3187                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3188 
   3189                 UChar32 c = UTEXT_NEXT32(fInputText);
   3190                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   3191                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   3192                 success ^= (opValue != 0);        // flip sense for \D
   3193                 if (success) {
   3194                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3195                 } else {
   3196                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3197                 }
   3198             }
   3199             break;
   3200 
   3201 
   3202         case URX_BACKSLASH_G:          // Test for position at end of previous match
   3203             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
   3204                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3205             }
   3206             break;
   3207 
   3208 
   3209         case URX_BACKSLASH_X:
   3210             //  Match a Grapheme, as defined by Unicode TR 29.
   3211             //  Differs slightly from Perl, which consumes combining marks independently
   3212             //    of context.
   3213             {
   3214 
   3215                 // Fail if at end of input
   3216                 if (fp->fInputIdx >= fActiveLimit) {
   3217                     fHitEnd = TRUE;
   3218                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3219                     break;
   3220                 }
   3221 
   3222                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3223 
   3224                 // Examine (and consume) the current char.
   3225                 //   Dispatch into a little state machine, based on the char.
   3226                 UChar32  c;
   3227                 c = UTEXT_NEXT32(fInputText);
   3228                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3229                 UnicodeSet **sets = fPattern->fStaticSets;
   3230                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   3231                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   3232                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   3233                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   3234                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   3235                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3236                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3237                 goto GC_Extend;
   3238 
   3239 
   3240 
   3241 GC_L:
   3242                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3243                 c = UTEXT_NEXT32(fInputText);
   3244                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3245                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   3246                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   3247                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   3248                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3249                 UTEXT_PREVIOUS32(fInputText);
   3250                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3251                 goto GC_Extend;
   3252 
   3253 GC_V:
   3254                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3255                 c = UTEXT_NEXT32(fInputText);
   3256                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3257                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3258                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3259                 UTEXT_PREVIOUS32(fInputText);
   3260                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3261                 goto GC_Extend;
   3262 
   3263 GC_T:
   3264                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3265                 c = UTEXT_NEXT32(fInputText);
   3266                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3267                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3268                 UTEXT_PREVIOUS32(fInputText);
   3269                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3270                 goto GC_Extend;
   3271 
   3272 GC_Extend:
   3273                 // Combining characters are consumed here
   3274                 for (;;) {
   3275                     if (fp->fInputIdx >= fActiveLimit) {
   3276                         break;
   3277                     }
   3278                     c = UTEXT_CURRENT32(fInputText);
   3279                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   3280                         break;
   3281                     }
   3282                     UTEXT_NEXT32(fInputText);
   3283                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3284                 }
   3285                 goto GC_Done;
   3286 
   3287 GC_Control:
   3288                 // Most control chars stand alone (don't combine with combining chars),
   3289                 //   except for that CR/LF sequence is a single grapheme cluster.
   3290                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
   3291                     c = UTEXT_NEXT32(fInputText);
   3292                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3293                 }
   3294 
   3295 GC_Done:
   3296                 if (fp->fInputIdx >= fActiveLimit) {
   3297                     fHitEnd = TRUE;
   3298                 }
   3299                 break;
   3300             }
   3301 
   3302 
   3303 
   3304 
   3305         case URX_BACKSLASH_Z:          // Test for end of Input
   3306             if (fp->fInputIdx < fAnchorLimit) {
   3307                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3308             } else {
   3309                 fHitEnd = TRUE;
   3310                 fRequireEnd = TRUE;
   3311             }
   3312             break;
   3313 
   3314 
   3315 
   3316         case URX_STATIC_SETREF:
   3317             {
   3318                 // Test input character against one of the predefined sets
   3319                 //    (Word Characters, for example)
   3320                 // The high bit of the op value is a flag for the match polarity.
   3321                 //    0:   success if input char is in set.
   3322                 //    1:   success if input char is not in set.
   3323                 if (fp->fInputIdx >= fActiveLimit) {
   3324                     fHitEnd = TRUE;
   3325                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3326                     break;
   3327                 }
   3328 
   3329                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   3330                 opValue &= ~URX_NEG_SET;
   3331                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3332 
   3333                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3334                 UChar32 c = UTEXT_NEXT32(fInputText);
   3335                 if (c < 256) {
   3336                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3337                     if (s8->contains(c)) {
   3338                         success = !success;
   3339                     }
   3340                 } else {
   3341                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3342                     if (s->contains(c)) {
   3343                         success = !success;
   3344                     }
   3345                 }
   3346                 if (success) {
   3347                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3348                 } else {
   3349                     // the character wasn't in the set.
   3350                     #ifdef REGEX_SMART_BACKTRACKING
   3351                     if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   3352                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3353                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3354                             // Try to find it, backwards
   3355                             UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
   3356                             success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
   3357                             do {
   3358                                 c = UTEXT_PREVIOUS32(fInputText);
   3359                                 if (c == U_SENTINEL) {
   3360                                     break;
   3361                                 } else if (c < 256) {
   3362                                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3363                                     if (s8->contains(c)) {
   3364                                         success = !success;
   3365                                     }
   3366                                 } else {
   3367                                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3368                                     if (s->contains(c)) {
   3369                                         success = !success;
   3370                                     }
   3371                                 }
   3372                             } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success);
   3373 
   3374                             if (success && c != U_SENTINEL) {
   3375                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3376                                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3377                                 if (fp->fInputIdx > backSearchIndex) {
   3378                                     fp = StateSave(fp, fp->fPatIdx, status);
   3379                                 }
   3380                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3381                                 break;
   3382                             }
   3383                         }
   3384                     }
   3385                     #endif
   3386                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3387                 }
   3388             }
   3389             break;
   3390 
   3391 
   3392         case URX_STAT_SETREF_N:
   3393             {
   3394                 // Test input character for NOT being a member of  one of
   3395                 //    the predefined sets (Word Characters, for example)
   3396                 if (fp->fInputIdx >= fActiveLimit) {
   3397                     fHitEnd = TRUE;
   3398                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3399                     break;
   3400                 }
   3401 
   3402                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3403 
   3404                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3405 
   3406                 UChar32 c = UTEXT_NEXT32(fInputText);
   3407                 if (c < 256) {
   3408                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3409                     if (s8->contains(c) == FALSE) {
   3410                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3411                         break;
   3412                     }
   3413                 } else {
   3414                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3415                     if (s->contains(c) == FALSE) {
   3416                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3417                         break;
   3418                     }
   3419                 }
   3420                 // the character wasn't in the set.
   3421                 #ifdef REGEX_SMART_BACKTRACKING
   3422                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   3423                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3424                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3425                         // Try to find it, backwards
   3426                         UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
   3427                         UBool success = FALSE;
   3428                         do {
   3429                             c = UTEXT_PREVIOUS32(fInputText);
   3430                             if (c == U_SENTINEL) {
   3431                                 break;
   3432                             } else if (c < 256) {
   3433                                 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3434                                 if (s8->contains(c) == FALSE) {
   3435                                     success = TRUE;
   3436                                     break;
   3437                                 }
   3438                             } else {
   3439                                 const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3440                                 if (s->contains(c) == FALSE) {
   3441                                     success = TRUE;
   3442                                     break;
   3443                                 }
   3444                             }
   3445                         } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
   3446 
   3447                         if (success) {
   3448                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3449                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3450                             if (fp->fInputIdx > backSearchIndex) {
   3451                                 fp = StateSave(fp, fp->fPatIdx, status);
   3452                             }
   3453                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3454                             break;
   3455                         }
   3456                     }
   3457                 }
   3458                 #endif
   3459                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3460             }
   3461             break;
   3462 
   3463 
   3464         case URX_SETREF:
   3465             if (fp->fInputIdx >= fActiveLimit) {
   3466                 fHitEnd = TRUE;
   3467                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3468                 break;
   3469             } else {
   3470                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3471 
   3472                 // There is input left.  Pick up one char and test it for set membership.
   3473                 UChar32 c = UTEXT_NEXT32(fInputText);
   3474                 U_ASSERT(opValue > 0 && opValue < sets->size());
   3475                 if (c<256) {
   3476                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   3477                     if (s8->contains(c)) {
   3478                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3479                         break;
   3480                     }
   3481                 } else {
   3482                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   3483                     if (s->contains(c)) {
   3484                         // The character is in the set.  A Match.
   3485                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3486                         break;
   3487                     }
   3488                 }
   3489 
   3490                 // the character wasn't in the set.
   3491                 #ifdef REGEX_SMART_BACKTRACKING
   3492                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   3493                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3494                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3495                         // Try to find it, backwards
   3496                         UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
   3497                         UBool success = FALSE;
   3498                         do {
   3499                             c = UTEXT_PREVIOUS32(fInputText);
   3500                             if (c == U_SENTINEL) {
   3501                                 break;
   3502                             } else if (c < 256) {
   3503                                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   3504                                 if (s8->contains(c)) {
   3505                                     success = TRUE;
   3506                                     break;
   3507                                 }
   3508                             } else {
   3509                                 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   3510                                 if (s->contains(c)) {
   3511                                     success = TRUE;
   3512                                     break;
   3513                                 }
   3514                             }
   3515                         } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
   3516 
   3517                         if (success) {
   3518                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3519                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3520                             if (fp->fInputIdx > backSearchIndex) {
   3521                                 fp = StateSave(fp, fp->fPatIdx, status);
   3522                             }
   3523                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3524                             break;
   3525                         }
   3526                     }
   3527                 }
   3528                 #endif
   3529                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3530             }
   3531             break;
   3532 
   3533 
   3534         case URX_DOTANY:
   3535             {
   3536                 // . matches anything, but stops at end-of-line.
   3537                 if (fp->fInputIdx >= fActiveLimit) {
   3538                     // At end of input.  Match failed.  Backtrack out.
   3539                     fHitEnd = TRUE;
   3540                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3541                     break;
   3542                 }
   3543 
   3544                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3545 
   3546                 // There is input left.  Advance over one char, unless we've hit end-of-line
   3547                 UChar32 c = UTEXT_NEXT32(fInputText);
   3548                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   3549                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   3550                     // End of line in normal mode.   . does not match.
   3551                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3552                     break;
   3553                 }
   3554                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3555             }
   3556             break;
   3557 
   3558 
   3559         case URX_DOTANY_ALL:
   3560             {
   3561                 // ., in dot-matches-all (including new lines) mode
   3562                 if (fp->fInputIdx >= fActiveLimit) {
   3563                     // At end of input.  Match failed.  Backtrack out.
   3564                     fHitEnd = TRUE;
   3565                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3566                     break;
   3567                 }
   3568 
   3569                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3570 
   3571                 // There is input left.  Advance over one char, except if we are
   3572                 //   at a cr/lf, advance over both of them.
   3573                 UChar32 c;
   3574                 c = UTEXT_NEXT32(fInputText);
   3575                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3576                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   3577                     // In the case of a CR/LF, we need to advance over both.
   3578                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
   3579                     if (nextc == 0x0a) {
   3580                         UTEXT_NEXT32(fInputText);
   3581                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3582                     }
   3583                 }
   3584             }
   3585             break;
   3586 
   3587 
   3588         case URX_DOTANY_UNIX:
   3589             {
   3590                 // '.' operator, matches all, but stops at end-of-line.
   3591                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   3592                 if (fp->fInputIdx >= fActiveLimit) {
   3593                     // At end of input.  Match failed.  Backtrack out.
   3594                     fHitEnd = TRUE;
   3595                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3596                     break;
   3597                 }
   3598 
   3599                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3600 
   3601                 // There is input left.  Advance over one char, unless we've hit end-of-line
   3602                 UChar32 c = UTEXT_NEXT32(fInputText);
   3603                 if (c == 0x0a) {
   3604                     // End of line in normal mode.   '.' does not match the \n
   3605                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3606                 } else {
   3607                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3608                 }
   3609             }
   3610             break;
   3611 
   3612 
   3613         case URX_JMP:
   3614             fp->fPatIdx = opValue;
   3615             break;
   3616 
   3617         case URX_FAIL:
   3618             isMatch = FALSE;
   3619             goto breakFromLoop;
   3620 
   3621         case URX_JMP_SAV:
   3622             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   3623             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   3624             fp->fPatIdx = opValue;                         // Then JMP.
   3625             break;
   3626 
   3627         case URX_JMP_SAV_X:
   3628             // This opcode is used with (x)+, when x can match a zero length string.
   3629             // Same as JMP_SAV, except conditional on the match having made forward progress.
   3630             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   3631             //   data address of the input position at the start of the loop.
   3632             {
   3633                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   3634                 int32_t  stoOp = (int32_t)pat[opValue-1];
   3635                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   3636                 int32_t  frameLoc = URX_VAL(stoOp);
   3637                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   3638                 int64_t prevInputIdx = fp->fExtra[frameLoc];
   3639                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   3640                 if (prevInputIdx < fp->fInputIdx) {
   3641                     // The match did make progress.  Repeat the loop.
   3642                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   3643                     fp->fPatIdx = opValue;
   3644                     fp->fExtra[frameLoc] = fp->fInputIdx;
   3645                 }
   3646                 // If the input position did not advance, we do nothing here,
   3647                 //   execution will fall out of the loop.
   3648             }
   3649             break;
   3650 
   3651         case URX_CTR_INIT:
   3652             {
   3653                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3654                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   3655 
   3656                 // Pick up the three extra operands that CTR_INIT has, and
   3657                 //    skip the pattern location counter past
   3658                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3659                 fp->fPatIdx += 3;
   3660                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3661                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   3662                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   3663                 U_ASSERT(minCount>=0);
   3664                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   3665                 U_ASSERT(loopLoc>fp->fPatIdx);
   3666 
   3667                 if (minCount == 0) {
   3668                     fp = StateSave(fp, loopLoc+1, status);
   3669                 }
   3670                 if (maxCount == 0) {
   3671                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3672                 }
   3673             }
   3674             break;
   3675 
   3676         case URX_CTR_LOOP:
   3677             {
   3678                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3679                 int32_t initOp = (int32_t)pat[opValue];
   3680                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   3681                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3682                 int32_t minCount  = (int32_t)pat[opValue+2];
   3683                 int32_t maxCount  = (int32_t)pat[opValue+3];
   3684                 // Increment the counter.  Note: we DIDN'T worry about counter
   3685                 //   overflow, since the data comes from UnicodeStrings, which
   3686                 //   stores its length in an int32_t. Do we have to think about
   3687                 //   this now that we're using UText? Probably not, since the length
   3688                 //   in UChar32s is still an int32_t.
   3689                 (*pCounter)++;
   3690                 U_ASSERT(*pCounter > 0);
   3691                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   3692                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   3693                     break;
   3694                 }
   3695                 if (*pCounter >= minCount) {
   3696                     fp = StateSave(fp, fp->fPatIdx, status);
   3697                 }
   3698                 fp->fPatIdx = opValue + 4;    // Loop back.
   3699             }
   3700             break;
   3701 
   3702         case URX_CTR_INIT_NG:
   3703             {
   3704                 // Initialize a non-greedy loop
   3705                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3706                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   3707 
   3708                 // Pick up the three extra operands that CTR_INIT has, and
   3709                 //    skip the pattern location counter past
   3710                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3711                 fp->fPatIdx += 3;
   3712                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3713                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   3714                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   3715                 U_ASSERT(minCount>=0);
   3716                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   3717                 U_ASSERT(loopLoc>fp->fPatIdx);
   3718 
   3719                 if (minCount == 0) {
   3720                     if (maxCount != 0) {
   3721                         fp = StateSave(fp, fp->fPatIdx, status);
   3722                     }
   3723                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   3724                 }
   3725             }
   3726             break;
   3727 
   3728         case URX_CTR_LOOP_NG:
   3729             {
   3730                 // Non-greedy {min, max} loops
   3731                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3732                 int32_t initOp = (int32_t)pat[opValue];
   3733                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   3734                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3735                 int32_t minCount  = (int32_t)pat[opValue+2];
   3736                 int32_t maxCount  = (int32_t)pat[opValue+3];
   3737                 // Increment the counter.  Note: we DIDN'T worry about counter
   3738                 //   overflow, since the data comes from UnicodeStrings, which
   3739                 //   stores its length in an int32_t. Do we have to think about
   3740                 //   this now that we're using UText? Probably not, since the length
   3741                 //   in UChar32s is still an int32_t.
   3742                 (*pCounter)++;
   3743                 U_ASSERT(*pCounter > 0);
   3744 
   3745                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   3746                     // The loop has matched the maximum permitted number of times.
   3747                     //   Break out of here with no action.  Matching will
   3748                     //   continue with the following pattern.
   3749                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   3750                     break;
   3751                 }
   3752 
   3753                 if (*pCounter < minCount) {
   3754                     // We haven't met the minimum number of matches yet.
   3755                     //   Loop back for another one.
   3756                     fp->fPatIdx = opValue + 4;    // Loop back.
   3757                 } else {
   3758                     // We do have the minimum number of matches.
   3759                     //   Fall into the following pattern, but first do
   3760                     //   a state save to the top of the loop, so that a failure
   3761                     //   in the following pattern will try another iteration of the loop.
   3762                     fp = StateSave(fp, opValue + 4, status);
   3763                 }
   3764             }
   3765             break;
   3766 
   3767         case URX_STO_SP:
   3768             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3769             fData[opValue] = fStack->size();
   3770             break;
   3771 
   3772         case URX_LD_SP:
   3773             {
   3774                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3775                 int32_t newStackSize = (int32_t)fData[opValue];
   3776                 U_ASSERT(newStackSize <= fStack->size());
   3777                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3778                 if (newFP == (int64_t *)fp) {
   3779                     break;
   3780                 }
   3781                 int32_t i;
   3782                 for (i=0; i<fFrameSize; i++) {
   3783                     newFP[i] = ((int64_t *)fp)[i];
   3784                 }
   3785                 fp = (REStackFrame *)newFP;
   3786                 fStack->setSize(newStackSize);
   3787             }
   3788             break;
   3789 
   3790         case URX_BACKREF:
   3791         case URX_BACKREF_I:
   3792             {
   3793                 U_ASSERT(opValue < fFrameSize);
   3794                 int64_t groupStartIdx = fp->fExtra[opValue];
   3795                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   3796                 U_ASSERT(groupStartIdx <= groupEndIdx);
   3797                 if (groupStartIdx < 0) {
   3798                     // This capture group has not participated in the match thus far,
   3799                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3800                 }
   3801 
   3802                 if (groupEndIdx == groupStartIdx) {
   3803                     //   The capture group match was of an empty string.
   3804                     //   Verified by testing:  Perl matches succeed in this case, so
   3805                     //   we do too.
   3806                     break;
   3807                 }
   3808 
   3809                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
   3810                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3811 
   3812                 UBool haveMatch = (opType == URX_BACKREF ?
   3813                     (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) :
   3814                     (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status)));
   3815                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3816 
   3817                 if (fp->fInputIdx > fActiveLimit) {
   3818                     fHitEnd = TRUE;
   3819                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3820                 } else if (!haveMatch) {
   3821                     if (fp->fInputIdx == fActiveLimit) {
   3822                         fHitEnd = TRUE;
   3823                     }
   3824                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3825                 }
   3826             }
   3827             break;
   3828 
   3829         case URX_STO_INP_LOC:
   3830             {
   3831                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   3832                 fp->fExtra[opValue] = fp->fInputIdx;
   3833             }
   3834             break;
   3835 
   3836         case URX_JMPX:
   3837             {
   3838                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3839                 fp->fPatIdx += 1;
   3840                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   3841                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   3842                 int64_t savedInputIdx = fp->fExtra[dataLoc];
   3843                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   3844                 if (savedInputIdx < fp->fInputIdx) {
   3845                     fp->fPatIdx = opValue;                               // JMP
   3846                 } else {
   3847                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   3848                 }
   3849             }
   3850             break;
   3851 
   3852         case URX_LA_START:
   3853             {
   3854                 // Entering a lookahead block.
   3855                 // Save Stack Ptr, Input Pos.
   3856                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3857                 fData[opValue]   = fStack->size();
   3858                 fData[opValue+1] = fp->fInputIdx;
   3859                 fActiveStart     = fLookStart;          // Set the match region change for
   3860                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   3861             }
   3862             break;
   3863 
   3864         case URX_LA_END:
   3865             {
   3866                 // Leaving a look-ahead block.
   3867                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   3868                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3869                 int32_t stackSize = fStack->size();
   3870                 int32_t newStackSize =(int32_t)fData[opValue];
   3871                 U_ASSERT(stackSize >= newStackSize);
   3872                 if (stackSize > newStackSize) {
   3873                     // Copy the current top frame back to the new (cut back) top frame.
   3874                     //   This makes the capture groups from within the look-ahead
   3875                     //   expression available.
   3876                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3877                     int32_t i;
   3878                     for (i=0; i<fFrameSize; i++) {
   3879                         newFP[i] = ((int64_t *)fp)[i];
   3880                     }
   3881                     fp = (REStackFrame *)newFP;
   3882                     fStack->setSize(newStackSize);
   3883                 }
   3884                 fp->fInputIdx = fData[opValue+1];
   3885 
   3886                 // Restore the active region bounds in the input string; they may have
   3887                 //    been changed because of transparent bounds on a Region.
   3888                 fActiveStart = fRegionStart;
   3889                 fActiveLimit = fRegionLimit;
   3890             }
   3891             break;
   3892 
   3893         case URX_ONECHAR_I:
   3894             if (fp->fInputIdx < fActiveLimit) {
   3895                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3896 
   3897                 UChar32 c = UTEXT_NEXT32(fInputText);
   3898                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   3899                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3900                     break;
   3901                 }
   3902             } else {
   3903                 fHitEnd = TRUE;
   3904             }
   3905 
   3906             #ifdef REGEX_SMART_BACKTRACKING
   3907             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   3908                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3909                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3910                     UBool success = FALSE;
   3911                     UChar32 c = UTEXT_PREVIOUS32(fInputText);
   3912                     while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
   3913                         if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   3914                             success = TRUE;
   3915                             break;
   3916                         } else if (c == U_SENTINEL) {
   3917                             break;
   3918                         }
   3919                         c = UTEXT_PREVIOUS32(fInputText);
   3920                     }
   3921                     if (success) {
   3922                         fHitEnd = FALSE;
   3923                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3924                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3925                         if (fp->fInputIdx > backSearchIndex) {
   3926                             fp = StateSave(fp, fp->fPatIdx, status);
   3927                         }
   3928                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3929                         break;
   3930                     }
   3931                 }
   3932             }
   3933             #endif
   3934 
   3935             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3936             break;
   3937 
   3938         case URX_STRING_I:
   3939             {
   3940                 // Test input against a literal string.
   3941                 // Strings require two slots in the compiled pattern, one for the
   3942                 //   offset to the string text, and one for the length.
   3943                 const UCaseProps *csp = ucase_getSingleton();
   3944                 {
   3945                     int32_t stringStartIdx, stringLen;
   3946                     stringStartIdx = opValue;
   3947 
   3948                     op      = (int32_t)pat[fp->fPatIdx];
   3949                     fp->fPatIdx++;
   3950                     opType  = URX_TYPE(op);
   3951                     opValue = URX_VAL(op);
   3952                     U_ASSERT(opType == URX_STRING_LEN);
   3953                     stringLen = opValue;
   3954 
   3955                     const UChar *patternChars = litText+stringStartIdx;
   3956                     const UChar *patternEnd = patternChars+stringLen;
   3957 
   3958                     const UChar *foldChars = NULL;
   3959                     int32_t foldOffset, foldLength;
   3960                     UChar32 c;
   3961 
   3962                     foldOffset = foldLength = 0;
   3963                     UBool success = TRUE;
   3964 
   3965                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3966                     while (patternChars < patternEnd && success) {
   3967                         if(foldOffset < foldLength) {
   3968                             U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   3969                         } else {
   3970                             c = UTEXT_NEXT32(fInputText);
   3971                             if (c != U_SENTINEL) {
   3972                                 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
   3973                                 if(foldLength >= 0) {
   3974                                     if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
   3975                                         foldOffset = 0;
   3976                                         U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   3977                                     } else {
   3978                                         c = foldLength;
   3979                                         foldLength = foldOffset; // to avoid reading chars from the folding buffer
   3980                                     }
   3981                                 }
   3982                             }
   3983 
   3984                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3985                         }
   3986 
   3987                         success = FALSE;
   3988                         if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) {
   3989                             if (U_IS_BMP(c)) {
   3990                                 success = (*patternChars == c);
   3991                                 patternChars += 1;
   3992                             } else if (patternChars+1 < patternEnd) {
   3993                                 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
   3994                                 patternChars += 2;
   3995                             }
   3996                         } else {
   3997                             fHitEnd = TRUE;          //   TODO:  See ticket 6074
   3998                         }
   3999                     }
   4000 
   4001                     if (!success) {
   4002                         #ifdef REGEX_SMART_BACKTRACKING
   4003                         if (fp->fInputIdx > backSearchIndex && fStack->size()) {
   4004                             REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   4005                             if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   4006                                 // Reset to last start point
   4007                                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4008                                 patternChars = litText+stringStartIdx;
   4009 
   4010                                 // Search backwards for a possible start
   4011                                 do {
   4012                                     c = UTEXT_PREVIOUS32(fInputText);
   4013                                     if (c == U_SENTINEL) {
   4014                                         break;
   4015                                     } else {
   4016                                         foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
   4017                                         if(foldLength >= 0) {
   4018                                             if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
   4019                                                 foldOffset = 0;
   4020                                                 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   4021                                             } else {
   4022                                                 c = foldLength;
   4023                                                 foldLength = foldOffset; // to avoid reading chars from the folding buffer
   4024                                             }
   4025                                         }
   4026 
   4027                                         if ((U_IS_BMP(c) && *patternChars == c) ||
   4028                                                (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
   4029                                             success = TRUE;
   4030                                             break;
   4031                                         }
   4032                                     }
   4033                                 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
   4034 
   4035                                 // And try again
   4036                                 if (success) {
   4037                                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4038                                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4039                                     if (fp->fInputIdx > backSearchIndex) {
   4040                                         fp = StateSave(fp, fp->fPatIdx, status);
   4041                                     }
   4042                                     fp->fPatIdx++; // Skip the LOOP_C, we just did that
   4043                                     break;
   4044                                 }
   4045                             }
   4046                         }
   4047                         #endif
   4048                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4049                     }
   4050                 }
   4051             }
   4052             break;
   4053 
   4054         case URX_LB_START:
   4055             {
   4056                 // Entering a look-behind block.
   4057                 // Save Stack Ptr, Input Pos.
   4058                 //   TODO:  implement transparent bounds.  Ticket #6067
   4059                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4060                 fData[opValue]   = fStack->size();
   4061                 fData[opValue+1] = fp->fInputIdx;
   4062                 // Init the variable containing the start index for attempted matches.
   4063                 fData[opValue+2] = -1;
   4064                 // Save input string length, then reset to pin any matches to end at
   4065                 //   the current position.
   4066                 fData[opValue+3] = fActiveLimit;
   4067                 fActiveLimit     = fp->fInputIdx;
   4068             }
   4069             break;
   4070 
   4071 
   4072         case URX_LB_CONT:
   4073             {
   4074                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   4075                 //    at all possible input starting positions.
   4076 
   4077                 // Fetch the min and max possible match lengths.  They are the operands
   4078                 //   of this op in the pattern.
   4079                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
   4080                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
   4081                 U_ASSERT(minML <= maxML);
   4082                 U_ASSERT(minML >= 0);
   4083 
   4084                 // Fetch (from data) the last input index where a match was attempted.
   4085                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4086                 int64_t  *lbStartIdx = &fData[opValue+2];
   4087                 if (*lbStartIdx < 0) {
   4088                     // First time through loop.
   4089                     *lbStartIdx = fp->fInputIdx - minML;
   4090                 } else {
   4091                     // 2nd through nth time through the loop.
   4092                     // Back up start position for match by one.
   4093                     if (*lbStartIdx == 0) {
   4094                         (*lbStartIdx)--;
   4095                     } else {
   4096                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
   4097                         UTEXT_PREVIOUS32(fInputText);
   4098                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4099                     }
   4100                 }
   4101 
   4102                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   4103                     // We have tried all potential match starting points without
   4104                     //  getting a match.  Backtrack out, and out of the
   4105                     //   Look Behind altogether.
   4106                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4107                     int64_t restoreInputLen = fData[opValue+3];
   4108                     U_ASSERT(restoreInputLen >= fActiveLimit);
   4109                     U_ASSERT(restoreInputLen <= fInputLength);
   4110                     fActiveLimit = restoreInputLen;
   4111                     break;
   4112                 }
   4113 
   4114                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   4115                 //      (successful match will fall off the end of the loop.)
   4116                 fp = StateSave(fp, fp->fPatIdx-3, status);
   4117                 fp->fInputIdx = *lbStartIdx;
   4118             }
   4119             break;
   4120 
   4121         case URX_LB_END:
   4122             // End of a look-behind block, after a successful match.
   4123             {
   4124                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4125                 if (fp->fInputIdx != fActiveLimit) {
   4126                     //  The look-behind expression matched, but the match did not
   4127                     //    extend all the way to the point that we are looking behind from.
   4128                     //  FAIL out of here, which will take us back to the LB_CONT, which
   4129                     //     will retry the match starting at another position or fail
   4130                     //     the look-behind altogether, whichever is appropriate.
   4131                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4132                     break;
   4133                 }
   4134 
   4135                 // Look-behind match is good.  Restore the orignal input string length,
   4136                 //   which had been truncated to pin the end of the lookbehind match to the
   4137                 //   position being looked-behind.
   4138                 int64_t originalInputLen = fData[opValue+3];
   4139                 U_ASSERT(originalInputLen >= fActiveLimit);
   4140                 U_ASSERT(originalInputLen <= fInputLength);
   4141                 fActiveLimit = originalInputLen;
   4142             }
   4143             break;
   4144 
   4145 
   4146         case URX_LBN_CONT:
   4147             {
   4148                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   4149                 //    at all possible input starting positions.
   4150 
   4151                 // Fetch the extra parameters of this op.
   4152                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
   4153                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
   4154                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
   4155                         continueLoc = URX_VAL(continueLoc);
   4156                 U_ASSERT(minML <= maxML);
   4157                 U_ASSERT(minML >= 0);
   4158                 U_ASSERT(continueLoc > fp->fPatIdx);
   4159 
   4160                 // Fetch (from data) the last input index where a match was attempted.
   4161                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4162                 int64_t  *lbStartIdx = &fData[opValue+2];
   4163                 if (*lbStartIdx < 0) {
   4164                     // First time through loop.
   4165                     *lbStartIdx = fp->fInputIdx - minML;
   4166                 } else {
   4167                     // 2nd through nth time through the loop.
   4168                     // Back up start position for match by one.
   4169                     if (*lbStartIdx == 0) {
   4170                         (*lbStartIdx)--;
   4171                     } else {
   4172                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
   4173                         UTEXT_PREVIOUS32(fInputText);
   4174                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4175                     }
   4176                 }
   4177 
   4178                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   4179                     // We have tried all potential match starting points without
   4180                     //  getting a match, which means that the negative lookbehind as
   4181                     //  a whole has succeeded.  Jump forward to the continue location
   4182                     int64_t restoreInputLen = fData[opValue+3];
   4183                     U_ASSERT(restoreInputLen >= fActiveLimit);
   4184                     U_ASSERT(restoreInputLen <= fInputLength);
   4185                     fActiveLimit = restoreInputLen;
   4186                     fp->fPatIdx = continueLoc;
   4187                     break;
   4188                 }
   4189 
   4190                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   4191                 //      (successful match will cause a FAIL out of the loop altogether.)
   4192                 fp = StateSave(fp, fp->fPatIdx-4, status);
   4193                 fp->fInputIdx = *lbStartIdx;
   4194             }
   4195             break;
   4196 
   4197         case URX_LBN_END:
   4198             // End of a negative look-behind block, after a successful match.
   4199             {
   4200                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4201                 if (fp->fInputIdx != fActiveLimit) {
   4202                     //  The look-behind expression matched, but the match did not
   4203                     //    extend all the way to the point that we are looking behind from.
   4204                     //  FAIL out of here, which will take us back to the LB_CONT, which
   4205                     //     will retry the match starting at another position or succeed
   4206                     //     the look-behind altogether, whichever is appropriate.
   4207                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4208                     break;
   4209                 }
   4210 
   4211                 // Look-behind expression matched, which means look-behind test as
   4212                 //   a whole Fails
   4213 
   4214                 //   Restore the orignal input string length, which had been truncated
   4215                 //   inorder to pin the end of the lookbehind match
   4216                 //   to the position being looked-behind.
   4217                 int64_t originalInputLen = fData[opValue+3];
   4218                 U_ASSERT(originalInputLen >= fActiveLimit);
   4219                 U_ASSERT(originalInputLen <= fInputLength);
   4220                 fActiveLimit = originalInputLen;
   4221 
   4222                 // Restore original stack position, discarding any state saved
   4223                 //   by the successful pattern match.
   4224                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4225                 int32_t newStackSize = (int32_t)fData[opValue];
   4226                 U_ASSERT(fStack->size() > newStackSize);
   4227                 fStack->setSize(newStackSize);
   4228 
   4229                 //  FAIL, which will take control back to someplace
   4230                 //  prior to entering the look-behind test.
   4231                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4232             }
   4233             break;
   4234 
   4235 
   4236         case URX_LOOP_SR_I:
   4237             // Loop Initialization for the optimized implementation of
   4238             //     [some character set]*
   4239             //   This op scans through all matching input.
   4240             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4241             {
   4242                 U_ASSERT(opValue > 0 && opValue < sets->size());
   4243                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   4244                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   4245 
   4246                 // Loop through input, until either the input is exhausted or
   4247                 //   we reach a character that is not a member of the set.
   4248                 int64_t ix = fp->fInputIdx;
   4249                 UTEXT_SETNATIVEINDEX(fInputText, ix);
   4250                 for (;;) {
   4251                     if (ix >= fActiveLimit) {
   4252                         fHitEnd = TRUE;
   4253                         break;
   4254                     }
   4255                     UChar32 c = UTEXT_NEXT32(fInputText);
   4256                     if (c<256) {
   4257                         if (s8->contains(c) == FALSE) {
   4258                             break;
   4259                         }
   4260                     } else {
   4261                         if (s->contains(c) == FALSE) {
   4262                             break;
   4263                         }
   4264                     }
   4265                     ix = UTEXT_GETNATIVEINDEX(fInputText);
   4266                 }
   4267 
   4268                 // If there were no matching characters, skip over the loop altogether.
   4269                 //   The loop doesn't run at all, a * op always succeeds.
   4270                 if (ix == fp->fInputIdx) {
   4271                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4272                     break;
   4273                 }
   4274 
   4275                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4276                 //   must follow.  It's operand is the stack location
   4277                 //   that holds the starting input index for the match of this [set]*
   4278                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   4279                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4280                 int32_t stackLoc = URX_VAL(loopcOp);
   4281                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4282                 fp->fExtra[stackLoc] = fp->fInputIdx;
   4283                 #ifdef REGEX_SMART_BACKTRACKING
   4284                 backSearchIndex = fp->fInputIdx;
   4285                 #endif
   4286                 fp->fInputIdx = ix;
   4287 
   4288                 // Save State to the URX_LOOP_C op that follows this one,
   4289                 //   so that match failures in the following code will return to there.
   4290                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4291                 fp = StateSave(fp, fp->fPatIdx, status);
   4292                 fp->fPatIdx++;
   4293             }
   4294             break;
   4295 
   4296 
   4297         case URX_LOOP_DOT_I:
   4298             // Loop Initialization for the optimized implementation of .*
   4299             //   This op scans through all remaining input.
   4300             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4301             {
   4302                 // Loop through input until the input is exhausted (we reach an end-of-line)
   4303                 // In DOTALL mode, we can just go straight to the end of the input.
   4304                 int64_t ix;
   4305                 if ((opValue & 1) == 1) {
   4306                     // Dot-matches-All mode.  Jump straight to the end of the string.
   4307                     ix = fActiveLimit;
   4308                     fHitEnd = TRUE;
   4309                 } else {
   4310                     // NOT DOT ALL mode.  Line endings do not match '.'
   4311                     // Scan forward until a line ending or end of input.
   4312                     ix = fp->fInputIdx;
   4313                     UTEXT_SETNATIVEINDEX(fInputText, ix);
   4314                     for (;;) {
   4315                         if (ix >= fActiveLimit) {
   4316                             fHitEnd = TRUE;
   4317                             break;
   4318                         }
   4319                         UChar32 c = UTEXT_NEXT32(fInputText);
   4320                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
   4321                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
   4322                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   4323                                     (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
   4324                                 //  char is a line ending.  Exit the scanning loop.
   4325                                 break;
   4326                             }
   4327                         }
   4328                         ix = UTEXT_GETNATIVEINDEX(fInputText);
   4329                     }
   4330                 }
   4331 
   4332                 // If there were no matching characters, skip over the loop altogether.
   4333                 //   The loop doesn't run at all, a * op always succeeds.
   4334                 if (ix == fp->fInputIdx) {
   4335                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4336                     break;
   4337                 }
   4338 
   4339                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4340                 //   must follow.  It's operand is the stack location
   4341                 //   that holds the starting input index for the match of this .*
   4342                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   4343                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4344                 int32_t stackLoc = URX_VAL(loopcOp);
   4345                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4346                 fp->fExtra[stackLoc] = fp->fInputIdx;
   4347                 #ifdef REGEX_SMART_BACKTRACKING
   4348                 backSearchIndex = fp->fInputIdx;
   4349                 #endif
   4350                 fp->fInputIdx = ix;
   4351 
   4352                 // Save State to the URX_LOOP_C op that follows this one,
   4353                 //   so that match failures in the following code will return to there.
   4354                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4355                 fp = StateSave(fp, fp->fPatIdx, status);
   4356                 fp->fPatIdx++;
   4357             }
   4358             break;
   4359 
   4360 
   4361         case URX_LOOP_C:
   4362             {
   4363                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   4364                 backSearchIndex = fp->fExtra[opValue];
   4365                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
   4366                 if (backSearchIndex == fp->fInputIdx) {
   4367                     // We've backed up the input idx to the point that the loop started.
   4368                     // The loop is done.  Leave here without saving state.
   4369                     //  Subsequent failures won't come back here.
   4370                     break;
   4371                 }
   4372                 // Set up for the next iteration of the loop, with input index
   4373                 //   backed up by one from the last time through,
   4374                 //   and a state save to this instruction in case the following code fails again.
   4375                 //   (We're going backwards because this loop emulates stack unwinding, not
   4376                 //    the initial scan forward.)
   4377                 U_ASSERT(fp->fInputIdx > 0);
   4378                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4379                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
   4380                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4381 
   4382                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
   4383                 if (prevC == 0x0a &&
   4384                     fp->fInputIdx > backSearchIndex &&
   4385                     twoPrevC == 0x0d) {
   4386                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
   4387                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   4388                         // .*, stepping back over CRLF pair.
   4389                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4390                     }
   4391                 }
   4392 
   4393 
   4394                 fp = StateSave(fp, fp->fPatIdx-1, status);
   4395             }
   4396             break;
   4397 
   4398 
   4399 
   4400         default:
   4401             // Trouble.  The compiled pattern contains an entry with an
   4402             //           unrecognized type tag.
   4403             U_ASSERT(FALSE);
   4404         }
   4405 
   4406         if (U_FAILURE(status)) {
   4407             isMatch = FALSE;
   4408             break;
   4409         }
   4410     }
   4411 
   4412 breakFromLoop:
   4413     fMatch = isMatch;
   4414     if (isMatch) {
   4415         fLastMatchEnd = fMatchEnd;
   4416         fMatchStart   = startIdx;
   4417         fMatchEnd     = fp->fInputIdx;
   4418         if (fTraceDebug) {
   4419             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
   4420         }
   4421     }
   4422     else
   4423     {
   4424         if (fTraceDebug) {
   4425             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
   4426         }
   4427     }
   4428 
   4429     fFrame = fp;                // The active stack frame when the engine stopped.
   4430                                 //   Contains the capture group results that we need to
   4431                                 //    access later.
   4432     return;
   4433 }
   4434 
   4435 
   4436 //--------------------------------------------------------------------------------
   4437 //
   4438 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
   4439 //                  assumption that the entire string is available in the UText's
   4440 //                  chunk buffer. For now, that means we can use int32_t indexes,
   4441 //                  except for anything that needs to be saved (like group starts
   4442 //                  and ends).
   4443 //
   4444 //                  startIdx:    begin matching a this index.
   4445 //                  toEnd:       if true, match must extend to end of the input region
   4446 //
   4447 //--------------------------------------------------------------------------------
   4448 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
   4449     UBool       isMatch  = FALSE;      // True if the we have a match.
   4450 
   4451     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
   4452 
   4453     int32_t     op;                    // Operation from the compiled pattern, split into
   4454     int32_t     opType;                //    the opcode
   4455     int32_t     opValue;               //    and the operand value.
   4456 
   4457 #ifdef REGEX_RUN_DEBUG
   4458     if (fTraceDebug)
   4459     {
   4460         printf("MatchAt(startIdx=%ld)\n", startIdx);
   4461         printf("Original Pattern: ");
   4462         UChar32 c = utext_next32From(fPattern->fPattern, 0);
   4463         while (c != U_SENTINEL) {
   4464             if (c<32 || c>256) {
   4465                 c = '.';
   4466             }
   4467             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
   4468 
   4469             c = UTEXT_NEXT32(fPattern->fPattern);
   4470         }
   4471         printf("\n");
   4472         printf("Input String: ");
   4473         c = utext_next32From(fInputText, 0);
   4474         while (c != U_SENTINEL) {
   4475             if (c<32 || c>256) {
   4476                 c = '.';
   4477             }
   4478             printf("%c", c);
   4479 
   4480             c = UTEXT_NEXT32(fInputText);
   4481         }
   4482         printf("\n");
   4483         printf("\n");
   4484     }
   4485 #endif
   4486 
   4487     if (U_FAILURE(status)) {
   4488         return;
   4489     }
   4490 
   4491     //  Cache frequently referenced items from the compiled pattern
   4492     //
   4493     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   4494 
   4495     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   4496     UVector             *sets          = fPattern->fSets;
   4497 
   4498     const UChar         *inputBuf      = fInputText->chunkContents;
   4499 
   4500     fFrameSize = fPattern->fFrameSize;
   4501     REStackFrame        *fp            = resetStack();
   4502 
   4503     fp->fPatIdx   = 0;
   4504     fp->fInputIdx = startIdx;
   4505 
   4506     // Zero out the pattern's static data
   4507     int32_t i;
   4508     for (i = 0; i<fPattern->fDataSize; i++) {
   4509         fData[i] = 0;
   4510     }
   4511 
   4512     //
   4513     //  Main loop for interpreting the compiled pattern.
   4514     //  One iteration of the loop per pattern operation performed.
   4515     //
   4516     for (;;) {
   4517 #if 0
   4518         if (_heapchk() != _HEAPOK) {
   4519             fprintf(stderr, "Heap Trouble\n");
   4520         }
   4521 #endif
   4522 
   4523         op      = (int32_t)pat[fp->fPatIdx];
   4524         opType  = URX_TYPE(op);
   4525         opValue = URX_VAL(op);
   4526 #ifdef REGEX_RUN_DEBUG
   4527         if (fTraceDebug) {
   4528             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4529             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
   4530                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   4531             fPattern->dumpOp(fp->fPatIdx);
   4532         }
   4533 #endif
   4534         fp->fPatIdx++;
   4535 
   4536         switch (opType) {
   4537 
   4538 
   4539         case URX_NOP:
   4540             break;
   4541 
   4542 
   4543         case URX_BACKTRACK:
   4544             // Force a backtrack.  In some circumstances, the pattern compiler
   4545             //   will notice that the pattern can't possibly match anything, and will
   4546             //   emit one of these at that point.
   4547             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4548             break;
   4549 
   4550 
   4551         case URX_ONECHAR:
   4552             if (fp->fInputIdx < fActiveLimit) {
   4553                 UChar32 c;
   4554                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4555                 if (c == opValue) {
   4556                     break;
   4557                 }
   4558             } else {
   4559                 fHitEnd = TRUE;
   4560             }
   4561 
   4562             #ifdef REGEX_SMART_BACKTRACKING
   4563             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   4564                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   4565                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   4566                     int64_t reverseIndex = fp->fInputIdx;
   4567                     UChar32 c;
   4568                     do {
   4569                         U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   4570                         if (c == opValue) {
   4571                             break;
   4572                         }
   4573                     } while (reverseIndex > backSearchIndex);
   4574                     if (c == opValue) {
   4575                         fHitEnd = FALSE;
   4576                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4577                         fp->fInputIdx = reverseIndex;
   4578                         if (fp->fInputIdx > backSearchIndex) {
   4579                             fp = StateSave(fp, fp->fPatIdx, status);
   4580                         }
   4581                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
   4582                         break;
   4583                     }
   4584                 }
   4585             }
   4586             #endif
   4587 
   4588             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4589             break;
   4590 
   4591 
   4592         case URX_STRING:
   4593             {
   4594                 // Test input against a literal string.
   4595                 // Strings require two slots in the compiled pattern, one for the
   4596                 //   offset to the string text, and one for the length.
   4597                 int32_t   stringStartIdx = opValue;
   4598                 int32_t   stringLen;
   4599 
   4600                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
   4601                 fp->fPatIdx++;
   4602                 opType    = URX_TYPE(op);
   4603                 stringLen = URX_VAL(op);
   4604                 U_ASSERT(opType == URX_STRING_LEN);
   4605                 U_ASSERT(stringLen >= 2);
   4606 
   4607                 if (fp->fInputIdx + stringLen > fActiveLimit) {
   4608                     // No match.  String is longer than the remaining input text.
   4609                     fHitEnd = TRUE;          //   TODO:  See ticket 6074
   4610                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4611                     break;
   4612                 }
   4613 
   4614                 const UChar * pInp = inputBuf + fp->fInputIdx;
   4615                 const UChar * pPat = litText+stringStartIdx;
   4616                 const UChar * pEnd = pInp + stringLen;
   4617                 UBool success = FALSE;
   4618                 for(;;) {
   4619                     if (*pInp == *pPat) {
   4620                         pInp++;
   4621                         pPat++;
   4622                         if (pInp == pEnd) {
   4623                             // Successful Match.
   4624                             success = TRUE;
   4625                             break;
   4626                         }
   4627                     } else {
   4628                         // Match failed.
   4629                         break;
   4630                     }
   4631                 }
   4632 
   4633                 if (success) {
   4634                     fp->fInputIdx += stringLen;
   4635                 } else {
   4636                     #ifdef REGEX_SMART_BACKTRACKING
   4637                     if (fp->fInputIdx > backSearchIndex && fStack->size()) {
   4638                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   4639                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   4640                             // Reset to last start point
   4641                             int64_t reverseIndex = fp->fInputIdx;
   4642                             UChar32 c;
   4643                             pPat = litText+stringStartIdx;
   4644 
   4645                             // Search backwards for a possible start
   4646                             do {
   4647                                 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   4648                                 if ((U_IS_BMP(c) && *pPat == c) ||
   4649                                     (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) {
   4650                                     success = TRUE;
   4651                                     break;
   4652                                 }
   4653                             } while (reverseIndex > backSearchIndex);
   4654 
   4655                             // And try again
   4656                             if (success) {
   4657                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4658                                 fp->fInputIdx = reverseIndex;
   4659                                 if (fp->fInputIdx > backSearchIndex) {
   4660                                     fp = StateSave(fp, fp->fPatIdx, status);
   4661                                 }
   4662                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
   4663                                 break;
   4664                             }
   4665                         }
   4666                     }
   4667                     #endif
   4668                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4669                 }
   4670             }
   4671             break;
   4672 
   4673 
   4674         case URX_STATE_SAVE:
   4675             fp = StateSave(fp, opValue, status);
   4676             break;
   4677 
   4678 
   4679         case URX_END:
   4680             // The match loop will exit via this path on a successful match,
   4681             //   when we reach the end of the pattern.
   4682             if (toEnd && fp->fInputIdx != fActiveLimit) {
   4683                 // The pattern matched, but not to the end of input.  Try some more.
   4684                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4685                 break;
   4686             }
   4687             isMatch = TRUE;
   4688             goto  breakFromLoop;
   4689 
   4690             // Start and End Capture stack frame variables are laid out out like this:
   4691             //  fp->fExtra[opValue]  - The start of a completed capture group
   4692             //             opValue+1 - The end   of a completed capture group
   4693             //             opValue+2 - the start of a capture group whose end
   4694             //                          has not yet been reached (and might not ever be).
   4695         case URX_START_CAPTURE:
   4696             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4697             fp->fExtra[opValue+2] = fp->fInputIdx;
   4698             break;
   4699 
   4700 
   4701         case URX_END_CAPTURE:
   4702             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4703             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   4704             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   4705             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   4706             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   4707             break;
   4708 
   4709 
   4710         case URX_DOLLAR:                   //  $, test for End of line
   4711             //     or for position before new line at end of input
   4712             if (fp->fInputIdx < fAnchorLimit-2) {
   4713                 // We are no where near the end of input.  Fail.
   4714                 //   This is the common case.  Keep it first.
   4715                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4716                 break;
   4717             }
   4718             if (fp->fInputIdx >= fAnchorLimit) {
   4719                 // We really are at the end of input.  Success.
   4720                 fHitEnd = TRUE;
   4721                 fRequireEnd = TRUE;
   4722                 break;
   4723             }
   4724 
   4725             // If we are positioned just before a new-line that is located at the
   4726             //   end of input, succeed.
   4727             if (fp->fInputIdx == fAnchorLimit-1) {
   4728                 UChar32 c;
   4729                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
   4730 
   4731                 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
   4732                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4733                         // At new-line at end of input. Success
   4734                         fHitEnd = TRUE;
   4735                         fRequireEnd = TRUE;
   4736                         break;
   4737                     }
   4738                 }
   4739             } else if (fp->fInputIdx == fAnchorLimit-2 &&
   4740                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
   4741                     fHitEnd = TRUE;
   4742                     fRequireEnd = TRUE;
   4743                     break;                         // At CR/LF at end of input.  Success
   4744             }
   4745 
   4746             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4747 
   4748             break;
   4749 
   4750 
   4751         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   4752             if (fp->fInputIdx >= fAnchorLimit-1) {
   4753                 // Either at the last character of input, or off the end.
   4754                 if (fp->fInputIdx == fAnchorLimit-1) {
   4755                     // At last char of input.  Success if it's a new line.
   4756                     if (inputBuf[fp->fInputIdx] == 0x0a) {
   4757                         fHitEnd = TRUE;
   4758                         fRequireEnd = TRUE;
   4759                         break;
   4760                     }
   4761                 } else {
   4762                     // Off the end of input.  Success.
   4763                     fHitEnd = TRUE;
   4764                     fRequireEnd = TRUE;
   4765                     break;
   4766                 }
   4767             }
   4768 
   4769             // Not at end of input.  Back-track out.
   4770             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4771             break;
   4772 
   4773 
   4774         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   4775             {
   4776                 if (fp->fInputIdx >= fAnchorLimit) {
   4777                     // We really are at the end of input.  Success.
   4778                     fHitEnd = TRUE;
   4779                     fRequireEnd = TRUE;
   4780                     break;
   4781                 }
   4782                 // If we are positioned just before a new-line, succeed.
   4783                 // It makes no difference where the new-line is within the input.
   4784                 UChar32 c = inputBuf[fp->fInputIdx];
   4785                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
   4786                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   4787                     //  In multi-line mode, hitting a new-line just before the end of input does not
   4788                     //   set the hitEnd or requireEnd flags
   4789                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4790                         break;
   4791                     }
   4792                 }
   4793                 // not at a new line.  Fail.
   4794                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4795             }
   4796             break;
   4797 
   4798 
   4799         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   4800             {
   4801                 if (fp->fInputIdx >= fAnchorLimit) {
   4802                     // We really are at the end of input.  Success.
   4803                     fHitEnd = TRUE;
   4804                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   4805                     break;               //   adding a new-line would not lose the match.
   4806                 }
   4807                 // If we are not positioned just before a new-line, the test fails; backtrack out.
   4808                 // It makes no difference where the new-line is within the input.
   4809                 if (inputBuf[fp->fInputIdx] != 0x0a) {
   4810                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4811                 }
   4812             }
   4813             break;
   4814 
   4815 
   4816         case URX_CARET:                    //  ^, test for start of line
   4817             if (fp->fInputIdx != fAnchorStart) {
   4818                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4819             }
   4820             break;
   4821 
   4822 
   4823         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   4824             {
   4825                 if (fp->fInputIdx == fAnchorStart) {
   4826                     // We are at the start input.  Success.
   4827                     break;
   4828                 }
   4829                 // Check whether character just before the current pos is a new-line
   4830                 //   unless we are at the end of input
   4831                 UChar  c = inputBuf[fp->fInputIdx - 1];
   4832                 if ((fp->fInputIdx < fAnchorLimit) &&
   4833                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   4834                     //  It's a new-line.  ^ is true.  Success.
   4835                     //  TODO:  what should be done with positions between a CR and LF?
   4836                     break;
   4837                 }
   4838                 // Not at the start of a line.  Fail.
   4839                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4840             }
   4841             break;
   4842 
   4843 
   4844         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   4845             {
   4846                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
   4847                 if (fp->fInputIdx <= fAnchorStart) {
   4848                     // We are at the start input.  Success.
   4849                     break;
   4850                 }
   4851                 // Check whether character just before the current pos is a new-line
   4852                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   4853                 UChar  c = inputBuf[fp->fInputIdx - 1];
   4854                 if (c != 0x0a) {
   4855                     // Not at the start of a line.  Back-track out.
   4856                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4857                 }
   4858             }
   4859             break;
   4860 
   4861         case URX_BACKSLASH_B:          // Test for word boundaries
   4862             {
   4863                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
   4864                 success ^= (opValue != 0);     // flip sense for \B
   4865                 if (!success) {
   4866                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4867                 }
   4868             }
   4869             break;
   4870 
   4871 
   4872         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   4873             {
   4874                 UBool success = isUWordBoundary(fp->fInputIdx);
   4875                 success ^= (opValue != 0);     // flip sense for \B
   4876                 if (!success) {
   4877                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4878                 }
   4879             }
   4880             break;
   4881 
   4882 
   4883         case URX_BACKSLASH_D:            // Test for decimal digit
   4884             {
   4885                 if (fp->fInputIdx >= fActiveLimit) {
   4886                     fHitEnd = TRUE;
   4887                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4888                     break;
   4889                 }
   4890 
   4891                 UChar32 c;
   4892                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4893                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   4894                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   4895                 success ^= (opValue != 0);        // flip sense for \D
   4896                 if (!success) {
   4897                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4898                 }
   4899             }
   4900             break;
   4901 
   4902 
   4903         case URX_BACKSLASH_G:          // Test for position at end of previous match
   4904             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
   4905                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4906             }
   4907             break;
   4908 
   4909 
   4910         case URX_BACKSLASH_X:
   4911         //  Match a Grapheme, as defined by Unicode TR 29.
   4912         //  Differs slightly from Perl, which consumes combining marks independently
   4913         //    of context.
   4914         {
   4915 
   4916             // Fail if at end of input
   4917             if (fp->fInputIdx >= fActiveLimit) {
   4918                 fHitEnd = TRUE;
   4919                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4920                 break;
   4921             }
   4922 
   4923             // Examine (and consume) the current char.
   4924             //   Dispatch into a little state machine, based on the char.
   4925             UChar32  c;
   4926             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4927             UnicodeSet **sets = fPattern->fStaticSets;
   4928             if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   4929             if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   4930             if (sets[URX_GC_L]->contains(c))       goto GC_L;
   4931             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   4932             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   4933             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4934             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4935             goto GC_Extend;
   4936 
   4937 
   4938 
   4939 GC_L:
   4940             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4941             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4942             if (sets[URX_GC_L]->contains(c))       goto GC_L;
   4943             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   4944             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   4945             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4946             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4947             goto GC_Extend;
   4948 
   4949 GC_V:
   4950             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4951             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4952             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4953             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4954             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4955             goto GC_Extend;
   4956 
   4957 GC_T:
   4958             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4959             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4960             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4961             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4962             goto GC_Extend;
   4963 
   4964 GC_Extend:
   4965             // Combining characters are consumed here
   4966             for (;;) {
   4967                 if (fp->fInputIdx >= fActiveLimit) {
   4968                     break;
   4969                 }
   4970                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4971                 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   4972                     U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   4973                     break;
   4974                 }
   4975             }
   4976             goto GC_Done;
   4977 
   4978 GC_Control:
   4979             // Most control chars stand alone (don't combine with combining chars),
   4980             //   except for that CR/LF sequence is a single grapheme cluster.
   4981             if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
   4982                 fp->fInputIdx++;
   4983             }
   4984 
   4985 GC_Done:
   4986             if (fp->fInputIdx >= fActiveLimit) {
   4987                 fHitEnd = TRUE;
   4988             }
   4989             break;
   4990         }
   4991 
   4992 
   4993 
   4994 
   4995         case URX_BACKSLASH_Z:          // Test for end of Input
   4996             if (fp->fInputIdx < fAnchorLimit) {
   4997                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4998             } else {
   4999                 fHitEnd = TRUE;
   5000                 fRequireEnd = TRUE;
   5001             }
   5002             break;
   5003 
   5004 
   5005 
   5006         case URX_STATIC_SETREF:
   5007             {
   5008                 // Test input character against one of the predefined sets
   5009                 //    (Word Characters, for example)
   5010                 // The high bit of the op value is a flag for the match polarity.
   5011                 //    0:   success if input char is in set.
   5012                 //    1:   success if input char is not in set.
   5013                 if (fp->fInputIdx >= fActiveLimit) {
   5014                     fHitEnd = TRUE;
   5015                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5016                     break;
   5017                 }
   5018 
   5019                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   5020                 opValue &= ~URX_NEG_SET;
   5021                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   5022 
   5023                 UChar32 c;
   5024                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5025                 if (c < 256) {
   5026                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   5027                     if (s8->contains(c)) {
   5028                         success = !success;
   5029                     }
   5030                 } else {
   5031                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   5032                     if (s->contains(c)) {
   5033                         success = !success;
   5034                     }
   5035                 }
   5036                 if (!success) {
   5037                     #ifdef REGEX_SMART_BACKTRACKING
   5038                     if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   5039                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5040                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5041                             // Try to find it, backwards
   5042                             int64_t reverseIndex = fp->fInputIdx;
   5043                             U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
   5044                             success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
   5045                             do {
   5046                                 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5047                                 if (c < 256) {
   5048                                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   5049                                     if (s8->contains(c)) {
   5050                                         success = !success;
   5051                                     }
   5052                                 } else {
   5053                                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   5054                                     if (s->contains(c)) {
   5055                                         success = !success;
   5056                                     }
   5057                                 }
   5058                             } while (reverseIndex > backSearchIndex && !success);
   5059 
   5060                             if (success) {
   5061                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5062                                 fp->fInputIdx = reverseIndex;
   5063                                 if (fp->fInputIdx > backSearchIndex) {
   5064                                     fp = StateSave(fp, fp->fPatIdx, status);
   5065                                 }
   5066                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5067                                 break;
   5068                             }
   5069                         }
   5070                     }
   5071                     #endif
   5072                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5073                 }
   5074             }
   5075             break;
   5076 
   5077 
   5078         case URX_STAT_SETREF_N:
   5079             {
   5080                 // Test input character for NOT being a member of  one of
   5081                 //    the predefined sets (Word Characters, for example)
   5082                 if (fp->fInputIdx >= fActiveLimit) {
   5083                     fHitEnd = TRUE;
   5084                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5085                     break;
   5086                 }
   5087 
   5088                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   5089 
   5090                 UChar32  c;
   5091                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5092                 if (c < 256) {
   5093                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   5094                     if (s8->contains(c) == FALSE) {
   5095                         break;
   5096                     }
   5097                 } else {
   5098                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   5099                     if (s->contains(c) == FALSE) {
   5100                         break;
   5101                     }
   5102                 }
   5103 
   5104                 #ifdef REGEX_SMART_BACKTRACKING
   5105                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   5106                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5107                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5108                         // Try to find it, backwards
   5109                         int64_t reverseIndex = fp->fInputIdx;
   5110                         U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
   5111                         UBool success = FALSE;
   5112                         do {
   5113                             U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5114                             if (c < 256) {
   5115                                 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   5116                                 if (s8->contains(c) == FALSE) {
   5117                                     success = TRUE;
   5118                                     break;
   5119                                 }
   5120                             } else {
   5121                                 const UnicodeSet *s = fPattern->fStaticSets[opValue];
   5122                                 if (s->contains(c) == FALSE) {
   5123                                     success = TRUE;
   5124                                     break;
   5125                                 }
   5126                             }
   5127                         } while (reverseIndex > backSearchIndex);
   5128 
   5129                         if (success) {
   5130                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5131                             fp->fInputIdx = reverseIndex;
   5132                             if (fp->fInputIdx > backSearchIndex) {
   5133                                 fp = StateSave(fp, fp->fPatIdx, status);
   5134                             }
   5135                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5136                             break;
   5137                         }
   5138                     }
   5139                 }
   5140                 #endif
   5141                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5142             }
   5143             break;
   5144 
   5145 
   5146         case URX_SETREF:
   5147             {
   5148                 if (fp->fInputIdx >= fActiveLimit) {
   5149                     fHitEnd = TRUE;
   5150                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5151                     break;
   5152                 }
   5153 
   5154                 U_ASSERT(opValue > 0 && opValue < sets->size());
   5155 
   5156                 // There is input left.  Pick up one char and test it for set membership.
   5157                 UChar32  c;
   5158                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5159                 if (c<256) {
   5160                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5161                     if (s8->contains(c)) {
   5162                         // The character is in the set.  A Match.
   5163                         break;
   5164                     }
   5165                 } else {
   5166                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   5167                     if (s->contains(c)) {
   5168                         // The character is in the set.  A Match.
   5169                         break;
   5170                     }
   5171                 }
   5172 
   5173                 // the character wasn't in the set.
   5174                 #ifdef REGEX_SMART_BACKTRACKING
   5175                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   5176                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5177                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5178                         // Try to find it, backwards
   5179                         int64_t reverseIndex = fp->fInputIdx;
   5180                         U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
   5181                         UBool success = FALSE;
   5182                         do {
   5183                             U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5184                             if (c < 256) {
   5185                                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5186                                 if (s8->contains(c)) {
   5187                                     success = TRUE;
   5188                                     break;
   5189                                 }
   5190                             } else {
   5191                                 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   5192                                 if (s->contains(c)) {
   5193                                     success = TRUE;
   5194                                     break;
   5195                                 }
   5196                             }
   5197                         } while (reverseIndex > backSearchIndex);
   5198 
   5199                         if (success) {
   5200                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5201                             fp->fInputIdx = reverseIndex;
   5202                             if (fp->fInputIdx > reverseIndex) {
   5203                                 fp = StateSave(fp, fp->fPatIdx, status);
   5204                             }
   5205                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5206                             break;
   5207                         }
   5208                     }
   5209                 }
   5210                 #endif
   5211                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5212             }
   5213             break;
   5214 
   5215 
   5216         case URX_DOTANY:
   5217             {
   5218                 // . matches anything, but stops at end-of-line.
   5219                 if (fp->fInputIdx >= fActiveLimit) {
   5220                     // At end of input.  Match failed.  Backtrack out.
   5221                     fHitEnd = TRUE;
   5222                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5223                     break;
   5224                 }
   5225 
   5226                 // There is input left.  Advance over one char, unless we've hit end-of-line
   5227                 UChar32  c;
   5228                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5229                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   5230                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   5231                     // End of line in normal mode.   . does not match.
   5232                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5233                     break;
   5234                 }
   5235             }
   5236             break;
   5237 
   5238 
   5239         case URX_DOTANY_ALL:
   5240             {
   5241                 // . in dot-matches-all (including new lines) mode
   5242                 if (fp->fInputIdx >= fActiveLimit) {
   5243                     // At end of input.  Match failed.  Backtrack out.
   5244                     fHitEnd = TRUE;
   5245                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5246                     break;
   5247                 }
   5248 
   5249                 // There is input left.  Advance over one char, except if we are
   5250                 //   at a cr/lf, advance over both of them.
   5251                 UChar32 c;
   5252                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5253                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   5254                     // In the case of a CR/LF, we need to advance over both.
   5255                     if (inputBuf[fp->fInputIdx] == 0x0a) {
   5256                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
   5257                     }
   5258                 }
   5259             }
   5260             break;
   5261 
   5262 
   5263         case URX_DOTANY_UNIX:
   5264             {
   5265                 // '.' operator, matches all, but stops at end-of-line.
   5266                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   5267                 if (fp->fInputIdx >= fActiveLimit) {
   5268                     // At end of input.  Match failed.  Backtrack out.
   5269                     fHitEnd = TRUE;
   5270                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5271                     break;
   5272                 }
   5273 
   5274                 // There is input left.  Advance over one char, unless we've hit end-of-line
   5275                 UChar32 c;
   5276                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5277                 if (c == 0x0a) {
   5278                     // End of line in normal mode.   '.' does not match the \n
   5279                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5280                 }
   5281             }
   5282             break;
   5283 
   5284 
   5285         case URX_JMP:
   5286             fp->fPatIdx = opValue;
   5287             break;
   5288 
   5289         case URX_FAIL:
   5290             isMatch = FALSE;
   5291             goto breakFromLoop;
   5292 
   5293         case URX_JMP_SAV:
   5294             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   5295             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   5296             fp->fPatIdx = opValue;                         // Then JMP.
   5297             break;
   5298 
   5299         case URX_JMP_SAV_X:
   5300             // This opcode is used with (x)+, when x can match a zero length string.
   5301             // Same as JMP_SAV, except conditional on the match having made forward progress.
   5302             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   5303             //   data address of the input position at the start of the loop.
   5304             {
   5305                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   5306                 int32_t  stoOp = (int32_t)pat[opValue-1];
   5307                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   5308                 int32_t  frameLoc = URX_VAL(stoOp);
   5309                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   5310                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
   5311                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   5312                 if (prevInputIdx < fp->fInputIdx) {
   5313                     // The match did make progress.  Repeat the loop.
   5314                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   5315                     fp->fPatIdx = opValue;
   5316                     fp->fExtra[frameLoc] = fp->fInputIdx;
   5317                 }
   5318                 // If the input position did not advance, we do nothing here,
   5319                 //   execution will fall out of the loop.
   5320             }
   5321             break;
   5322 
   5323         case URX_CTR_INIT:
   5324             {
   5325                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   5326                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   5327 
   5328                 // Pick up the three extra operands that CTR_INIT has, and
   5329                 //    skip the pattern location counter past
   5330                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5331                 fp->fPatIdx += 3;
   5332                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   5333                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   5334                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   5335                 U_ASSERT(minCount>=0);
   5336                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   5337                 U_ASSERT(loopLoc>fp->fPatIdx);
   5338 
   5339                 if (minCount == 0) {
   5340                     fp = StateSave(fp, loopLoc+1, status);
   5341                 }
   5342                 if (maxCount == 0) {
   5343                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5344                 }
   5345             }
   5346             break;
   5347 
   5348         case URX_CTR_LOOP:
   5349             {
   5350                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   5351                 int32_t initOp = (int32_t)pat[opValue];
   5352                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   5353                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   5354                 int32_t minCount  = (int32_t)pat[opValue+2];
   5355                 int32_t maxCount  = (int32_t)pat[opValue+3];
   5356                 // Increment the counter.  Note: we DIDN'T worry about counter
   5357                 //   overflow, since the data comes from UnicodeStrings, which
   5358                 //   stores its length in an int32_t. Do we have to think about
   5359                 //   this now that we're using UText? Probably not, since the length
   5360                 //   in UChar32s is still an int32_t.
   5361                 (*pCounter)++;
   5362                 U_ASSERT(*pCounter > 0);
   5363                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   5364                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   5365                     break;
   5366                 }
   5367                 if (*pCounter >= minCount) {
   5368                     fp = StateSave(fp, fp->fPatIdx, status);
   5369                 }
   5370                 fp->fPatIdx = opValue + 4;    // Loop back.
   5371             }
   5372             break;
   5373 
   5374         case URX_CTR_INIT_NG:
   5375             {
   5376                 // Initialize a non-greedy loop
   5377                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   5378                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   5379 
   5380                 // Pick up the three extra operands that CTR_INIT has, and
   5381                 //    skip the pattern location counter past
   5382                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5383                 fp->fPatIdx += 3;
   5384                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   5385                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   5386                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   5387                 U_ASSERT(minCount>=0);
   5388                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   5389                 U_ASSERT(loopLoc>fp->fPatIdx);
   5390 
   5391                 if (minCount == 0) {
   5392                     if (maxCount != 0) {
   5393                         fp = StateSave(fp, fp->fPatIdx, status);
   5394                     }
   5395                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   5396                 }
   5397             }
   5398             break;
   5399 
   5400         case URX_CTR_LOOP_NG:
   5401             {
   5402                 // Non-greedy {min, max} loops
   5403                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   5404                 int32_t initOp = (int32_t)pat[opValue];
   5405                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   5406                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   5407                 int32_t minCount  = (int32_t)pat[opValue+2];
   5408                 int32_t maxCount  = (int32_t)pat[opValue+3];
   5409                 // Increment the counter.  Note: we DIDN'T worry about counter
   5410                 //   overflow, since the data comes from UnicodeStrings, which
   5411                 //   stores its length in an int32_t. Do we have to think about
   5412                 //   this now that we're using UText? Probably not, since the length
   5413                 //   in UChar32s is still an int32_t.
   5414                 (*pCounter)++;
   5415                 U_ASSERT(*pCounter > 0);
   5416 
   5417                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   5418                     // The loop has matched the maximum permitted number of times.
   5419                     //   Break out of here with no action.  Matching will
   5420                     //   continue with the following pattern.
   5421                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   5422                     break;
   5423                 }
   5424 
   5425                 if (*pCounter < minCount) {
   5426                     // We haven't met the minimum number of matches yet.
   5427                     //   Loop back for another one.
   5428                     fp->fPatIdx = opValue + 4;    // Loop back.
   5429                 } else {
   5430                     // We do have the minimum number of matches.
   5431                     //   Fall into the following pattern, but first do
   5432                     //   a state save to the top of the loop, so that a failure
   5433                     //   in the following pattern will try another iteration of the loop.
   5434                     fp = StateSave(fp, opValue + 4, status);
   5435                 }
   5436             }
   5437             break;
   5438 
   5439         case URX_STO_SP:
   5440             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5441             fData[opValue] = fStack->size();
   5442             break;
   5443 
   5444         case URX_LD_SP:
   5445             {
   5446                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5447                 int32_t newStackSize = (int32_t)fData[opValue];
   5448                 U_ASSERT(newStackSize <= fStack->size());
   5449                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5450                 if (newFP == (int64_t *)fp) {
   5451                     break;
   5452                 }
   5453                 int32_t i;
   5454                 for (i=0; i<fFrameSize; i++) {
   5455                     newFP[i] = ((int64_t *)fp)[i];
   5456                 }
   5457                 fp = (REStackFrame *)newFP;
   5458                 fStack->setSize(newStackSize);
   5459             }
   5460             break;
   5461 
   5462         case URX_BACKREF:
   5463         case URX_BACKREF_I:
   5464             {
   5465                 U_ASSERT(opValue < fFrameSize);
   5466                 int64_t groupStartIdx = fp->fExtra[opValue];
   5467                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   5468                 U_ASSERT(groupStartIdx <= groupEndIdx);
   5469                 int64_t len = groupEndIdx-groupStartIdx;
   5470                 if (groupStartIdx < 0) {
   5471                     // This capture group has not participated in the match thus far,
   5472                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   5473                 }
   5474 
   5475                 if (len == 0) {
   5476                         //   The capture group match was of an empty string.
   5477                         //   Verified by testing:  Perl matches succeed in this case, so
   5478                         //   we do too.
   5479                         break;
   5480                     }
   5481 
   5482                 UBool  haveMatch = FALSE;
   5483                 if (fp->fInputIdx + len <= fActiveLimit) {
   5484                     if (opType == URX_BACKREF) {
   5485                         if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) {
   5486                             haveMatch = TRUE;
   5487                         }
   5488                     } else {
   5489                         if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
   5490                                   (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) {
   5491                             haveMatch = TRUE;
   5492                         }
   5493                     }
   5494                 } else {
   5495                     // TODO: probably need to do a partial string comparison, and only
   5496                     //       set HitEnd if the available input matched.  Ticket #6074
   5497                     fHitEnd = TRUE;
   5498                 }
   5499                 if (haveMatch) {
   5500                     fp->fInputIdx += len;     // Match.  Advance current input position.
   5501                 } else {
   5502                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   5503                 }
   5504             }
   5505             break;
   5506 
   5507         case URX_STO_INP_LOC:
   5508             {
   5509                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   5510                 fp->fExtra[opValue] = fp->fInputIdx;
   5511             }
   5512             break;
   5513 
   5514         case URX_JMPX:
   5515             {
   5516                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5517                 fp->fPatIdx += 1;
   5518                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   5519                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   5520                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
   5521                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   5522                 if (savedInputIdx < fp->fInputIdx) {
   5523                     fp->fPatIdx = opValue;                               // JMP
   5524                 } else {
   5525                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   5526                 }
   5527             }
   5528             break;
   5529 
   5530         case URX_LA_START:
   5531             {
   5532                 // Entering a lookahead block.
   5533                 // Save Stack Ptr, Input Pos.
   5534                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5535                 fData[opValue]   = fStack->size();
   5536                 fData[opValue+1] = fp->fInputIdx;
   5537                 fActiveStart     = fLookStart;          // Set the match region change for
   5538                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   5539             }
   5540             break;
   5541 
   5542         case URX_LA_END:
   5543             {
   5544                 // Leaving a look-ahead block.
   5545                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   5546                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5547                 int32_t stackSize = fStack->size();
   5548                 int32_t newStackSize = (int32_t)fData[opValue];
   5549                 U_ASSERT(stackSize >= newStackSize);
   5550                 if (stackSize > newStackSize) {
   5551                     // Copy the current top frame back to the new (cut back) top frame.
   5552                     //   This makes the capture groups from within the look-ahead
   5553                     //   expression available.
   5554                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5555                     int32_t i;
   5556                     for (i=0; i<fFrameSize; i++) {
   5557                         newFP[i] = ((int64_t *)fp)[i];
   5558                     }
   5559                     fp = (REStackFrame *)newFP;
   5560                     fStack->setSize(newStackSize);
   5561                 }
   5562                 fp->fInputIdx = fData[opValue+1];
   5563 
   5564                 // Restore the active region bounds in the input string; they may have
   5565                 //    been changed because of transparent bounds on a Region.
   5566                 fActiveStart = fRegionStart;
   5567                 fActiveLimit = fRegionLimit;
   5568             }
   5569             break;
   5570 
   5571         case URX_ONECHAR_I:
   5572             if (fp->fInputIdx < fActiveLimit) {
   5573                 UChar32 c;
   5574                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5575                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   5576                     break;
   5577                 }
   5578             } else {
   5579                 fHitEnd = TRUE;
   5580             }
   5581 
   5582             #ifdef REGEX_SMART_BACKTRACKING
   5583             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   5584                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5585                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5586                     UBool success = FALSE;
   5587                     int64_t reverseIndex = fp->fInputIdx;
   5588                     UChar32 c;
   5589                     while (reverseIndex > backSearchIndex) {
   5590                         U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5591                         if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   5592                             success = TRUE;
   5593                             break;
   5594                         } else if (c == U_SENTINEL) {
   5595                             break;
   5596                         }
   5597                     }
   5598                     if (success) {
   5599                         fHitEnd = FALSE;
   5600                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5601                         fp->fInputIdx = reverseIndex;
   5602                         if (fp->fInputIdx > backSearchIndex) {
   5603                             fp = StateSave(fp, fp->fPatIdx, status);
   5604                         }
   5605                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5606                         break;
   5607                     }
   5608                 }
   5609             }
   5610             #endif
   5611 
   5612             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5613             break;
   5614 
   5615         case URX_STRING_I:
   5616             {
   5617                 // Test input against a literal string.
   5618                 // Strings require two slots in the compiled pattern, one for the
   5619                 //   offset to the string text, and one for the length.
   5620                 const UCaseProps *csp = ucase_getSingleton();
   5621                 {
   5622                     int32_t stringStartIdx, stringLen;
   5623                     stringStartIdx = opValue;
   5624 
   5625                     op      = (int32_t)pat[fp->fPatIdx];
   5626                     fp->fPatIdx++;
   5627                     opType  = URX_TYPE(op);
   5628                     opValue = URX_VAL(op);
   5629                     U_ASSERT(opType == URX_STRING_LEN);
   5630                     stringLen = opValue;
   5631 
   5632                     const UChar *patternChars = litText+stringStartIdx;
   5633                     const UChar *patternEnd = patternChars+stringLen;
   5634 
   5635                     const UChar *foldChars = NULL;
   5636                     int32_t foldOffset, foldLength;
   5637                     UChar32 c;
   5638                     // BEGIN android-changed
   5639                     // For ICU ticket#8824
   5640                     UBool c_is_valid = FALSE;
   5641 
   5642                     #ifdef REGEX_SMART_BACKTRACKING
   5643                     int32_t originalInputIdx = fp->fInputIdx;
   5644                     #endif
   5645                     UBool success = TRUE;
   5646 
   5647                     foldOffset = foldLength = 0;
   5648 
   5649                     while (patternChars < patternEnd && success) {
   5650                         if (fp->fInputIdx < fActiveLimit) {  // don't read past end of string
   5651                             if(foldOffset < foldLength) {
   5652                                 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   5653                                 c_is_valid = TRUE;
   5654                             } else {
   5655                                 // test pre-condition of U16_NEXT: i < length
   5656                                 U_ASSERT(fp->fInputIdx < fActiveLimit);
   5657                                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5658                                 c_is_valid = TRUE;
   5659                                 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
   5660                                 if(foldLength >= 0) {
   5661                                     if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
   5662                                         foldOffset = 0;
   5663                                         U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   5664                                     } else {
   5665                                         c = foldLength;
   5666                                         foldLength = foldOffset; // to avoid reading chars from the folding buffer
   5667                                     }
   5668                                 }
   5669                             }
   5670                         } else {
   5671                             c_is_valid = FALSE;
   5672                         }
   5673 
   5674                         if (fp->fInputIdx <= fActiveLimit && c_is_valid) {
   5675                             if (U_IS_BMP(c)) {
   5676                                 success = (*patternChars == c);
   5677                                 patternChars += 1;
   5678                             } else if (patternChars+1 < patternEnd) {
   5679                                 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
   5680                                 patternChars += 2;
   5681                             }
   5682                         } else {
   5683                             success = FALSE;
   5684                             fHitEnd = TRUE;          //   TODO:  See ticket 6074
   5685                         }
   5686                     }
   5687                     // END android-changed
   5688 
   5689                     if (!success) {
   5690                         #ifdef REGEX_SMART_BACKTRACKING
   5691                         if (fp->fInputIdx > backSearchIndex && fStack->size()) {
   5692                             REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5693                             if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5694                                 // Reset to last start point
   5695                                 int64_t reverseIndex = originalInputIdx;
   5696                                 patternChars = litText+stringStartIdx;
   5697 
   5698                                 // Search backwards for a possible start
   5699                                 do {
   5700                                     U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5701                                     foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
   5702                                     if(foldLength >= 0) {
   5703                                         if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
   5704                                             foldOffset = 0;
   5705                                             U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   5706                                         } else {
   5707                                             c = foldLength;
   5708                                             foldLength = foldOffset; // to avoid reading chars from the folding buffer
   5709                                         }
   5710                                     }
   5711 
   5712                                     if ((U_IS_BMP(c) && *patternChars == c) ||
   5713                                            (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
   5714                                         success = TRUE;
   5715                                         break;
   5716                                     }
   5717                                 } while (reverseIndex > backSearchIndex);
   5718 
   5719                                 // And try again
   5720                                 if (success) {
   5721                                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5722                                     fp->fInputIdx = reverseIndex;
   5723                                     if (fp->fInputIdx > backSearchIndex) {
   5724                                         fp = StateSave(fp, fp->fPatIdx, status);
   5725                                     }
   5726                                     fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5727                                     break;
   5728                                 }
   5729                             }
   5730                         }
   5731                         #endif
   5732                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5733                     }
   5734                 }
   5735             }
   5736             break;
   5737 
   5738         case URX_LB_START:
   5739             {
   5740                 // Entering a look-behind block.
   5741                 // Save Stack Ptr, Input Pos.
   5742                 //   TODO:  implement transparent bounds.  Ticket #6067
   5743                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5744                 fData[opValue]   = fStack->size();
   5745                 fData[opValue+1] = fp->fInputIdx;
   5746                 // Init the variable containing the start index for attempted matches.
   5747                 fData[opValue+2] = -1;
   5748                 // Save input string length, then reset to pin any matches to end at
   5749                 //   the current position.
   5750                 fData[opValue+3] = fActiveLimit;
   5751                 fActiveLimit     = fp->fInputIdx;
   5752             }
   5753             break;
   5754 
   5755 
   5756         case URX_LB_CONT:
   5757             {
   5758                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   5759                 //    at all possible input starting positions.
   5760 
   5761                 // Fetch the min and max possible match lengths.  They are the operands
   5762                 //   of this op in the pattern.
   5763                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
   5764                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
   5765                 U_ASSERT(minML <= maxML);
   5766                 U_ASSERT(minML >= 0);
   5767 
   5768                 // Fetch (from data) the last input index where a match was attempted.
   5769                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5770                 int64_t  *lbStartIdx = &fData[opValue+2];
   5771                 if (*lbStartIdx < 0) {
   5772                     // First time through loop.
   5773                     *lbStartIdx = fp->fInputIdx - minML;
   5774                 } else {
   5775                     // 2nd through nth time through the loop.
   5776                     // Back up start position for match by one.
   5777                     if (*lbStartIdx == 0) {
   5778                         (*lbStartIdx)--;
   5779                     } else {
   5780                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
   5781                     }
   5782                 }
   5783 
   5784                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   5785                     // We have tried all potential match starting points without
   5786                     //  getting a match.  Backtrack out, and out of the
   5787                     //   Look Behind altogether.
   5788                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5789                     int64_t restoreInputLen = fData[opValue+3];
   5790                     U_ASSERT(restoreInputLen >= fActiveLimit);
   5791                     U_ASSERT(restoreInputLen <= fInputLength);
   5792                     fActiveLimit = restoreInputLen;
   5793                     break;
   5794                 }
   5795 
   5796                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5797                 //      (successful match will fall off the end of the loop.)
   5798                 fp = StateSave(fp, fp->fPatIdx-3, status);
   5799                 fp->fInputIdx =  *lbStartIdx;
   5800             }
   5801             break;
   5802 
   5803         case URX_LB_END:
   5804             // End of a look-behind block, after a successful match.
   5805             {
   5806                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5807                 if (fp->fInputIdx != fActiveLimit) {
   5808                     //  The look-behind expression matched, but the match did not
   5809                     //    extend all the way to the point that we are looking behind from.
   5810                     //  FAIL out of here, which will take us back to the LB_CONT, which
   5811                     //     will retry the match starting at another position or fail
   5812                     //     the look-behind altogether, whichever is appropriate.
   5813                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5814                     break;
   5815                 }
   5816 
   5817                 // Look-behind match is good.  Restore the orignal input string length,
   5818                 //   which had been truncated to pin the end of the lookbehind match to the
   5819                 //   position being looked-behind.
   5820                 int64_t originalInputLen = fData[opValue+3];
   5821                 U_ASSERT(originalInputLen >= fActiveLimit);
   5822                 U_ASSERT(originalInputLen <= fInputLength);
   5823                 fActiveLimit = originalInputLen;
   5824             }
   5825             break;
   5826 
   5827 
   5828         case URX_LBN_CONT:
   5829             {
   5830                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   5831                 //    at all possible input starting positions.
   5832 
   5833                 // Fetch the extra parameters of this op.
   5834                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
   5835                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
   5836                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
   5837                 continueLoc = URX_VAL(continueLoc);
   5838                 U_ASSERT(minML <= maxML);
   5839                 U_ASSERT(minML >= 0);
   5840                 U_ASSERT(continueLoc > fp->fPatIdx);
   5841 
   5842                 // Fetch (from data) the last input index where a match was attempted.
   5843                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5844                 int64_t  *lbStartIdx = &fData[opValue+2];
   5845                 if (*lbStartIdx < 0) {
   5846                     // First time through loop.
   5847                     *lbStartIdx = fp->fInputIdx - minML;
   5848                 } else {
   5849                     // 2nd through nth time through the loop.
   5850                     // Back up start position for match by one.
   5851                     if (*lbStartIdx == 0) {
   5852                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
   5853                     } else {
   5854                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
   5855                     }
   5856                 }
   5857 
   5858                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   5859                     // We have tried all potential match starting points without
   5860                     //  getting a match, which means that the negative lookbehind as
   5861                     //  a whole has succeeded.  Jump forward to the continue location
   5862                     int64_t restoreInputLen = fData[opValue+3];
   5863                     U_ASSERT(restoreInputLen >= fActiveLimit);
   5864                     U_ASSERT(restoreInputLen <= fInputLength);
   5865                     fActiveLimit = restoreInputLen;
   5866                     fp->fPatIdx = continueLoc;
   5867                     break;
   5868                 }
   5869 
   5870                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5871                 //      (successful match will cause a FAIL out of the loop altogether.)
   5872                 fp = StateSave(fp, fp->fPatIdx-4, status);
   5873                 fp->fInputIdx =  *lbStartIdx;
   5874             }
   5875             break;
   5876 
   5877         case URX_LBN_END:
   5878             // End of a negative look-behind block, after a successful match.
   5879             {
   5880                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5881                 if (fp->fInputIdx != fActiveLimit) {
   5882                     //  The look-behind expression matched, but the match did not
   5883                     //    extend all the way to the point that we are looking behind from.
   5884                     //  FAIL out of here, which will take us back to the LB_CONT, which
   5885                     //     will retry the match starting at another position or succeed
   5886                     //     the look-behind altogether, whichever is appropriate.
   5887                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5888                     break;
   5889                 }
   5890 
   5891                 // Look-behind expression matched, which means look-behind test as
   5892                 //   a whole Fails
   5893 
   5894                 //   Restore the orignal input string length, which had been truncated
   5895                 //   inorder to pin the end of the lookbehind match
   5896                 //   to the position being looked-behind.
   5897                 int64_t originalInputLen = fData[opValue+3];
   5898                 U_ASSERT(originalInputLen >= fActiveLimit);
   5899                 U_ASSERT(originalInputLen <= fInputLength);
   5900                 fActiveLimit = originalInputLen;
   5901 
   5902                 // Restore original stack position, discarding any state saved
   5903                 //   by the successful pattern match.
   5904                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5905                 int32_t newStackSize = (int32_t)fData[opValue];
   5906                 U_ASSERT(fStack->size() > newStackSize);
   5907                 fStack->setSize(newStackSize);
   5908 
   5909                 //  FAIL, which will take control back to someplace
   5910                 //  prior to entering the look-behind test.
   5911                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5912             }
   5913             break;
   5914 
   5915 
   5916         case URX_LOOP_SR_I:
   5917             // Loop Initialization for the optimized implementation of
   5918             //     [some character set]*
   5919             //   This op scans through all matching input.
   5920             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5921             {
   5922                 U_ASSERT(opValue > 0 && opValue < sets->size());
   5923                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5924                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   5925 
   5926                 // Loop through input, until either the input is exhausted or
   5927                 //   we reach a character that is not a member of the set.
   5928                 int32_t ix = (int32_t)fp->fInputIdx;
   5929                 for (;;) {
   5930                     if (ix >= fActiveLimit) {
   5931                         fHitEnd = TRUE;
   5932                         break;
   5933                     }
   5934                     UChar32   c;
   5935                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
   5936                     if (c<256) {
   5937                         if (s8->contains(c) == FALSE) {
   5938                             U16_BACK_1(inputBuf, 0, ix);
   5939                             break;
   5940                         }
   5941                     } else {
   5942                         if (s->contains(c) == FALSE) {
   5943                             U16_BACK_1(inputBuf, 0, ix);
   5944                             break;
   5945                         }
   5946                     }
   5947                 }
   5948 
   5949                 // If there were no matching characters, skip over the loop altogether.
   5950                 //   The loop doesn't run at all, a * op always succeeds.
   5951                 if (ix == fp->fInputIdx) {
   5952                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   5953                     break;
   5954                 }
   5955 
   5956                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   5957                 //   must follow.  It's operand is the stack location
   5958                 //   that holds the starting input index for the match of this [set]*
   5959                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   5960                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   5961                 int32_t stackLoc = URX_VAL(loopcOp);
   5962                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   5963                 fp->fExtra[stackLoc] = fp->fInputIdx;
   5964                 #ifdef REGEX_SMART_BACKTRACKING
   5965                 backSearchIndex = fp->fInputIdx;
   5966                 #endif
   5967                 fp->fInputIdx = ix;
   5968 
   5969                 // Save State to the URX_LOOP_C op that follows this one,
   5970                 //   so that match failures in the following code will return to there.
   5971                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   5972                 fp = StateSave(fp, fp->fPatIdx, status);
   5973                 fp->fPatIdx++;
   5974             }
   5975             break;
   5976 
   5977 
   5978         case URX_LOOP_DOT_I:
   5979             // Loop Initialization for the optimized implementation of .*
   5980             //   This op scans through all remaining input.
   5981             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5982             {
   5983                 // Loop through input until the input is exhausted (we reach an end-of-line)
   5984                 // In DOTALL mode, we can just go straight to the end of the input.
   5985                 int32_t ix;
   5986                 if ((opValue & 1) == 1) {
   5987                     // Dot-matches-All mode.  Jump straight to the end of the string.
   5988                     ix = (int32_t)fActiveLimit;
   5989                     fHitEnd = TRUE;
   5990                 } else {
   5991                     // NOT DOT ALL mode.  Line endings do not match '.'
   5992                     // Scan forward until a line ending or end of input.
   5993                     ix = (int32_t)fp->fInputIdx;
   5994                     for (;;) {
   5995                         if (ix >= fActiveLimit) {
   5996                             fHitEnd = TRUE;
   5997                             break;
   5998                         }
   5999                         UChar32   c;
   6000                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
   6001                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
   6002                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
   6003                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   6004                                    ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
   6005                                 //  char is a line ending.  Put the input pos back to the
   6006                                 //    line ending char, and exit the scanning loop.
   6007                                 U16_BACK_1(inputBuf, 0, ix);
   6008                                 break;
   6009                             }
   6010                         }
   6011                     }
   6012                 }
   6013 
   6014                 // If there were no matching characters, skip over the loop altogether.
   6015                 //   The loop doesn't run at all, a * op always succeeds.
   6016                 if (ix == fp->fInputIdx) {
   6017                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   6018                     break;
   6019                 }
   6020 
   6021                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   6022                 //   must follow.  It's operand is the stack location
   6023                 //   that holds the starting input index for the match of this .*
   6024                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   6025                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   6026                 int32_t stackLoc = URX_VAL(loopcOp);
   6027                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   6028                 fp->fExtra[stackLoc] = fp->fInputIdx;
   6029                 #ifdef REGEX_SMART_BACKTRACKING
   6030                 backSearchIndex = fp->fInputIdx;
   6031                 #endif
   6032                 fp->fInputIdx = ix;
   6033 
   6034                 // Save State to the URX_LOOP_C op that follows this one,
   6035                 //   so that match failures in the following code will return to there.
   6036                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   6037                 fp = StateSave(fp, fp->fPatIdx, status);
   6038                 fp->fPatIdx++;
   6039             }
   6040             break;
   6041 
   6042 
   6043         case URX_LOOP_C:
   6044             {
   6045                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   6046                 backSearchIndex = (int32_t)fp->fExtra[opValue];
   6047                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
   6048                 if (backSearchIndex == fp->fInputIdx) {
   6049                     // We've backed up the input idx to the point that the loop started.
   6050                     // The loop is done.  Leave here without saving state.
   6051                     //  Subsequent failures won't come back here.
   6052                     break;
   6053                 }
   6054                 // Set up for the next iteration of the loop, with input index
   6055                 //   backed up by one from the last time through,
   6056                 //   and a state save to this instruction in case the following code fails again.
   6057                 //   (We're going backwards because this loop emulates stack unwinding, not
   6058                 //    the initial scan forward.)
   6059                 U_ASSERT(fp->fInputIdx > 0);
   6060                 UChar32 prevC;
   6061                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
   6062 
   6063                 if (prevC == 0x0a &&
   6064                     fp->fInputIdx > backSearchIndex &&
   6065                     inputBuf[fp->fInputIdx-1] == 0x0d) {
   6066                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
   6067                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   6068                         // .*, stepping back over CRLF pair.
   6069                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   6070                     }
   6071                 }
   6072 
   6073 
   6074                 fp = StateSave(fp, fp->fPatIdx-1, status);
   6075             }
   6076             break;
   6077 
   6078 
   6079 
   6080         default:
   6081             // Trouble.  The compiled pattern contains an entry with an
   6082             //           unrecognized type tag.
   6083             U_ASSERT(FALSE);
   6084         }
   6085 
   6086         if (U_FAILURE(status)) {
   6087             isMatch = FALSE;
   6088             break;
   6089         }
   6090     }
   6091 
   6092 breakFromLoop:
   6093     fMatch = isMatch;
   6094     if (isMatch) {
   6095         fLastMatchEnd = fMatchEnd;
   6096         fMatchStart   = startIdx;
   6097         fMatchEnd     = fp->fInputIdx;
   6098         if (fTraceDebug) {
   6099             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
   6100         }
   6101     }
   6102     else
   6103     {
   6104         if (fTraceDebug) {
   6105             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
   6106         }
   6107     }
   6108 
   6109     fFrame = fp;                // The active stack frame when the engine stopped.
   6110     //   Contains the capture group results that we need to
   6111     //    access later.
   6112 
   6113     return;
   6114 }
   6115 
   6116 
   6117 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
   6118 
   6119 U_NAMESPACE_END
   6120 
   6121 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
   6122 
   6123