Home | History | Annotate | Download | only in i18n
      1 /*
      2 **************************************************************************
      3 *   Copyright (C) 2002-2010 International Business Machines Corporation  *
      4 *   and others. All rights reserved.                                     *
      5 **************************************************************************
      6 */
      7 //
      8 //  file:  rematch.cpp
      9 //
     10 //         Contains the implementation of class RegexMatcher,
     11 //         which is one of the main API classes for the ICU regular expression package.
     12 //
     13 
     14 #include "unicode/utypes.h"
     15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     16 
     17 #include "unicode/regex.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/ustring.h"
     21 #include "unicode/rbbi.h"
     22 #include "uassert.h"
     23 #include "cmemory.h"
     24 #include "uvector.h"
     25 #include "uvectr32.h"
     26 #include "uvectr64.h"
     27 #include "regeximp.h"
     28 #include "regexst.h"
     29 #include "regextxt.h"
     30 #include "ucase.h"
     31 
     32 // #include <malloc.h>        // Needed for heapcheck testing
     33 
     34 
     35 
     36 // Smart Backtracking
     37 // ------------------
     38 // When a failure would go back to a LOOP_C instruction,
     39 // strings, characters, and setrefs scan backwards for a valid start
     40 // character themselves, pop the stack, and save state, emulating the
     41 // LOOP_C's effect but assured that the next character of input is a
     42 // possible matching character.
     43 //
     44 // Good idea in theory; unfortunately it only helps out a few specific
     45 // cases and slows the engine down a little in the rest.
     46 
     47 //#define REGEX_SMART_BACKTRACKING 1
     48 
     49 U_NAMESPACE_BEGIN
     50 
     51 // Default limit for the size of the back track stack, to avoid system
     52 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
     53 // This value puts ICU's limits higher than most other regexp implementations,
     54 //    which use recursion rather than the heap, and take more storage per
     55 //    backtrack point.
     56 //
     57 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
     58 
     59 // Time limit counter constant.
     60 //   Time limits for expression evaluation are in terms of quanta of work by
     61 //   the engine, each of which is 10,000 state saves.
     62 //   This constant determines that state saves per tick number.
     63 static const int32_t TIMER_INITIAL_VALUE = 10000;
     64 
     65 //-----------------------------------------------------------------------------
     66 //
     67 //   Constructor and Destructor
     68 //
     69 //-----------------------------------------------------------------------------
     70 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
     71     fDeferredStatus = U_ZERO_ERROR;
     72     init(fDeferredStatus);
     73     if (U_FAILURE(fDeferredStatus)) {
     74         return;
     75     }
     76     if (pat==NULL) {
     77         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
     78         return;
     79     }
     80     fPattern = pat;
     81     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
     82 }
     83 
     84 
     85 
     86 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
     87                            uint32_t flags, UErrorCode &status) {
     88     init(status);
     89     if (U_FAILURE(status)) {
     90         return;
     91     }
     92     UParseError    pe;
     93     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
     94     fPattern           = fPatternOwned;
     95 
     96     UText inputText = UTEXT_INITIALIZER;
     97     utext_openConstUnicodeString(&inputText, &input, &status);
     98     init2(&inputText, status);
     99     utext_close(&inputText);
    100 
    101     fInputUniStrMaybeMutable = TRUE;
    102 }
    103 
    104 
    105 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
    106                            uint32_t flags, UErrorCode &status) {
    107     init(status);
    108     if (U_FAILURE(status)) {
    109         return;
    110     }
    111     UParseError    pe;
    112     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    113     if (U_FAILURE(status)) {
    114         return;
    115     }
    116 
    117     fPattern           = fPatternOwned;
    118     init2(input, status);
    119 }
    120 
    121 
    122 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
    123                            uint32_t flags, UErrorCode &status) {
    124     init(status);
    125     if (U_FAILURE(status)) {
    126         return;
    127     }
    128     UParseError    pe;
    129     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    130     if (U_FAILURE(status)) {
    131         return;
    132     }
    133     fPattern           = fPatternOwned;
    134     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    135 }
    136 
    137 RegexMatcher::RegexMatcher(UText *regexp,
    138                            uint32_t flags, UErrorCode &status) {
    139     init(status);
    140     if (U_FAILURE(status)) {
    141         return;
    142     }
    143     UParseError    pe;
    144     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    145         if (U_FAILURE(status)) {
    146         return;
    147     }
    148 
    149     fPattern           = fPatternOwned;
    150     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    151 }
    152 
    153 
    154 
    155 
    156 RegexMatcher::~RegexMatcher() {
    157     delete fStack;
    158     if (fData != fSmallData) {
    159         uprv_free(fData);
    160         fData = NULL;
    161     }
    162     if (fPatternOwned) {
    163         delete fPatternOwned;
    164         fPatternOwned = NULL;
    165         fPattern = NULL;
    166     }
    167 
    168     if (fInput) {
    169         delete fInput;
    170     }
    171     if (fInputText) {
    172         utext_close(fInputText);
    173     }
    174     if (fAltInputText) {
    175         utext_close(fAltInputText);
    176     }
    177 
    178     #if UCONFIG_NO_BREAK_ITERATION==0
    179     delete fWordBreakItr;
    180     #endif
    181 }
    182 
    183 //
    184 //   init()   common initialization for use by all constructors.
    185 //            Initialize all fields, get the object into a consistent state.
    186 //            This must be done even when the initial status shows an error,
    187 //            so that the object is initialized sufficiently well for the destructor
    188 //            to run safely.
    189 //
    190 void RegexMatcher::init(UErrorCode &status) {
    191     fPattern           = NULL;
    192     fPatternOwned      = NULL;
    193     fFrameSize         = 0;
    194     fRegionStart       = 0;
    195     fRegionLimit       = 0;
    196     fAnchorStart       = 0;
    197     fAnchorLimit       = 0;
    198     fLookStart         = 0;
    199     fLookLimit         = 0;
    200     fActiveStart       = 0;
    201     fActiveLimit       = 0;
    202     fTransparentBounds = FALSE;
    203     fAnchoringBounds   = TRUE;
    204     fMatch             = FALSE;
    205     fMatchStart        = 0;
    206     fMatchEnd          = 0;
    207     fLastMatchEnd      = -1;
    208     fAppendPosition    = 0;
    209     fHitEnd            = FALSE;
    210     fRequireEnd        = FALSE;
    211     fStack             = NULL;
    212     fFrame             = NULL;
    213     fTimeLimit         = 0;
    214     fTime              = 0;
    215     fTickCounter       = 0;
    216     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
    217     fCallbackFn        = NULL;
    218     fCallbackContext   = NULL;
    219     fTraceDebug        = FALSE;
    220     fDeferredStatus    = status;
    221     fData              = fSmallData;
    222     fWordBreakItr      = NULL;
    223 
    224     fStack             = new UVector64(status);
    225     fInputText         = NULL;
    226     fAltInputText      = NULL;
    227     fInput             = NULL;
    228     fInputLength       = 0;
    229     fInputUniStrMaybeMutable = FALSE;
    230 
    231     if (U_FAILURE(status)) {
    232         fDeferredStatus = status;
    233     }
    234 }
    235 
    236 //
    237 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
    238 //            This handles the common setup to be done after the Pattern is available.
    239 //
    240 void RegexMatcher::init2(UText *input, UErrorCode &status) {
    241     if (U_FAILURE(status)) {
    242         fDeferredStatus = status;
    243         return;
    244     }
    245 
    246     if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
    247         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
    248         if (fData == NULL) {
    249             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    250             return;
    251         }
    252     }
    253 
    254     reset(input);
    255     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
    256     if (U_FAILURE(status)) {
    257         fDeferredStatus = status;
    258         return;
    259     }
    260 }
    261 
    262 
    263 static const UChar BACKSLASH  = 0x5c;
    264 static const UChar DOLLARSIGN = 0x24;
    265 //--------------------------------------------------------------------------------
    266 //
    267 //    appendReplacement
    268 //
    269 //--------------------------------------------------------------------------------
    270 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
    271                                               const UnicodeString &replacement,
    272                                               UErrorCode &status) {
    273     UText replacementText = UTEXT_INITIALIZER;
    274 
    275     utext_openConstUnicodeString(&replacementText, &replacement, &status);
    276     if (U_SUCCESS(status)) {
    277         UText resultText = UTEXT_INITIALIZER;
    278         utext_openUnicodeString(&resultText, &dest, &status);
    279 
    280         if (U_SUCCESS(status)) {
    281             appendReplacement(&resultText, &replacementText, status);
    282             utext_close(&resultText);
    283         }
    284         utext_close(&replacementText);
    285     }
    286 
    287     return *this;
    288 }
    289 
    290 //
    291 //    appendReplacement, UText mode
    292 //
    293 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
    294                                               UText *replacement,
    295                                               UErrorCode &status) {
    296     if (U_FAILURE(status)) {
    297         return *this;
    298     }
    299     if (U_FAILURE(fDeferredStatus)) {
    300         status = fDeferredStatus;
    301         return *this;
    302     }
    303     if (fMatch == FALSE) {
    304         status = U_REGEX_INVALID_STATE;
    305         return *this;
    306     }
    307 
    308     // Copy input string from the end of previous match to start of current match
    309     int64_t  destLen = utext_nativeLength(dest);
    310     if (fMatchStart > fAppendPosition) {
    311         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    312             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, (int32_t)(fMatchStart-fAppendPosition), &status);
    313         } else {
    314             int32_t len16;
    315             if (UTEXT_USES_U16(fInputText)) {
    316                 len16 = (int32_t)(fMatchStart-fAppendPosition);
    317             } else {
    318                 UErrorCode lengthStatus = U_ZERO_ERROR;
    319                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
    320             }
    321             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
    322             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
    323             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    324             uprv_free(inputChars);
    325         }
    326     }
    327     fAppendPosition = fMatchEnd;
    328 
    329 
    330     // scan the replacement text, looking for substitutions ($n) and \escapes.
    331     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
    332     //         move entire ranges not containing substitutions.
    333     UTEXT_SETNATIVEINDEX(replacement, 0);
    334     UChar32 c = UTEXT_NEXT32(replacement);
    335     while (c != U_SENTINEL) {
    336         if (c == BACKSLASH) {
    337             // Backslash Escape.  Copy the following char out without further checks.
    338             //                    Note:  Surrogate pairs don't need any special handling
    339             //                           The second half wont be a '$' or a '\', and
    340             //                           will move to the dest normally on the next
    341             //                           loop iteration.
    342             c = UTEXT_CURRENT32(replacement);
    343             if (c == U_SENTINEL) {
    344                 break;
    345             }
    346 
    347             if (c==0x55/*U*/ || c==0x75/*u*/) {
    348                 // We have a \udddd or \Udddddddd escape sequence.
    349                 int32_t offset = 0;
    350                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
    351                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
    352                 if (escapedChar != (UChar32)0xFFFFFFFF) {
    353                     if (U_IS_BMP(escapedChar)) {
    354                         UChar c16 = (UChar)escapedChar;
    355                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    356                     } else {
    357                         UChar surrogate[2];
    358                         surrogate[0] = U16_LEAD(escapedChar);
    359                         surrogate[1] = U16_TRAIL(escapedChar);
    360                         if (U_SUCCESS(status)) {
    361                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    362                         }
    363                     }
    364                     // TODO:  Report errors for mal-formed \u escapes?
    365                     //        As this is, the original sequence is output, which may be OK.
    366                     if (context.lastOffset == offset) {
    367                         UTEXT_PREVIOUS32(replacement);
    368                     } else if (context.lastOffset != offset-1) {
    369                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
    370                     }
    371                 }
    372             } else {
    373                 UTEXT_NEXT32(replacement);
    374                 // Plain backslash escape.  Just put out the escaped character.
    375                 if (U_IS_BMP(c)) {
    376                     UChar c16 = (UChar)c;
    377                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    378                 } else {
    379                     UChar surrogate[2];
    380                     surrogate[0] = U16_LEAD(c);
    381                     surrogate[1] = U16_TRAIL(c);
    382                     if (U_SUCCESS(status)) {
    383                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    384                     }
    385                 }
    386             }
    387         } else if (c != DOLLARSIGN) {
    388             // Normal char, not a $.  Copy it out without further checks.
    389             if (U_IS_BMP(c)) {
    390                 UChar c16 = (UChar)c;
    391                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    392             } else {
    393                 UChar surrogate[2];
    394                 surrogate[0] = U16_LEAD(c);
    395                 surrogate[1] = U16_TRAIL(c);
    396                 if (U_SUCCESS(status)) {
    397                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    398                 }
    399             }
    400         } else {
    401             // We've got a $.  Pick up a capture group number if one follows.
    402             // Consume at most the number of digits necessary for the largest capture
    403             // number that is valid for this pattern.
    404 
    405             int32_t numDigits = 0;
    406             int32_t groupNum  = 0;
    407             UChar32 digitC;
    408             for (;;) {
    409                 digitC = UTEXT_CURRENT32(replacement);
    410                 if (digitC == U_SENTINEL) {
    411                     break;
    412                 }
    413                 if (u_isdigit(digitC) == FALSE) {
    414                     break;
    415                 }
    416                 UTEXT_NEXT32(replacement);
    417                 groupNum=groupNum*10 + u_charDigitValue(digitC);
    418                 numDigits++;
    419                 if (numDigits >= fPattern->fMaxCaptureDigits) {
    420                     break;
    421                 }
    422             }
    423 
    424 
    425             if (numDigits == 0) {
    426                 // The $ didn't introduce a group number at all.
    427                 // Treat it as just part of the substitution text.
    428                 UChar c16 = DOLLARSIGN;
    429                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    430             } else {
    431                 // Finally, append the capture group data to the destination.
    432                 destLen += appendGroup(groupNum, dest, status);
    433                 if (U_FAILURE(status)) {
    434                     // Can fail if group number is out of range.
    435                     break;
    436                 }
    437             }
    438         }
    439 
    440         if (U_FAILURE(status)) {
    441             break;
    442         } else {
    443             c = UTEXT_NEXT32(replacement);
    444         }
    445     }
    446 
    447     return *this;
    448 }
    449 
    450 
    451 
    452 //--------------------------------------------------------------------------------
    453 //
    454 //    appendTail     Intended to be used in conjunction with appendReplacement()
    455 //                   To the destination string, append everything following
    456 //                   the last match position from the input string.
    457 //
    458 //                   Note:  Match ranges do not affect appendTail or appendReplacement
    459 //
    460 //--------------------------------------------------------------------------------
    461 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
    462     UErrorCode status = U_ZERO_ERROR;
    463     UText resultText = UTEXT_INITIALIZER;
    464     utext_openUnicodeString(&resultText, &dest, &status);
    465 
    466     if (U_SUCCESS(status)) {
    467         appendTail(&resultText);
    468         utext_close(&resultText);
    469     }
    470 
    471     return dest;
    472 }
    473 
    474 //
    475 //   appendTail, UText mode
    476 //
    477 UText *RegexMatcher::appendTail(UText *dest) {
    478     if (fInputLength > fAppendPosition) {
    479         UErrorCode status = U_ZERO_ERROR;
    480         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    481             int64_t destLen = utext_nativeLength(dest);
    482             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, (int32_t)(fInputLength-fAppendPosition), &status);
    483         } else {
    484             int32_t len16;
    485             if (UTEXT_USES_U16(fInputText)) {
    486                 len16 = (int32_t)(fInputLength-fAppendPosition);
    487             } else {
    488                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
    489                 status = U_ZERO_ERROR; // buffer overflow
    490             }
    491 
    492             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
    493             utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
    494 
    495             int64_t destLen = utext_nativeLength(dest);
    496             utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    497 
    498             uprv_free(inputChars);
    499         }
    500     }
    501     return dest;
    502 }
    503 
    504 
    505 
    506 //--------------------------------------------------------------------------------
    507 //
    508 //   end
    509 //
    510 //--------------------------------------------------------------------------------
    511 int32_t RegexMatcher::end(UErrorCode &err) const {
    512     return end(0, err);
    513 }
    514 
    515 
    516 
    517 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
    518     if (U_FAILURE(err)) {
    519         return -1;
    520     }
    521     if (fMatch == FALSE) {
    522         err = U_REGEX_INVALID_STATE;
    523         return -1;
    524     }
    525     if (group < 0 || group > fPattern->fGroupMap->size()) {
    526         err = U_INDEX_OUTOFBOUNDS_ERROR;
    527         return -1;
    528     }
    529     int64_t e = -1;
    530     if (group == 0) {
    531         e = fMatchEnd;
    532     } else {
    533         // Get the position within the stack frame of the variables for
    534         //    this capture group.
    535         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
    536         U_ASSERT(groupOffset < fPattern->fFrameSize);
    537         U_ASSERT(groupOffset >= 0);
    538         e = fFrame->fExtra[groupOffset + 1];
    539     }
    540 
    541     if (e == -1 || UTEXT_USES_U16(fInputText)) {
    542         return (int32_t)e;
    543     } else {
    544         // !!!: Would like a better way to do this!
    545         UErrorCode status = U_ZERO_ERROR;
    546         return utext_extract(fInputText, 0, e, NULL, 0, &status);
    547     }
    548 }
    549 
    550 
    551 
    552 //--------------------------------------------------------------------------------
    553 //
    554 //   find()
    555 //
    556 //--------------------------------------------------------------------------------
    557 UBool RegexMatcher::find() {
    558     // Start at the position of the last match end.  (Will be zero if the
    559     //   matcher has been reset.)
    560     //
    561     if (U_FAILURE(fDeferredStatus)) {
    562         return FALSE;
    563     }
    564 
    565     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    566         return findUsingChunk();
    567     }
    568 
    569     int64_t startPos = fMatchEnd;
    570     if (startPos==0) {
    571         startPos = fActiveStart;
    572     }
    573 
    574     if (fMatch) {
    575         // Save the position of any previous successful match.
    576         fLastMatchEnd = fMatchEnd;
    577 
    578         if (fMatchStart == fMatchEnd) {
    579             // Previous match had zero length.  Move start position up one position
    580             //  to avoid sending find() into a loop on zero-length matches.
    581             if (startPos >= fActiveLimit) {
    582                 fMatch = FALSE;
    583                 fHitEnd = TRUE;
    584                 return FALSE;
    585             }
    586             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    587             UTEXT_NEXT32(fInputText);
    588             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    589         }
    590     } else {
    591         if (fLastMatchEnd >= 0) {
    592             // A previous find() failed to match.  Don't try again.
    593             //   (without this test, a pattern with a zero-length match
    594             //    could match again at the end of an input string.)
    595             fHitEnd = TRUE;
    596             return FALSE;
    597         }
    598     }
    599 
    600 
    601     // Compute the position in the input string beyond which a match can not begin, because
    602     //   the minimum length match would extend past the end of the input.
    603     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    604     //          Be aware of possible overflows if making changes here.
    605     int64_t testStartLimit;
    606     if (UTEXT_USES_U16(fInputText)) {
    607         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
    608         if (startPos > testStartLimit) {
    609             fMatch = FALSE;
    610             fHitEnd = TRUE;
    611             return FALSE;
    612         }
    613     } else {
    614         // For now, let the matcher discover that it can't match on its own
    615         // We don't know how long the match len is in native characters
    616         testStartLimit = fActiveLimit;
    617     }
    618 
    619     UChar32  c;
    620     U_ASSERT(startPos >= 0);
    621 
    622     switch (fPattern->fStartType) {
    623     case START_NO_INFO:
    624         // No optimization was found.
    625         //  Try a match at each input position.
    626         for (;;) {
    627             MatchAt(startPos, FALSE, fDeferredStatus);
    628             if (U_FAILURE(fDeferredStatus)) {
    629                 return FALSE;
    630             }
    631             if (fMatch) {
    632                 return TRUE;
    633             }
    634             if (startPos >= testStartLimit) {
    635                 fHitEnd = TRUE;
    636                 return FALSE;
    637             }
    638             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    639             UTEXT_NEXT32(fInputText);
    640             startPos = UTEXT_GETNATIVEINDEX(fInputText);
    641             // Note that it's perfectly OK for a pattern to have a zero-length
    642             //   match at the end of a string, so we must make sure that the loop
    643             //   runs with startPos == testStartLimit the last time through.
    644         }
    645         U_ASSERT(FALSE);
    646 
    647     case START_START:
    648         // Matches are only possible at the start of the input string
    649         //   (pattern begins with ^ or \A)
    650         if (startPos > fActiveStart) {
    651             fMatch = FALSE;
    652             return FALSE;
    653         }
    654         MatchAt(startPos, FALSE, fDeferredStatus);
    655         if (U_FAILURE(fDeferredStatus)) {
    656             return FALSE;
    657         }
    658         return fMatch;
    659 
    660 
    661     case START_SET:
    662         {
    663             // Match may start on any char from a pre-computed set.
    664             U_ASSERT(fPattern->fMinMatchLen > 0);
    665             int64_t pos;
    666             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    667             for (;;) {
    668                 c = UTEXT_NEXT32(fInputText);
    669                 pos = UTEXT_GETNATIVEINDEX(fInputText);
    670                 // c will be -1 (U_SENTINEL) at end of text, in which case we
    671                 // skip this next block (so we don't have a negative array index)
    672                 // and handle end of text in the following block.
    673                 if (c >= 0 && (c<256 && fPattern->fInitialChars8->contains(c) ||
    674                     c>=256 && fPattern->fInitialChars->contains(c))) {
    675                     MatchAt(startPos, FALSE, fDeferredStatus);
    676                     if (U_FAILURE(fDeferredStatus)) {
    677                         return FALSE;
    678                     }
    679                     if (fMatch) {
    680                         return TRUE;
    681                     }
    682                     UTEXT_SETNATIVEINDEX(fInputText, pos);
    683                 }
    684                 if (startPos >= testStartLimit) {
    685                     fMatch = FALSE;
    686                     fHitEnd = TRUE;
    687                     return FALSE;
    688                 }
    689                 startPos = pos;
    690             }
    691         }
    692         U_ASSERT(FALSE);
    693 
    694     case START_STRING:
    695     case START_CHAR:
    696         {
    697             // Match starts on exactly one char.
    698             U_ASSERT(fPattern->fMinMatchLen > 0);
    699             UChar32 theChar = fPattern->fInitialChar;
    700             int64_t pos;
    701             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    702             for (;;) {
    703                 c = UTEXT_NEXT32(fInputText);
    704                 pos = UTEXT_GETNATIVEINDEX(fInputText);
    705                 if (c == theChar) {
    706                     MatchAt(startPos, FALSE, fDeferredStatus);
    707                     if (U_FAILURE(fDeferredStatus)) {
    708                         return FALSE;
    709                     }
    710                     if (fMatch) {
    711                         return TRUE;
    712                     }
    713                     UTEXT_SETNATIVEINDEX(fInputText, pos);
    714                 }
    715                 if (startPos >= testStartLimit) {
    716                     fMatch = FALSE;
    717                     fHitEnd = TRUE;
    718                     return FALSE;
    719                 }
    720                 startPos = pos;
    721            }
    722         }
    723         U_ASSERT(FALSE);
    724 
    725     case START_LINE:
    726         {
    727             UChar32  c;
    728             if (startPos == fAnchorStart) {
    729                 MatchAt(startPos, FALSE, fDeferredStatus);
    730                 if (U_FAILURE(fDeferredStatus)) {
    731                     return FALSE;
    732                 }
    733                 if (fMatch) {
    734                     return TRUE;
    735                 }
    736                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    737                 c = UTEXT_NEXT32(fInputText);
    738                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    739             } else {
    740                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    741                 c = UTEXT_PREVIOUS32(fInputText);
    742                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
    743             }
    744 
    745             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
    746                 for (;;) {
    747                     if (c == 0x0a) {
    748                             MatchAt(startPos, FALSE, fDeferredStatus);
    749                             if (U_FAILURE(fDeferredStatus)) {
    750                                 return FALSE;
    751                             }
    752                             if (fMatch) {
    753                                 return TRUE;
    754                             }
    755                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    756                     }
    757                     if (startPos >= testStartLimit) {
    758                         fMatch = FALSE;
    759                         fHitEnd = TRUE;
    760                         return FALSE;
    761                     }
    762                     c = UTEXT_NEXT32(fInputText);
    763                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
    764                     // Note that it's perfectly OK for a pattern to have a zero-length
    765                     //   match at the end of a string, so we must make sure that the loop
    766                     //   runs with startPos == testStartLimit the last time through.
    767                 }
    768             } else {
    769                 for (;;) {
    770                     if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
    771                         ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
    772                             if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
    773                                 UTEXT_NEXT32(fInputText);
    774                                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
    775                             }
    776                             MatchAt(startPos, FALSE, fDeferredStatus);
    777                             if (U_FAILURE(fDeferredStatus)) {
    778                                 return FALSE;
    779                             }
    780                             if (fMatch) {
    781                                 return TRUE;
    782                             }
    783                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
    784                     }
    785                     if (startPos >= testStartLimit) {
    786                         fMatch = FALSE;
    787                         fHitEnd = TRUE;
    788                         return FALSE;
    789                     }
    790                     c = UTEXT_NEXT32(fInputText);
    791                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
    792                     // Note that it's perfectly OK for a pattern to have a zero-length
    793                     //   match at the end of a string, so we must make sure that the loop
    794                     //   runs with startPos == testStartLimit the last time through.
    795                 }
    796             }
    797         }
    798 
    799     default:
    800         U_ASSERT(FALSE);
    801     }
    802 
    803     U_ASSERT(FALSE);
    804     return FALSE;
    805 }
    806 
    807 
    808 
    809 UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
    810     if (U_FAILURE(status)) {
    811         return FALSE;
    812     }
    813     if (U_FAILURE(fDeferredStatus)) {
    814         status = fDeferredStatus;
    815         return FALSE;
    816     }
    817     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
    818                                           //        This will reset the region to be the full input length.
    819     if (start < 0) {
    820         status = U_INDEX_OUTOFBOUNDS_ERROR;
    821         return FALSE;
    822     }
    823 
    824     UBool couldFindStart = TRUE;
    825     int64_t nativeStart;
    826     if (UTEXT_USES_U16(fInputText)) {
    827         nativeStart = start;
    828     } else {
    829         UTEXT_SETNATIVEINDEX(fInputText, 0);
    830         int32_t i = 0;
    831         while (i < start) {
    832             UChar32 c = UTEXT_NEXT32(fInputText);
    833             if (c != U_SENTINEL) {
    834                 i += U16_LENGTH(c);
    835             } else {
    836                 couldFindStart = FALSE;
    837                 break;
    838             }
    839         }
    840         nativeStart = UTEXT_GETNATIVEINDEX(fInputText);
    841     }
    842     if (!couldFindStart || nativeStart < fActiveStart || nativeStart > fActiveLimit) {
    843         status = U_INDEX_OUTOFBOUNDS_ERROR;
    844         return FALSE;
    845     }
    846     fMatchEnd = nativeStart;
    847     return find();
    848 }
    849 
    850 
    851 //--------------------------------------------------------------------------------
    852 //
    853 //   findUsingChunk() -- like find(), but with the advance knowledge that the
    854 //                       entire string is available in the UText's chunk buffer.
    855 //
    856 //--------------------------------------------------------------------------------
    857 UBool RegexMatcher::findUsingChunk() {
    858     // Start at the position of the last match end.  (Will be zero if the
    859     //   matcher has been reset.
    860     //
    861 
    862     int32_t startPos = (int32_t)fMatchEnd;
    863     if (startPos==0) {
    864         startPos = (int32_t)fActiveStart;
    865     }
    866 
    867     const UChar *inputBuf = fInputText->chunkContents;
    868 
    869     if (fMatch) {
    870         // Save the position of any previous successful match.
    871         fLastMatchEnd = fMatchEnd;
    872 
    873         if (fMatchStart == fMatchEnd) {
    874             // Previous match had zero length.  Move start position up one position
    875             //  to avoid sending find() into a loop on zero-length matches.
    876             if (startPos >= fActiveLimit) {
    877                 fMatch = FALSE;
    878                 fHitEnd = TRUE;
    879                 return FALSE;
    880             }
    881             U16_FWD_1(inputBuf, startPos, fInputLength);
    882         }
    883     } else {
    884         if (fLastMatchEnd >= 0) {
    885             // A previous find() failed to match.  Don't try again.
    886             //   (without this test, a pattern with a zero-length match
    887             //    could match again at the end of an input string.)
    888             fHitEnd = TRUE;
    889             return FALSE;
    890         }
    891     }
    892 
    893 
    894     // Compute the position in the input string beyond which a match can not begin, because
    895     //   the minimum length match would extend past the end of the input.
    896     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    897     //          Be aware of possible overflows if making changes here.
    898     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
    899     if (startPos > testLen) {
    900         fMatch = FALSE;
    901         fHitEnd = TRUE;
    902         return FALSE;
    903     }
    904 
    905     UChar32  c;
    906     U_ASSERT(startPos >= 0);
    907 
    908     switch (fPattern->fStartType) {
    909     case START_NO_INFO:
    910         // No optimization was found.
    911         //  Try a match at each input position.
    912         for (;;) {
    913             MatchChunkAt(startPos, FALSE, fDeferredStatus);
    914             if (U_FAILURE(fDeferredStatus)) {
    915                 return FALSE;
    916             }
    917             if (fMatch) {
    918                 return TRUE;
    919             }
    920             if (startPos >= testLen) {
    921                 fHitEnd = TRUE;
    922                 return FALSE;
    923             }
    924             U16_FWD_1(inputBuf, startPos, fActiveLimit);
    925             // Note that it's perfectly OK for a pattern to have a zero-length
    926             //   match at the end of a string, so we must make sure that the loop
    927             //   runs with startPos == testLen the last time through.
    928         }
    929         U_ASSERT(FALSE);
    930 
    931     case START_START:
    932         // Matches are only possible at the start of the input string
    933         //   (pattern begins with ^ or \A)
    934         if (startPos > fActiveStart) {
    935             fMatch = FALSE;
    936             return FALSE;
    937         }
    938         MatchChunkAt(startPos, FALSE, fDeferredStatus);
    939         if (U_FAILURE(fDeferredStatus)) {
    940             return FALSE;
    941         }
    942         return fMatch;
    943 
    944 
    945     case START_SET:
    946     {
    947         // Match may start on any char from a pre-computed set.
    948         U_ASSERT(fPattern->fMinMatchLen > 0);
    949         for (;;) {
    950             int32_t pos = startPos;
    951             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
    952             if (c<256 && fPattern->fInitialChars8->contains(c) ||
    953                 c>=256 && fPattern->fInitialChars->contains(c)) {
    954                 MatchChunkAt(pos, FALSE, fDeferredStatus);
    955                 if (U_FAILURE(fDeferredStatus)) {
    956                     return FALSE;
    957                 }
    958                 if (fMatch) {
    959                     return TRUE;
    960                 }
    961             }
    962             if (pos >= testLen) {
    963                 fMatch = FALSE;
    964                 fHitEnd = TRUE;
    965                 return FALSE;
    966             }
    967         }
    968     }
    969         U_ASSERT(FALSE);
    970 
    971     case START_STRING:
    972     case START_CHAR:
    973     {
    974         // Match starts on exactly one char.
    975         U_ASSERT(fPattern->fMinMatchLen > 0);
    976         UChar32 theChar = fPattern->fInitialChar;
    977         for (;;) {
    978             int32_t pos = startPos;
    979             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
    980             if (c == theChar) {
    981                 MatchChunkAt(pos, FALSE, fDeferredStatus);
    982                 if (U_FAILURE(fDeferredStatus)) {
    983                     return FALSE;
    984                 }
    985                 if (fMatch) {
    986                     return TRUE;
    987                 }
    988             }
    989             if (pos >= testLen) {
    990                 fMatch = FALSE;
    991                 fHitEnd = TRUE;
    992                 return FALSE;
    993             }
    994         }
    995     }
    996         U_ASSERT(FALSE);
    997 
    998     case START_LINE:
    999     {
   1000         UChar32  c;
   1001         if (startPos == fAnchorStart) {
   1002             MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1003             if (U_FAILURE(fDeferredStatus)) {
   1004                 return FALSE;
   1005             }
   1006             if (fMatch) {
   1007                 return TRUE;
   1008             }
   1009             U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1010         }
   1011 
   1012         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
   1013             for (;;) {
   1014                 c = inputBuf[startPos-1];
   1015                 if (c == 0x0a) {
   1016                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1017                     if (U_FAILURE(fDeferredStatus)) {
   1018                         return FALSE;
   1019                     }
   1020                     if (fMatch) {
   1021                         return TRUE;
   1022                     }
   1023                 }
   1024                 if (startPos >= testLen) {
   1025                     fMatch = FALSE;
   1026                     fHitEnd = TRUE;
   1027                     return FALSE;
   1028                 }
   1029                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1030                 // Note that it's perfectly OK for a pattern to have a zero-length
   1031                 //   match at the end of a string, so we must make sure that the loop
   1032                 //   runs with startPos == testLen the last time through.
   1033             }
   1034         } else {
   1035             for (;;) {
   1036                 c = inputBuf[startPos-1];
   1037                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   1038                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
   1039                     if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
   1040                         startPos++;
   1041                     }
   1042                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
   1043                     if (U_FAILURE(fDeferredStatus)) {
   1044                         return FALSE;
   1045                     }
   1046                     if (fMatch) {
   1047                         return TRUE;
   1048                     }
   1049                 }
   1050                 if (startPos >= testLen) {
   1051                     fMatch = FALSE;
   1052                     fHitEnd = TRUE;
   1053                     return FALSE;
   1054                 }
   1055                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1056                 // Note that it's perfectly OK for a pattern to have a zero-length
   1057                 //   match at the end of a string, so we must make sure that the loop
   1058                 //   runs with startPos == testLen the last time through.
   1059             }
   1060         }
   1061     }
   1062 
   1063     default:
   1064         U_ASSERT(FALSE);
   1065     }
   1066 
   1067     U_ASSERT(FALSE);
   1068     return FALSE;
   1069 }
   1070 
   1071 
   1072 
   1073 //--------------------------------------------------------------------------------
   1074 //
   1075 //  group()
   1076 //
   1077 //--------------------------------------------------------------------------------
   1078 UnicodeString RegexMatcher::group(UErrorCode &status) const {
   1079     return group(0, status);
   1080 }
   1081 
   1082 UText *RegexMatcher::group(UText *dest, MatcherDestIsUTextFlag /*flag*/, UErrorCode &status) const {
   1083     return group(0, dest, status);
   1084 }
   1085 
   1086 
   1087 
   1088 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
   1089     UnicodeString result;
   1090     UText resultText = UTEXT_INITIALIZER;
   1091     utext_openUnicodeString(&resultText, &result, &status);
   1092     group(groupNum, &resultText, status);
   1093     utext_close(&resultText);
   1094     return result;
   1095 }
   1096 
   1097 
   1098 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
   1099     UBool bailOut = FALSE;
   1100     if (U_FAILURE(status)) {
   1101         bailOut = TRUE;
   1102     }
   1103     if (U_FAILURE(fDeferredStatus)) {
   1104         status = fDeferredStatus;
   1105         bailOut = TRUE;
   1106     }
   1107 
   1108     if (fMatch == FALSE) {
   1109         status = U_REGEX_INVALID_STATE;
   1110         bailOut = TRUE;
   1111     }
   1112     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1113         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1114         bailOut = TRUE;
   1115     }
   1116 
   1117     if (bailOut) {
   1118         if (dest) {
   1119             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1120             return dest;
   1121         } else {
   1122             return utext_openUChars(NULL, NULL, 0, &status);
   1123         }
   1124     }
   1125 
   1126     int64_t s, e;
   1127     if (groupNum == 0) {
   1128         s = fMatchStart;
   1129         e = fMatchEnd;
   1130     } else {
   1131         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1132         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1133         U_ASSERT(groupOffset >= 0);
   1134         s = fFrame->fExtra[groupOffset];
   1135         e = fFrame->fExtra[groupOffset+1];
   1136     }
   1137 
   1138     if (s < 0) {
   1139         // A capture group wasn't part of the match
   1140         if (dest) {
   1141             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
   1142             return dest;
   1143         } else {
   1144             return utext_openUChars(NULL, NULL, 0, &status);
   1145         }
   1146     }
   1147     U_ASSERT(s <= e);
   1148 
   1149     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1150         U_ASSERT(e <= fInputLength);
   1151         if (dest) {
   1152             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
   1153         } else {
   1154             UText groupText = UTEXT_INITIALIZER;
   1155             utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
   1156             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
   1157             utext_close(&groupText);
   1158         }
   1159     } else {
   1160         int32_t len16;
   1161         if (UTEXT_USES_U16(fInputText)) {
   1162             len16 = (int32_t)(e-s);
   1163         } else {
   1164             UErrorCode lengthStatus = U_ZERO_ERROR;
   1165             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
   1166         }
   1167         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   1168         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
   1169 
   1170         if (dest) {
   1171             utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
   1172         } else {
   1173             UText groupText = UTEXT_INITIALIZER;
   1174             utext_openUChars(&groupText, groupChars, len16, &status);
   1175             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
   1176             utext_close(&groupText);
   1177         }
   1178 
   1179         uprv_free(groupChars);
   1180     }
   1181     return dest;
   1182 }
   1183 
   1184 //--------------------------------------------------------------------------------
   1185 //
   1186 //  appendGroup() -- currently internal only, appends a group to a UText rather
   1187 //                   than replacing its contents
   1188 //
   1189 //--------------------------------------------------------------------------------
   1190 
   1191 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
   1192     int64_t destLen = utext_nativeLength(dest);
   1193 
   1194     if (U_FAILURE(status)) {
   1195         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1196     }
   1197     if (U_FAILURE(fDeferredStatus)) {
   1198         status = fDeferredStatus;
   1199         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1200     }
   1201 
   1202     if (fMatch == FALSE) {
   1203         status = U_REGEX_INVALID_STATE;
   1204         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1205     }
   1206     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1207         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1208         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1209     }
   1210 
   1211     int64_t s, e;
   1212     if (groupNum == 0) {
   1213         s = fMatchStart;
   1214         e = fMatchEnd;
   1215     } else {
   1216         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1217         U_ASSERT(groupOffset < fPattern->fFrameSize);
   1218         U_ASSERT(groupOffset >= 0);
   1219         s = fFrame->fExtra[groupOffset];
   1220         e = fFrame->fExtra[groupOffset+1];
   1221     }
   1222 
   1223     if (s < 0) {
   1224         // A capture group wasn't part of the match
   1225         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
   1226     }
   1227     U_ASSERT(s <= e);
   1228 
   1229     int64_t deltaLen;
   1230     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1231         U_ASSERT(e <= fInputLength);
   1232         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
   1233     } else {
   1234         int32_t len16;
   1235         if (UTEXT_USES_U16(fInputText)) {
   1236             len16 = (int32_t)(e-s);
   1237         } else {
   1238             UErrorCode lengthStatus = U_ZERO_ERROR;
   1239             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
   1240         }
   1241         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
   1242         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
   1243 
   1244         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
   1245         uprv_free(groupChars);
   1246     }
   1247     return deltaLen;
   1248 }
   1249 
   1250 
   1251 
   1252 //--------------------------------------------------------------------------------
   1253 //
   1254 //  groupCount()
   1255 //
   1256 //--------------------------------------------------------------------------------
   1257 int32_t RegexMatcher::groupCount() const {
   1258     return fPattern->fGroupMap->size();
   1259 }
   1260 
   1261 
   1262 
   1263 //--------------------------------------------------------------------------------
   1264 //
   1265 //  hasAnchoringBounds()
   1266 //
   1267 //--------------------------------------------------------------------------------
   1268 UBool RegexMatcher::hasAnchoringBounds() const {
   1269     return fAnchoringBounds;
   1270 }
   1271 
   1272 
   1273 //--------------------------------------------------------------------------------
   1274 //
   1275 //  hasTransparentBounds()
   1276 //
   1277 //--------------------------------------------------------------------------------
   1278 UBool RegexMatcher::hasTransparentBounds() const {
   1279     return fTransparentBounds;
   1280 }
   1281 
   1282 
   1283 
   1284 //--------------------------------------------------------------------------------
   1285 //
   1286 //  hitEnd()
   1287 //
   1288 //--------------------------------------------------------------------------------
   1289 UBool RegexMatcher::hitEnd() const {
   1290     return fHitEnd;
   1291 }
   1292 
   1293 
   1294 //--------------------------------------------------------------------------------
   1295 //
   1296 //  input()
   1297 //
   1298 //--------------------------------------------------------------------------------
   1299 const UnicodeString &RegexMatcher::input() const {
   1300     if (!fInput) {
   1301         UErrorCode status = U_ZERO_ERROR;
   1302         int32_t len16;
   1303         if (UTEXT_USES_U16(fInputText)) {
   1304             len16 = (int32_t)fInputLength;
   1305         } else {
   1306             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
   1307             status = U_ZERO_ERROR; // overflow, length status
   1308         }
   1309         UnicodeString *result = new UnicodeString(len16, 0, 0);
   1310 
   1311         UChar *inputChars = result->getBuffer(len16);
   1312         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
   1313         result->releaseBuffer(len16);
   1314 
   1315         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
   1316     }
   1317 
   1318     return *fInput;
   1319 }
   1320 
   1321 //--------------------------------------------------------------------------------
   1322 //
   1323 //  inputText()
   1324 //
   1325 //--------------------------------------------------------------------------------
   1326 UText *RegexMatcher::inputText() const {
   1327     return fInputText;
   1328 }
   1329 
   1330 
   1331 //--------------------------------------------------------------------------------
   1332 //
   1333 //  getInput() -- like inputText(), but makes a clone or copies into another UText
   1334 //
   1335 //--------------------------------------------------------------------------------
   1336 UText *RegexMatcher::getInput (UText *dest) const {
   1337     UErrorCode status = U_ZERO_ERROR; // ignored
   1338     if (dest) {
   1339         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1340             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
   1341         } else {
   1342             int32_t input16Len;
   1343             if (UTEXT_USES_U16(fInputText)) {
   1344                 input16Len = (int32_t)fInputLength;
   1345             } else {
   1346                 UErrorCode lengthStatus = U_ZERO_ERROR;
   1347                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
   1348             }
   1349             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
   1350 
   1351             status = U_ZERO_ERROR;
   1352             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
   1353             status = U_ZERO_ERROR;
   1354             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
   1355 
   1356             uprv_free(inputChars);
   1357         }
   1358         return dest;
   1359     } else {
   1360         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
   1361     }
   1362 }
   1363 
   1364 
   1365 static UBool compat_SyncMutableUTextContents(UText *ut);
   1366 static UBool compat_SyncMutableUTextContents(UText *ut) {
   1367     UBool retVal = FALSE;
   1368 
   1369     //  In the following test, we're really only interested in whether the UText should switch
   1370     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
   1371     //  will still point to the correct data.
   1372     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
   1373         UnicodeString *us=(UnicodeString *)ut->context;
   1374 
   1375         // Update to the latest length.
   1376         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
   1377         int32_t newLength = us->length();
   1378 
   1379         // Update the chunk description.
   1380         // The buffer may have switched between stack- and heap-based.
   1381         ut->chunkContents    = us->getBuffer();
   1382         ut->chunkLength      = newLength;
   1383         ut->chunkNativeLimit = newLength;
   1384         ut->nativeIndexingLimit = newLength;
   1385         retVal = TRUE;
   1386     }
   1387 
   1388     return retVal;
   1389 }
   1390 
   1391 //--------------------------------------------------------------------------------
   1392 //
   1393 //  lookingAt()
   1394 //
   1395 //--------------------------------------------------------------------------------
   1396 UBool RegexMatcher::lookingAt(UErrorCode &status) {
   1397     if (U_FAILURE(status)) {
   1398         return FALSE;
   1399     }
   1400     if (U_FAILURE(fDeferredStatus)) {
   1401         status = fDeferredStatus;
   1402         return FALSE;
   1403     }
   1404 
   1405     if (fInputUniStrMaybeMutable) {
   1406         if (compat_SyncMutableUTextContents(fInputText)) {
   1407         fInputLength = utext_nativeLength(fInputText);
   1408         reset();
   1409         }
   1410     }
   1411     else {
   1412         resetPreserveRegion();
   1413     }
   1414     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1415         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
   1416     } else {
   1417         MatchAt(fActiveStart, FALSE, status);
   1418     }
   1419     return fMatch;
   1420 }
   1421 
   1422 
   1423 UBool RegexMatcher::lookingAt(int32_t start, UErrorCode &status) {
   1424     if (U_FAILURE(status)) {
   1425         return FALSE;
   1426     }
   1427     if (U_FAILURE(fDeferredStatus)) {
   1428         status = fDeferredStatus;
   1429         return FALSE;
   1430     }
   1431     reset();
   1432 
   1433     if (start < 0) {
   1434         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1435         return FALSE;
   1436     }
   1437 
   1438     if (fInputUniStrMaybeMutable) {
   1439         if (compat_SyncMutableUTextContents(fInputText)) {
   1440         fInputLength = utext_nativeLength(fInputText);
   1441         reset();
   1442         }
   1443     }
   1444 
   1445     int64_t nativeStart;
   1446     UBool couldFindStart = TRUE;
   1447     if (UTEXT_USES_U16(fInputText)) {
   1448         nativeStart = start;
   1449     } else {
   1450         UTEXT_SETNATIVEINDEX(fInputText, 0);
   1451         int32_t i = 0;
   1452         while (i < start) {
   1453             UChar32 c = UTEXT_NEXT32(fInputText);
   1454             if (c != U_SENTINEL) {
   1455                 i += U16_LENGTH(c);
   1456             } else {
   1457                 couldFindStart = FALSE;
   1458                 break;
   1459             }
   1460         }
   1461         nativeStart = UTEXT_GETNATIVEINDEX(fInputText);
   1462     }
   1463     if (!couldFindStart || nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1464         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1465         return FALSE;
   1466     }
   1467 
   1468     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1469         MatchChunkAt((int32_t)nativeStart, FALSE, status);
   1470     } else {
   1471         MatchAt(nativeStart, FALSE, status);
   1472     }
   1473     return fMatch;
   1474 }
   1475 
   1476 
   1477 
   1478 //--------------------------------------------------------------------------------
   1479 //
   1480 //  matches()
   1481 //
   1482 //--------------------------------------------------------------------------------
   1483 UBool RegexMatcher::matches(UErrorCode &status) {
   1484     if (U_FAILURE(status)) {
   1485         return FALSE;
   1486     }
   1487     if (U_FAILURE(fDeferredStatus)) {
   1488         status = fDeferredStatus;
   1489         return FALSE;
   1490     }
   1491 
   1492     if (fInputUniStrMaybeMutable) {
   1493         if (compat_SyncMutableUTextContents(fInputText)) {
   1494         fInputLength = utext_nativeLength(fInputText);
   1495         reset();
   1496         }
   1497     }
   1498     else {
   1499         resetPreserveRegion();
   1500     }
   1501 
   1502     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1503         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
   1504     } else {
   1505         MatchAt(fActiveStart, TRUE, status);
   1506     }
   1507     return fMatch;
   1508 }
   1509 
   1510 
   1511 UBool RegexMatcher::matches(int32_t start, UErrorCode &status) {
   1512     if (U_FAILURE(status)) {
   1513         return FALSE;
   1514     }
   1515     if (U_FAILURE(fDeferredStatus)) {
   1516         status = fDeferredStatus;
   1517         return FALSE;
   1518     }
   1519     reset();
   1520 
   1521     if (start < 0) {
   1522         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1523         return FALSE;
   1524     }
   1525 
   1526     if (fInputUniStrMaybeMutable) {
   1527         if (compat_SyncMutableUTextContents(fInputText)) {
   1528         fInputLength = utext_nativeLength(fInputText);
   1529         reset();
   1530         }
   1531     }
   1532 
   1533     int64_t nativeStart;
   1534     UBool couldFindStart = TRUE;
   1535     if (UTEXT_USES_U16(fInputText)) {
   1536         nativeStart = start;
   1537     } else {
   1538         UTEXT_SETNATIVEINDEX(fInputText, 0);
   1539         int32_t i = 0;
   1540         while (i < start) {
   1541             UChar32 c = UTEXT_NEXT32(fInputText);
   1542             if (c != U_SENTINEL) {
   1543                 i += U16_LENGTH(c);
   1544             } else {
   1545                 couldFindStart = FALSE;
   1546                 break;
   1547             }
   1548         }
   1549         nativeStart = UTEXT_GETNATIVEINDEX(fInputText);
   1550     }
   1551     if (!couldFindStart || nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1552         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1553         return FALSE;
   1554     }
   1555 
   1556     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1557         MatchChunkAt((int32_t)nativeStart, TRUE, status);
   1558     } else {
   1559         MatchAt(nativeStart, TRUE, status);
   1560     }
   1561     return fMatch;
   1562 }
   1563 
   1564 
   1565 
   1566 //--------------------------------------------------------------------------------
   1567 //
   1568 //    pattern
   1569 //
   1570 //--------------------------------------------------------------------------------
   1571 const RegexPattern &RegexMatcher::pattern() const {
   1572     return *fPattern;
   1573 }
   1574 
   1575 
   1576 
   1577 //--------------------------------------------------------------------------------
   1578 //
   1579 //    region
   1580 //
   1581 //--------------------------------------------------------------------------------
   1582 RegexMatcher &RegexMatcher::region(int32_t start, int32_t limit, UErrorCode &status) {
   1583     if (U_FAILURE(status)) {
   1584         return *this;
   1585     }
   1586 
   1587     if (start>limit || start<0 || limit<0) {
   1588         status = U_ILLEGAL_ARGUMENT_ERROR;
   1589     }
   1590 
   1591     int64_t nativeStart;
   1592     int32_t i = 0;
   1593     UBool couldFindStart = TRUE;
   1594     if (UTEXT_USES_U16(fInputText)) {
   1595         nativeStart = start;
   1596         couldFindStart = (nativeStart <= fInputLength);
   1597     } else {
   1598         UTEXT_SETNATIVEINDEX(fInputText, 0);
   1599         while (i < start) {
   1600             UChar32 c = UTEXT_NEXT32(fInputText);
   1601             if (c != U_SENTINEL) {
   1602                 i += U16_LENGTH(c);
   1603             } else {
   1604                 couldFindStart = FALSE;
   1605                 break;
   1606             }
   1607         }
   1608         nativeStart = UTEXT_GETNATIVEINDEX(fInputText);
   1609     }
   1610     int64_t nativeLimit = nativeStart;
   1611 
   1612     if (!couldFindStart) {
   1613         status = U_ILLEGAL_ARGUMENT_ERROR;
   1614     } else {
   1615         UBool couldFindLimit = TRUE;
   1616         if (UTEXT_USES_U16(fInputText)) {
   1617             nativeLimit = limit;
   1618             couldFindLimit = (nativeLimit <= fInputLength);
   1619         } else {
   1620             while (i < limit) {
   1621                 UChar32 c = UTEXT_NEXT32(fInputText);
   1622                 if (c != U_SENTINEL) {
   1623                     i += U16_LENGTH(c);
   1624                 } else {
   1625                     couldFindLimit = FALSE;
   1626                     break;
   1627                 }
   1628             }
   1629             nativeLimit = UTEXT_GETNATIVEINDEX(fInputText);
   1630         }
   1631         if (!couldFindLimit) {
   1632             status = U_ILLEGAL_ARGUMENT_ERROR;
   1633         }
   1634     }
   1635 
   1636     this->reset();
   1637     fRegionStart = nativeStart;
   1638     fRegionLimit = nativeLimit;
   1639     fActiveStart = nativeStart;
   1640     fActiveLimit = nativeLimit;
   1641     if (!fTransparentBounds) {
   1642         fLookStart = nativeStart;
   1643         fLookLimit = nativeLimit;
   1644     }
   1645     if (fAnchoringBounds) {
   1646         fAnchorStart = nativeStart;
   1647         fAnchorLimit = nativeLimit;
   1648     }
   1649     return *this;
   1650 }
   1651 
   1652 
   1653 
   1654 //--------------------------------------------------------------------------------
   1655 //
   1656 //    regionEnd
   1657 //
   1658 //--------------------------------------------------------------------------------
   1659 int32_t RegexMatcher::regionEnd() const {
   1660     if (UTEXT_USES_U16(fInputText)) {
   1661         return (int32_t)fRegionLimit;
   1662     } else {
   1663         // !!!: Would like a better way to do this!
   1664         UErrorCode status = U_ZERO_ERROR;
   1665         return utext_extract(fInputText, 0, fRegionLimit, NULL, 0, &status);
   1666     }
   1667 }
   1668 
   1669 
   1670 //--------------------------------------------------------------------------------
   1671 //
   1672 //    regionStart
   1673 //
   1674 //--------------------------------------------------------------------------------
   1675 int32_t RegexMatcher::regionStart() const {
   1676     if (UTEXT_USES_U16(fInputText)) {
   1677         return (int32_t)fRegionStart;
   1678     } else {
   1679         // !!!: Would like a better way to do this!
   1680         UErrorCode status = U_ZERO_ERROR;
   1681         return utext_extract(fInputText, 0, fRegionStart, NULL, 0, &status);
   1682     }
   1683 }
   1684 
   1685 
   1686 //--------------------------------------------------------------------------------
   1687 //
   1688 //    replaceAll
   1689 //
   1690 //--------------------------------------------------------------------------------
   1691 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
   1692     UText replacementText = UTEXT_INITIALIZER;
   1693     UText resultText = UTEXT_INITIALIZER;
   1694     UnicodeString resultString;
   1695 
   1696     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1697     utext_openUnicodeString(&resultText, &resultString, &status);
   1698 
   1699     replaceAll(&replacementText, &resultText, status);
   1700 
   1701     utext_close(&resultText);
   1702     utext_close(&replacementText);
   1703 
   1704     return resultString;
   1705 }
   1706 
   1707 
   1708 //
   1709 //    replaceAll, UText mode
   1710 //
   1711 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
   1712     if (U_FAILURE(status)) {
   1713         return getInput(dest);
   1714     }
   1715     if (U_FAILURE(fDeferredStatus)) {
   1716         status = fDeferredStatus;
   1717         return getInput(dest);
   1718     }
   1719 
   1720     if (dest == NULL) {
   1721         UnicodeString emptyString;
   1722         UText empty = UTEXT_INITIALIZER;
   1723 
   1724         utext_openUnicodeString(&empty, &emptyString, &status);
   1725         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
   1726         utext_close(&empty);
   1727     }
   1728 
   1729     if (U_SUCCESS(status)) {
   1730         reset();
   1731         while (find()) {
   1732             appendReplacement(dest, replacement, status);
   1733             if (U_FAILURE(status)) {
   1734                 break;
   1735             }
   1736         }
   1737         appendTail(dest);
   1738     }
   1739 
   1740     return dest;
   1741 }
   1742 
   1743 
   1744 //--------------------------------------------------------------------------------
   1745 //
   1746 //    replaceFirst
   1747 //
   1748 //--------------------------------------------------------------------------------
   1749 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
   1750     UText replacementText = UTEXT_INITIALIZER;
   1751     UText resultText = UTEXT_INITIALIZER;
   1752     UnicodeString resultString;
   1753 
   1754     utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1755     utext_openUnicodeString(&resultText, &resultString, &status);
   1756 
   1757     replaceFirst(&replacementText, &resultText, status);
   1758 
   1759     utext_close(&resultText);
   1760     utext_close(&replacementText);
   1761 
   1762     return resultString;
   1763 }
   1764 
   1765 //
   1766 //    replaceFirst, UText mode
   1767 //
   1768 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
   1769     if (U_FAILURE(status)) {
   1770         return getInput(dest);
   1771     }
   1772     if (U_FAILURE(fDeferredStatus)) {
   1773         status = fDeferredStatus;
   1774         return getInput(dest);
   1775     }
   1776 
   1777     reset();
   1778     if (!find()) {
   1779         return getInput(dest);
   1780     }
   1781 
   1782     if (dest == NULL) {
   1783         UnicodeString emptyString;
   1784         UText empty = UTEXT_INITIALIZER;
   1785 
   1786         utext_openUnicodeString(&empty, &emptyString, &status);
   1787         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
   1788         utext_close(&empty);
   1789     }
   1790 
   1791     appendReplacement(dest, replacement, status);
   1792     appendTail(dest);
   1793 
   1794     return dest;
   1795 }
   1796 
   1797 
   1798 //--------------------------------------------------------------------------------
   1799 //
   1800 //     requireEnd
   1801 //
   1802 //--------------------------------------------------------------------------------
   1803 UBool RegexMatcher::requireEnd() const {
   1804     return fRequireEnd;
   1805 }
   1806 
   1807 
   1808 //--------------------------------------------------------------------------------
   1809 //
   1810 //     reset
   1811 //
   1812 //--------------------------------------------------------------------------------
   1813 RegexMatcher &RegexMatcher::reset() {
   1814     fRegionStart    = 0;
   1815     fRegionLimit    = fInputLength;
   1816     fActiveStart    = 0;
   1817     fActiveLimit    = fInputLength;
   1818     fAnchorStart    = 0;
   1819     fAnchorLimit    = fInputLength;
   1820     fLookStart      = 0;
   1821     fLookLimit      = fInputLength;
   1822     resetPreserveRegion();
   1823     return *this;
   1824 }
   1825 
   1826 
   1827 
   1828 void RegexMatcher::resetPreserveRegion() {
   1829     fMatchStart     = 0;
   1830     fMatchEnd       = 0;
   1831     fLastMatchEnd   = -1;
   1832     fAppendPosition = 0;
   1833     fMatch          = FALSE;
   1834     fHitEnd         = FALSE;
   1835     fRequireEnd     = FALSE;
   1836     fTime           = 0;
   1837     fTickCounter    = TIMER_INITIAL_VALUE;
   1838     //resetStack(); // more expensive than it looks...
   1839 }
   1840 
   1841 
   1842 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
   1843     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
   1844     if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
   1845     fInputLength = utext_nativeLength(fInputText);
   1846 
   1847     reset();
   1848     delete fInput;
   1849     fInput = NULL;
   1850 
   1851     //  Do the following for any UnicodeString.
   1852     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
   1853     fInputUniStrMaybeMutable = TRUE;
   1854 
   1855     if (fWordBreakItr != NULL) {
   1856 #if UCONFIG_NO_BREAK_ITERATION==0
   1857         UErrorCode status = U_ZERO_ERROR;
   1858         fWordBreakItr->setText(fInputText, status);
   1859 #endif
   1860     }
   1861     return *this;
   1862 }
   1863 
   1864 
   1865 RegexMatcher &RegexMatcher::reset(UText *input) {
   1866     if (fInputText != input) {
   1867         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
   1868         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
   1869         fInputLength = utext_nativeLength(fInputText);
   1870 
   1871         delete fInput;
   1872         fInput = NULL;
   1873 
   1874         if (fWordBreakItr != NULL) {
   1875 #if UCONFIG_NO_BREAK_ITERATION==0
   1876             UErrorCode status = U_ZERO_ERROR;
   1877             fWordBreakItr->setText(input, status);
   1878 #endif
   1879         }
   1880     }
   1881     reset();
   1882     fInputUniStrMaybeMutable = FALSE;
   1883 
   1884     return *this;
   1885 }
   1886 
   1887 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
   1888     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
   1889     return *this;
   1890 }*/
   1891 
   1892 
   1893 RegexMatcher &RegexMatcher::reset(int32_t position, UErrorCode &status) {
   1894     if (U_FAILURE(status)) {
   1895         return *this;
   1896     }
   1897     reset();       // Reset also resets the region to be the entire string.
   1898 
   1899     if (position < 0) {
   1900         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1901         return *this;
   1902     }
   1903 
   1904     int64_t nativePos;
   1905     UBool couldFindStart = TRUE;
   1906     if (UTEXT_USES_U16(fInputText)) {
   1907         nativePos = position;
   1908     } else {
   1909         UTEXT_SETNATIVEINDEX(fInputText, 0);
   1910         int32_t i = 0;
   1911         while (i < position) {
   1912             UChar32 c = UTEXT_NEXT32(fInputText);
   1913             if (c != U_SENTINEL) {
   1914                 i += U16_LENGTH(c);
   1915             } else {
   1916                 couldFindStart = FALSE;
   1917                 break;
   1918             }
   1919         }
   1920         nativePos = UTEXT_GETNATIVEINDEX(fInputText);
   1921     }
   1922     if (!couldFindStart || nativePos < fActiveStart || nativePos >= fActiveLimit) {
   1923         status = U_INDEX_OUTOFBOUNDS_ERROR;
   1924         return *this;
   1925     }
   1926     fMatchEnd = nativePos;
   1927     return *this;
   1928 }
   1929 
   1930 // BEGIN android-added
   1931 // Removed this function after Android upgrad to ICU4.6.
   1932 //--------------------------------------------------------------------------------
   1933 //
   1934 //    refresh
   1935 //
   1936 //--------------------------------------------------------------------------------
   1937 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
   1938     if (U_FAILURE(status)) {
   1939         return *this;
   1940     }
   1941     if (input == NULL) {
   1942         status = U_ILLEGAL_ARGUMENT_ERROR;
   1943         return *this;
   1944     }
   1945     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
   1946         status = U_ILLEGAL_ARGUMENT_ERROR;
   1947         return *this;
   1948     }
   1949     int64_t  pos = utext_getNativeIndex(fInputText);
   1950     //  Shallow read-only clone of the new UText into the existing input UText
   1951     fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
   1952     if (U_FAILURE(status)) {
   1953         return *this;
   1954     }
   1955     utext_setNativeIndex(fInputText, pos);
   1956 
   1957     if (fAltInputText != NULL) {
   1958         pos = utext_getNativeIndex(fAltInputText);
   1959         fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
   1960         if (U_FAILURE(status)) {
   1961             return *this;
   1962         }
   1963         utext_setNativeIndex(fAltInputText, pos);
   1964     }
   1965     return *this;
   1966 }
   1967 // END android-added
   1968 
   1969 
   1970 
   1971 //--------------------------------------------------------------------------------
   1972 //
   1973 //    setTrace
   1974 //
   1975 //--------------------------------------------------------------------------------
   1976 void RegexMatcher::setTrace(UBool state) {
   1977     fTraceDebug = state;
   1978 }
   1979 
   1980 
   1981 
   1982 //---------------------------------------------------------------------
   1983 //
   1984 //   split
   1985 //
   1986 //---------------------------------------------------------------------
   1987 int32_t  RegexMatcher::split(const UnicodeString &input,
   1988         UnicodeString    dest[],
   1989         int32_t          destCapacity,
   1990         UErrorCode      &status)
   1991 {
   1992     UText inputText = UTEXT_INITIALIZER;
   1993     utext_openConstUnicodeString(&inputText, &input, &status);
   1994 
   1995 	UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
   1996     int32_t i;
   1997     for (i = 0; i < destCapacity; i++) {
   1998         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
   1999     }
   2000 
   2001     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
   2002 
   2003     for (i = 0; i < destCapacity; i++) {
   2004         utext_close(destText[i]);
   2005     }
   2006 
   2007 	uprv_free(destText);
   2008     utext_close(&inputText);
   2009     return fieldCount;
   2010 }
   2011 
   2012 //
   2013 //   split, UText mode
   2014 //
   2015 int32_t  RegexMatcher::split(UText *input,
   2016         UText           *dest[],
   2017         int32_t          destCapacity,
   2018         UErrorCode      &status)
   2019 {
   2020     //
   2021     // Check arguements for validity
   2022     //
   2023     if (U_FAILURE(status)) {
   2024         return 0;
   2025     };
   2026 
   2027     if (destCapacity < 1) {
   2028         status = U_ILLEGAL_ARGUMENT_ERROR;
   2029         return 0;
   2030     }
   2031 
   2032     //
   2033     // Reset for the input text
   2034     //
   2035     reset(input);
   2036     int64_t   nextOutputStringStart = 0;
   2037     if (fActiveLimit == 0) {
   2038         return 0;
   2039     }
   2040 
   2041     //
   2042     // Loop through the input text, searching for the delimiter pattern
   2043     //
   2044     int32_t i;
   2045     int32_t numCaptureGroups = fPattern->fGroupMap->size();
   2046     for (i=0; ; i++) {
   2047         if (i>=destCapacity-1) {
   2048             // There is one or zero output string left.
   2049             // Fill the last output string with whatever is left from the input, then exit the loop.
   2050             //  ( i will be == destCapacity if we filled the output array while processing
   2051             //    capture groups of the delimiter expression, in which case we will discard the
   2052             //    last capture group saved in favor of the unprocessed remainder of the
   2053             //    input string.)
   2054             i = destCapacity-1;
   2055             if (fActiveLimit > nextOutputStringStart) {
   2056                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2057                     if (dest[i]) {
   2058                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), input->chunkContents+nextOutputStringStart, (int32_t)(fActiveLimit-nextOutputStringStart), &status);
   2059                     } else {
   2060                         UText remainingText = UTEXT_INITIALIZER;
   2061                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, fActiveLimit-nextOutputStringStart, &status);
   2062                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2063                         utext_close(&remainingText);
   2064                     }
   2065                 } else {
   2066                     UErrorCode lengthStatus = U_ZERO_ERROR;
   2067                     int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
   2068                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2069 
   2070                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2071                     if (dest[i]) {
   2072                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2073                     } else {
   2074                         UText remainingText = UTEXT_INITIALIZER;
   2075                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2076                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2077                         utext_close(&remainingText);
   2078                     }
   2079 
   2080                     uprv_free(remainingChars);
   2081                 }
   2082             }
   2083             break;
   2084         }
   2085         if (find()) {
   2086             // We found another delimiter.  Move everything from where we started looking
   2087             //  up until the start of the delimiter into the next output string.
   2088             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2089                 if (dest[i]) {
   2090                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), input->chunkContents+nextOutputStringStart, (int32_t)(fMatchStart-nextOutputStringStart), &status);
   2091                 } else {
   2092                     UText remainingText = UTEXT_INITIALIZER;
   2093                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, fMatchStart-nextOutputStringStart, &status);
   2094                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2095                     utext_close(&remainingText);
   2096                 }
   2097             } else {
   2098                 UErrorCode lengthStatus = U_ZERO_ERROR;
   2099                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
   2100                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2101 
   2102                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
   2103                 if (dest[i]) {
   2104                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2105                 } else {
   2106                     UText remainingText = UTEXT_INITIALIZER;
   2107                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2108                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2109                     utext_close(&remainingText);
   2110                 }
   2111 
   2112                 uprv_free(remainingChars);
   2113             }
   2114             nextOutputStringStart = fMatchEnd;
   2115 
   2116             // If the delimiter pattern has capturing parentheses, the captured
   2117             //  text goes out into the next n destination strings.
   2118             int32_t groupNum;
   2119             UBool lastGroupWasNullUText = FALSE;
   2120             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
   2121                 if (i==destCapacity-1) {
   2122                     break;
   2123                 }
   2124                 i++;
   2125                 lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE);
   2126                 dest[i] = group(groupNum, dest[i], status);
   2127             }
   2128 
   2129             if (nextOutputStringStart == fActiveLimit) {
   2130                 // The delimiter was at the end of the string.  We're done.
   2131                 break;
   2132             } else if (i == destCapacity-1) {
   2133                 // We're out of capture groups, and the rest of the string is more important
   2134                 if (lastGroupWasNullUText) {
   2135                     utext_close(dest[i]);
   2136                     dest[i] = NULL;
   2137                 }
   2138             }
   2139 
   2140         }
   2141         else
   2142         {
   2143             // We ran off the end of the input while looking for the next delimiter.
   2144             // All the remaining text goes into the current output string.
   2145             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2146                 if (dest[i]) {
   2147                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), input->chunkContents+nextOutputStringStart, (int32_t)(fActiveLimit-nextOutputStringStart), &status);
   2148                 } else {
   2149                     UText remainingText = UTEXT_INITIALIZER;
   2150                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, fActiveLimit-nextOutputStringStart, &status);
   2151                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2152                     utext_close(&remainingText);
   2153                 }
   2154             } else {
   2155                 UErrorCode lengthStatus = U_ZERO_ERROR;
   2156                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
   2157                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
   2158 
   2159                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2160                 if (dest[i]) {
   2161                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2162                 } else {
   2163                     UText remainingText = UTEXT_INITIALIZER;
   2164                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2165                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
   2166                     utext_close(&remainingText);
   2167                 }
   2168 
   2169                 uprv_free(remainingChars);
   2170             }
   2171             break;
   2172         }
   2173     }
   2174     return i+1;
   2175 }
   2176 
   2177 
   2178 //--------------------------------------------------------------------------------
   2179 //
   2180 //     start
   2181 //
   2182 //--------------------------------------------------------------------------------
   2183 int32_t RegexMatcher::start(UErrorCode &status) const {
   2184     return start(0, status);
   2185 }
   2186 
   2187 
   2188 
   2189 
   2190 //--------------------------------------------------------------------------------
   2191 //
   2192 //     start(int32_t group, UErrorCode &status)
   2193 //
   2194 //--------------------------------------------------------------------------------
   2195 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
   2196     if (U_FAILURE(status)) {
   2197         return -1;
   2198     }
   2199     if (U_FAILURE(fDeferredStatus)) {
   2200         status = fDeferredStatus;
   2201         return -1;
   2202     }
   2203     if (fMatch == FALSE) {
   2204         status = U_REGEX_INVALID_STATE;
   2205         return -1;
   2206     }
   2207     if (group < 0 || group > fPattern->fGroupMap->size()) {
   2208         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2209         return -1;
   2210     }
   2211     int64_t s;
   2212     if (group == 0) {
   2213         s = fMatchStart;
   2214     } else {
   2215         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
   2216         U_ASSERT(groupOffset < fPattern->fFrameSize);
   2217         U_ASSERT(groupOffset >= 0);
   2218         s = fFrame->fExtra[groupOffset];
   2219     }
   2220 
   2221     if (s == -1 || UTEXT_USES_U16(fInputText)) {
   2222         return (int32_t)s;
   2223     } else {
   2224         // !!!: Would like a better way to do this!
   2225         UErrorCode status = U_ZERO_ERROR;
   2226         return utext_extract(fInputText, 0, s, NULL, 0, &status);
   2227     }
   2228 }
   2229 
   2230 
   2231 
   2232 //--------------------------------------------------------------------------------
   2233 //
   2234 //     useAnchoringBounds
   2235 //
   2236 //--------------------------------------------------------------------------------
   2237 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
   2238     fAnchoringBounds = b;
   2239     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
   2240     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
   2241     return *this;
   2242 }
   2243 
   2244 
   2245 //--------------------------------------------------------------------------------
   2246 //
   2247 //     useTransparentBounds
   2248 //
   2249 //--------------------------------------------------------------------------------
   2250 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
   2251     fTransparentBounds = b;
   2252     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
   2253     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
   2254     return *this;
   2255 }
   2256 
   2257 //--------------------------------------------------------------------------------
   2258 //
   2259 //     setTimeLimit
   2260 //
   2261 //--------------------------------------------------------------------------------
   2262 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
   2263     if (U_FAILURE(status)) {
   2264         return;
   2265     }
   2266     if (U_FAILURE(fDeferredStatus)) {
   2267         status = fDeferredStatus;
   2268         return;
   2269     }
   2270     if (limit < 0) {
   2271         status = U_ILLEGAL_ARGUMENT_ERROR;
   2272         return;
   2273     }
   2274     fTimeLimit = limit;
   2275 }
   2276 
   2277 
   2278 //--------------------------------------------------------------------------------
   2279 //
   2280 //     getTimeLimit
   2281 //
   2282 //--------------------------------------------------------------------------------
   2283 int32_t RegexMatcher::getTimeLimit() const {
   2284     return fTimeLimit;
   2285 }
   2286 
   2287 
   2288 //--------------------------------------------------------------------------------
   2289 //
   2290 //     setStackLimit
   2291 //
   2292 //--------------------------------------------------------------------------------
   2293 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
   2294     if (U_FAILURE(status)) {
   2295         return;
   2296     }
   2297     if (U_FAILURE(fDeferredStatus)) {
   2298         status = fDeferredStatus;
   2299         return;
   2300     }
   2301     if (limit < 0) {
   2302         status = U_ILLEGAL_ARGUMENT_ERROR;
   2303         return;
   2304     }
   2305 
   2306     // Reset the matcher.  This is needed here in case there is a current match
   2307     //    whose final stack frame (containing the match results, pointed to by fFrame)
   2308     //    would be lost by resizing to a smaller stack size.
   2309     reset();
   2310 
   2311     if (limit == 0) {
   2312         // Unlimited stack expansion
   2313         fStack->setMaxCapacity(0);
   2314     } else {
   2315         // Change the units of the limit  from bytes to ints, and bump the size up
   2316         //   to be big enough to hold at least one stack frame for the pattern,
   2317         //   if it isn't there already.
   2318         int32_t adjustedLimit = limit / sizeof(int32_t);
   2319         if (adjustedLimit < fPattern->fFrameSize) {
   2320             adjustedLimit = fPattern->fFrameSize;
   2321         }
   2322         fStack->setMaxCapacity(adjustedLimit);
   2323     }
   2324     fStackLimit = limit;
   2325 }
   2326 
   2327 
   2328 //--------------------------------------------------------------------------------
   2329 //
   2330 //     getStackLimit
   2331 //
   2332 //--------------------------------------------------------------------------------
   2333 int32_t RegexMatcher::getStackLimit() const {
   2334     return fStackLimit;
   2335 }
   2336 
   2337 
   2338 //--------------------------------------------------------------------------------
   2339 //
   2340 //     setMatchCallback
   2341 //
   2342 //--------------------------------------------------------------------------------
   2343 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
   2344                                     const void              *context,
   2345                                     UErrorCode              &status) {
   2346     if (U_FAILURE(status)) {
   2347         return;
   2348     }
   2349     fCallbackFn = callback;
   2350     fCallbackContext = context;
   2351 }
   2352 
   2353 
   2354 //--------------------------------------------------------------------------------
   2355 //
   2356 //     getMatchCallback
   2357 //
   2358 //--------------------------------------------------------------------------------
   2359 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
   2360                                   const void              *&context,
   2361                                   UErrorCode              &status) {
   2362     if (U_FAILURE(status)) {
   2363        return;
   2364     }
   2365     callback = fCallbackFn;
   2366     context  = fCallbackContext;
   2367 }
   2368 
   2369 
   2370 //================================================================================
   2371 //
   2372 //    Code following this point in this file is the internal
   2373 //    Match Engine Implementation.
   2374 //
   2375 //================================================================================
   2376 
   2377 
   2378 //--------------------------------------------------------------------------------
   2379 //
   2380 //   resetStack
   2381 //           Discard any previous contents of the state save stack, and initialize a
   2382 //           new stack frame to all -1.  The -1s are needed for capture group limits,
   2383 //           where they indicate that a group has not yet matched anything.
   2384 //--------------------------------------------------------------------------------
   2385 REStackFrame *RegexMatcher::resetStack() {
   2386     // Discard any previous contents of the state save stack, and initialize a
   2387     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
   2388     //  where they indicate that a group has not yet matched anything.
   2389     fStack->removeAllElements();
   2390 
   2391     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
   2392     int32_t i;
   2393     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
   2394         iFrame->fExtra[i] = -1;
   2395     }
   2396     return iFrame;
   2397 }
   2398 
   2399 
   2400 
   2401 //--------------------------------------------------------------------------------
   2402 //
   2403 //   isWordBoundary
   2404 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
   2405 //                     For us,
   2406 //                       If the current char is a combining mark,
   2407 //                          \b is FALSE.
   2408 //                       Else Scan backwards to the first non-combining char.
   2409 //                            We are at a boundary if the this char and the original chars are
   2410 //                               opposite in membership in \w set
   2411 //
   2412 //          parameters:   pos   - the current position in the input buffer
   2413 //
   2414 //              TODO:  double-check edge cases at region boundaries.
   2415 //
   2416 //--------------------------------------------------------------------------------
   2417 UBool RegexMatcher::isWordBoundary(int64_t pos) {
   2418     UBool isBoundary = FALSE;
   2419     UBool cIsWord    = FALSE;
   2420 
   2421     if (pos >= fLookLimit) {
   2422         fHitEnd = TRUE;
   2423     } else {
   2424         // Determine whether char c at current position is a member of the word set of chars.
   2425         // If we're off the end of the string, behave as though we're not at a word char.
   2426         UTEXT_SETNATIVEINDEX(fInputText, pos);
   2427         UChar32  c = UTEXT_CURRENT32(fInputText);
   2428         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2429             // Current char is a combining one.  Not a boundary.
   2430             return FALSE;
   2431         }
   2432         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   2433     }
   2434 
   2435     // Back up until we come to a non-combining char, determine whether
   2436     //  that char is a word char.
   2437     UBool prevCIsWord = FALSE;
   2438     for (;;) {
   2439         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
   2440             break;
   2441         }
   2442         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
   2443         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2444               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2445             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   2446             break;
   2447         }
   2448     }
   2449     isBoundary = cIsWord ^ prevCIsWord;
   2450     return isBoundary;
   2451 }
   2452 
   2453 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
   2454     UBool isBoundary = FALSE;
   2455     UBool cIsWord    = FALSE;
   2456 
   2457     const UChar *inputBuf = fInputText->chunkContents;
   2458 
   2459     if (pos >= fLookLimit) {
   2460         fHitEnd = TRUE;
   2461     } else {
   2462         // Determine whether char c at current position is a member of the word set of chars.
   2463         // If we're off the end of the string, behave as though we're not at a word char.
   2464         UChar32 c;
   2465         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
   2466         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2467             // Current char is a combining one.  Not a boundary.
   2468             return FALSE;
   2469         }
   2470         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
   2471     }
   2472 
   2473     // Back up until we come to a non-combining char, determine whether
   2474     //  that char is a word char.
   2475     UBool prevCIsWord = FALSE;
   2476     for (;;) {
   2477         if (pos <= fLookStart) {
   2478             break;
   2479         }
   2480         UChar32 prevChar;
   2481         U16_PREV(inputBuf, fLookStart, pos, prevChar);
   2482         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2483               || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2484             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
   2485             break;
   2486         }
   2487     }
   2488     isBoundary = cIsWord ^ prevCIsWord;
   2489     return isBoundary;
   2490 }
   2491 
   2492 //--------------------------------------------------------------------------------
   2493 //
   2494 //   isUWordBoundary
   2495 //
   2496 //         Test for a word boundary using RBBI word break.
   2497 //
   2498 //          parameters:   pos   - the current position in the input buffer
   2499 //
   2500 //--------------------------------------------------------------------------------
   2501 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
   2502     UBool       returnVal = FALSE;
   2503 #if UCONFIG_NO_BREAK_ITERATION==0
   2504 
   2505     // If we haven't yet created a break iterator for this matcher, do it now.
   2506     if (fWordBreakItr == NULL) {
   2507         fWordBreakItr =
   2508             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
   2509         if (U_FAILURE(fDeferredStatus)) {
   2510             return FALSE;
   2511         }
   2512         fWordBreakItr->setText(fInputText, fDeferredStatus);
   2513     }
   2514 
   2515     if (pos >= fLookLimit) {
   2516         fHitEnd = TRUE;
   2517         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
   2518                             //    words are not boundaries.  All non-word chars stand by themselves,
   2519                             //    with word boundaries on both sides.
   2520     } else {
   2521         if (!UTEXT_USES_U16(fInputText)) {
   2522             // !!!: Would like a better way to do this!
   2523             UErrorCode status = U_ZERO_ERROR;
   2524             pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
   2525         }
   2526         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
   2527     }
   2528 #endif
   2529     return   returnVal;
   2530 }
   2531 
   2532 //--------------------------------------------------------------------------------
   2533 //
   2534 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
   2535 //                     saves. Increment the "time" counter, and call the
   2536 //                     user callback function if there is one installed.
   2537 //
   2538 //                     If the match operation needs to be aborted, either for a time-out
   2539 //                     or because the user callback asked for it, just set an error status.
   2540 //                     The engine will pick that up and stop in its outer loop.
   2541 //
   2542 //--------------------------------------------------------------------------------
   2543 void RegexMatcher::IncrementTime(UErrorCode &status) {
   2544     fTickCounter = TIMER_INITIAL_VALUE;
   2545     fTime++;
   2546     if (fCallbackFn != NULL) {
   2547         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
   2548             status = U_REGEX_STOPPED_BY_CALLER;
   2549             return;
   2550         }
   2551     }
   2552     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
   2553         status = U_REGEX_TIME_OUT;
   2554     }
   2555 }
   2556 
   2557 //--------------------------------------------------------------------------------
   2558 //
   2559 //   StateSave
   2560 //       Make a new stack frame, initialized as a copy of the current stack frame.
   2561 //       Set the pattern index in the original stack frame from the operand value
   2562 //       in the opcode.  Execution of the engine continues with the state in
   2563 //       the newly created stack frame
   2564 //
   2565 //       Note that reserveBlock() may grow the stack, resulting in the
   2566 //       whole thing being relocated in memory.
   2567 //
   2568 //    Parameters:
   2569 //       fp           The top frame pointer when called.  At return, a new
   2570 //                    fame will be present
   2571 //       savePatIdx   An index into the compiled pattern.  Goes into the original
   2572 //                    (not new) frame.  If execution ever back-tracks out of the
   2573 //                    new frame, this will be where we continue from in the pattern.
   2574 //    Return
   2575 //                    The new frame pointer.
   2576 //
   2577 //--------------------------------------------------------------------------------
   2578 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
   2579     // push storage for a new frame.
   2580     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
   2581     if (newFP == NULL) {
   2582         // Failure on attempted stack expansion.
   2583         //   Stack function set some other error code, change it to a more
   2584         //   specific one for regular expressions.
   2585         status = U_REGEX_STACK_OVERFLOW;
   2586         // We need to return a writable stack frame, so just return the
   2587         //    previous frame.  The match operation will stop quickly
   2588         //    because of the error status, after which the frame will never
   2589         //    be looked at again.
   2590         return fp;
   2591     }
   2592     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
   2593 
   2594     // New stack frame = copy of old top frame.
   2595     int64_t *source = (int64_t *)fp;
   2596     int64_t *dest   = newFP;
   2597     for (;;) {
   2598         *dest++ = *source++;
   2599         if (source == newFP) {
   2600             break;
   2601         }
   2602     }
   2603 
   2604     fTickCounter--;
   2605     if (fTickCounter <= 0) {
   2606        IncrementTime(status);    // Re-initializes fTickCounter
   2607     }
   2608     fp->fPatIdx = savePatIdx;
   2609     return (REStackFrame *)newFP;
   2610 }
   2611 
   2612 
   2613 //--------------------------------------------------------------------------------
   2614 //
   2615 //   MatchAt      This is the actual matching engine.
   2616 //
   2617 //                  startIdx:    begin matching a this index.
   2618 //                  toEnd:       if true, match must extend to end of the input region
   2619 //
   2620 //--------------------------------------------------------------------------------
   2621 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
   2622     UBool       isMatch  = FALSE;      // True if the we have a match.
   2623 
   2624     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
   2625 
   2626     int32_t     op;                    // Operation from the compiled pattern, split into
   2627     int32_t     opType;                //    the opcode
   2628     int32_t     opValue;               //    and the operand value.
   2629 
   2630     #ifdef REGEX_RUN_DEBUG
   2631     if (fTraceDebug)
   2632     {
   2633         printf("MatchAt(startIdx=%ld)\n", startIdx);
   2634         printf("Original Pattern: ");
   2635         UChar32 c = utext_next32From(fPattern->fPattern, 0);
   2636         while (c != U_SENTINEL) {
   2637             if (c<32 || c>256) {
   2638                 c = '.';
   2639             }
   2640             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
   2641 
   2642             c = UTEXT_NEXT32(fPattern->fPattern);
   2643         }
   2644         printf("\n");
   2645         printf("Input String: ");
   2646         c = utext_next32From(fInputText, 0);
   2647         while (c != U_SENTINEL) {
   2648             if (c<32 || c>256) {
   2649                 c = '.';
   2650             }
   2651             printf("%c", c);
   2652 
   2653             c = UTEXT_NEXT32(fInputText);
   2654         }
   2655         printf("\n");
   2656         printf("\n");
   2657     }
   2658     #endif
   2659 
   2660     if (U_FAILURE(status)) {
   2661         return;
   2662     }
   2663 
   2664     //  Cache frequently referenced items from the compiled pattern
   2665     //
   2666     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   2667 
   2668     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   2669     UVector             *sets          = fPattern->fSets;
   2670 
   2671     fFrameSize = fPattern->fFrameSize;
   2672     REStackFrame        *fp            = resetStack();
   2673 
   2674     fp->fPatIdx   = 0;
   2675     fp->fInputIdx = startIdx;
   2676 
   2677     // Zero out the pattern's static data
   2678     int32_t i;
   2679     for (i = 0; i<fPattern->fDataSize; i++) {
   2680         fData[i] = 0;
   2681     }
   2682 
   2683     //
   2684     //  Main loop for interpreting the compiled pattern.
   2685     //  One iteration of the loop per pattern operation performed.
   2686     //
   2687     for (;;) {
   2688 #if 0
   2689         if (_heapchk() != _HEAPOK) {
   2690             fprintf(stderr, "Heap Trouble\n");
   2691         }
   2692 #endif
   2693 
   2694         op      = (int32_t)pat[fp->fPatIdx];
   2695         opType  = URX_TYPE(op);
   2696         opValue = URX_VAL(op);
   2697         #ifdef REGEX_RUN_DEBUG
   2698         if (fTraceDebug) {
   2699             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2700             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
   2701                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   2702             fPattern->dumpOp(fp->fPatIdx);
   2703         }
   2704         #endif
   2705         fp->fPatIdx++;
   2706 
   2707         switch (opType) {
   2708 
   2709 
   2710         case URX_NOP:
   2711             break;
   2712 
   2713 
   2714         case URX_BACKTRACK:
   2715             // Force a backtrack.  In some circumstances, the pattern compiler
   2716             //   will notice that the pattern can't possibly match anything, and will
   2717             //   emit one of these at that point.
   2718             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2719             break;
   2720 
   2721 
   2722         case URX_ONECHAR:
   2723             if (fp->fInputIdx < fActiveLimit) {
   2724                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2725                 UChar32 c = UTEXT_NEXT32(fInputText);
   2726                 if (c == opValue) {
   2727                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2728                     break;
   2729                 }
   2730             } else {
   2731                 fHitEnd = TRUE;
   2732             }
   2733 
   2734             #ifdef REGEX_SMART_BACKTRACKING
   2735             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   2736                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   2737                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   2738                     UBool success = FALSE;
   2739                     UChar32 c = UTEXT_PREVIOUS32(fInputText);
   2740                     while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
   2741                         if (c == opValue) {
   2742                             success = TRUE;
   2743                             break;
   2744                         } else if (c == U_SENTINEL) {
   2745                             break;
   2746                         }
   2747                         c = UTEXT_PREVIOUS32(fInputText);
   2748                     }
   2749                     if (success) {
   2750                         fHitEnd = FALSE;
   2751                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2752                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2753                         if (fp->fInputIdx > backSearchIndex) {
   2754                             fp = StateSave(fp, fp->fPatIdx, status);
   2755                         }
   2756                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
   2757                         break;
   2758                     }
   2759                 }
   2760             }
   2761             #endif
   2762 
   2763             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2764             break;
   2765 
   2766 
   2767         case URX_STRING:
   2768             {
   2769                 // Test input against a literal string.
   2770                 // Strings require two slots in the compiled pattern, one for the
   2771                 //   offset to the string text, and one for the length.
   2772                 int32_t   stringStartIdx = opValue;
   2773                 int32_t   stringLen;
   2774 
   2775                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
   2776                 fp->fPatIdx++;
   2777                 opType    = URX_TYPE(op);
   2778                 stringLen = URX_VAL(op);
   2779                 U_ASSERT(opType == URX_STRING_LEN);
   2780                 U_ASSERT(stringLen >= 2);
   2781 
   2782                 const UChar *patternChars = litText+stringStartIdx;
   2783                 const UChar *patternEnd = patternChars+stringLen;
   2784 
   2785                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2786                 UChar32 c;
   2787                 UBool success = TRUE;
   2788 
   2789                 while (patternChars < patternEnd && success) {
   2790                     c = UTEXT_NEXT32(fInputText);
   2791 
   2792                     if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) {
   2793                         if (U_IS_BMP(c)) {
   2794                             success = (*patternChars == c);
   2795                             patternChars += 1;
   2796                         } else if (patternChars+1 < patternEnd) {
   2797                             success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
   2798                             patternChars += 2;
   2799                         }
   2800                     } else {
   2801                         success = FALSE;
   2802                         fHitEnd = TRUE;          //   TODO:  See ticket 6074
   2803                     }
   2804                 }
   2805 
   2806                 if (success) {
   2807                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2808                 } else {
   2809                     #ifdef REGEX_SMART_BACKTRACKING
   2810                     if (fp->fInputIdx > backSearchIndex && fStack->size()) {
   2811                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   2812                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   2813                             // Reset to last start point
   2814                             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2815                             patternChars = litText+stringStartIdx;
   2816 
   2817                             // Search backwards for a possible start
   2818                             do {
   2819                                 c = UTEXT_PREVIOUS32(fInputText);
   2820                                 if (c == U_SENTINEL) {
   2821                                     break;
   2822                                 } else if ((U_IS_BMP(c) && *patternChars == c) ||
   2823                                     (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
   2824                                     success = TRUE;
   2825                                     break;
   2826                                 }
   2827                             } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
   2828 
   2829                             // And try again
   2830                             if (success) {
   2831                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2832                                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2833                                 if (fp->fInputIdx > backSearchIndex) {
   2834                                     fp = StateSave(fp, fp->fPatIdx, status);
   2835                                 }
   2836                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
   2837                                 break;
   2838                             }
   2839                         }
   2840                     }
   2841                     #endif
   2842                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2843                 }
   2844             }
   2845             break;
   2846 
   2847 
   2848         case URX_STATE_SAVE:
   2849             fp = StateSave(fp, opValue, status);
   2850             break;
   2851 
   2852 
   2853         case URX_END:
   2854             // The match loop will exit via this path on a successful match,
   2855             //   when we reach the end of the pattern.
   2856             if (toEnd && fp->fInputIdx != fActiveLimit) {
   2857                 // The pattern matched, but not to the end of input.  Try some more.
   2858                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2859                 break;
   2860             }
   2861             isMatch = TRUE;
   2862             goto  breakFromLoop;
   2863 
   2864         // Start and End Capture stack frame variables are laid out out like this:
   2865             //  fp->fExtra[opValue]  - The start of a completed capture group
   2866             //             opValue+1 - The end   of a completed capture group
   2867             //             opValue+2 - the start of a capture group whose end
   2868             //                          has not yet been reached (and might not ever be).
   2869         case URX_START_CAPTURE:
   2870             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2871             fp->fExtra[opValue+2] = fp->fInputIdx;
   2872             break;
   2873 
   2874 
   2875         case URX_END_CAPTURE:
   2876             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2877             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   2878             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   2879             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   2880             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   2881             break;
   2882 
   2883 
   2884         case URX_DOLLAR:                   //  $, test for End of line
   2885                                            //     or for position before new line at end of input
   2886             {
   2887                 if (fp->fInputIdx >= fAnchorLimit) {
   2888                     // We really are at the end of input.  Success.
   2889                     fHitEnd = TRUE;
   2890                     fRequireEnd = TRUE;
   2891                     break;
   2892                 }
   2893 
   2894                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2895 
   2896                 // If we are positioned just before a new-line that is located at the
   2897                 //   end of input, succeed.
   2898                 UChar32 c = UTEXT_NEXT32(fInputText);
   2899                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   2900                     if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
   2901                         // If not in the middle of a CR/LF sequence
   2902                         if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
   2903                             // At new-line at end of input. Success
   2904                             fHitEnd = TRUE;
   2905                             fRequireEnd = TRUE;
   2906 
   2907                             break;
   2908                         }
   2909                     }
   2910                 } else {
   2911                     UChar32 nextC = UTEXT_NEXT32(fInputText);
   2912                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   2913                         fHitEnd = TRUE;
   2914                         fRequireEnd = TRUE;
   2915                         break;                         // At CR/LF at end of input.  Success
   2916                     }
   2917                 }
   2918 
   2919                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2920             }
   2921             break;
   2922 
   2923 
   2924          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   2925             if (fp->fInputIdx >= fAnchorLimit) {
   2926                 // Off the end of input.  Success.
   2927                 fHitEnd = TRUE;
   2928                 fRequireEnd = TRUE;
   2929                 break;
   2930             } else {
   2931                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2932                 UChar32 c = UTEXT_NEXT32(fInputText);
   2933                 // Either at the last character of input, or off the end.
   2934                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
   2935                     fHitEnd = TRUE;
   2936                     fRequireEnd = TRUE;
   2937                     break;
   2938                 }
   2939             }
   2940 
   2941             // Not at end of input.  Back-track out.
   2942             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2943             break;
   2944 
   2945 
   2946          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   2947              {
   2948                  if (fp->fInputIdx >= fAnchorLimit) {
   2949                      // We really are at the end of input.  Success.
   2950                      fHitEnd = TRUE;
   2951                      fRequireEnd = TRUE;
   2952                      break;
   2953                  }
   2954                  // If we are positioned just before a new-line, succeed.
   2955                  // It makes no difference where the new-line is within the input.
   2956                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2957                  UChar32 c = UTEXT_CURRENT32(fInputText);
   2958                  if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
   2959                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   2960                      //  In multi-line mode, hitting a new-line just before the end of input does not
   2961                      //   set the hitEnd or requireEnd flags
   2962                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
   2963                         break;
   2964                      }
   2965                  }
   2966                  // not at a new line.  Fail.
   2967                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2968              }
   2969              break;
   2970 
   2971 
   2972          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   2973              {
   2974                  if (fp->fInputIdx >= fAnchorLimit) {
   2975                      // We really are at the end of input.  Success.
   2976                      fHitEnd = TRUE;
   2977                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   2978                      break;               //   adding a new-line would not lose the match.
   2979                  }
   2980                  // If we are not positioned just before a new-line, the test fails; backtrack out.
   2981                  // It makes no difference where the new-line is within the input.
   2982                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2983                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
   2984                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2985                  }
   2986              }
   2987              break;
   2988 
   2989 
   2990        case URX_CARET:                    //  ^, test for start of line
   2991             if (fp->fInputIdx != fAnchorStart) {
   2992                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   2993             }
   2994             break;
   2995 
   2996 
   2997        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   2998            {
   2999                if (fp->fInputIdx == fAnchorStart) {
   3000                    // We are at the start input.  Success.
   3001                    break;
   3002                }
   3003                // Check whether character just before the current pos is a new-line
   3004                //   unless we are at the end of input
   3005                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3006                UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3007                if ((fp->fInputIdx < fAnchorLimit) &&
   3008                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   3009                    //  It's a new-line.  ^ is true.  Success.
   3010                    //  TODO:  what should be done with positions between a CR and LF?
   3011                    break;
   3012                }
   3013                // Not at the start of a line.  Fail.
   3014                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3015            }
   3016            break;
   3017 
   3018 
   3019        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   3020            {
   3021                U_ASSERT(fp->fInputIdx >= fAnchorStart);
   3022                if (fp->fInputIdx <= fAnchorStart) {
   3023                    // We are at the start input.  Success.
   3024                    break;
   3025                }
   3026                // Check whether character just before the current pos is a new-line
   3027                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   3028                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3029                UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3030                if (c != 0x0a) {
   3031                    // Not at the start of a line.  Back-track out.
   3032                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3033                }
   3034            }
   3035            break;
   3036 
   3037         case URX_BACKSLASH_B:          // Test for word boundaries
   3038             {
   3039                 UBool success = isWordBoundary(fp->fInputIdx);
   3040                 success ^= (opValue != 0);     // flip sense for \B
   3041                 if (!success) {
   3042                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3043                 }
   3044             }
   3045             break;
   3046 
   3047 
   3048         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   3049             {
   3050                 UBool success = isUWordBoundary(fp->fInputIdx);
   3051                 success ^= (opValue != 0);     // flip sense for \B
   3052                 if (!success) {
   3053                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3054                 }
   3055             }
   3056             break;
   3057 
   3058 
   3059         case URX_BACKSLASH_D:            // Test for decimal digit
   3060             {
   3061                 if (fp->fInputIdx >= fActiveLimit) {
   3062                     fHitEnd = TRUE;
   3063                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3064                     break;
   3065                 }
   3066 
   3067                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3068 
   3069                 UChar32 c = UTEXT_NEXT32(fInputText);
   3070                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   3071                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   3072                 success ^= (opValue != 0);        // flip sense for \D
   3073                 if (success) {
   3074                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3075                 } else {
   3076                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3077                 }
   3078             }
   3079             break;
   3080 
   3081 
   3082         case URX_BACKSLASH_G:          // Test for position at end of previous match
   3083             if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==fActiveStart)) {
   3084                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3085             }
   3086             break;
   3087 
   3088 
   3089         case URX_BACKSLASH_X:
   3090             //  Match a Grapheme, as defined by Unicode TR 29.
   3091             //  Differs slightly from Perl, which consumes combining marks independently
   3092             //    of context.
   3093             {
   3094 
   3095                 // Fail if at end of input
   3096                 if (fp->fInputIdx >= fActiveLimit) {
   3097                     fHitEnd = TRUE;
   3098                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3099                     break;
   3100                 }
   3101 
   3102                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3103 
   3104                 // Examine (and consume) the current char.
   3105                 //   Dispatch into a little state machine, based on the char.
   3106                 UChar32  c;
   3107                 c = UTEXT_NEXT32(fInputText);
   3108                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3109                 UnicodeSet **sets = fPattern->fStaticSets;
   3110                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   3111                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   3112                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   3113                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   3114                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   3115                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3116                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3117                 goto GC_Extend;
   3118 
   3119 
   3120 
   3121 GC_L:
   3122                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3123                 c = UTEXT_NEXT32(fInputText);
   3124                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3125                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
   3126                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   3127                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   3128                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3129                 UTEXT_PREVIOUS32(fInputText);
   3130                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3131                 goto GC_Extend;
   3132 
   3133 GC_V:
   3134                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3135                 c = UTEXT_NEXT32(fInputText);
   3136                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3137                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
   3138                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3139                 UTEXT_PREVIOUS32(fInputText);
   3140                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3141                 goto GC_Extend;
   3142 
   3143 GC_T:
   3144                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   3145                 c = UTEXT_NEXT32(fInputText);
   3146                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3147                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
   3148                 UTEXT_PREVIOUS32(fInputText);
   3149                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3150                 goto GC_Extend;
   3151 
   3152 GC_Extend:
   3153                 // Combining characters are consumed here
   3154                 for (;;) {
   3155                     if (fp->fInputIdx >= fActiveLimit) {
   3156                         break;
   3157                     }
   3158                     c = UTEXT_CURRENT32(fInputText);
   3159                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   3160                         break;
   3161                     }
   3162                     UTEXT_NEXT32(fInputText);
   3163                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3164                 }
   3165                 goto GC_Done;
   3166 
   3167 GC_Control:
   3168                 // Most control chars stand alone (don't combine with combining chars),
   3169                 //   except for that CR/LF sequence is a single grapheme cluster.
   3170                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
   3171                     c = UTEXT_NEXT32(fInputText);
   3172                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3173                 }
   3174 
   3175 GC_Done:
   3176                 if (fp->fInputIdx >= fActiveLimit) {
   3177                     fHitEnd = TRUE;
   3178                 }
   3179                 break;
   3180             }
   3181 
   3182 
   3183 
   3184 
   3185         case URX_BACKSLASH_Z:          // Test for end of Input
   3186             if (fp->fInputIdx < fAnchorLimit) {
   3187                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3188             } else {
   3189                 fHitEnd = TRUE;
   3190                 fRequireEnd = TRUE;
   3191             }
   3192             break;
   3193 
   3194 
   3195 
   3196         case URX_STATIC_SETREF:
   3197             {
   3198                 // Test input character against one of the predefined sets
   3199                 //    (Word Characters, for example)
   3200                 // The high bit of the op value is a flag for the match polarity.
   3201                 //    0:   success if input char is in set.
   3202                 //    1:   success if input char is not in set.
   3203                 if (fp->fInputIdx >= fActiveLimit) {
   3204                     fHitEnd = TRUE;
   3205                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3206                     break;
   3207                 }
   3208 
   3209                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   3210                 opValue &= ~URX_NEG_SET;
   3211                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3212 
   3213                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3214                 UChar32 c = UTEXT_NEXT32(fInputText);
   3215                 if (c < 256) {
   3216                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3217                     if (s8->contains(c)) {
   3218                         success = !success;
   3219                     }
   3220                 } else {
   3221                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3222                     if (s->contains(c)) {
   3223                         success = !success;
   3224                     }
   3225                 }
   3226                 if (success) {
   3227                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3228                 } else {
   3229                     // the character wasn't in the set.
   3230                     #ifdef REGEX_SMART_BACKTRACKING
   3231                     if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   3232                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3233                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3234                             // Try to find it, backwards
   3235                             UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
   3236                             success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
   3237                             do {
   3238                                 c = UTEXT_PREVIOUS32(fInputText);
   3239                                 if (c == U_SENTINEL) {
   3240                                     break;
   3241                                 } else if (c < 256) {
   3242                                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3243                                     if (s8->contains(c)) {
   3244                                         success = !success;
   3245                                     }
   3246                                 } else {
   3247                                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3248                                     if (s->contains(c)) {
   3249                                         success = !success;
   3250                                     }
   3251                                 }
   3252                             } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success);
   3253 
   3254                             if (success && c != U_SENTINEL) {
   3255                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3256                                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3257                                 if (fp->fInputIdx > backSearchIndex) {
   3258                                     fp = StateSave(fp, fp->fPatIdx, status);
   3259                                 }
   3260                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3261                                 break;
   3262                             }
   3263                         }
   3264                     }
   3265                     #endif
   3266                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3267                 }
   3268             }
   3269             break;
   3270 
   3271 
   3272         case URX_STAT_SETREF_N:
   3273             {
   3274                 // Test input character for NOT being a member of  one of
   3275                 //    the predefined sets (Word Characters, for example)
   3276                 if (fp->fInputIdx >= fActiveLimit) {
   3277                     fHitEnd = TRUE;
   3278                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3279                     break;
   3280                 }
   3281 
   3282                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3283 
   3284                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3285 
   3286                 UChar32 c = UTEXT_NEXT32(fInputText);
   3287                 if (c < 256) {
   3288                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3289                     if (s8->contains(c) == FALSE) {
   3290                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3291                         break;
   3292                     }
   3293                 } else {
   3294                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3295                     if (s->contains(c) == FALSE) {
   3296                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3297                         break;
   3298                     }
   3299                 }
   3300                 // the character wasn't in the set.
   3301                 #ifdef REGEX_SMART_BACKTRACKING
   3302                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   3303                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3304                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3305                         // Try to find it, backwards
   3306                         UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
   3307                         UBool success = FALSE;
   3308                         do {
   3309                             c = UTEXT_PREVIOUS32(fInputText);
   3310                             if (c == U_SENTINEL) {
   3311                                 break;
   3312                             } else if (c < 256) {
   3313                                 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   3314                                 if (s8->contains(c) == FALSE) {
   3315                                     success = TRUE;
   3316                                     break;
   3317                                 }
   3318                             } else {
   3319                                 const UnicodeSet *s = fPattern->fStaticSets[opValue];
   3320                                 if (s->contains(c) == FALSE) {
   3321                                     success = TRUE;
   3322                                     break;
   3323                                 }
   3324                             }
   3325                         } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
   3326 
   3327                         if (success) {
   3328                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3329                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3330                             if (fp->fInputIdx > backSearchIndex) {
   3331                                 fp = StateSave(fp, fp->fPatIdx, status);
   3332                             }
   3333                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3334                             break;
   3335                         }
   3336                     }
   3337                 }
   3338                 #endif
   3339                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3340             }
   3341             break;
   3342 
   3343 
   3344         case URX_SETREF:
   3345             if (fp->fInputIdx >= fActiveLimit) {
   3346                 fHitEnd = TRUE;
   3347                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3348                 break;
   3349             } else {
   3350                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3351 
   3352                 // There is input left.  Pick up one char and test it for set membership.
   3353                 UChar32 c = UTEXT_NEXT32(fInputText);
   3354                 U_ASSERT(opValue > 0 && opValue < sets->size());
   3355                 if (c<256) {
   3356                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   3357                     if (s8->contains(c)) {
   3358                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3359                         break;
   3360                     }
   3361                 } else {
   3362                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   3363                     if (s->contains(c)) {
   3364                         // The character is in the set.  A Match.
   3365                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3366                         break;
   3367                     }
   3368                 }
   3369 
   3370                 // the character wasn't in the set.
   3371                 #ifdef REGEX_SMART_BACKTRACKING
   3372                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   3373                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3374                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3375                         // Try to find it, backwards
   3376                         UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
   3377                         UBool success = FALSE;
   3378                         do {
   3379                             c = UTEXT_PREVIOUS32(fInputText);
   3380                             if (c == U_SENTINEL) {
   3381                                 break;
   3382                             } else if (c < 256) {
   3383                                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   3384                                 if (s8->contains(c)) {
   3385                                     success = TRUE;
   3386                                     break;
   3387                                 }
   3388                             } else {
   3389                                 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   3390                                 if (s->contains(c)) {
   3391                                     success = TRUE;
   3392                                     break;
   3393                                 }
   3394                             }
   3395                         } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
   3396 
   3397                         if (success) {
   3398                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3399                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3400                             if (fp->fInputIdx > backSearchIndex) {
   3401                                 fp = StateSave(fp, fp->fPatIdx, status);
   3402                             }
   3403                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3404                             break;
   3405                         }
   3406                     }
   3407                 }
   3408                 #endif
   3409                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3410             }
   3411             break;
   3412 
   3413 
   3414         case URX_DOTANY:
   3415             {
   3416                 // . matches anything, but stops at end-of-line.
   3417                 if (fp->fInputIdx >= fActiveLimit) {
   3418                     // At end of input.  Match failed.  Backtrack out.
   3419                     fHitEnd = TRUE;
   3420                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3421                     break;
   3422                 }
   3423 
   3424                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3425 
   3426                 // There is input left.  Advance over one char, unless we've hit end-of-line
   3427                 UChar32 c = UTEXT_NEXT32(fInputText);
   3428                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   3429                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   3430                     // End of line in normal mode.   . does not match.
   3431                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3432                     break;
   3433                 }
   3434                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3435             }
   3436             break;
   3437 
   3438 
   3439         case URX_DOTANY_ALL:
   3440             {
   3441                 // ., in dot-matches-all (including new lines) mode
   3442                 if (fp->fInputIdx >= fActiveLimit) {
   3443                     // At end of input.  Match failed.  Backtrack out.
   3444                     fHitEnd = TRUE;
   3445                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3446                     break;
   3447                 }
   3448 
   3449                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3450 
   3451                 // There is input left.  Advance over one char, except if we are
   3452                 //   at a cr/lf, advance over both of them.
   3453                 UChar32 c;
   3454                 c = UTEXT_NEXT32(fInputText);
   3455                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3456                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   3457                     // In the case of a CR/LF, we need to advance over both.
   3458                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
   3459                     if (nextc == 0x0a) {
   3460                         UTEXT_NEXT32(fInputText);
   3461                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3462                     }
   3463                 }
   3464             }
   3465             break;
   3466 
   3467 
   3468         case URX_DOTANY_UNIX:
   3469             {
   3470                 // '.' operator, matches all, but stops at end-of-line.
   3471                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   3472                 if (fp->fInputIdx >= fActiveLimit) {
   3473                     // At end of input.  Match failed.  Backtrack out.
   3474                     fHitEnd = TRUE;
   3475                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3476                     break;
   3477                 }
   3478 
   3479                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3480 
   3481                 // There is input left.  Advance over one char, unless we've hit end-of-line
   3482                 UChar32 c = UTEXT_NEXT32(fInputText);
   3483                 if (c == 0x0a) {
   3484                     // End of line in normal mode.   '.' does not match the \n
   3485                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3486                 } else {
   3487                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3488                 }
   3489             }
   3490             break;
   3491 
   3492 
   3493         case URX_JMP:
   3494             fp->fPatIdx = opValue;
   3495             break;
   3496 
   3497         case URX_FAIL:
   3498             isMatch = FALSE;
   3499             goto breakFromLoop;
   3500 
   3501         case URX_JMP_SAV:
   3502             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   3503             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   3504             fp->fPatIdx = opValue;                         // Then JMP.
   3505             break;
   3506 
   3507         case URX_JMP_SAV_X:
   3508             // This opcode is used with (x)+, when x can match a zero length string.
   3509             // Same as JMP_SAV, except conditional on the match having made forward progress.
   3510             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   3511             //   data address of the input position at the start of the loop.
   3512             {
   3513                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   3514                 int32_t  stoOp = (int32_t)pat[opValue-1];
   3515                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   3516                 int32_t  frameLoc = URX_VAL(stoOp);
   3517                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   3518                 int64_t prevInputIdx = fp->fExtra[frameLoc];
   3519                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   3520                 if (prevInputIdx < fp->fInputIdx) {
   3521                     // The match did make progress.  Repeat the loop.
   3522                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   3523                     fp->fPatIdx = opValue;
   3524                     fp->fExtra[frameLoc] = fp->fInputIdx;
   3525                 }
   3526                 // If the input position did not advance, we do nothing here,
   3527                 //   execution will fall out of the loop.
   3528             }
   3529             break;
   3530 
   3531         case URX_CTR_INIT:
   3532             {
   3533                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3534                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   3535 
   3536                 // Pick up the three extra operands that CTR_INIT has, and
   3537                 //    skip the pattern location counter past
   3538                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3539                 fp->fPatIdx += 3;
   3540                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3541                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   3542                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   3543                 U_ASSERT(minCount>=0);
   3544                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   3545                 U_ASSERT(loopLoc>fp->fPatIdx);
   3546 
   3547                 if (minCount == 0) {
   3548                     fp = StateSave(fp, loopLoc+1, status);
   3549                 }
   3550                 if (maxCount == 0) {
   3551                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3552                 }
   3553             }
   3554             break;
   3555 
   3556         case URX_CTR_LOOP:
   3557             {
   3558                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3559                 int32_t initOp = (int32_t)pat[opValue];
   3560                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   3561                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3562                 int32_t minCount  = (int32_t)pat[opValue+2];
   3563                 int32_t maxCount  = (int32_t)pat[opValue+3];
   3564                 // Increment the counter.  Note: we DIDN'T worry about counter
   3565                 //   overflow, since the data comes from UnicodeStrings, which
   3566                 //   stores its length in an int32_t. Do we have to think about
   3567                 //   this now that we're using UText? Probably not, since the length
   3568                 //   in UChar32s is still an int32_t.
   3569                 (*pCounter)++;
   3570                 U_ASSERT(*pCounter > 0);
   3571                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   3572                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   3573                     break;
   3574                 }
   3575                 if (*pCounter >= minCount) {
   3576                     fp = StateSave(fp, fp->fPatIdx, status);
   3577                 }
   3578                 fp->fPatIdx = opValue + 4;    // Loop back.
   3579             }
   3580             break;
   3581 
   3582         case URX_CTR_INIT_NG:
   3583             {
   3584                 // Initialize a non-greedy loop
   3585                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3586                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   3587 
   3588                 // Pick up the three extra operands that CTR_INIT has, and
   3589                 //    skip the pattern location counter past
   3590                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3591                 fp->fPatIdx += 3;
   3592                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3593                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   3594                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   3595                 U_ASSERT(minCount>=0);
   3596                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   3597                 U_ASSERT(loopLoc>fp->fPatIdx);
   3598 
   3599                 if (minCount == 0) {
   3600                     if (maxCount != 0) {
   3601                         fp = StateSave(fp, fp->fPatIdx, status);
   3602                     }
   3603                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   3604                 }
   3605             }
   3606             break;
   3607 
   3608         case URX_CTR_LOOP_NG:
   3609             {
   3610                 // Non-greedy {min, max} loops
   3611                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3612                 int32_t initOp = (int32_t)pat[opValue];
   3613                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   3614                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3615                 int32_t minCount  = (int32_t)pat[opValue+2];
   3616                 int32_t maxCount  = (int32_t)pat[opValue+3];
   3617                 // Increment the counter.  Note: we DIDN'T worry about counter
   3618                 //   overflow, since the data comes from UnicodeStrings, which
   3619                 //   stores its length in an int32_t. Do we have to think about
   3620                 //   this now that we're using UText? Probably not, since the length
   3621                 //   in UChar32s is still an int32_t.
   3622                 (*pCounter)++;
   3623                 U_ASSERT(*pCounter > 0);
   3624 
   3625                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   3626                     // The loop has matched the maximum permitted number of times.
   3627                     //   Break out of here with no action.  Matching will
   3628                     //   continue with the following pattern.
   3629                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   3630                     break;
   3631                 }
   3632 
   3633                 if (*pCounter < minCount) {
   3634                     // We haven't met the minimum number of matches yet.
   3635                     //   Loop back for another one.
   3636                     fp->fPatIdx = opValue + 4;    // Loop back.
   3637                 } else {
   3638                     // We do have the minimum number of matches.
   3639                     //   Fall into the following pattern, but first do
   3640                     //   a state save to the top of the loop, so that a failure
   3641                     //   in the following pattern will try another iteration of the loop.
   3642                     fp = StateSave(fp, opValue + 4, status);
   3643                 }
   3644             }
   3645             break;
   3646 
   3647         case URX_STO_SP:
   3648             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3649             fData[opValue] = fStack->size();
   3650             break;
   3651 
   3652         case URX_LD_SP:
   3653             {
   3654                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3655                 int32_t newStackSize = (int32_t)fData[opValue];
   3656                 U_ASSERT(newStackSize <= fStack->size());
   3657                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3658                 if (newFP == (int64_t *)fp) {
   3659                     break;
   3660                 }
   3661                 int32_t i;
   3662                 for (i=0; i<fFrameSize; i++) {
   3663                     newFP[i] = ((int64_t *)fp)[i];
   3664                 }
   3665                 fp = (REStackFrame *)newFP;
   3666                 fStack->setSize(newStackSize);
   3667             }
   3668             break;
   3669 
   3670         case URX_BACKREF:
   3671         case URX_BACKREF_I:
   3672             {
   3673                 U_ASSERT(opValue < fFrameSize);
   3674                 int64_t groupStartIdx = fp->fExtra[opValue];
   3675                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   3676                 U_ASSERT(groupStartIdx <= groupEndIdx);
   3677                 if (groupStartIdx < 0) {
   3678                     // This capture group has not participated in the match thus far,
   3679                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3680                 }
   3681 
   3682                 if (groupEndIdx == groupStartIdx) {
   3683                     //   The capture group match was of an empty string.
   3684                     //   Verified by testing:  Perl matches succeed in this case, so
   3685                     //   we do too.
   3686                     break;
   3687                 }
   3688 
   3689                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
   3690                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3691 
   3692                 UBool haveMatch = (opType == URX_BACKREF ?
   3693                     (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) :
   3694                     (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status)));
   3695                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3696 
   3697                 if (fp->fInputIdx > fActiveLimit) {
   3698                     fHitEnd = TRUE;
   3699                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3700                 } else if (!haveMatch) {
   3701                     if (fp->fInputIdx == fActiveLimit) {
   3702                         fHitEnd = TRUE;
   3703                     }
   3704                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   3705                 }
   3706             }
   3707             break;
   3708 
   3709         case URX_STO_INP_LOC:
   3710             {
   3711                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   3712                 fp->fExtra[opValue] = fp->fInputIdx;
   3713             }
   3714             break;
   3715 
   3716         case URX_JMPX:
   3717             {
   3718                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   3719                 fp->fPatIdx += 1;
   3720                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   3721                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   3722                 int64_t savedInputIdx = fp->fExtra[dataLoc];
   3723                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   3724                 if (savedInputIdx < fp->fInputIdx) {
   3725                     fp->fPatIdx = opValue;                               // JMP
   3726                 } else {
   3727                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   3728                 }
   3729             }
   3730             break;
   3731 
   3732         case URX_LA_START:
   3733             {
   3734                 // Entering a lookahead block.
   3735                 // Save Stack Ptr, Input Pos.
   3736                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3737                 fData[opValue]   = fStack->size();
   3738                 fData[opValue+1] = fp->fInputIdx;
   3739                 fActiveStart     = fLookStart;          // Set the match region change for
   3740                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   3741             }
   3742             break;
   3743 
   3744         case URX_LA_END:
   3745             {
   3746                 // Leaving a look-ahead block.
   3747                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   3748                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3749                 int32_t stackSize = fStack->size();
   3750                 int32_t newStackSize =(int32_t)fData[opValue];
   3751                 U_ASSERT(stackSize >= newStackSize);
   3752                 if (stackSize > newStackSize) {
   3753                     // Copy the current top frame back to the new (cut back) top frame.
   3754                     //   This makes the capture groups from within the look-ahead
   3755                     //   expression available.
   3756                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3757                     int32_t i;
   3758                     for (i=0; i<fFrameSize; i++) {
   3759                         newFP[i] = ((int64_t *)fp)[i];
   3760                     }
   3761                     fp = (REStackFrame *)newFP;
   3762                     fStack->setSize(newStackSize);
   3763                 }
   3764                 fp->fInputIdx = fData[opValue+1];
   3765 
   3766                 // Restore the active region bounds in the input string; they may have
   3767                 //    been changed because of transparent bounds on a Region.
   3768                 fActiveStart = fRegionStart;
   3769                 fActiveLimit = fRegionLimit;
   3770             }
   3771             break;
   3772 
   3773         case URX_ONECHAR_I:
   3774             if (fp->fInputIdx < fActiveLimit) {
   3775                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3776 
   3777                 UChar32 c = UTEXT_NEXT32(fInputText);
   3778                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   3779                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3780                     break;
   3781                 }
   3782             } else {
   3783                 fHitEnd = TRUE;
   3784             }
   3785 
   3786             #ifdef REGEX_SMART_BACKTRACKING
   3787             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   3788                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3789                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3790                     UBool success = FALSE;
   3791                     UChar32 c = UTEXT_PREVIOUS32(fInputText);
   3792                     while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
   3793                         if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   3794                             success = TRUE;
   3795                             break;
   3796                         } else if (c == U_SENTINEL) {
   3797                             break;
   3798                         }
   3799                         c = UTEXT_PREVIOUS32(fInputText);
   3800                     }
   3801                     if (success) {
   3802                         fHitEnd = FALSE;
   3803                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3804                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3805                         if (fp->fInputIdx > backSearchIndex) {
   3806                             fp = StateSave(fp, fp->fPatIdx, status);
   3807                         }
   3808                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3809                         break;
   3810                     }
   3811                 }
   3812             }
   3813             #endif
   3814 
   3815             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3816             break;
   3817 
   3818         case URX_STRING_I:
   3819             {
   3820                 // Test input against a literal string.
   3821                 // Strings require two slots in the compiled pattern, one for the
   3822                 //   offset to the string text, and one for the length.
   3823                 const UCaseProps *csp = ucase_getSingleton(&status);
   3824                 if (U_SUCCESS(status)) {
   3825                     int32_t stringStartIdx, stringLen;
   3826                     stringStartIdx = opValue;
   3827 
   3828                     op      = (int32_t)pat[fp->fPatIdx];
   3829                     fp->fPatIdx++;
   3830                     opType  = URX_TYPE(op);
   3831                     opValue = URX_VAL(op);
   3832                     U_ASSERT(opType == URX_STRING_LEN);
   3833                     stringLen = opValue;
   3834 
   3835                     const UChar *patternChars = litText+stringStartIdx;
   3836                     const UChar *patternEnd = patternChars+stringLen;
   3837 
   3838                     const UChar *foldChars = NULL;
   3839                     int32_t foldOffset, foldLength;
   3840                     UChar32 c;
   3841 
   3842                     foldOffset = foldLength = 0;
   3843                     UBool success = TRUE;
   3844 
   3845                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3846                     while (patternChars < patternEnd && success) {
   3847                         if(foldOffset < foldLength) {
   3848                             U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   3849                         } else {
   3850                             c = UTEXT_NEXT32(fInputText);
   3851                             if (c != U_SENTINEL) {
   3852                                 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
   3853                                 if(foldLength >= 0) {
   3854                                     if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
   3855                                         foldOffset = 0;
   3856                                         U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   3857                                     } else {
   3858                                         c = foldLength;
   3859                                         foldLength = foldOffset; // to avoid reading chars from the folding buffer
   3860                                     }
   3861                                 }
   3862                             }
   3863 
   3864                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3865                         }
   3866 
   3867                         success = FALSE;
   3868                         if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) {
   3869                             if (U_IS_BMP(c)) {
   3870                                 success = (*patternChars == c);
   3871                                 patternChars += 1;
   3872                             } else if (patternChars+1 < patternEnd) {
   3873                                 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
   3874                                 patternChars += 2;
   3875                             }
   3876                         } else {
   3877                             fHitEnd = TRUE;          //   TODO:  See ticket 6074
   3878                         }
   3879                     }
   3880 
   3881                     if (!success) {
   3882                         #ifdef REGEX_SMART_BACKTRACKING
   3883                         if (fp->fInputIdx > backSearchIndex && fStack->size()) {
   3884                             REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   3885                             if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   3886                                 // Reset to last start point
   3887                                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3888                                 patternChars = litText+stringStartIdx;
   3889 
   3890                                 // Search backwards for a possible start
   3891                                 do {
   3892                                     c = UTEXT_PREVIOUS32(fInputText);
   3893                                     if (c == U_SENTINEL) {
   3894                                         break;
   3895                                     } else {
   3896                                         foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
   3897                                         if(foldLength >= 0) {
   3898                                             if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
   3899                                                 foldOffset = 0;
   3900                                                 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   3901                                             } else {
   3902                                                 c = foldLength;
   3903                                                 foldLength = foldOffset; // to avoid reading chars from the folding buffer
   3904                                             }
   3905                                         }
   3906 
   3907                                         if ((U_IS_BMP(c) && *patternChars == c) ||
   3908                                                (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
   3909                                             success = TRUE;
   3910                                             break;
   3911                                         }
   3912                                     }
   3913                                 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
   3914 
   3915                                 // And try again
   3916                                 if (success) {
   3917                                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3918                                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3919                                     if (fp->fInputIdx > backSearchIndex) {
   3920                                         fp = StateSave(fp, fp->fPatIdx, status);
   3921                                     }
   3922                                     fp->fPatIdx++; // Skip the LOOP_C, we just did that
   3923                                     break;
   3924                                 }
   3925                             }
   3926                         }
   3927                         #endif
   3928                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3929                     }
   3930                 }
   3931             }
   3932             break;
   3933 
   3934         case URX_LB_START:
   3935             {
   3936                 // Entering a look-behind block.
   3937                 // Save Stack Ptr, Input Pos.
   3938                 //   TODO:  implement transparent bounds.  Ticket #6067
   3939                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3940                 fData[opValue]   = fStack->size();
   3941                 fData[opValue+1] = fp->fInputIdx;
   3942                 // Init the variable containing the start index for attempted matches.
   3943                 fData[opValue+2] = -1;
   3944                 // Save input string length, then reset to pin any matches to end at
   3945                 //   the current position.
   3946                 fData[opValue+3] = fActiveLimit;
   3947                 fActiveLimit     = fp->fInputIdx;
   3948             }
   3949             break;
   3950 
   3951 
   3952         case URX_LB_CONT:
   3953             {
   3954                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   3955                 //    at all possible input starting positions.
   3956 
   3957                 // Fetch the min and max possible match lengths.  They are the operands
   3958                 //   of this op in the pattern.
   3959                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
   3960                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
   3961                 U_ASSERT(minML <= maxML);
   3962                 U_ASSERT(minML >= 0);
   3963 
   3964                 // Fetch (from data) the last input index where a match was attempted.
   3965                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   3966                 int64_t  *lbStartIdx = &fData[opValue+2];
   3967                 if (*lbStartIdx < 0) {
   3968                     // First time through loop.
   3969                     *lbStartIdx = fp->fInputIdx - minML;
   3970                 } else {
   3971                     // 2nd through nth time through the loop.
   3972                     // Back up start position for match by one.
   3973                     if (*lbStartIdx == 0) {
   3974                         (*lbStartIdx)--;
   3975                     } else {
   3976                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
   3977                         UTEXT_PREVIOUS32(fInputText);
   3978                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3979                     }
   3980                 }
   3981 
   3982                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   3983                     // We have tried all potential match starting points without
   3984                     //  getting a match.  Backtrack out, and out of the
   3985                     //   Look Behind altogether.
   3986                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   3987                     int64_t restoreInputLen = fData[opValue+3];
   3988                     U_ASSERT(restoreInputLen >= fActiveLimit);
   3989                     U_ASSERT(restoreInputLen <= fInputLength);
   3990                     fActiveLimit = restoreInputLen;
   3991                     break;
   3992                 }
   3993 
   3994                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   3995                 //      (successful match will fall off the end of the loop.)
   3996                 fp = StateSave(fp, fp->fPatIdx-3, status);
   3997                 fp->fInputIdx = *lbStartIdx;
   3998             }
   3999             break;
   4000 
   4001         case URX_LB_END:
   4002             // End of a look-behind block, after a successful match.
   4003             {
   4004                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4005                 if (fp->fInputIdx != fActiveLimit) {
   4006                     //  The look-behind expression matched, but the match did not
   4007                     //    extend all the way to the point that we are looking behind from.
   4008                     //  FAIL out of here, which will take us back to the LB_CONT, which
   4009                     //     will retry the match starting at another position or fail
   4010                     //     the look-behind altogether, whichever is appropriate.
   4011                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4012                     break;
   4013                 }
   4014 
   4015                 // Look-behind match is good.  Restore the orignal input string length,
   4016                 //   which had been truncated to pin the end of the lookbehind match to the
   4017                 //   position being looked-behind.
   4018                 int64_t originalInputLen = fData[opValue+3];
   4019                 U_ASSERT(originalInputLen >= fActiveLimit);
   4020                 U_ASSERT(originalInputLen <= fInputLength);
   4021                 fActiveLimit = originalInputLen;
   4022             }
   4023             break;
   4024 
   4025 
   4026         case URX_LBN_CONT:
   4027             {
   4028                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   4029                 //    at all possible input starting positions.
   4030 
   4031                 // Fetch the extra parameters of this op.
   4032                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
   4033                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
   4034                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
   4035                         continueLoc = URX_VAL(continueLoc);
   4036                 U_ASSERT(minML <= maxML);
   4037                 U_ASSERT(minML >= 0);
   4038                 U_ASSERT(continueLoc > fp->fPatIdx);
   4039 
   4040                 // Fetch (from data) the last input index where a match was attempted.
   4041                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4042                 int64_t  *lbStartIdx = &fData[opValue+2];
   4043                 if (*lbStartIdx < 0) {
   4044                     // First time through loop.
   4045                     *lbStartIdx = fp->fInputIdx - minML;
   4046                 } else {
   4047                     // 2nd through nth time through the loop.
   4048                     // Back up start position for match by one.
   4049                     if (*lbStartIdx == 0) {
   4050                         (*lbStartIdx)--;
   4051                     } else {
   4052                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
   4053                         UTEXT_PREVIOUS32(fInputText);
   4054                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4055                     }
   4056                 }
   4057 
   4058                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   4059                     // We have tried all potential match starting points without
   4060                     //  getting a match, which means that the negative lookbehind as
   4061                     //  a whole has succeeded.  Jump forward to the continue location
   4062                     int64_t restoreInputLen = fData[opValue+3];
   4063                     U_ASSERT(restoreInputLen >= fActiveLimit);
   4064                     U_ASSERT(restoreInputLen <= fInputLength);
   4065                     fActiveLimit = restoreInputLen;
   4066                     fp->fPatIdx = continueLoc;
   4067                     break;
   4068                 }
   4069 
   4070                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   4071                 //      (successful match will cause a FAIL out of the loop altogether.)
   4072                 fp = StateSave(fp, fp->fPatIdx-4, status);
   4073                 fp->fInputIdx = *lbStartIdx;
   4074             }
   4075             break;
   4076 
   4077         case URX_LBN_END:
   4078             // End of a negative look-behind block, after a successful match.
   4079             {
   4080                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4081                 if (fp->fInputIdx != fActiveLimit) {
   4082                     //  The look-behind expression matched, but the match did not
   4083                     //    extend all the way to the point that we are looking behind from.
   4084                     //  FAIL out of here, which will take us back to the LB_CONT, which
   4085                     //     will retry the match starting at another position or succeed
   4086                     //     the look-behind altogether, whichever is appropriate.
   4087                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4088                     break;
   4089                 }
   4090 
   4091                 // Look-behind expression matched, which means look-behind test as
   4092                 //   a whole Fails
   4093 
   4094                 //   Restore the orignal input string length, which had been truncated
   4095                 //   inorder to pin the end of the lookbehind match
   4096                 //   to the position being looked-behind.
   4097                 int64_t originalInputLen = fData[opValue+3];
   4098                 U_ASSERT(originalInputLen >= fActiveLimit);
   4099                 U_ASSERT(originalInputLen <= fInputLength);
   4100                 fActiveLimit = originalInputLen;
   4101 
   4102                 // Restore original stack position, discarding any state saved
   4103                 //   by the successful pattern match.
   4104                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4105                 int32_t newStackSize = (int32_t)fData[opValue];
   4106                 U_ASSERT(fStack->size() > newStackSize);
   4107                 fStack->setSize(newStackSize);
   4108 
   4109                 //  FAIL, which will take control back to someplace
   4110                 //  prior to entering the look-behind test.
   4111                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4112             }
   4113             break;
   4114 
   4115 
   4116         case URX_LOOP_SR_I:
   4117             // Loop Initialization for the optimized implementation of
   4118             //     [some character set]*
   4119             //   This op scans through all matching input.
   4120             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4121             {
   4122                 U_ASSERT(opValue > 0 && opValue < sets->size());
   4123                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   4124                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   4125 
   4126                 // Loop through input, until either the input is exhausted or
   4127                 //   we reach a character that is not a member of the set.
   4128                 int64_t ix = fp->fInputIdx;
   4129                 UTEXT_SETNATIVEINDEX(fInputText, ix);
   4130                 for (;;) {
   4131                     if (ix >= fActiveLimit) {
   4132                         fHitEnd = TRUE;
   4133                         break;
   4134                     }
   4135                     UChar32 c = UTEXT_NEXT32(fInputText);
   4136                     if (c<256) {
   4137                         if (s8->contains(c) == FALSE) {
   4138                             break;
   4139                         }
   4140                     } else {
   4141                         if (s->contains(c) == FALSE) {
   4142                             break;
   4143                         }
   4144                     }
   4145                     ix = UTEXT_GETNATIVEINDEX(fInputText);
   4146                 }
   4147 
   4148                 // If there were no matching characters, skip over the loop altogether.
   4149                 //   The loop doesn't run at all, a * op always succeeds.
   4150                 if (ix == fp->fInputIdx) {
   4151                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4152                     break;
   4153                 }
   4154 
   4155                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4156                 //   must follow.  It's operand is the stack location
   4157                 //   that holds the starting input index for the match of this [set]*
   4158                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   4159                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4160                 int32_t stackLoc = URX_VAL(loopcOp);
   4161                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4162                 fp->fExtra[stackLoc] = fp->fInputIdx;
   4163                 #ifdef REGEX_SMART_BACKTRACKING
   4164                 backSearchIndex = fp->fInputIdx;
   4165                 #endif
   4166                 fp->fInputIdx = ix;
   4167 
   4168                 // Save State to the URX_LOOP_C op that follows this one,
   4169                 //   so that match failures in the following code will return to there.
   4170                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4171                 fp = StateSave(fp, fp->fPatIdx, status);
   4172                 fp->fPatIdx++;
   4173             }
   4174             break;
   4175 
   4176 
   4177         case URX_LOOP_DOT_I:
   4178             // Loop Initialization for the optimized implementation of .*
   4179             //   This op scans through all remaining input.
   4180             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4181             {
   4182                 // Loop through input until the input is exhausted (we reach an end-of-line)
   4183                 // In DOTALL mode, we can just go straight to the end of the input.
   4184                 int64_t ix;
   4185                 if ((opValue & 1) == 1) {
   4186                     // Dot-matches-All mode.  Jump straight to the end of the string.
   4187                     ix = fActiveLimit;
   4188                     fHitEnd = TRUE;
   4189                 } else {
   4190                     // NOT DOT ALL mode.  Line endings do not match '.'
   4191                     // Scan forward until a line ending or end of input.
   4192                     ix = fp->fInputIdx;
   4193                     UTEXT_SETNATIVEINDEX(fInputText, ix);
   4194                     for (;;) {
   4195                         if (ix >= fActiveLimit) {
   4196                             fHitEnd = TRUE;
   4197                             break;
   4198                         }
   4199                         UChar32 c = UTEXT_NEXT32(fInputText);
   4200                         if ((c & 0x7f) <= 0x29) {        // Fast filter of non-new-line-s
   4201                             if ((c == 0x0a) ||            //  0x0a is newline in both modes.
   4202                                ((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   4203                                     (c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029) {
   4204                                 //  char is a line ending.  Exit the scanning loop.
   4205                                 break;
   4206                             }
   4207                         }
   4208                         ix = UTEXT_GETNATIVEINDEX(fInputText);
   4209                     }
   4210                 }
   4211 
   4212                 // If there were no matching characters, skip over the loop altogether.
   4213                 //   The loop doesn't run at all, a * op always succeeds.
   4214                 if (ix == fp->fInputIdx) {
   4215                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4216                     break;
   4217                 }
   4218 
   4219                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4220                 //   must follow.  It's operand is the stack location
   4221                 //   that holds the starting input index for the match of this .*
   4222                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   4223                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4224                 int32_t stackLoc = URX_VAL(loopcOp);
   4225                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4226                 fp->fExtra[stackLoc] = fp->fInputIdx;
   4227                 #ifdef REGEX_SMART_BACKTRACKING
   4228                 backSearchIndex = fp->fInputIdx;
   4229                 #endif
   4230                 fp->fInputIdx = ix;
   4231 
   4232                 // Save State to the URX_LOOP_C op that follows this one,
   4233                 //   so that match failures in the following code will return to there.
   4234                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4235                 fp = StateSave(fp, fp->fPatIdx, status);
   4236                 fp->fPatIdx++;
   4237             }
   4238             break;
   4239 
   4240 
   4241         case URX_LOOP_C:
   4242             {
   4243                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   4244                 backSearchIndex = fp->fExtra[opValue];
   4245                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
   4246                 if (backSearchIndex == fp->fInputIdx) {
   4247                     // We've backed up the input idx to the point that the loop started.
   4248                     // The loop is done.  Leave here without saving state.
   4249                     //  Subsequent failures won't come back here.
   4250                     break;
   4251                 }
   4252                 // Set up for the next iteration of the loop, with input index
   4253                 //   backed up by one from the last time through,
   4254                 //   and a state save to this instruction in case the following code fails again.
   4255                 //   (We're going backwards because this loop emulates stack unwinding, not
   4256                 //    the initial scan forward.)
   4257                 U_ASSERT(fp->fInputIdx > 0);
   4258                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4259                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
   4260                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4261 
   4262                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
   4263                 if (prevC == 0x0a &&
   4264                     fp->fInputIdx > backSearchIndex &&
   4265                     twoPrevC == 0x0d) {
   4266                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
   4267                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   4268                         // .*, stepping back over CRLF pair.
   4269                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4270                     }
   4271                 }
   4272 
   4273 
   4274                 fp = StateSave(fp, fp->fPatIdx-1, status);
   4275             }
   4276             break;
   4277 
   4278 
   4279 
   4280         default:
   4281             // Trouble.  The compiled pattern contains an entry with an
   4282             //           unrecognized type tag.
   4283             U_ASSERT(FALSE);
   4284         }
   4285 
   4286         if (U_FAILURE(status)) {
   4287             isMatch = FALSE;
   4288             break;
   4289         }
   4290     }
   4291 
   4292 breakFromLoop:
   4293     fMatch = isMatch;
   4294     if (isMatch) {
   4295         fLastMatchEnd = fMatchEnd;
   4296         fMatchStart   = startIdx;
   4297         fMatchEnd     = fp->fInputIdx;
   4298         if (fTraceDebug) {
   4299             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
   4300         }
   4301     }
   4302     else
   4303     {
   4304         if (fTraceDebug) {
   4305             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
   4306         }
   4307     }
   4308 
   4309     fFrame = fp;                // The active stack frame when the engine stopped.
   4310                                 //   Contains the capture group results that we need to
   4311                                 //    access later.
   4312     return;
   4313 }
   4314 
   4315 
   4316 //--------------------------------------------------------------------------------
   4317 //
   4318 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
   4319 //                  assumption that the entire string is available in the UText's
   4320 //                  chunk buffer. For now, that means we can use int32_t indexes,
   4321 //                  except for anything that needs to be saved (like group starts
   4322 //                  and ends).
   4323 //
   4324 //                  startIdx:    begin matching a this index.
   4325 //                  toEnd:       if true, match must extend to end of the input region
   4326 //
   4327 //--------------------------------------------------------------------------------
   4328 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
   4329     UBool       isMatch  = FALSE;      // True if the we have a match.
   4330 
   4331     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
   4332 
   4333     int32_t     op;                    // Operation from the compiled pattern, split into
   4334     int32_t     opType;                //    the opcode
   4335     int32_t     opValue;               //    and the operand value.
   4336 
   4337 #ifdef REGEX_RUN_DEBUG
   4338     if (fTraceDebug)
   4339     {
   4340         printf("MatchAt(startIdx=%ld)\n", startIdx);
   4341         printf("Original Pattern: ");
   4342         UChar32 c = utext_next32From(fPattern->fPattern, 0);
   4343         while (c != U_SENTINEL) {
   4344             if (c<32 || c>256) {
   4345                 c = '.';
   4346             }
   4347             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
   4348 
   4349             c = UTEXT_NEXT32(fPattern->fPattern);
   4350         }
   4351         printf("\n");
   4352         printf("Input String: ");
   4353         c = utext_next32From(fInputText, 0);
   4354         while (c != U_SENTINEL) {
   4355             if (c<32 || c>256) {
   4356                 c = '.';
   4357             }
   4358             printf("%c", c);
   4359 
   4360             c = UTEXT_NEXT32(fInputText);
   4361         }
   4362         printf("\n");
   4363         printf("\n");
   4364     }
   4365 #endif
   4366 
   4367     if (U_FAILURE(status)) {
   4368         return;
   4369     }
   4370 
   4371     //  Cache frequently referenced items from the compiled pattern
   4372     //
   4373     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   4374 
   4375     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
   4376     UVector             *sets          = fPattern->fSets;
   4377 
   4378     const UChar         *inputBuf      = fInputText->chunkContents;
   4379 
   4380     fFrameSize = fPattern->fFrameSize;
   4381     REStackFrame        *fp            = resetStack();
   4382 
   4383     fp->fPatIdx   = 0;
   4384     fp->fInputIdx = startIdx;
   4385 
   4386     // Zero out the pattern's static data
   4387     int32_t i;
   4388     for (i = 0; i<fPattern->fDataSize; i++) {
   4389         fData[i] = 0;
   4390     }
   4391 
   4392     //
   4393     //  Main loop for interpreting the compiled pattern.
   4394     //  One iteration of the loop per pattern operation performed.
   4395     //
   4396     for (;;) {
   4397 #if 0
   4398         if (_heapchk() != _HEAPOK) {
   4399             fprintf(stderr, "Heap Trouble\n");
   4400         }
   4401 #endif
   4402 
   4403         op      = (int32_t)pat[fp->fPatIdx];
   4404         opType  = URX_TYPE(op);
   4405         opValue = URX_VAL(op);
   4406 #ifdef REGEX_RUN_DEBUG
   4407         if (fTraceDebug) {
   4408             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4409             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
   4410                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   4411             fPattern->dumpOp(fp->fPatIdx);
   4412         }
   4413 #endif
   4414         fp->fPatIdx++;
   4415 
   4416         switch (opType) {
   4417 
   4418 
   4419         case URX_NOP:
   4420             break;
   4421 
   4422 
   4423         case URX_BACKTRACK:
   4424             // Force a backtrack.  In some circumstances, the pattern compiler
   4425             //   will notice that the pattern can't possibly match anything, and will
   4426             //   emit one of these at that point.
   4427             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4428             break;
   4429 
   4430 
   4431         case URX_ONECHAR:
   4432             if (fp->fInputIdx < fActiveLimit) {
   4433                 UChar32 c;
   4434                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4435                 if (c == opValue) {
   4436                     break;
   4437                 }
   4438             } else {
   4439                 fHitEnd = TRUE;
   4440             }
   4441 
   4442             #ifdef REGEX_SMART_BACKTRACKING
   4443             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   4444                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   4445                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   4446                     int64_t reverseIndex = fp->fInputIdx;
   4447                     UChar32 c;
   4448                     do {
   4449                         U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   4450                         if (c == opValue) {
   4451                             break;
   4452                         }
   4453                     } while (reverseIndex > backSearchIndex);
   4454                     if (c == opValue) {
   4455                         fHitEnd = FALSE;
   4456                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4457                         fp->fInputIdx = reverseIndex;
   4458                         if (fp->fInputIdx > backSearchIndex) {
   4459                             fp = StateSave(fp, fp->fPatIdx, status);
   4460                         }
   4461                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
   4462                         break;
   4463                     }
   4464                 }
   4465             }
   4466             #endif
   4467 
   4468             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4469             break;
   4470 
   4471 
   4472         case URX_STRING:
   4473             {
   4474                 // Test input against a literal string.
   4475                 // Strings require two slots in the compiled pattern, one for the
   4476                 //   offset to the string text, and one for the length.
   4477                 int32_t   stringStartIdx = opValue;
   4478                 int32_t   stringLen;
   4479 
   4480                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
   4481                 fp->fPatIdx++;
   4482                 opType    = URX_TYPE(op);
   4483                 stringLen = URX_VAL(op);
   4484                 U_ASSERT(opType == URX_STRING_LEN);
   4485                 U_ASSERT(stringLen >= 2);
   4486 
   4487                 if (fp->fInputIdx + stringLen > fActiveLimit) {
   4488                     // No match.  String is longer than the remaining input text.
   4489                     fHitEnd = TRUE;          //   TODO:  See ticket 6074
   4490                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4491                     break;
   4492                 }
   4493 
   4494                 const UChar * pInp = inputBuf + fp->fInputIdx;
   4495                 const UChar * pPat = litText+stringStartIdx;
   4496                 const UChar * pEnd = pInp + stringLen;
   4497                 UBool success = FALSE;
   4498                 for(;;) {
   4499                     if (*pInp == *pPat) {
   4500                         pInp++;
   4501                         pPat++;
   4502                         if (pInp == pEnd) {
   4503                             // Successful Match.
   4504                             success = TRUE;
   4505                             break;
   4506                         }
   4507                     } else {
   4508                         // Match failed.
   4509                         break;
   4510                     }
   4511                 }
   4512 
   4513                 if (success) {
   4514                     fp->fInputIdx += stringLen;
   4515                 } else {
   4516                     #ifdef REGEX_SMART_BACKTRACKING
   4517                     if (fp->fInputIdx > backSearchIndex && fStack->size()) {
   4518                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   4519                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   4520                             // Reset to last start point
   4521                             int64_t reverseIndex = fp->fInputIdx;
   4522                             UChar32 c;
   4523                             pPat = litText+stringStartIdx;
   4524 
   4525                             // Search backwards for a possible start
   4526                             do {
   4527                                 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   4528                                 if ((U_IS_BMP(c) && *pPat == c) ||
   4529                                     (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) {
   4530                                     success = TRUE;
   4531                                     break;
   4532                                 }
   4533                             } while (reverseIndex > backSearchIndex);
   4534 
   4535                             // And try again
   4536                             if (success) {
   4537                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4538                                 fp->fInputIdx = reverseIndex;
   4539                                 if (fp->fInputIdx > backSearchIndex) {
   4540                                     fp = StateSave(fp, fp->fPatIdx, status);
   4541                                 }
   4542                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
   4543                                 break;
   4544                             }
   4545                         }
   4546                     }
   4547                     #endif
   4548                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4549                 }
   4550             }
   4551             break;
   4552 
   4553 
   4554         case URX_STATE_SAVE:
   4555             fp = StateSave(fp, opValue, status);
   4556             break;
   4557 
   4558 
   4559         case URX_END:
   4560             // The match loop will exit via this path on a successful match,
   4561             //   when we reach the end of the pattern.
   4562             if (toEnd && fp->fInputIdx != fActiveLimit) {
   4563                 // The pattern matched, but not to the end of input.  Try some more.
   4564                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4565                 break;
   4566             }
   4567             isMatch = TRUE;
   4568             goto  breakFromLoop;
   4569 
   4570             // Start and End Capture stack frame variables are laid out out like this:
   4571             //  fp->fExtra[opValue]  - The start of a completed capture group
   4572             //             opValue+1 - The end   of a completed capture group
   4573             //             opValue+2 - the start of a capture group whose end
   4574             //                          has not yet been reached (and might not ever be).
   4575         case URX_START_CAPTURE:
   4576             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4577             fp->fExtra[opValue+2] = fp->fInputIdx;
   4578             break;
   4579 
   4580 
   4581         case URX_END_CAPTURE:
   4582             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4583             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   4584             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   4585             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   4586             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   4587             break;
   4588 
   4589 
   4590         case URX_DOLLAR:                   //  $, test for End of line
   4591             //     or for position before new line at end of input
   4592             if (fp->fInputIdx < fAnchorLimit-2) {
   4593                 // We are no where near the end of input.  Fail.
   4594                 //   This is the common case.  Keep it first.
   4595                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4596                 break;
   4597             }
   4598             if (fp->fInputIdx >= fAnchorLimit) {
   4599                 // We really are at the end of input.  Success.
   4600                 fHitEnd = TRUE;
   4601                 fRequireEnd = TRUE;
   4602                 break;
   4603             }
   4604 
   4605             // If we are positioned just before a new-line that is located at the
   4606             //   end of input, succeed.
   4607             if (fp->fInputIdx == fAnchorLimit-1) {
   4608                 UChar32 c;
   4609                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
   4610 
   4611                 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
   4612                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4613                         // At new-line at end of input. Success
   4614                         fHitEnd = TRUE;
   4615                         fRequireEnd = TRUE;
   4616                         break;
   4617                     }
   4618                 }
   4619             } else if (fp->fInputIdx == fAnchorLimit-2 &&
   4620                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
   4621                     fHitEnd = TRUE;
   4622                     fRequireEnd = TRUE;
   4623                     break;                         // At CR/LF at end of input.  Success
   4624             }
   4625 
   4626             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4627 
   4628             break;
   4629 
   4630 
   4631         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   4632             if (fp->fInputIdx >= fAnchorLimit-1) {
   4633                 // Either at the last character of input, or off the end.
   4634                 if (fp->fInputIdx == fAnchorLimit-1) {
   4635                     // At last char of input.  Success if it's a new line.
   4636                     if (inputBuf[fp->fInputIdx] == 0x0a) {
   4637                         fHitEnd = TRUE;
   4638                         fRequireEnd = TRUE;
   4639                         break;
   4640                     }
   4641                 } else {
   4642                     // Off the end of input.  Success.
   4643                     fHitEnd = TRUE;
   4644                     fRequireEnd = TRUE;
   4645                     break;
   4646                 }
   4647             }
   4648 
   4649             // Not at end of input.  Back-track out.
   4650             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4651             break;
   4652 
   4653 
   4654         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   4655             {
   4656                 if (fp->fInputIdx >= fAnchorLimit) {
   4657                     // We really are at the end of input.  Success.
   4658                     fHitEnd = TRUE;
   4659                     fRequireEnd = TRUE;
   4660                     break;
   4661                 }
   4662                 // If we are positioned just before a new-line, succeed.
   4663                 // It makes no difference where the new-line is within the input.
   4664                 UChar32 c = inputBuf[fp->fInputIdx];
   4665                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
   4666                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   4667                     //  In multi-line mode, hitting a new-line just before the end of input does not
   4668                     //   set the hitEnd or requireEnd flags
   4669                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4670                         break;
   4671                     }
   4672                 }
   4673                 // not at a new line.  Fail.
   4674                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4675             }
   4676             break;
   4677 
   4678 
   4679         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   4680             {
   4681                 if (fp->fInputIdx >= fAnchorLimit) {
   4682                     // We really are at the end of input.  Success.
   4683                     fHitEnd = TRUE;
   4684                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
   4685                     break;               //   adding a new-line would not lose the match.
   4686                 }
   4687                 // If we are not positioned just before a new-line, the test fails; backtrack out.
   4688                 // It makes no difference where the new-line is within the input.
   4689                 if (inputBuf[fp->fInputIdx] != 0x0a) {
   4690                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4691                 }
   4692             }
   4693             break;
   4694 
   4695 
   4696         case URX_CARET:                    //  ^, test for start of line
   4697             if (fp->fInputIdx != fAnchorStart) {
   4698                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4699             }
   4700             break;
   4701 
   4702 
   4703         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   4704             {
   4705                 if (fp->fInputIdx == fAnchorStart) {
   4706                     // We are at the start input.  Success.
   4707                     break;
   4708                 }
   4709                 // Check whether character just before the current pos is a new-line
   4710                 //   unless we are at the end of input
   4711                 UChar  c = inputBuf[fp->fInputIdx - 1];
   4712                 if ((fp->fInputIdx < fAnchorLimit) &&
   4713                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   4714                     //  It's a new-line.  ^ is true.  Success.
   4715                     //  TODO:  what should be done with positions between a CR and LF?
   4716                     break;
   4717                 }
   4718                 // Not at the start of a line.  Fail.
   4719                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4720             }
   4721             break;
   4722 
   4723 
   4724         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   4725             {
   4726                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
   4727                 if (fp->fInputIdx <= fAnchorStart) {
   4728                     // We are at the start input.  Success.
   4729                     break;
   4730                 }
   4731                 // Check whether character just before the current pos is a new-line
   4732                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   4733                 UChar  c = inputBuf[fp->fInputIdx - 1];
   4734                 if (c != 0x0a) {
   4735                     // Not at the start of a line.  Back-track out.
   4736                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4737                 }
   4738             }
   4739             break;
   4740 
   4741         case URX_BACKSLASH_B:          // Test for word boundaries
   4742             {
   4743                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
   4744                 success ^= (opValue != 0);     // flip sense for \B
   4745                 if (!success) {
   4746                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4747                 }
   4748             }
   4749             break;
   4750 
   4751 
   4752         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   4753             {
   4754                 UBool success = isUWordBoundary(fp->fInputIdx);
   4755                 success ^= (opValue != 0);     // flip sense for \B
   4756                 if (!success) {
   4757                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4758                 }
   4759             }
   4760             break;
   4761 
   4762 
   4763         case URX_BACKSLASH_D:            // Test for decimal digit
   4764             {
   4765                 if (fp->fInputIdx >= fActiveLimit) {
   4766                     fHitEnd = TRUE;
   4767                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4768                     break;
   4769                 }
   4770 
   4771                 UChar32 c;
   4772                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4773                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   4774                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   4775                 success ^= (opValue != 0);        // flip sense for \D
   4776                 if (!success) {
   4777                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4778                 }
   4779             }
   4780             break;
   4781 
   4782 
   4783         case URX_BACKSLASH_G:          // Test for position at end of previous match
   4784             if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==fActiveStart)) {
   4785                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4786             }
   4787             break;
   4788 
   4789 
   4790         case URX_BACKSLASH_X:
   4791         //  Match a Grapheme, as defined by Unicode TR 29.
   4792         //  Differs slightly from Perl, which consumes combining marks independently
   4793         //    of context.
   4794         {
   4795 
   4796             // Fail if at end of input
   4797             if (fp->fInputIdx >= fActiveLimit) {
   4798                 fHitEnd = TRUE;
   4799                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4800                 break;
   4801             }
   4802 
   4803             // Examine (and consume) the current char.
   4804             //   Dispatch into a little state machine, based on the char.
   4805             UChar32  c;
   4806             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4807             UnicodeSet **sets = fPattern->fStaticSets;
   4808             if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
   4809             if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
   4810             if (sets[URX_GC_L]->contains(c))       goto GC_L;
   4811             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   4812             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   4813             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4814             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4815             goto GC_Extend;
   4816 
   4817 
   4818 
   4819 GC_L:
   4820             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4821             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4822             if (sets[URX_GC_L]->contains(c))       goto GC_L;
   4823             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
   4824             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
   4825             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4826             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4827             goto GC_Extend;
   4828 
   4829 GC_V:
   4830             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4831             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4832             if (sets[URX_GC_V]->contains(c))       goto GC_V;
   4833             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4834             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4835             goto GC_Extend;
   4836 
   4837 GC_T:
   4838             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
   4839             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4840             if (sets[URX_GC_T]->contains(c))       goto GC_T;
   4841             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
   4842             goto GC_Extend;
   4843 
   4844 GC_Extend:
   4845             // Combining characters are consumed here
   4846             for (;;) {
   4847                 if (fp->fInputIdx >= fActiveLimit) {
   4848                     break;
   4849                 }
   4850                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4851                 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
   4852                     U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   4853                     break;
   4854                 }
   4855             }
   4856             goto GC_Done;
   4857 
   4858 GC_Control:
   4859             // Most control chars stand alone (don't combine with combining chars),
   4860             //   except for that CR/LF sequence is a single grapheme cluster.
   4861             if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
   4862                 fp->fInputIdx++;
   4863             }
   4864 
   4865 GC_Done:
   4866             if (fp->fInputIdx >= fActiveLimit) {
   4867                 fHitEnd = TRUE;
   4868             }
   4869             break;
   4870         }
   4871 
   4872 
   4873 
   4874 
   4875         case URX_BACKSLASH_Z:          // Test for end of Input
   4876             if (fp->fInputIdx < fAnchorLimit) {
   4877                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4878             } else {
   4879                 fHitEnd = TRUE;
   4880                 fRequireEnd = TRUE;
   4881             }
   4882             break;
   4883 
   4884 
   4885 
   4886         case URX_STATIC_SETREF:
   4887             {
   4888                 // Test input character against one of the predefined sets
   4889                 //    (Word Characters, for example)
   4890                 // The high bit of the op value is a flag for the match polarity.
   4891                 //    0:   success if input char is in set.
   4892                 //    1:   success if input char is not in set.
   4893                 if (fp->fInputIdx >= fActiveLimit) {
   4894                     fHitEnd = TRUE;
   4895                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4896                     break;
   4897                 }
   4898 
   4899                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   4900                 opValue &= ~URX_NEG_SET;
   4901                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   4902 
   4903                 UChar32 c;
   4904                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4905                 if (c < 256) {
   4906                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   4907                     if (s8->contains(c)) {
   4908                         success = !success;
   4909                     }
   4910                 } else {
   4911                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   4912                     if (s->contains(c)) {
   4913                         success = !success;
   4914                     }
   4915                 }
   4916                 if (!success) {
   4917                     #ifdef REGEX_SMART_BACKTRACKING
   4918                     if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   4919                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   4920                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   4921                             // Try to find it, backwards
   4922                             int64_t reverseIndex = fp->fInputIdx;
   4923                             U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
   4924                             success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
   4925                             do {
   4926                                 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   4927                                 if (c < 256) {
   4928                                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   4929                                     if (s8->contains(c)) {
   4930                                         success = !success;
   4931                                     }
   4932                                 } else {
   4933                                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   4934                                     if (s->contains(c)) {
   4935                                         success = !success;
   4936                                     }
   4937                                 }
   4938                             } while (reverseIndex > backSearchIndex && !success);
   4939 
   4940                             if (success) {
   4941                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4942                                 fp->fInputIdx = reverseIndex;
   4943                                 if (fp->fInputIdx > backSearchIndex) {
   4944                                     fp = StateSave(fp, fp->fPatIdx, status);
   4945                                 }
   4946                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
   4947                                 break;
   4948                             }
   4949                         }
   4950                     }
   4951                     #endif
   4952                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4953                 }
   4954             }
   4955             break;
   4956 
   4957 
   4958         case URX_STAT_SETREF_N:
   4959             {
   4960                 // Test input character for NOT being a member of  one of
   4961                 //    the predefined sets (Word Characters, for example)
   4962                 if (fp->fInputIdx >= fActiveLimit) {
   4963                     fHitEnd = TRUE;
   4964                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   4965                     break;
   4966                 }
   4967 
   4968                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   4969 
   4970                 UChar32  c;
   4971                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4972                 if (c < 256) {
   4973                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   4974                     if (s8->contains(c) == FALSE) {
   4975                         break;
   4976                     }
   4977                 } else {
   4978                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
   4979                     if (s->contains(c) == FALSE) {
   4980                         break;
   4981                     }
   4982                 }
   4983 
   4984                 #ifdef REGEX_SMART_BACKTRACKING
   4985                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   4986                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   4987                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   4988                         // Try to find it, backwards
   4989                         int64_t reverseIndex = fp->fInputIdx;
   4990                         U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
   4991                         UBool success = FALSE;
   4992                         do {
   4993                             U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   4994                             if (c < 256) {
   4995                                 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
   4996                                 if (s8->contains(c) == FALSE) {
   4997                                     success = TRUE;
   4998                                     break;
   4999                                 }
   5000                             } else {
   5001                                 const UnicodeSet *s = fPattern->fStaticSets[opValue];
   5002                                 if (s->contains(c) == FALSE) {
   5003                                     success = TRUE;
   5004                                     break;
   5005                                 }
   5006                             }
   5007                         } while (reverseIndex > backSearchIndex);
   5008 
   5009                         if (success) {
   5010                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5011                             fp->fInputIdx = reverseIndex;
   5012                             if (fp->fInputIdx > backSearchIndex) {
   5013                                 fp = StateSave(fp, fp->fPatIdx, status);
   5014                             }
   5015                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5016                             break;
   5017                         }
   5018                     }
   5019                 }
   5020                 #endif
   5021                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5022             }
   5023             break;
   5024 
   5025 
   5026         case URX_SETREF:
   5027             {
   5028                 if (fp->fInputIdx >= fActiveLimit) {
   5029                     fHitEnd = TRUE;
   5030                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5031                     break;
   5032                 }
   5033 
   5034                 U_ASSERT(opValue > 0 && opValue < sets->size());
   5035 
   5036                 // There is input left.  Pick up one char and test it for set membership.
   5037                 UChar32  c;
   5038                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5039                 if (c<256) {
   5040                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5041                     if (s8->contains(c)) {
   5042                         // The character is in the set.  A Match.
   5043                         break;
   5044                     }
   5045                 } else {
   5046                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   5047                     if (s->contains(c)) {
   5048                         // The character is in the set.  A Match.
   5049                         break;
   5050                     }
   5051                 }
   5052 
   5053                 // the character wasn't in the set.
   5054                 #ifdef REGEX_SMART_BACKTRACKING
   5055                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   5056                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5057                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5058                         // Try to find it, backwards
   5059                         int64_t reverseIndex = fp->fInputIdx;
   5060                         U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
   5061                         UBool success = FALSE;
   5062                         do {
   5063                             U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5064                             if (c < 256) {
   5065                                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5066                                 if (s8->contains(c)) {
   5067                                     success = TRUE;
   5068                                     break;
   5069                                 }
   5070                             } else {
   5071                                 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
   5072                                 if (s->contains(c)) {
   5073                                     success = TRUE;
   5074                                     break;
   5075                                 }
   5076                             }
   5077                         } while (reverseIndex > backSearchIndex);
   5078 
   5079                         if (success) {
   5080                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5081                             fp->fInputIdx = reverseIndex;
   5082                             if (fp->fInputIdx > reverseIndex) {
   5083                                 fp = StateSave(fp, fp->fPatIdx, status);
   5084                             }
   5085                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5086                             break;
   5087                         }
   5088                     }
   5089                 }
   5090                 #endif
   5091                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5092             }
   5093             break;
   5094 
   5095 
   5096         case URX_DOTANY:
   5097             {
   5098                 // . matches anything, but stops at end-of-line.
   5099                 if (fp->fInputIdx >= fActiveLimit) {
   5100                     // At end of input.  Match failed.  Backtrack out.
   5101                     fHitEnd = TRUE;
   5102                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5103                     break;
   5104                 }
   5105 
   5106                 // There is input left.  Advance over one char, unless we've hit end-of-line
   5107                 UChar32  c;
   5108                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5109                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
   5110                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
   5111                     // End of line in normal mode.   . does not match.
   5112                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5113                     break;
   5114                 }
   5115             }
   5116             break;
   5117 
   5118 
   5119         case URX_DOTANY_ALL:
   5120             {
   5121                 // . in dot-matches-all (including new lines) mode
   5122                 if (fp->fInputIdx >= fActiveLimit) {
   5123                     // At end of input.  Match failed.  Backtrack out.
   5124                     fHitEnd = TRUE;
   5125                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5126                     break;
   5127                 }
   5128 
   5129                 // There is input left.  Advance over one char, except if we are
   5130                 //   at a cr/lf, advance over both of them.
   5131                 UChar32 c;
   5132                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5133                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   5134                     // In the case of a CR/LF, we need to advance over both.
   5135                     if (inputBuf[fp->fInputIdx] == 0x0a) {
   5136                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
   5137                     }
   5138                 }
   5139             }
   5140             break;
   5141 
   5142 
   5143         case URX_DOTANY_UNIX:
   5144             {
   5145                 // '.' operator, matches all, but stops at end-of-line.
   5146                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   5147                 if (fp->fInputIdx >= fActiveLimit) {
   5148                     // At end of input.  Match failed.  Backtrack out.
   5149                     fHitEnd = TRUE;
   5150                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5151                     break;
   5152                 }
   5153 
   5154                 // There is input left.  Advance over one char, unless we've hit end-of-line
   5155                 UChar32 c;
   5156                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5157                 if (c == 0x0a) {
   5158                     // End of line in normal mode.   '.' does not match the \n
   5159                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5160                 }
   5161             }
   5162             break;
   5163 
   5164 
   5165         case URX_JMP:
   5166             fp->fPatIdx = opValue;
   5167             break;
   5168 
   5169         case URX_FAIL:
   5170             isMatch = FALSE;
   5171             goto breakFromLoop;
   5172 
   5173         case URX_JMP_SAV:
   5174             U_ASSERT(opValue < fPattern->fCompiledPat->size());
   5175             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   5176             fp->fPatIdx = opValue;                         // Then JMP.
   5177             break;
   5178 
   5179         case URX_JMP_SAV_X:
   5180             // This opcode is used with (x)+, when x can match a zero length string.
   5181             // Same as JMP_SAV, except conditional on the match having made forward progress.
   5182             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   5183             //   data address of the input position at the start of the loop.
   5184             {
   5185                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   5186                 int32_t  stoOp = (int32_t)pat[opValue-1];
   5187                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   5188                 int32_t  frameLoc = URX_VAL(stoOp);
   5189                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   5190                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
   5191                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
   5192                 if (prevInputIdx < fp->fInputIdx) {
   5193                     // The match did make progress.  Repeat the loop.
   5194                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   5195                     fp->fPatIdx = opValue;
   5196                     fp->fExtra[frameLoc] = fp->fInputIdx;
   5197                 }
   5198                 // If the input position did not advance, we do nothing here,
   5199                 //   execution will fall out of the loop.
   5200             }
   5201             break;
   5202 
   5203         case URX_CTR_INIT:
   5204             {
   5205                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   5206                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   5207 
   5208                 // Pick up the three extra operands that CTR_INIT has, and
   5209                 //    skip the pattern location counter past
   5210                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5211                 fp->fPatIdx += 3;
   5212                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   5213                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   5214                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   5215                 U_ASSERT(minCount>=0);
   5216                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   5217                 U_ASSERT(loopLoc>fp->fPatIdx);
   5218 
   5219                 if (minCount == 0) {
   5220                     fp = StateSave(fp, loopLoc+1, status);
   5221                 }
   5222                 if (maxCount == 0) {
   5223                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5224                 }
   5225             }
   5226             break;
   5227 
   5228         case URX_CTR_LOOP:
   5229             {
   5230                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   5231                 int32_t initOp = (int32_t)pat[opValue];
   5232                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   5233                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   5234                 int32_t minCount  = (int32_t)pat[opValue+2];
   5235                 int32_t maxCount  = (int32_t)pat[opValue+3];
   5236                 // Increment the counter.  Note: we DIDN'T worry about counter
   5237                 //   overflow, since the data comes from UnicodeStrings, which
   5238                 //   stores its length in an int32_t. Do we have to think about
   5239                 //   this now that we're using UText? Probably not, since the length
   5240                 //   in UChar32s is still an int32_t.
   5241                 (*pCounter)++;
   5242                 U_ASSERT(*pCounter > 0);
   5243                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   5244                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   5245                     break;
   5246                 }
   5247                 if (*pCounter >= minCount) {
   5248                     fp = StateSave(fp, fp->fPatIdx, status);
   5249                 }
   5250                 fp->fPatIdx = opValue + 4;    // Loop back.
   5251             }
   5252             break;
   5253 
   5254         case URX_CTR_INIT_NG:
   5255             {
   5256                 // Initialize a non-greedy loop
   5257                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   5258                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
   5259 
   5260                 // Pick up the three extra operands that CTR_INIT has, and
   5261                 //    skip the pattern location counter past
   5262                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5263                 fp->fPatIdx += 3;
   5264                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   5265                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
   5266                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
   5267                 U_ASSERT(minCount>=0);
   5268                 U_ASSERT(maxCount>=minCount || maxCount==-1);
   5269                 U_ASSERT(loopLoc>fp->fPatIdx);
   5270 
   5271                 if (minCount == 0) {
   5272                     if (maxCount != 0) {
   5273                         fp = StateSave(fp, fp->fPatIdx, status);
   5274                     }
   5275                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   5276                 }
   5277             }
   5278             break;
   5279 
   5280         case URX_CTR_LOOP_NG:
   5281             {
   5282                 // Non-greedy {min, max} loops
   5283                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   5284                 int32_t initOp = (int32_t)pat[opValue];
   5285                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   5286                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   5287                 int32_t minCount  = (int32_t)pat[opValue+2];
   5288                 int32_t maxCount  = (int32_t)pat[opValue+3];
   5289                 // Increment the counter.  Note: we DIDN'T worry about counter
   5290                 //   overflow, since the data comes from UnicodeStrings, which
   5291                 //   stores its length in an int32_t. Do we have to think about
   5292                 //   this now that we're using UText? Probably not, since the length
   5293                 //   in UChar32s is still an int32_t.
   5294                 (*pCounter)++;
   5295                 U_ASSERT(*pCounter > 0);
   5296 
   5297                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
   5298                     // The loop has matched the maximum permitted number of times.
   5299                     //   Break out of here with no action.  Matching will
   5300                     //   continue with the following pattern.
   5301                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
   5302                     break;
   5303                 }
   5304 
   5305                 if (*pCounter < minCount) {
   5306                     // We haven't met the minimum number of matches yet.
   5307                     //   Loop back for another one.
   5308                     fp->fPatIdx = opValue + 4;    // Loop back.
   5309                 } else {
   5310                     // We do have the minimum number of matches.
   5311                     //   Fall into the following pattern, but first do
   5312                     //   a state save to the top of the loop, so that a failure
   5313                     //   in the following pattern will try another iteration of the loop.
   5314                     fp = StateSave(fp, opValue + 4, status);
   5315                 }
   5316             }
   5317             break;
   5318 
   5319         case URX_STO_SP:
   5320             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5321             fData[opValue] = fStack->size();
   5322             break;
   5323 
   5324         case URX_LD_SP:
   5325             {
   5326                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5327                 int32_t newStackSize = (int32_t)fData[opValue];
   5328                 U_ASSERT(newStackSize <= fStack->size());
   5329                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5330                 if (newFP == (int64_t *)fp) {
   5331                     break;
   5332                 }
   5333                 int32_t i;
   5334                 for (i=0; i<fFrameSize; i++) {
   5335                     newFP[i] = ((int64_t *)fp)[i];
   5336                 }
   5337                 fp = (REStackFrame *)newFP;
   5338                 fStack->setSize(newStackSize);
   5339             }
   5340             break;
   5341 
   5342         case URX_BACKREF:
   5343         case URX_BACKREF_I:
   5344             {
   5345                 U_ASSERT(opValue < fFrameSize);
   5346                 int64_t groupStartIdx = fp->fExtra[opValue];
   5347                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
   5348                 U_ASSERT(groupStartIdx <= groupEndIdx);
   5349                 int64_t len = groupEndIdx-groupStartIdx;
   5350                 if (groupStartIdx < 0) {
   5351                     // This capture group has not participated in the match thus far,
   5352                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   5353                 }
   5354 
   5355                 if (len == 0) {
   5356                         //   The capture group match was of an empty string.
   5357                         //   Verified by testing:  Perl matches succeed in this case, so
   5358                         //   we do too.
   5359                         break;
   5360                     }
   5361 
   5362                 UBool  haveMatch = FALSE;
   5363                 if (fp->fInputIdx + len <= fActiveLimit) {
   5364                     if (opType == URX_BACKREF) {
   5365                         if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) {
   5366                             haveMatch = TRUE;
   5367                         }
   5368                     } else {
   5369                         if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
   5370                                   (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) {
   5371                             haveMatch = TRUE;
   5372                         }
   5373                     }
   5374                 } else {
   5375                     // TODO: probably need to do a partial string comparison, and only
   5376                     //       set HitEnd if the available input matched.  Ticket #6074
   5377                     fHitEnd = TRUE;
   5378                 }
   5379                 if (haveMatch) {
   5380                     fp->fInputIdx += len;     // Match.  Advance current input position.
   5381                 } else {
   5382                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
   5383                 }
   5384             }
   5385             break;
   5386 
   5387         case URX_STO_INP_LOC:
   5388             {
   5389                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   5390                 fp->fExtra[opValue] = fp->fInputIdx;
   5391             }
   5392             break;
   5393 
   5394         case URX_JMPX:
   5395             {
   5396                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
   5397                 fp->fPatIdx += 1;
   5398                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   5399                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   5400                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
   5401                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
   5402                 if (savedInputIdx < fp->fInputIdx) {
   5403                     fp->fPatIdx = opValue;                               // JMP
   5404                 } else {
   5405                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
   5406                 }
   5407             }
   5408             break;
   5409 
   5410         case URX_LA_START:
   5411             {
   5412                 // Entering a lookahead block.
   5413                 // Save Stack Ptr, Input Pos.
   5414                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5415                 fData[opValue]   = fStack->size();
   5416                 fData[opValue+1] = fp->fInputIdx;
   5417                 fActiveStart     = fLookStart;          // Set the match region change for
   5418                 fActiveLimit     = fLookLimit;          //   transparent bounds.
   5419             }
   5420             break;
   5421 
   5422         case URX_LA_END:
   5423             {
   5424                 // Leaving a look-ahead block.
   5425                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   5426                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5427                 int32_t stackSize = fStack->size();
   5428                 int32_t newStackSize = (int32_t)fData[opValue];
   5429                 U_ASSERT(stackSize >= newStackSize);
   5430                 if (stackSize > newStackSize) {
   5431                     // Copy the current top frame back to the new (cut back) top frame.
   5432                     //   This makes the capture groups from within the look-ahead
   5433                     //   expression available.
   5434                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5435                     int32_t i;
   5436                     for (i=0; i<fFrameSize; i++) {
   5437                         newFP[i] = ((int64_t *)fp)[i];
   5438                     }
   5439                     fp = (REStackFrame *)newFP;
   5440                     fStack->setSize(newStackSize);
   5441                 }
   5442                 fp->fInputIdx = fData[opValue+1];
   5443 
   5444                 // Restore the active region bounds in the input string; they may have
   5445                 //    been changed because of transparent bounds on a Region.
   5446                 fActiveStart = fRegionStart;
   5447                 fActiveLimit = fRegionLimit;
   5448             }
   5449             break;
   5450 
   5451         case URX_ONECHAR_I:
   5452             if (fp->fInputIdx < fActiveLimit) {
   5453                 UChar32 c;
   5454                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5455                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   5456                     break;
   5457                 }
   5458             } else {
   5459                 fHitEnd = TRUE;
   5460             }
   5461 
   5462             #ifdef REGEX_SMART_BACKTRACKING
   5463             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
   5464                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5465                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5466                     UBool success = FALSE;
   5467                     int64_t reverseIndex = fp->fInputIdx;
   5468                     UChar32 c;
   5469                     while (reverseIndex > backSearchIndex) {
   5470                         U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5471                         if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   5472                             success = TRUE;
   5473                             break;
   5474                         } else if (c == U_SENTINEL) {
   5475                             break;
   5476                         }
   5477                     }
   5478                     if (success) {
   5479                         fHitEnd = FALSE;
   5480                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5481                         fp->fInputIdx = reverseIndex;
   5482                         if (fp->fInputIdx > backSearchIndex) {
   5483                             fp = StateSave(fp, fp->fPatIdx, status);
   5484                         }
   5485                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5486                         break;
   5487                     }
   5488                 }
   5489             }
   5490             #endif
   5491 
   5492             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5493             break;
   5494 
   5495         case URX_STRING_I:
   5496             {
   5497                 // Test input against a literal string.
   5498                 // Strings require two slots in the compiled pattern, one for the
   5499                 //   offset to the string text, and one for the length.
   5500                 const UCaseProps *csp = ucase_getSingleton(&status);
   5501                 if (U_SUCCESS(status)) {
   5502                     int32_t stringStartIdx, stringLen;
   5503                     stringStartIdx = opValue;
   5504 
   5505                     op      = (int32_t)pat[fp->fPatIdx];
   5506                     fp->fPatIdx++;
   5507                     opType  = URX_TYPE(op);
   5508                     opValue = URX_VAL(op);
   5509                     U_ASSERT(opType == URX_STRING_LEN);
   5510                     stringLen = opValue;
   5511 
   5512                     const UChar *patternChars = litText+stringStartIdx;
   5513                     const UChar *patternEnd = patternChars+stringLen;
   5514 
   5515                     const UChar *foldChars = NULL;
   5516                     int32_t foldOffset, foldLength;
   5517                     UChar32 c;
   5518 
   5519                     #ifdef REGEX_SMART_BACKTRACKING
   5520                     int32_t originalInputIdx = fp->fInputIdx;
   5521                     #endif
   5522                     UBool success = TRUE;
   5523 
   5524                     foldOffset = foldLength = 0;
   5525 
   5526                     while (patternChars < patternEnd && success) {
   5527                         if(foldOffset < foldLength) {
   5528                             U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   5529                         } else {
   5530                             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5531                             foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
   5532                             if(foldLength >= 0) {
   5533                                 if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
   5534                                     foldOffset = 0;
   5535                                     U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   5536                                 } else {
   5537                                     c = foldLength;
   5538                                     foldLength = foldOffset; // to avoid reading chars from the folding buffer
   5539                                 }
   5540                             }
   5541                         }
   5542 
   5543                         if (fp->fInputIdx <= fActiveLimit) {
   5544                             if (U_IS_BMP(c)) {
   5545                                 success = (*patternChars == c);
   5546                                 patternChars += 1;
   5547                             } else if (patternChars+1 < patternEnd) {
   5548                                 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
   5549                                 patternChars += 2;
   5550                             }
   5551                         } else {
   5552                             success = FALSE;
   5553                             fHitEnd = TRUE;          //   TODO:  See ticket 6074
   5554                         }
   5555                     }
   5556 
   5557                     if (!success) {
   5558                         #ifdef REGEX_SMART_BACKTRACKING
   5559                         if (fp->fInputIdx > backSearchIndex && fStack->size()) {
   5560                             REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
   5561                             if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
   5562                                 // Reset to last start point
   5563                                 int64_t reverseIndex = originalInputIdx;
   5564                                 patternChars = litText+stringStartIdx;
   5565 
   5566                                 // Search backwards for a possible start
   5567                                 do {
   5568                                     U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
   5569                                     foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
   5570                                     if(foldLength >= 0) {
   5571                                         if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
   5572                                             foldOffset = 0;
   5573                                             U16_NEXT_UNSAFE(foldChars, foldOffset, c);
   5574                                         } else {
   5575                                             c = foldLength;
   5576                                             foldLength = foldOffset; // to avoid reading chars from the folding buffer
   5577                                         }
   5578                                     }
   5579 
   5580                                     if ((U_IS_BMP(c) && *patternChars == c) ||
   5581                                            (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
   5582                                         success = TRUE;
   5583                                         break;
   5584                                     }
   5585                                 } while (reverseIndex > backSearchIndex);
   5586 
   5587                                 // And try again
   5588                                 if (success) {
   5589                                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5590                                     fp->fInputIdx = reverseIndex;
   5591                                     if (fp->fInputIdx > backSearchIndex) {
   5592                                         fp = StateSave(fp, fp->fPatIdx, status);
   5593                                     }
   5594                                     fp->fPatIdx++; // Skip the LOOP_C, we just did that
   5595                                     break;
   5596                                 }
   5597                             }
   5598                         }
   5599                         #endif
   5600                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5601                     }
   5602                 }
   5603             }
   5604             break;
   5605 
   5606         case URX_LB_START:
   5607             {
   5608                 // Entering a look-behind block.
   5609                 // Save Stack Ptr, Input Pos.
   5610                 //   TODO:  implement transparent bounds.  Ticket #6067
   5611                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5612                 fData[opValue]   = fStack->size();
   5613                 fData[opValue+1] = fp->fInputIdx;
   5614                 // Init the variable containing the start index for attempted matches.
   5615                 fData[opValue+2] = -1;
   5616                 // Save input string length, then reset to pin any matches to end at
   5617                 //   the current position.
   5618                 fData[opValue+3] = fActiveLimit;
   5619                 fActiveLimit     = fp->fInputIdx;
   5620             }
   5621             break;
   5622 
   5623 
   5624         case URX_LB_CONT:
   5625             {
   5626                 // Positive Look-Behind, at top of loop checking for matches of LB expression
   5627                 //    at all possible input starting positions.
   5628 
   5629                 // Fetch the min and max possible match lengths.  They are the operands
   5630                 //   of this op in the pattern.
   5631                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
   5632                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
   5633                 U_ASSERT(minML <= maxML);
   5634                 U_ASSERT(minML >= 0);
   5635 
   5636                 // Fetch (from data) the last input index where a match was attempted.
   5637                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5638                 int64_t  *lbStartIdx = &fData[opValue+2];
   5639                 if (*lbStartIdx < 0) {
   5640                     // First time through loop.
   5641                     *lbStartIdx = fp->fInputIdx - minML;
   5642                 } else {
   5643                     // 2nd through nth time through the loop.
   5644                     // Back up start position for match by one.
   5645                     if (*lbStartIdx == 0) {
   5646                         (*lbStartIdx)--;
   5647                     } else {
   5648                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
   5649                     }
   5650                 }
   5651 
   5652                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   5653                     // We have tried all potential match starting points without
   5654                     //  getting a match.  Backtrack out, and out of the
   5655                     //   Look Behind altogether.
   5656                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5657                     int64_t restoreInputLen = fData[opValue+3];
   5658                     U_ASSERT(restoreInputLen >= fActiveLimit);
   5659                     U_ASSERT(restoreInputLen <= fInputLength);
   5660                     fActiveLimit = restoreInputLen;
   5661                     break;
   5662                 }
   5663 
   5664                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5665                 //      (successful match will fall off the end of the loop.)
   5666                 fp = StateSave(fp, fp->fPatIdx-3, status);
   5667                 fp->fInputIdx =  *lbStartIdx;
   5668             }
   5669             break;
   5670 
   5671         case URX_LB_END:
   5672             // End of a look-behind block, after a successful match.
   5673             {
   5674                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5675                 if (fp->fInputIdx != fActiveLimit) {
   5676                     //  The look-behind expression matched, but the match did not
   5677                     //    extend all the way to the point that we are looking behind from.
   5678                     //  FAIL out of here, which will take us back to the LB_CONT, which
   5679                     //     will retry the match starting at another position or fail
   5680                     //     the look-behind altogether, whichever is appropriate.
   5681                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5682                     break;
   5683                 }
   5684 
   5685                 // Look-behind match is good.  Restore the orignal input string length,
   5686                 //   which had been truncated to pin the end of the lookbehind match to the
   5687                 //   position being looked-behind.
   5688                 int64_t originalInputLen = fData[opValue+3];
   5689                 U_ASSERT(originalInputLen >= fActiveLimit);
   5690                 U_ASSERT(originalInputLen <= fInputLength);
   5691                 fActiveLimit = originalInputLen;
   5692             }
   5693             break;
   5694 
   5695 
   5696         case URX_LBN_CONT:
   5697             {
   5698                 // Negative Look-Behind, at top of loop checking for matches of LB expression
   5699                 //    at all possible input starting positions.
   5700 
   5701                 // Fetch the extra parameters of this op.
   5702                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
   5703                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
   5704                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
   5705                 continueLoc = URX_VAL(continueLoc);
   5706                 U_ASSERT(minML <= maxML);
   5707                 U_ASSERT(minML >= 0);
   5708                 U_ASSERT(continueLoc > fp->fPatIdx);
   5709 
   5710                 // Fetch (from data) the last input index where a match was attempted.
   5711                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5712                 int64_t  *lbStartIdx = &fData[opValue+2];
   5713                 if (*lbStartIdx < 0) {
   5714                     // First time through loop.
   5715                     *lbStartIdx = fp->fInputIdx - minML;
   5716                 } else {
   5717                     // 2nd through nth time through the loop.
   5718                     // Back up start position for match by one.
   5719                     if (*lbStartIdx == 0) {
   5720                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
   5721                     } else {
   5722                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
   5723                     }
   5724                 }
   5725 
   5726                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
   5727                     // We have tried all potential match starting points without
   5728                     //  getting a match, which means that the negative lookbehind as
   5729                     //  a whole has succeeded.  Jump forward to the continue location
   5730                     int64_t restoreInputLen = fData[opValue+3];
   5731                     U_ASSERT(restoreInputLen >= fActiveLimit);
   5732                     U_ASSERT(restoreInputLen <= fInputLength);
   5733                     fActiveLimit = restoreInputLen;
   5734                     fp->fPatIdx = continueLoc;
   5735                     break;
   5736                 }
   5737 
   5738                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5739                 //      (successful match will cause a FAIL out of the loop altogether.)
   5740                 fp = StateSave(fp, fp->fPatIdx-4, status);
   5741                 fp->fInputIdx =  *lbStartIdx;
   5742             }
   5743             break;
   5744 
   5745         case URX_LBN_END:
   5746             // End of a negative look-behind block, after a successful match.
   5747             {
   5748                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5749                 if (fp->fInputIdx != fActiveLimit) {
   5750                     //  The look-behind expression matched, but the match did not
   5751                     //    extend all the way to the point that we are looking behind from.
   5752                     //  FAIL out of here, which will take us back to the LB_CONT, which
   5753                     //     will retry the match starting at another position or succeed
   5754                     //     the look-behind altogether, whichever is appropriate.
   5755                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5756                     break;
   5757                 }
   5758 
   5759                 // Look-behind expression matched, which means look-behind test as
   5760                 //   a whole Fails
   5761 
   5762                 //   Restore the orignal input string length, which had been truncated
   5763                 //   inorder to pin the end of the lookbehind match
   5764                 //   to the position being looked-behind.
   5765                 int64_t originalInputLen = fData[opValue+3];
   5766                 U_ASSERT(originalInputLen >= fActiveLimit);
   5767                 U_ASSERT(originalInputLen <= fInputLength);
   5768                 fActiveLimit = originalInputLen;
   5769 
   5770                 // Restore original stack position, discarding any state saved
   5771                 //   by the successful pattern match.
   5772                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5773                 int32_t newStackSize = (int32_t)fData[opValue];
   5774                 U_ASSERT(fStack->size() > newStackSize);
   5775                 fStack->setSize(newStackSize);
   5776 
   5777                 //  FAIL, which will take control back to someplace
   5778                 //  prior to entering the look-behind test.
   5779                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
   5780             }
   5781             break;
   5782 
   5783 
   5784         case URX_LOOP_SR_I:
   5785             // Loop Initialization for the optimized implementation of
   5786             //     [some character set]*
   5787             //   This op scans through all matching input.
   5788             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5789             {
   5790                 U_ASSERT(opValue > 0 && opValue < sets->size());
   5791                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5792                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
   5793 
   5794                 // Loop through input, until either the input is exhausted or
   5795                 //   we reach a character that is not a member of the set.
   5796                 int32_t ix = (int32_t)fp->fInputIdx;
   5797                 for (;;) {
   5798                     if (ix >= fActiveLimit) {
   5799                         fHitEnd = TRUE;
   5800                         break;
   5801                     }
   5802                     UChar32   c;
   5803                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
   5804                     if (c<256) {
   5805                         if (s8->contains(c) == FALSE) {
   5806                             U16_BACK_1(inputBuf, 0, ix);
   5807                             break;
   5808                         }
   5809                     } else {
   5810                         if (s->contains(c) == FALSE) {
   5811                             U16_BACK_1(inputBuf, 0, ix);
   5812                             break;
   5813                         }
   5814                     }
   5815                 }
   5816 
   5817                 // If there were no matching characters, skip over the loop altogether.
   5818                 //   The loop doesn't run at all, a * op always succeeds.
   5819                 if (ix == fp->fInputIdx) {
   5820                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   5821                     break;
   5822                 }
   5823 
   5824                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   5825                 //   must follow.  It's operand is the stack location
   5826                 //   that holds the starting input index for the match of this [set]*
   5827                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   5828                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   5829                 int32_t stackLoc = URX_VAL(loopcOp);
   5830                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   5831                 fp->fExtra[stackLoc] = fp->fInputIdx;
   5832                 #ifdef REGEX_SMART_BACKTRACKING
   5833                 backSearchIndex = fp->fInputIdx;
   5834                 #endif
   5835                 fp->fInputIdx = ix;
   5836 
   5837                 // Save State to the URX_LOOP_C op that follows this one,
   5838                 //   so that match failures in the following code will return to there.
   5839                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   5840                 fp = StateSave(fp, fp->fPatIdx, status);
   5841                 fp->fPatIdx++;
   5842             }
   5843             break;
   5844 
   5845 
   5846         case URX_LOOP_DOT_I:
   5847             // Loop Initialization for the optimized implementation of .*
   5848             //   This op scans through all remaining input.
   5849             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5850             {
   5851                 // Loop through input until the input is exhausted (we reach an end-of-line)
   5852                 // In DOTALL mode, we can just go straight to the end of the input.
   5853                 int32_t ix;
   5854                 if ((opValue & 1) == 1) {
   5855                     // Dot-matches-All mode.  Jump straight to the end of the string.
   5856                     ix = (int32_t)fActiveLimit;
   5857                     fHitEnd = TRUE;
   5858                 } else {
   5859                     // NOT DOT ALL mode.  Line endings do not match '.'
   5860                     // Scan forward until a line ending or end of input.
   5861                     ix = (int32_t)fp->fInputIdx;
   5862                     for (;;) {
   5863                         if (ix >= fActiveLimit) {
   5864                             fHitEnd = TRUE;
   5865                             break;
   5866                         }
   5867                         UChar32   c;
   5868                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
   5869                         if ((c & 0x7f) <= 0x29) {        // Fast filter of non-new-line-s
   5870                             if ((c == 0x0a) ||            //  0x0a is newline in both modes.
   5871                                 ((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   5872                                 (c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029) {
   5873                                 //  char is a line ending.  Put the input pos back to the
   5874                                 //    line ending char, and exit the scanning loop.
   5875                                 U16_BACK_1(inputBuf, 0, ix);
   5876                                 break;
   5877                             }
   5878                         }
   5879                     }
   5880                 }
   5881 
   5882                 // If there were no matching characters, skip over the loop altogether.
   5883                 //   The loop doesn't run at all, a * op always succeeds.
   5884                 if (ix == fp->fInputIdx) {
   5885                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
   5886                     break;
   5887                 }
   5888 
   5889                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   5890                 //   must follow.  It's operand is the stack location
   5891                 //   that holds the starting input index for the match of this .*
   5892                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
   5893                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   5894                 int32_t stackLoc = URX_VAL(loopcOp);
   5895                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   5896                 fp->fExtra[stackLoc] = fp->fInputIdx;
   5897                 #ifdef REGEX_SMART_BACKTRACKING
   5898                 backSearchIndex = fp->fInputIdx;
   5899                 #endif
   5900                 fp->fInputIdx = ix;
   5901 
   5902                 // Save State to the URX_LOOP_C op that follows this one,
   5903                 //   so that match failures in the following code will return to there.
   5904                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   5905                 fp = StateSave(fp, fp->fPatIdx, status);
   5906                 fp->fPatIdx++;
   5907             }
   5908             break;
   5909 
   5910 
   5911         case URX_LOOP_C:
   5912             {
   5913                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
   5914                 backSearchIndex = (int32_t)fp->fExtra[opValue];
   5915                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
   5916                 if (backSearchIndex == fp->fInputIdx) {
   5917                     // We've backed up the input idx to the point that the loop started.
   5918                     // The loop is done.  Leave here without saving state.
   5919                     //  Subsequent failures won't come back here.
   5920                     break;
   5921                 }
   5922                 // Set up for the next iteration of the loop, with input index
   5923                 //   backed up by one from the last time through,
   5924                 //   and a state save to this instruction in case the following code fails again.
   5925                 //   (We're going backwards because this loop emulates stack unwinding, not
   5926                 //    the initial scan forward.)
   5927                 U_ASSERT(fp->fInputIdx > 0);
   5928                 UChar32 prevC;
   5929                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
   5930 
   5931                 if (prevC == 0x0a &&
   5932                     fp->fInputIdx > backSearchIndex &&
   5933                     inputBuf[fp->fInputIdx-1] == 0x0d) {
   5934                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
   5935                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   5936                         // .*, stepping back over CRLF pair.
   5937                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   5938                     }
   5939                 }
   5940 
   5941 
   5942                 fp = StateSave(fp, fp->fPatIdx-1, status);
   5943             }
   5944             break;
   5945 
   5946 
   5947 
   5948         default:
   5949             // Trouble.  The compiled pattern contains an entry with an
   5950             //           unrecognized type tag.
   5951             U_ASSERT(FALSE);
   5952         }
   5953 
   5954         if (U_FAILURE(status)) {
   5955             isMatch = FALSE;
   5956             break;
   5957         }
   5958     }
   5959 
   5960 breakFromLoop:
   5961     fMatch = isMatch;
   5962     if (isMatch) {
   5963         fLastMatchEnd = fMatchEnd;
   5964         fMatchStart   = startIdx;
   5965         fMatchEnd     = fp->fInputIdx;
   5966         if (fTraceDebug) {
   5967             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
   5968         }
   5969     }
   5970     else
   5971     {
   5972         if (fTraceDebug) {
   5973             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
   5974         }
   5975     }
   5976 
   5977     fFrame = fp;                // The active stack frame when the engine stopped.
   5978     //   Contains the capture group results that we need to
   5979     //    access later.
   5980 
   5981     return;
   5982 }
   5983 
   5984 
   5985 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
   5986 
   5987 U_NAMESPACE_END
   5988 
   5989 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
   5990 
   5991