Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 //
      4 //  file:  regexcmp.cpp
      5 //
      6 //  Copyright (C) 2002-2016 International Business Machines Corporation and others.
      7 //  All Rights Reserved.
      8 //
      9 //  This file contains the ICU regular expression compiler, which is responsible
     10 //  for processing a regular expression pattern into the compiled form that
     11 //  is used by the match finding engine.
     12 //
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     17 
     18 #include "unicode/ustring.h"
     19 #include "unicode/unistr.h"
     20 #include "unicode/uniset.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/uchriter.h"
     23 #include "unicode/parsepos.h"
     24 #include "unicode/parseerr.h"
     25 #include "unicode/regex.h"
     26 #include "unicode/utf.h"
     27 #include "unicode/utf16.h"
     28 #include "patternprops.h"
     29 #include "putilimp.h"
     30 #include "cmemory.h"
     31 #include "cstr.h"
     32 #include "cstring.h"
     33 #include "uvectr32.h"
     34 #include "uvectr64.h"
     35 #include "uassert.h"
     36 #include "uinvchar.h"
     37 
     38 #include "regeximp.h"
     39 #include "regexcst.h"   // Contains state table for the regex pattern parser.
     40                         //   generated by a Perl script.
     41 #include "regexcmp.h"
     42 #include "regexst.h"
     43 #include "regextxt.h"
     44 
     45 
     46 
     47 U_NAMESPACE_BEGIN
     48 
     49 
     50 //------------------------------------------------------------------------------
     51 //
     52 //  Constructor.
     53 //
     54 //------------------------------------------------------------------------------
     55 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
     56    fParenStack(status), fSetStack(status), fSetOpStack(status)
     57 {
     58     // Lazy init of all shared global sets (needed for init()'s empty text)
     59     RegexStaticSets::initGlobals(&status);
     60 
     61     fStatus           = &status;
     62 
     63     fRXPat            = rxp;
     64     fScanIndex        = 0;
     65     fLastChar         = -1;
     66     fPeekChar         = -1;
     67     fLineNum          = 1;
     68     fCharNum          = 0;
     69     fQuoteMode        = FALSE;
     70     fInBackslashQuote = FALSE;
     71     fModeFlags        = fRXPat->fFlags | 0x80000000;
     72     fEOLComments      = TRUE;
     73 
     74     fMatchOpenParen   = -1;
     75     fMatchCloseParen  = -1;
     76     fCaptureName      = NULL;
     77     fLastSetLiteral   = U_SENTINEL;
     78 
     79     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
     80         status = rxp->fDeferredStatus;
     81     }
     82 }
     83 
     84 static const UChar      chAmp       = 0x26;      // '&'
     85 static const UChar      chDash      = 0x2d;      // '-'
     86 
     87 
     88 //------------------------------------------------------------------------------
     89 //
     90 //  Destructor
     91 //
     92 //------------------------------------------------------------------------------
     93 RegexCompile::~RegexCompile() {
     94     delete fCaptureName;         // Normally will be NULL, but can exist if pattern
     95                                  //   compilation stops with a syntax error.
     96 }
     97 
     98 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
     99     set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
    100 }
    101 
    102 //------------------------------------------------------------------------------
    103 //
    104 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
    105 //                           The state tables are hand-written in the file regexcst.txt,
    106 //                           and converted to the form used here by a perl
    107 //                           script regexcst.pl
    108 //
    109 //------------------------------------------------------------------------------
    110 void    RegexCompile::compile(
    111                          const UnicodeString &pat,   // Source pat to be compiled.
    112                          UParseError &pp,            // Error position info
    113                          UErrorCode &e)              // Error Code
    114 {
    115     fRXPat->fPatternString = new UnicodeString(pat);
    116     UText patternText = UTEXT_INITIALIZER;
    117     utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
    118 
    119     if (U_SUCCESS(e)) {
    120         compile(&patternText, pp, e);
    121         utext_close(&patternText);
    122     }
    123 }
    124 
    125 //
    126 //   compile, UText mode
    127 //     All the work is actually done here.
    128 //
    129 void    RegexCompile::compile(
    130                          UText *pat,                 // Source pat to be compiled.
    131                          UParseError &pp,            // Error position info
    132                          UErrorCode &e)              // Error Code
    133 {
    134     fStatus             = &e;
    135     fParseErr           = &pp;
    136     fStackPtr           = 0;
    137     fStack[fStackPtr]   = 0;
    138 
    139     if (U_FAILURE(*fStatus)) {
    140         return;
    141     }
    142 
    143     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
    144     U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
    145 
    146     // Prepare the RegexPattern object to receive the compiled pattern.
    147     fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
    148     if (U_FAILURE(*fStatus)) {
    149         return;
    150     }
    151     fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
    152     fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;
    153 
    154 
    155     // Initialize the pattern scanning state machine
    156     fPatternLength = utext_nativeLength(pat);
    157     uint16_t                state = 1;
    158     const RegexTableEl      *tableEl;
    159 
    160     // UREGEX_LITERAL force entire pattern to be treated as a literal string.
    161     if (fModeFlags & UREGEX_LITERAL) {
    162         fQuoteMode = TRUE;
    163     }
    164 
    165     nextChar(fC);                        // Fetch the first char from the pattern string.
    166 
    167     //
    168     // Main loop for the regex pattern parsing state machine.
    169     //   Runs once per state transition.
    170     //   Each time through optionally performs, depending on the state table,
    171     //      - an advance to the the next pattern char
    172     //      - an action to be performed.
    173     //      - pushing or popping a state to/from the local state return stack.
    174     //   file regexcst.txt is the source for the state table.  The logic behind
    175     //     recongizing the pattern syntax is there, not here.
    176     //
    177     for (;;) {
    178         //  Bail out if anything has gone wrong.
    179         //  Regex pattern parsing stops on the first error encountered.
    180         if (U_FAILURE(*fStatus)) {
    181             break;
    182         }
    183 
    184         U_ASSERT(state != 0);
    185 
    186         // Find the state table element that matches the input char from the pattern, or the
    187         //    class of the input character.  Start with the first table row for this
    188         //    state, then linearly scan forward until we find a row that matches the
    189         //    character.  The last row for each state always matches all characters, so
    190         //    the search will stop there, if not before.
    191         //
    192         tableEl = &gRuleParseStateTable[state];
    193         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
    194             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
    195 
    196         for (;;) {    // loop through table rows belonging to this state, looking for one
    197                       //   that matches the current input char.
    198             REGEX_SCAN_DEBUG_PRINTF(("."));
    199             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
    200                 // Table row specified an individual character, not a set, and
    201                 //   the input character is not quoted, and
    202                 //   the input character matched it.
    203                 break;
    204             }
    205             if (tableEl->fCharClass == 255) {
    206                 // Table row specified default, match anything character class.
    207                 break;
    208             }
    209             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
    210                 // Table row specified "quoted" and the char was quoted.
    211                 break;
    212             }
    213             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
    214                 // Table row specified eof and we hit eof on the input.
    215                 break;
    216             }
    217 
    218             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
    219                 fC.fQuoted == FALSE &&                                       //   char is not escaped &&
    220                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
    221                 U_ASSERT(tableEl->fCharClass <= 137);
    222                 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
    223                     // Table row specified a character class, or set of characters,
    224                     //   and the current char matches it.
    225                     break;
    226                 }
    227             }
    228 
    229             // No match on this row, advance to the next  row for this state,
    230             tableEl++;
    231         }
    232         REGEX_SCAN_DEBUG_PRINTF(("\n"));
    233 
    234         //
    235         // We've found the row of the state table that matches the current input
    236         //   character from the rules string.
    237         // Perform any action specified  by this row in the state table.
    238         if (doParseActions(tableEl->fAction) == FALSE) {
    239             // Break out of the state machine loop if the
    240             //   the action signalled some kind of error, or
    241             //   the action was to exit, occurs on normal end-of-rules-input.
    242             break;
    243         }
    244 
    245         if (tableEl->fPushState != 0) {
    246             fStackPtr++;
    247             if (fStackPtr >= kStackSize) {
    248                 error(U_REGEX_INTERNAL_ERROR);
    249                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
    250                 fStackPtr--;
    251             }
    252             fStack[fStackPtr] = tableEl->fPushState;
    253         }
    254 
    255         //
    256         //  NextChar.  This is where characters are actually fetched from the pattern.
    257         //             Happens under control of the 'n' tag in the state table.
    258         //
    259         if (tableEl->fNextChar) {
    260             nextChar(fC);
    261         }
    262 
    263         // Get the next state from the table entry, or from the
    264         //   state stack if the next state was specified as "pop".
    265         if (tableEl->fNextState != 255) {
    266             state = tableEl->fNextState;
    267         } else {
    268             state = fStack[fStackPtr];
    269             fStackPtr--;
    270             if (fStackPtr < 0) {
    271                 // state stack underflow
    272                 // This will occur if the user pattern has mis-matched parentheses,
    273                 //   with extra close parens.
    274                 //
    275                 fStackPtr++;
    276                 error(U_REGEX_MISMATCHED_PAREN);
    277             }
    278         }
    279 
    280     }
    281 
    282     if (U_FAILURE(*fStatus)) {
    283         // Bail out if the pattern had errors.
    284         //   Set stack cleanup:  a successful compile would have left it empty,
    285         //   but errors can leave temporary sets hanging around.
    286         while (!fSetStack.empty()) {
    287             delete (UnicodeSet *)fSetStack.pop();
    288         }
    289         return;
    290     }
    291 
    292     //
    293     // The pattern has now been read and processed, and the compiled code generated.
    294     //
    295 
    296     //
    297     // The pattern's fFrameSize so far has accumulated the requirements for
    298     //   storage for capture parentheses, counters, etc. that are encountered
    299     //   in the pattern.  Add space for the two variables that are always
    300     //   present in the saved state:  the input string position (int64_t) and
    301     //   the position in the compiled pattern.
    302     //
    303     allocateStackData(RESTACKFRAME_HDRCOUNT);
    304 
    305     //
    306     // Optimization pass 1: NOPs, back-references, and case-folding
    307     //
    308     stripNOPs();
    309 
    310     //
    311     // Get bounds for the minimum and maximum length of a string that this
    312     //   pattern can match.  Used to avoid looking for matches in strings that
    313     //   are too short.
    314     //
    315     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
    316 
    317     //
    318     // Optimization pass 2: match start type
    319     //
    320     matchStartType();
    321 
    322     //
    323     // Set up fast latin-1 range sets
    324     //
    325     int32_t numSets = fRXPat->fSets->size();
    326     fRXPat->fSets8 = new Regex8BitSet[numSets];
    327     // Null pointer check.
    328     if (fRXPat->fSets8 == NULL) {
    329         e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
    330         return;
    331     }
    332     int32_t i;
    333     for (i=0; i<numSets; i++) {
    334         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
    335         fRXPat->fSets8[i].init(s);
    336     }
    337 
    338 }
    339 
    340 
    341 
    342 
    343 
    344 //------------------------------------------------------------------------------
    345 //
    346 //  doParseAction        Do some action during regex pattern parsing.
    347 //                       Called by the parse state machine.
    348 //
    349 //                       Generation of the match engine PCode happens here, or
    350 //                       in functions called from the parse actions defined here.
    351 //
    352 //
    353 //------------------------------------------------------------------------------
    354 UBool RegexCompile::doParseActions(int32_t action)
    355 {
    356     UBool   returnVal = TRUE;
    357 
    358     switch ((Regex_PatternParseAction)action) {
    359 
    360     case doPatStart:
    361         // Start of pattern compiles to:
    362         //0   SAVE   2        Fall back to position of FAIL
    363         //1   jmp    3
    364         //2   FAIL            Stop if we ever reach here.
    365         //3   NOP             Dummy, so start of pattern looks the same as
    366         //                    the start of an ( grouping.
    367         //4   NOP             Resreved, will be replaced by a save if there are
    368         //                    OR | operators at the top level
    369         appendOp(URX_STATE_SAVE, 2);
    370         appendOp(URX_JMP,  3);
    371         appendOp(URX_FAIL, 0);
    372 
    373         // Standard open nonCapture paren action emits the two NOPs and
    374         //   sets up the paren stack frame.
    375         doParseActions(doOpenNonCaptureParen);
    376         break;
    377 
    378     case doPatFinish:
    379         // We've scanned to the end of the pattern
    380         //  The end of pattern compiles to:
    381         //        URX_END
    382         //    which will stop the runtime match engine.
    383         //  Encountering end of pattern also behaves like a close paren,
    384         //   and forces fixups of the State Save at the beginning of the compiled pattern
    385         //   and of any OR operations at the top level.
    386         //
    387         handleCloseParen();
    388         if (fParenStack.size() > 0) {
    389             // Missing close paren in pattern.
    390             error(U_REGEX_MISMATCHED_PAREN);
    391         }
    392 
    393         // add the END operation to the compiled pattern.
    394         appendOp(URX_END, 0);
    395 
    396         // Terminate the pattern compilation state machine.
    397         returnVal = FALSE;
    398         break;
    399 
    400 
    401 
    402     case doOrOperator:
    403         // Scanning a '|', as in (A|B)
    404         {
    405             // Generate code for any pending literals preceding the '|'
    406             fixLiterals(FALSE);
    407 
    408             // Insert a SAVE operation at the start of the pattern section preceding
    409             //   this OR at this level.  This SAVE will branch the match forward
    410             //   to the right hand side of the OR in the event that the left hand
    411             //   side fails to match and backtracks.  Locate the position for the
    412             //   save from the location on the top of the parentheses stack.
    413             int32_t savePosition = fParenStack.popi();
    414             int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
    415             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
    416             op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
    417             fRXPat->fCompiledPat->setElementAt(op, savePosition);
    418 
    419             // Append an JMP operation into the compiled pattern.  The operand for
    420             //  the JMP will eventually be the location following the ')' for the
    421             //  group.  This will be patched in later, when the ')' is encountered.
    422             appendOp(URX_JMP, 0);
    423 
    424             // Push the position of the newly added JMP op onto the parentheses stack.
    425             // This registers if for fixup when this block's close paren is encountered.
    426             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
    427 
    428             // Append a NOP to the compiled pattern.  This is the slot reserved
    429             //   for a SAVE in the event that there is yet another '|' following
    430             //   this one.
    431             appendOp(URX_NOP, 0);
    432             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
    433         }
    434         break;
    435 
    436 
    437     case doBeginNamedCapture:
    438         // Scanning (?<letter.
    439         //   The first letter of the name will come through again under doConinueNamedCapture.
    440         fCaptureName = new UnicodeString();
    441         if (fCaptureName == NULL) {
    442             error(U_MEMORY_ALLOCATION_ERROR);
    443         }
    444         break;
    445 
    446     case  doContinueNamedCapture:
    447         fCaptureName->append(fC.fChar);
    448         break;
    449 
    450     case doBadNamedCapture:
    451         error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
    452         break;
    453 
    454     case doOpenCaptureParen:
    455         // Open Capturing Paren, possibly named.
    456         //   Compile to a
    457         //      - NOP, which later may be replaced by a save-state if the
    458         //         parenthesized group gets a * quantifier, followed by
    459         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
    460         //      - NOP, which may later be replaced by a save-state if there
    461         //             is an '|' alternation within the parens.
    462         //
    463         //    Each capture group gets three slots in the save stack frame:
    464         //         0: Capture Group start position (in input string being matched.)
    465         //         1: Capture Group end position.
    466         //         2: Start of Match-in-progress.
    467         //    The first two locations are for a completed capture group, and are
    468         //     referred to by back references and the like.
    469         //    The third location stores the capture start position when an START_CAPTURE is
    470         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
    471         //      END_CAPTURE is encountered.
    472         {
    473             fixLiterals();
    474             appendOp(URX_NOP, 0);
    475             int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
    476             appendOp(URX_START_CAPTURE, varsLoc);
    477             appendOp(URX_NOP, 0);
    478 
    479             // On the Parentheses stack, start a new frame and add the postions
    480             //   of the two NOPs.  Depending on what follows in the pattern, the
    481             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
    482             //   address of the end of the parenthesized group.
    483             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    484             fParenStack.push(capturing, *fStatus);                        // Frame type.
    485             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
    486             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
    487 
    488             // Save the mapping from group number to stack frame variable position.
    489             fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
    490 
    491             // If this is a named capture group, add the name->group number mapping.
    492             if (fCaptureName != NULL) {
    493                 int32_t groupNumber = fRXPat->fGroupMap->size();
    494                 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
    495                 fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
    496                 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
    497                     error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
    498                 }
    499             }
    500         }
    501         break;
    502 
    503     case doOpenNonCaptureParen:
    504         // Open non-caputuring (grouping only) Paren.
    505         //   Compile to a
    506         //      - NOP, which later may be replaced by a save-state if the
    507         //         parenthesized group gets a * quantifier, followed by
    508         //      - NOP, which may later be replaced by a save-state if there
    509         //             is an '|' alternation within the parens.
    510         {
    511             fixLiterals();
    512             appendOp(URX_NOP, 0);
    513             appendOp(URX_NOP, 0);
    514 
    515             // On the Parentheses stack, start a new frame and add the postions
    516             //   of the two NOPs.
    517             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    518             fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
    519             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
    520             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
    521         }
    522          break;
    523 
    524 
    525     case doOpenAtomicParen:
    526         // Open Atomic Paren.  (?>
    527         //   Compile to a
    528         //      - NOP, which later may be replaced if the parenthesized group
    529         //         has a quantifier, followed by
    530         //      - STO_SP  save state stack position, so it can be restored at the ")"
    531         //      - NOP, which may later be replaced by a save-state if there
    532         //             is an '|' alternation within the parens.
    533         {
    534             fixLiterals();
    535             appendOp(URX_NOP, 0);
    536             int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
    537             appendOp(URX_STO_SP, varLoc);
    538             appendOp(URX_NOP, 0);
    539 
    540             // On the Parentheses stack, start a new frame and add the postions
    541             //   of the two NOPs.  Depending on what follows in the pattern, the
    542             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
    543             //   address of the end of the parenthesized group.
    544             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    545             fParenStack.push(atomic, *fStatus);                           // Frame type.
    546             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
    547             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
    548         }
    549         break;
    550 
    551 
    552     case doOpenLookAhead:
    553         // Positive Look-ahead   (?=  stuff  )
    554         //
    555         //   Note:   Addition of transparent input regions, with the need to
    556         //           restore the original regions when failing out of a lookahead
    557         //           block, complicated this sequence.  Some conbined opcodes
    558         //           might make sense - or might not, lookahead aren't that common.
    559         //
    560         //      Caution:  min match length optimization knows about this
    561         //               sequence; don't change without making updates there too.
    562         //
    563         // Compiles to
    564         //    1    START_LA     dataLoc     Saves SP, Input Pos
    565         //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
    566         //    3    JMP          6           continue ...
    567         //
    568         //    4.   LA_END                   Look Ahead failed.  Restore regions.
    569         //    5.   BACKTRACK                and back track again.
    570         //
    571         //    6.   NOP              reserved for use by quantifiers on the block.
    572         //                          Look-ahead can't have quantifiers, but paren stack
    573         //                             compile time conventions require the slot anyhow.
    574         //    7.   NOP              may be replaced if there is are '|' ops in the block.
    575         //    8.     code for parenthesized stuff.
    576         //    9.   LA_END
    577         //
    578         //  Two data slots are reserved, for saving the stack ptr and the input position.
    579         {
    580             fixLiterals();
    581             int32_t dataLoc = allocateData(2);
    582             appendOp(URX_LA_START, dataLoc);
    583             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
    584             appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
    585             appendOp(URX_LA_END, dataLoc);
    586             appendOp(URX_BACKTRACK, 0);
    587             appendOp(URX_NOP, 0);
    588             appendOp(URX_NOP, 0);
    589 
    590             // On the Parentheses stack, start a new frame and add the postions
    591             //   of the NOPs.
    592             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    593             fParenStack.push(lookAhead, *fStatus);                        // Frame type.
    594             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
    595             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
    596         }
    597         break;
    598 
    599     case doOpenLookAheadNeg:
    600         // Negated Lookahead.   (?! stuff )
    601         // Compiles to
    602         //    1.    START_LA    dataloc
    603         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
    604         //                                //   which continues with the match.
    605         //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
    606         //    4.       code for parenthesized stuff.
    607         //    5.    END_LA                // Cut back stack, remove saved state from step 2.
    608         //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
    609         //    7.    END_LA                // Restore match region, in case look-ahead was using
    610         //                                        an alternate (transparent) region.
    611         {
    612             fixLiterals();
    613             int32_t dataLoc = allocateData(2);
    614             appendOp(URX_LA_START, dataLoc);
    615             appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
    616             appendOp(URX_NOP, 0);
    617 
    618             // On the Parentheses stack, start a new frame and add the postions
    619             //   of the StateSave and NOP.
    620             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    621             fParenStack.push(negLookAhead, *fStatus);                    // Frame type
    622             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
    623             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
    624 
    625             // Instructions #5 - #7 will be added when the ')' is encountered.
    626         }
    627         break;
    628 
    629     case doOpenLookBehind:
    630         {
    631             //   Compile a (?<= look-behind open paren.
    632             //
    633             //          Compiles to
    634             //              0       URX_LB_START     dataLoc
    635             //              1       URX_LB_CONT      dataLoc
    636             //              2                        MinMatchLen
    637             //              3                        MaxMatchLen
    638             //              4       URX_NOP          Standard '(' boilerplate.
    639             //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
    640             //              6         <code for LookBehind expression>
    641             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
    642             //              8       URX_LA_END       dataLoc    # Restore stack, input pos
    643             //
    644             //          Allocate a block of matcher data, to contain (when running a match)
    645             //              0:    Stack ptr on entry
    646             //              1:    Input Index on entry
    647             //              2:    Start index of match current match attempt.
    648             //              3:    Original Input String len.
    649 
    650             // Generate match code for any pending literals.
    651             fixLiterals();
    652 
    653             // Allocate data space
    654             int32_t dataLoc = allocateData(4);
    655 
    656             // Emit URX_LB_START
    657             appendOp(URX_LB_START, dataLoc);
    658 
    659             // Emit URX_LB_CONT
    660             appendOp(URX_LB_CONT, dataLoc);
    661             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
    662             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
    663 
    664             // Emit the NOPs
    665             appendOp(URX_NOP, 0);
    666             appendOp(URX_NOP, 0);
    667 
    668             // On the Parentheses stack, start a new frame and add the postions
    669             //   of the URX_LB_CONT and the NOP.
    670             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    671             fParenStack.push(lookBehind, *fStatus);                       // Frame type
    672             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
    673             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
    674 
    675             // The final two instructions will be added when the ')' is encountered.
    676         }
    677 
    678         break;
    679 
    680     case doOpenLookBehindNeg:
    681         {
    682             //   Compile a (?<! negated look-behind open paren.
    683             //
    684             //          Compiles to
    685             //              0       URX_LB_START     dataLoc    # Save entry stack, input len
    686             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
    687             //              2                        MinMatchLen
    688             //              3                        MaxMatchLen
    689             //              4                        continueLoc (9)
    690             //              5       URX_NOP          Standard '(' boilerplate.
    691             //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
    692             //              7         <code for LookBehind expression>
    693             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
    694             //              9       ...
    695             //
    696             //          Allocate a block of matcher data, to contain (when running a match)
    697             //              0:    Stack ptr on entry
    698             //              1:    Input Index on entry
    699             //              2:    Start index of match current match attempt.
    700             //              3:    Original Input String len.
    701 
    702             // Generate match code for any pending literals.
    703             fixLiterals();
    704 
    705             // Allocate data space
    706             int32_t dataLoc = allocateData(4);
    707 
    708             // Emit URX_LB_START
    709             appendOp(URX_LB_START, dataLoc);
    710 
    711             // Emit URX_LBN_CONT
    712             appendOp(URX_LBN_CONT, dataLoc);
    713             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
    714             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
    715             appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
    716 
    717             // Emit the NOPs
    718             appendOp(URX_NOP, 0);
    719             appendOp(URX_NOP, 0);
    720 
    721             // On the Parentheses stack, start a new frame and add the postions
    722             //   of the URX_LB_CONT and the NOP.
    723             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    724             fParenStack.push(lookBehindN, *fStatus);                      // Frame type
    725             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
    726             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
    727 
    728             // The final two instructions will be added when the ')' is encountered.
    729         }
    730         break;
    731 
    732     case doConditionalExpr:
    733         // Conditionals such as (?(1)a:b)
    734     case doPerlInline:
    735         // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
    736         error(U_REGEX_UNIMPLEMENTED);
    737         break;
    738 
    739 
    740     case doCloseParen:
    741         handleCloseParen();
    742         if (fParenStack.size() <= 0) {
    743             //  Extra close paren, or missing open paren.
    744             error(U_REGEX_MISMATCHED_PAREN);
    745         }
    746         break;
    747 
    748     case doNOP:
    749         break;
    750 
    751 
    752     case doBadOpenParenType:
    753     case doRuleError:
    754         error(U_REGEX_RULE_SYNTAX);
    755         break;
    756 
    757 
    758     case doMismatchedParenErr:
    759         error(U_REGEX_MISMATCHED_PAREN);
    760         break;
    761 
    762     case doPlus:
    763         //  Normal '+'  compiles to
    764         //     1.   stuff to be repeated  (already built)
    765         //     2.   jmp-sav 1
    766         //     3.   ...
    767         //
    768         //  Or, if the item to be repeated can match a zero length string,
    769         //     1.   STO_INP_LOC  data-loc
    770         //     2.      body of stuff to be repeated
    771         //     3.   JMP_SAV_X    2
    772         //     4.   ...
    773 
    774         //
    775         //  Or, if the item to be repeated is simple
    776         //     1.   Item to be repeated.
    777         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
    778         //     3.   LOOP_C       stack location
    779         {
    780             int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
    781             int32_t  frameLoc;
    782 
    783             // Check for simple constructs, which may get special optimized code.
    784             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
    785                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
    786 
    787                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
    788                     // Emit optimized code for [char set]+
    789                     appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
    790                     frameLoc = allocateStackData(1);
    791                     appendOp(URX_LOOP_C, frameLoc);
    792                     break;
    793                 }
    794 
    795                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
    796                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
    797                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
    798                     // Emit Optimized code for .+ operations.
    799                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
    800                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
    801                         // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
    802                         loopOpI |= 1;
    803                     }
    804                     if (fModeFlags & UREGEX_UNIX_LINES) {
    805                         loopOpI |= 2;
    806                     }
    807                     appendOp(loopOpI);
    808                     frameLoc = allocateStackData(1);
    809                     appendOp(URX_LOOP_C, frameLoc);
    810                     break;
    811                 }
    812 
    813             }
    814 
    815             // General case.
    816 
    817             // Check for minimum match length of zero, which requires
    818             //    extra loop-breaking code.
    819             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
    820                 // Zero length match is possible.
    821                 // Emit the code sequence that can handle it.
    822                 insertOp(topLoc);
    823                 frameLoc = allocateStackData(1);
    824 
    825                 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
    826                 fRXPat->fCompiledPat->setElementAt(op, topLoc);
    827 
    828                 appendOp(URX_JMP_SAV_X, topLoc+1);
    829             } else {
    830                 // Simpler code when the repeated body must match something non-empty
    831                 appendOp(URX_JMP_SAV, topLoc);
    832             }
    833         }
    834         break;
    835 
    836     case doNGPlus:
    837         //  Non-greedy '+?'  compiles to
    838         //     1.   stuff to be repeated  (already built)
    839         //     2.   state-save  1
    840         //     3.   ...
    841         {
    842             int32_t topLoc      = blockTopLoc(FALSE);
    843             appendOp(URX_STATE_SAVE, topLoc);
    844         }
    845         break;
    846 
    847 
    848     case doOpt:
    849         // Normal (greedy) ? quantifier.
    850         //  Compiles to
    851         //     1. state save 3
    852         //     2.    body of optional block
    853         //     3. ...
    854         // Insert the state save into the compiled pattern, and we're done.
    855         {
    856             int32_t   saveStateLoc = blockTopLoc(TRUE);
    857             int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
    858             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
    859         }
    860         break;
    861 
    862     case doNGOpt:
    863         // Non-greedy ?? quantifier
    864         //   compiles to
    865         //    1.  jmp   4
    866         //    2.     body of optional block
    867         //    3   jmp   5
    868         //    4.  state save 2
    869         //    5    ...
    870         //  This code is less than ideal, with two jmps instead of one, because we can only
    871         //  insert one instruction at the top of the block being iterated.
    872         {
    873             int32_t  jmp1_loc = blockTopLoc(TRUE);
    874             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
    875 
    876             int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
    877             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
    878 
    879             appendOp(URX_JMP, jmp2_loc+2);
    880 
    881             appendOp(URX_STATE_SAVE, jmp1_loc+1);
    882         }
    883         break;
    884 
    885 
    886     case doStar:
    887         // Normal (greedy) * quantifier.
    888         // Compiles to
    889         //       1.   STATE_SAVE   4
    890         //       2.      body of stuff being iterated over
    891         //       3.   JMP_SAV      2
    892         //       4.   ...
    893         //
    894         // Or, if the body is a simple [Set],
    895         //       1.   LOOP_SR_I    set number
    896         //       2.   LOOP_C       stack location
    897         //       ...
    898         //
    899         // Or if this is a .*
    900         //       1.   LOOP_DOT_I    (. matches all mode flag)
    901         //       2.   LOOP_C        stack location
    902         //
    903         // Or, if the body can match a zero-length string, to inhibit infinite loops,
    904         //       1.   STATE_SAVE   5
    905         //       2.   STO_INP_LOC  data-loc
    906         //       3.      body of stuff
    907         //       4.   JMP_SAV_X    2
    908         //       5.   ...
    909         {
    910             // location of item #1, the STATE_SAVE
    911             int32_t   topLoc = blockTopLoc(FALSE);
    912             int32_t   dataLoc = -1;
    913 
    914             // Check for simple *, where the construct being repeated
    915             //   compiled to single opcode, and might be optimizable.
    916             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
    917                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
    918 
    919                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
    920                     // Emit optimized code for a [char set]*
    921                     int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
    922                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
    923                     dataLoc = allocateStackData(1);
    924                     appendOp(URX_LOOP_C, dataLoc);
    925                     break;
    926                 }
    927 
    928                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
    929                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
    930                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
    931                     // Emit Optimized code for .* operations.
    932                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
    933                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
    934                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
    935                         loopOpI |= 1;
    936                     }
    937                     if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
    938                         loopOpI |= 2;
    939                     }
    940                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
    941                     dataLoc = allocateStackData(1);
    942                     appendOp(URX_LOOP_C, dataLoc);
    943                     break;
    944                 }
    945             }
    946 
    947             // Emit general case code for this *
    948             // The optimizations did not apply.
    949 
    950             int32_t   saveStateLoc = blockTopLoc(TRUE);
    951             int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
    952 
    953             // Check for minimum match length of zero, which requires
    954             //    extra loop-breaking code.
    955             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
    956                 insertOp(saveStateLoc);
    957                 dataLoc = allocateStackData(1);
    958 
    959                 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
    960                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
    961                 jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
    962             }
    963 
    964             // Locate the position in the compiled pattern where the match will continue
    965             //   after completing the *.   (4 or 5 in the comment above)
    966             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
    967 
    968             // Put together the save state op and store it into the compiled code.
    969             int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
    970             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
    971 
    972             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
    973             appendOp(jmpOp);
    974         }
    975         break;
    976 
    977     case doNGStar:
    978         // Non-greedy *? quantifier
    979         // compiles to
    980         //     1.   JMP    3
    981         //     2.      body of stuff being iterated over
    982         //     3.   STATE_SAVE  2
    983         //     4    ...
    984         {
    985             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
    986             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
    987             int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
    988             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
    989             appendOp(URX_STATE_SAVE, jmpLoc+1);
    990         }
    991         break;
    992 
    993 
    994     case doIntervalInit:
    995         // The '{' opening an interval quantifier was just scanned.
    996         // Init the counter varaiables that will accumulate the values as the digits
    997         //    are scanned.
    998         fIntervalLow = 0;
    999         fIntervalUpper = -1;
   1000         break;
   1001 
   1002     case doIntevalLowerDigit:
   1003         // Scanned a digit from the lower value of an {lower,upper} interval
   1004         {
   1005             int32_t digitValue = u_charDigitValue(fC.fChar);
   1006             U_ASSERT(digitValue >= 0);
   1007             int64_t val = (int64_t)fIntervalLow*10 + digitValue;
   1008             if (val > INT32_MAX) {
   1009                 error(U_REGEX_NUMBER_TOO_BIG);
   1010             } else {
   1011                 fIntervalLow = (int32_t)val;
   1012             }
   1013         }
   1014         break;
   1015 
   1016     case doIntervalUpperDigit:
   1017         // Scanned a digit from the upper value of an {lower,upper} interval
   1018         {
   1019             if (fIntervalUpper < 0) {
   1020                 fIntervalUpper = 0;
   1021             }
   1022             int32_t digitValue = u_charDigitValue(fC.fChar);
   1023             U_ASSERT(digitValue >= 0);
   1024             int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
   1025             if (val > INT32_MAX) {
   1026                 error(U_REGEX_NUMBER_TOO_BIG);
   1027             } else {
   1028                 fIntervalUpper = (int32_t)val;
   1029             }
   1030         }
   1031         break;
   1032 
   1033     case doIntervalSame:
   1034         // Scanned a single value interval like {27}.  Upper = Lower.
   1035         fIntervalUpper = fIntervalLow;
   1036         break;
   1037 
   1038     case doInterval:
   1039         // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
   1040         if (compileInlineInterval() == FALSE) {
   1041             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
   1042         }
   1043         break;
   1044 
   1045     case doPossessiveInterval:
   1046         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
   1047         {
   1048             // Remember the loc for the top of the block being looped over.
   1049             //   (Can not reserve a slot in the compiled pattern at this time, because
   1050             //    compileInterval needs to reserve also, and blockTopLoc can only reserve
   1051             //    once per block.)
   1052             int32_t topLoc = blockTopLoc(FALSE);
   1053 
   1054             // Produce normal looping code.
   1055             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
   1056 
   1057             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
   1058             //  just as if the loop was inclosed in atomic parentheses.
   1059 
   1060             // First the STO_SP before the start of the loop
   1061             insertOp(topLoc);
   1062 
   1063             int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
   1064             int32_t  op     = buildOp(URX_STO_SP, varLoc);
   1065             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1066 
   1067             int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
   1068             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
   1069             loopOp++;     // point LoopOp after the just-inserted STO_SP
   1070             fRXPat->fCompiledPat->push(loopOp, *fStatus);
   1071 
   1072             // Then the LD_SP after the end of the loop
   1073             appendOp(URX_LD_SP, varLoc);
   1074         }
   1075 
   1076         break;
   1077 
   1078     case doNGInterval:
   1079         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
   1080         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
   1081         break;
   1082 
   1083     case doIntervalError:
   1084         error(U_REGEX_BAD_INTERVAL);
   1085         break;
   1086 
   1087     case doLiteralChar:
   1088         // We've just scanned a "normal" character from the pattern,
   1089         literalChar(fC.fChar);
   1090         break;
   1091 
   1092 
   1093     case doEscapedLiteralChar:
   1094         // We've just scanned an backslashed escaped character with  no
   1095         //   special meaning.  It represents itself.
   1096         if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
   1097             ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
   1098             (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
   1099                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1100              }
   1101         literalChar(fC.fChar);
   1102         break;
   1103 
   1104 
   1105     case doDotAny:
   1106         // scanned a ".",  match any single character.
   1107         {
   1108             fixLiterals(FALSE);
   1109             if (fModeFlags & UREGEX_DOTALL) {
   1110                 appendOp(URX_DOTANY_ALL, 0);
   1111             } else if (fModeFlags & UREGEX_UNIX_LINES) {
   1112                 appendOp(URX_DOTANY_UNIX, 0);
   1113             } else {
   1114                 appendOp(URX_DOTANY, 0);
   1115             }
   1116         }
   1117         break;
   1118 
   1119     case doCaret:
   1120         {
   1121             fixLiterals(FALSE);
   1122             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1123                 appendOp(URX_CARET, 0);
   1124             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1125                 appendOp(URX_CARET_M, 0);
   1126             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1127                 appendOp(URX_CARET, 0);   // Only testing true start of input.
   1128             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1129                 appendOp(URX_CARET_M_UNIX, 0);
   1130             }
   1131         }
   1132         break;
   1133 
   1134     case doDollar:
   1135         {
   1136             fixLiterals(FALSE);
   1137             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1138                 appendOp(URX_DOLLAR, 0);
   1139             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1140                 appendOp(URX_DOLLAR_M, 0);
   1141             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1142                 appendOp(URX_DOLLAR_D, 0);
   1143             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1144                 appendOp(URX_DOLLAR_MD, 0);
   1145             }
   1146         }
   1147         break;
   1148 
   1149     case doBackslashA:
   1150         fixLiterals(FALSE);
   1151         appendOp(URX_CARET, 0);
   1152         break;
   1153 
   1154     case doBackslashB:
   1155         {
   1156             #if  UCONFIG_NO_BREAK_ITERATION==1
   1157             if (fModeFlags & UREGEX_UWORD) {
   1158                 error(U_UNSUPPORTED_ERROR);
   1159             }
   1160             #endif
   1161             fixLiterals(FALSE);
   1162             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
   1163             appendOp(op, 1);
   1164         }
   1165         break;
   1166 
   1167     case doBackslashb:
   1168         {
   1169             #if  UCONFIG_NO_BREAK_ITERATION==1
   1170             if (fModeFlags & UREGEX_UWORD) {
   1171                 error(U_UNSUPPORTED_ERROR);
   1172             }
   1173             #endif
   1174             fixLiterals(FALSE);
   1175             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
   1176             appendOp(op, 0);
   1177         }
   1178         break;
   1179 
   1180     case doBackslashD:
   1181         fixLiterals(FALSE);
   1182         appendOp(URX_BACKSLASH_D, 1);
   1183         break;
   1184 
   1185     case doBackslashd:
   1186         fixLiterals(FALSE);
   1187         appendOp(URX_BACKSLASH_D, 0);
   1188         break;
   1189 
   1190     case doBackslashG:
   1191         fixLiterals(FALSE);
   1192         appendOp(URX_BACKSLASH_G, 0);
   1193         break;
   1194 
   1195     case doBackslashH:
   1196         fixLiterals(FALSE);
   1197         appendOp(URX_BACKSLASH_H, 1);
   1198         break;
   1199 
   1200     case doBackslashh:
   1201         fixLiterals(FALSE);
   1202         appendOp(URX_BACKSLASH_H, 0);
   1203         break;
   1204 
   1205     case doBackslashR:
   1206         fixLiterals(FALSE);
   1207         appendOp(URX_BACKSLASH_R, 0);
   1208         break;
   1209 
   1210     case doBackslashS:
   1211         fixLiterals(FALSE);
   1212         appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
   1213         break;
   1214 
   1215     case doBackslashs:
   1216         fixLiterals(FALSE);
   1217         appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
   1218         break;
   1219 
   1220     case doBackslashV:
   1221         fixLiterals(FALSE);
   1222         appendOp(URX_BACKSLASH_V, 1);
   1223         break;
   1224 
   1225     case doBackslashv:
   1226         fixLiterals(FALSE);
   1227         appendOp(URX_BACKSLASH_V, 0);
   1228         break;
   1229 
   1230     case doBackslashW:
   1231         fixLiterals(FALSE);
   1232         appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
   1233         break;
   1234 
   1235     case doBackslashw:
   1236         fixLiterals(FALSE);
   1237         appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
   1238         break;
   1239 
   1240     case doBackslashX:
   1241         fixLiterals(FALSE);
   1242         appendOp(URX_BACKSLASH_X, 0);
   1243         break;
   1244 
   1245 
   1246     case doBackslashZ:
   1247         fixLiterals(FALSE);
   1248         appendOp(URX_DOLLAR, 0);
   1249         break;
   1250 
   1251     case doBackslashz:
   1252         fixLiterals(FALSE);
   1253         appendOp(URX_BACKSLASH_Z, 0);
   1254         break;
   1255 
   1256     case doEscapeError:
   1257         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1258         break;
   1259 
   1260     case doExit:
   1261         fixLiterals(FALSE);
   1262         returnVal = FALSE;
   1263         break;
   1264 
   1265     case doProperty:
   1266         {
   1267             fixLiterals(FALSE);
   1268             UnicodeSet *theSet = scanProp();
   1269             compileSet(theSet);
   1270         }
   1271         break;
   1272 
   1273     case doNamedChar:
   1274         {
   1275             UChar32 c = scanNamedChar();
   1276             literalChar(c);
   1277         }
   1278         break;
   1279 
   1280 
   1281     case doBackRef:
   1282         // BackReference.  Somewhat unusual in that the front-end can not completely parse
   1283         //                 the regular expression, because the number of digits to be consumed
   1284         //                 depends on the number of capture groups that have been defined.  So
   1285         //                 we have to do it here instead.
   1286         {
   1287             int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
   1288             int32_t  groupNum = 0;
   1289             UChar32  c        = fC.fChar;
   1290 
   1291             for (;;) {
   1292                 // Loop once per digit, for max allowed number of digits in a back reference.
   1293                 int32_t digit = u_charDigitValue(c);
   1294                 groupNum = groupNum * 10 + digit;
   1295                 if (groupNum >= numCaptureGroups) {
   1296                     break;
   1297                 }
   1298                 c = peekCharLL();
   1299                 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
   1300                     break;
   1301                 }
   1302                 nextCharLL();
   1303             }
   1304 
   1305             // Scan of the back reference in the source regexp is complete.  Now generate
   1306             //  the compiled code for it.
   1307             // Because capture groups can be forward-referenced by back-references,
   1308             //  we fill the operand with the capture group number.  At the end
   1309             //  of compilation, it will be changed to the variable's location.
   1310             U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
   1311                                      //    and shouldn't enter this code path at all.
   1312             fixLiterals(FALSE);
   1313             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1314                 appendOp(URX_BACKREF_I, groupNum);
   1315             } else {
   1316                 appendOp(URX_BACKREF, groupNum);
   1317             }
   1318         }
   1319         break;
   1320 
   1321     case doBeginNamedBackRef:
   1322         U_ASSERT(fCaptureName == NULL);
   1323         fCaptureName = new UnicodeString;
   1324         if (fCaptureName == NULL) {
   1325             error(U_MEMORY_ALLOCATION_ERROR);
   1326         }
   1327         break;
   1328 
   1329     case doContinueNamedBackRef:
   1330         fCaptureName->append(fC.fChar);
   1331         break;
   1332 
   1333     case doCompleteNamedBackRef:
   1334         {
   1335         int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
   1336         if (groupNumber == 0) {
   1337             // Group name has not been defined.
   1338             //   Could be a forward reference. If we choose to support them at some
   1339             //   future time, extra mechanism will be required at this point.
   1340             error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   1341         } else {
   1342             // Given the number, handle identically to a \n numbered back reference.
   1343             // See comments above, under doBackRef
   1344             fixLiterals(FALSE);
   1345             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1346                 appendOp(URX_BACKREF_I, groupNumber);
   1347             } else {
   1348                 appendOp(URX_BACKREF, groupNumber);
   1349             }
   1350         }
   1351         delete fCaptureName;
   1352         fCaptureName = NULL;
   1353         break;
   1354         }
   1355 
   1356     case doPossessivePlus:
   1357         // Possessive ++ quantifier.
   1358         // Compiles to
   1359         //       1.   STO_SP
   1360         //       2.      body of stuff being iterated over
   1361         //       3.   STATE_SAVE 5
   1362         //       4.   JMP        2
   1363         //       5.   LD_SP
   1364         //       6.   ...
   1365         //
   1366         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
   1367         //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
   1368         //
   1369         {
   1370             // Emit the STO_SP
   1371             int32_t   topLoc = blockTopLoc(TRUE);
   1372             int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
   1373             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
   1374             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1375 
   1376             // Emit the STATE_SAVE
   1377             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
   1378 
   1379             // Emit the JMP
   1380             appendOp(URX_JMP, topLoc+1);
   1381 
   1382             // Emit the LD_SP
   1383             appendOp(URX_LD_SP, stoLoc);
   1384         }
   1385         break;
   1386 
   1387     case doPossessiveStar:
   1388         // Possessive *+ quantifier.
   1389         // Compiles to
   1390         //       1.   STO_SP       loc
   1391         //       2.   STATE_SAVE   5
   1392         //       3.      body of stuff being iterated over
   1393         //       4.   JMP          2
   1394         //       5.   LD_SP        loc
   1395         //       6    ...
   1396         // TODO:  do something to cut back the state stack each time through the loop.
   1397         {
   1398             // Reserve two slots at the top of the block.
   1399             int32_t   topLoc = blockTopLoc(TRUE);
   1400             insertOp(topLoc);
   1401 
   1402             // emit   STO_SP     loc
   1403             int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
   1404             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
   1405             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1406 
   1407             // Emit the SAVE_STATE   5
   1408             int32_t L7 = fRXPat->fCompiledPat->size()+1;
   1409             op = buildOp(URX_STATE_SAVE, L7);
   1410             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
   1411 
   1412             // Append the JMP operation.
   1413             appendOp(URX_JMP, topLoc+1);
   1414 
   1415             // Emit the LD_SP       loc
   1416             appendOp(URX_LD_SP, stoLoc);
   1417         }
   1418         break;
   1419 
   1420     case doPossessiveOpt:
   1421         // Possessive  ?+ quantifier.
   1422         //  Compiles to
   1423         //     1. STO_SP      loc
   1424         //     2. SAVE_STATE  5
   1425         //     3.    body of optional block
   1426         //     4. LD_SP       loc
   1427         //     5. ...
   1428         //
   1429         {
   1430             // Reserve two slots at the top of the block.
   1431             int32_t   topLoc = blockTopLoc(TRUE);
   1432             insertOp(topLoc);
   1433 
   1434             // Emit the STO_SP
   1435             int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
   1436             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
   1437             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1438 
   1439             // Emit the SAVE_STATE
   1440             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
   1441             op = buildOp(URX_STATE_SAVE, continueLoc);
   1442             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
   1443 
   1444             // Emit the LD_SP
   1445             appendOp(URX_LD_SP, stoLoc);
   1446         }
   1447         break;
   1448 
   1449 
   1450     case doBeginMatchMode:
   1451         fNewModeFlags = fModeFlags;
   1452         fSetModeFlag  = TRUE;
   1453         break;
   1454 
   1455     case doMatchMode:   //  (?i)    and similar
   1456         {
   1457             int32_t  bit = 0;
   1458             switch (fC.fChar) {
   1459             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
   1460             case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
   1461             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
   1462             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
   1463             case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
   1464             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
   1465             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
   1466             case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
   1467             default:
   1468                 U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
   1469                                    // by the scanner.
   1470             }
   1471             if (fSetModeFlag) {
   1472                 fNewModeFlags |= bit;
   1473             } else {
   1474                 fNewModeFlags &= ~bit;
   1475             }
   1476         }
   1477         break;
   1478 
   1479     case doSetMatchMode:
   1480         // Emit code to match any pending literals, using the not-yet changed match mode.
   1481         fixLiterals();
   1482 
   1483         // We've got a (?i) or similar.  The match mode is being changed, but
   1484         //   the change is not scoped to a parenthesized block.
   1485         U_ASSERT(fNewModeFlags < 0);
   1486         fModeFlags = fNewModeFlags;
   1487 
   1488         break;
   1489 
   1490 
   1491     case doMatchModeParen:
   1492         // We've got a (?i: or similar.  Begin a parenthesized block, save old
   1493         //   mode flags so they can be restored at the close of the block.
   1494         //
   1495         //   Compile to a
   1496         //      - NOP, which later may be replaced by a save-state if the
   1497         //         parenthesized group gets a * quantifier, followed by
   1498         //      - NOP, which may later be replaced by a save-state if there
   1499         //             is an '|' alternation within the parens.
   1500         {
   1501             fixLiterals(FALSE);
   1502             appendOp(URX_NOP, 0);
   1503             appendOp(URX_NOP, 0);
   1504 
   1505             // On the Parentheses stack, start a new frame and add the postions
   1506             //   of the two NOPs (a normal non-capturing () frame, except for the
   1507             //   saving of the orignal mode flags.)
   1508             fParenStack.push(fModeFlags, *fStatus);
   1509             fParenStack.push(flags, *fStatus);                            // Frame Marker
   1510             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
   1511             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
   1512 
   1513             // Set the current mode flags to the new values.
   1514             U_ASSERT(fNewModeFlags < 0);
   1515             fModeFlags = fNewModeFlags;
   1516         }
   1517         break;
   1518 
   1519     case doBadModeFlag:
   1520         error(U_REGEX_INVALID_FLAG);
   1521         break;
   1522 
   1523     case doSuppressComments:
   1524         // We have just scanned a '(?'.  We now need to prevent the character scanner from
   1525         // treating a '#' as a to-the-end-of-line comment.
   1526         //   (This Perl compatibility just gets uglier and uglier to do...)
   1527         fEOLComments = FALSE;
   1528         break;
   1529 
   1530 
   1531     case doSetAddAmp:
   1532         {
   1533           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1534           set->add(chAmp);
   1535         }
   1536         break;
   1537 
   1538     case doSetAddDash:
   1539         {
   1540           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1541           set->add(chDash);
   1542         }
   1543         break;
   1544 
   1545      case doSetBackslash_s:
   1546         {
   1547          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1548          set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
   1549          break;
   1550         }
   1551 
   1552      case doSetBackslash_S:
   1553         {
   1554             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1555             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
   1556             SSet.complement();
   1557             set->addAll(SSet);
   1558             break;
   1559         }
   1560 
   1561     case doSetBackslash_d:
   1562         {
   1563             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1564             // TODO - make a static set, ticket 6058.
   1565             addCategory(set, U_GC_ND_MASK, *fStatus);
   1566             break;
   1567         }
   1568 
   1569     case doSetBackslash_D:
   1570         {
   1571             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1572             UnicodeSet digits;
   1573             // TODO - make a static set, ticket 6058.
   1574             digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
   1575             digits.complement();
   1576             set->addAll(digits);
   1577             break;
   1578         }
   1579 
   1580     case doSetBackslash_h:
   1581         {
   1582             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1583             UnicodeSet h;
   1584             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
   1585             h.add((UChar32)9);   // Tab
   1586             set->addAll(h);
   1587             break;
   1588         }
   1589 
   1590     case doSetBackslash_H:
   1591         {
   1592             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1593             UnicodeSet h;
   1594             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
   1595             h.add((UChar32)9);   // Tab
   1596             h.complement();
   1597             set->addAll(h);
   1598             break;
   1599         }
   1600 
   1601     case doSetBackslash_v:
   1602         {
   1603             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1604             set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
   1605             set->add((UChar32)0x85);
   1606             set->add((UChar32)0x2028, (UChar32)0x2029);
   1607             break;
   1608         }
   1609 
   1610     case doSetBackslash_V:
   1611         {
   1612             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1613             UnicodeSet v;
   1614             v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
   1615             v.add((UChar32)0x85);
   1616             v.add((UChar32)0x2028, (UChar32)0x2029);
   1617             v.complement();
   1618             set->addAll(v);
   1619             break;
   1620         }
   1621 
   1622     case doSetBackslash_w:
   1623         {
   1624             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1625             set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
   1626             break;
   1627         }
   1628 
   1629     case doSetBackslash_W:
   1630         {
   1631             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1632             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
   1633             SSet.complement();
   1634             set->addAll(SSet);
   1635             break;
   1636         }
   1637 
   1638     case doSetBegin:
   1639         fixLiterals(FALSE);
   1640         fSetStack.push(new UnicodeSet(), *fStatus);
   1641         fSetOpStack.push(setStart, *fStatus);
   1642         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1643             fSetOpStack.push(setCaseClose, *fStatus);
   1644         }
   1645         break;
   1646 
   1647     case doSetBeginDifference1:
   1648         //  We have scanned something like [[abc]-[
   1649         //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
   1650         //  Push a Difference operator, which will cause the new set to be subtracted from what
   1651         //    went before once it is created.
   1652         setPushOp(setDifference1);
   1653         fSetOpStack.push(setStart, *fStatus);
   1654         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1655             fSetOpStack.push(setCaseClose, *fStatus);
   1656         }
   1657         break;
   1658 
   1659     case doSetBeginIntersection1:
   1660         //  We have scanned something like  [[abc]&[
   1661         //   Need both the '&' operator and the open '[' operator.
   1662         setPushOp(setIntersection1);
   1663         fSetOpStack.push(setStart, *fStatus);
   1664         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1665             fSetOpStack.push(setCaseClose, *fStatus);
   1666         }
   1667         break;
   1668 
   1669     case doSetBeginUnion:
   1670         //  We have scanned something like  [[abc][
   1671         //     Need to handle the union operation explicitly [[abc] | [
   1672         setPushOp(setUnion);
   1673         fSetOpStack.push(setStart, *fStatus);
   1674         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1675             fSetOpStack.push(setCaseClose, *fStatus);
   1676         }
   1677         break;
   1678 
   1679     case doSetDifference2:
   1680         // We have scanned something like [abc--
   1681         //   Consider this to unambiguously be a set difference operator.
   1682         setPushOp(setDifference2);
   1683         break;
   1684 
   1685     case doSetEnd:
   1686         // Have encountered the ']' that closes a set.
   1687         //    Force the evaluation of any pending operations within this set,
   1688         //    leave the completed set on the top of the set stack.
   1689         setEval(setEnd);
   1690         U_ASSERT(fSetOpStack.peeki()==setStart);
   1691         fSetOpStack.popi();
   1692         break;
   1693 
   1694     case doSetFinish:
   1695         {
   1696         // Finished a complete set expression, including all nested sets.
   1697         //   The close bracket has already triggered clearing out pending set operators,
   1698         //    the operator stack should be empty and the operand stack should have just
   1699         //    one entry, the result set.
   1700         U_ASSERT(fSetOpStack.empty());
   1701         UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
   1702         U_ASSERT(fSetStack.empty());
   1703         compileSet(theSet);
   1704         break;
   1705         }
   1706 
   1707     case doSetIntersection2:
   1708         // Have scanned something like [abc&&
   1709         setPushOp(setIntersection2);
   1710         break;
   1711 
   1712     case doSetLiteral:
   1713         // Union the just-scanned literal character into the set being built.
   1714         //    This operation is the highest precedence set operation, so we can always do
   1715         //    it immediately, without waiting to see what follows.  It is necessary to perform
   1716         //    any pending '-' or '&' operation first, because these have the same precedence
   1717         //    as union-ing in a literal'
   1718         {
   1719             setEval(setUnion);
   1720             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1721             s->add(fC.fChar);
   1722             fLastSetLiteral = fC.fChar;
   1723             break;
   1724         }
   1725 
   1726     case doSetLiteralEscaped:
   1727         // A back-slash escaped literal character was encountered.
   1728         // Processing is the same as with setLiteral, above, with the addition of
   1729         //  the optional check for errors on escaped ASCII letters.
   1730         {
   1731             if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
   1732                 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
   1733                  (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
   1734                 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1735             }
   1736             setEval(setUnion);
   1737             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1738             s->add(fC.fChar);
   1739             fLastSetLiteral = fC.fChar;
   1740             break;
   1741         }
   1742 
   1743         case doSetNamedChar:
   1744         // Scanning a \N{UNICODE CHARACTER NAME}
   1745         //  Aside from the source of the character, the processing is identical to doSetLiteral,
   1746         //    above.
   1747         {
   1748             UChar32  c = scanNamedChar();
   1749             setEval(setUnion);
   1750             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1751             s->add(c);
   1752             fLastSetLiteral = c;
   1753             break;
   1754         }
   1755 
   1756     case doSetNamedRange:
   1757         // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
   1758         // The left character is already in the set, and is saved in fLastSetLiteral.
   1759         // The right side needs to be picked up, the scan is at the 'N'.
   1760         // Lower Limit > Upper limit being an error matches both Java
   1761         //        and ICU UnicodeSet behavior.
   1762         {
   1763             UChar32  c = scanNamedChar();
   1764             if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
   1765                 error(U_REGEX_INVALID_RANGE);
   1766             }
   1767             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1768             s->add(fLastSetLiteral, c);
   1769             fLastSetLiteral = c;
   1770             break;
   1771         }
   1772 
   1773 
   1774     case  doSetNegate:
   1775         // Scanned a '^' at the start of a set.
   1776         // Push the negation operator onto the set op stack.
   1777         // A twist for case-insensitive matching:
   1778         //   the case closure operation must happen _before_ negation.
   1779         //   But the case closure operation will already be on the stack if it's required.
   1780         //   This requires checking for case closure, and swapping the stack order
   1781         //    if it is present.
   1782         {
   1783             int32_t  tosOp = fSetOpStack.peeki();
   1784             if (tosOp == setCaseClose) {
   1785                 fSetOpStack.popi();
   1786                 fSetOpStack.push(setNegation, *fStatus);
   1787                 fSetOpStack.push(setCaseClose, *fStatus);
   1788             } else {
   1789                 fSetOpStack.push(setNegation, *fStatus);
   1790             }
   1791         }
   1792         break;
   1793 
   1794     case doSetNoCloseError:
   1795         error(U_REGEX_MISSING_CLOSE_BRACKET);
   1796         break;
   1797 
   1798     case doSetOpError:
   1799         error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
   1800         break;
   1801 
   1802     case doSetPosixProp:
   1803         {
   1804             UnicodeSet *s = scanPosixProp();
   1805             if (s != NULL) {
   1806                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
   1807                 tos->addAll(*s);
   1808                 delete s;
   1809             }  // else error.  scanProp() reported the error status already.
   1810         }
   1811         break;
   1812 
   1813     case doSetProp:
   1814         //  Scanned a \p \P within [brackets].
   1815         {
   1816             UnicodeSet *s = scanProp();
   1817             if (s != NULL) {
   1818                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
   1819                 tos->addAll(*s);
   1820                 delete s;
   1821             }  // else error.  scanProp() reported the error status already.
   1822         }
   1823         break;
   1824 
   1825 
   1826     case doSetRange:
   1827         // We have scanned literal-literal.  Add the range to the set.
   1828         // The left character is already in the set, and is saved in fLastSetLiteral.
   1829         // The right side is the current character.
   1830         // Lower Limit > Upper limit being an error matches both Java
   1831         //        and ICU UnicodeSet behavior.
   1832         {
   1833 
   1834         if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
   1835             error(U_REGEX_INVALID_RANGE);
   1836         }
   1837         UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1838         s->add(fLastSetLiteral, fC.fChar);
   1839         break;
   1840         }
   1841 
   1842     default:
   1843         U_ASSERT(FALSE);
   1844         error(U_REGEX_INTERNAL_ERROR);
   1845         break;
   1846     }
   1847 
   1848     if (U_FAILURE(*fStatus)) {
   1849         returnVal = FALSE;
   1850     }
   1851 
   1852     return returnVal;
   1853 }
   1854 
   1855 
   1856 
   1857 //------------------------------------------------------------------------------
   1858 //
   1859 //   literalChar           We've encountered a literal character from the pattern,
   1860 //                             or an escape sequence that reduces to a character.
   1861 //                         Add it to the string containing all literal chars/strings from
   1862 //                             the pattern.
   1863 //
   1864 //------------------------------------------------------------------------------
   1865 void RegexCompile::literalChar(UChar32 c)  {
   1866     fLiteralChars.append(c);
   1867 }
   1868 
   1869 
   1870 //------------------------------------------------------------------------------
   1871 //
   1872 //    fixLiterals           When compiling something that can follow a literal
   1873 //                          string in a pattern, emit the code to match the
   1874 //                          accumulated literal string.
   1875 //
   1876 //                          Optionally, split the last char of the string off into
   1877 //                          a single "ONE_CHAR" operation, so that quantifiers can
   1878 //                          apply to that char alone.  Example:   abc*
   1879 //                          The * must apply to the 'c' only.
   1880 //
   1881 //------------------------------------------------------------------------------
   1882 void    RegexCompile::fixLiterals(UBool split) {
   1883 
   1884     // If no literal characters have been scanned but not yet had code generated
   1885     //   for them, nothing needs to be done.
   1886     if (fLiteralChars.length() == 0) {
   1887         return;
   1888     }
   1889 
   1890     int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
   1891     UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
   1892 
   1893     // Split:  We need to  ensure that the last item in the compiled pattern
   1894     //     refers only to the last literal scanned in the pattern, so that
   1895     //     quantifiers (*, +, etc.) affect only it, and not a longer string.
   1896     //     Split before case folding for case insensitive matches.
   1897 
   1898     if (split) {
   1899         fLiteralChars.truncate(indexOfLastCodePoint);
   1900         fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
   1901                               //  Note that the truncated literal string may be empty, in which case
   1902                               //  nothing will be emitted.
   1903 
   1904         literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
   1905         fixLiterals(FALSE);          // Second recursive call, code for the final code point.
   1906         return;
   1907     }
   1908 
   1909     // If we are doing case-insensitive matching, case fold the string.  This may expand
   1910     //   the string, e.g. the German sharp-s turns into "ss"
   1911     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1912         fLiteralChars.foldCase();
   1913         indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
   1914         lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
   1915     }
   1916 
   1917     if (indexOfLastCodePoint == 0) {
   1918         // Single character, emit a URX_ONECHAR op to match it.
   1919         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
   1920                  u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
   1921             appendOp(URX_ONECHAR_I, lastCodePoint);
   1922         } else {
   1923             appendOp(URX_ONECHAR, lastCodePoint);
   1924         }
   1925     } else {
   1926         // Two or more chars, emit a URX_STRING to match them.
   1927         if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
   1928             error(U_REGEX_PATTERN_TOO_BIG);
   1929         }
   1930         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1931             appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
   1932         } else {
   1933             // TODO here:  add optimization to split case sensitive strings of length two
   1934             //             into two single char ops, for efficiency.
   1935             appendOp(URX_STRING, fRXPat->fLiteralText.length());
   1936         }
   1937         appendOp(URX_STRING_LEN, fLiteralChars.length());
   1938 
   1939         // Add this string into the accumulated strings of the compiled pattern.
   1940         fRXPat->fLiteralText.append(fLiteralChars);
   1941     }
   1942 
   1943     fLiteralChars.remove();
   1944 }
   1945 
   1946 
   1947 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
   1948     if (U_FAILURE(*fStatus)) {
   1949         return 0;
   1950     }
   1951     if (type < 0 || type > 255) {
   1952         U_ASSERT(FALSE);
   1953         error(U_REGEX_INTERNAL_ERROR);
   1954         type = URX_RESERVED_OP;
   1955     }
   1956     if (val > 0x00ffffff) {
   1957         U_ASSERT(FALSE);
   1958         error(U_REGEX_INTERNAL_ERROR);
   1959         val = 0;
   1960     }
   1961     if (val < 0) {
   1962         if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
   1963             U_ASSERT(FALSE);
   1964             error(U_REGEX_INTERNAL_ERROR);
   1965             return -1;
   1966         }
   1967         if (URX_TYPE(val) != 0xff) {
   1968             U_ASSERT(FALSE);
   1969             error(U_REGEX_INTERNAL_ERROR);
   1970             return -1;
   1971         }
   1972         type = URX_RESERVED_OP_N;
   1973     }
   1974     return (type << 24) | val;
   1975 }
   1976 
   1977 
   1978 //------------------------------------------------------------------------------
   1979 //
   1980 //   appendOp()             Append a new instruction onto the compiled pattern
   1981 //                          Includes error checking, limiting the size of the
   1982 //                          pattern to lengths that can be represented in the
   1983 //                          24 bit operand field of an instruction.
   1984 //
   1985 //------------------------------------------------------------------------------
   1986 void RegexCompile::appendOp(int32_t op) {
   1987     if (U_FAILURE(*fStatus)) {
   1988         return;
   1989     }
   1990     fRXPat->fCompiledPat->addElement(op, *fStatus);
   1991     if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
   1992         error(U_REGEX_PATTERN_TOO_BIG);
   1993     }
   1994 }
   1995 
   1996 void RegexCompile::appendOp(int32_t type, int32_t val) {
   1997     appendOp(buildOp(type, val));
   1998 }
   1999 
   2000 
   2001 //------------------------------------------------------------------------------
   2002 //
   2003 //   insertOp()             Insert a slot for a new opcode into the already
   2004 //                          compiled pattern code.
   2005 //
   2006 //                          Fill the slot with a NOP.  Our caller will replace it
   2007 //                          with what they really wanted.
   2008 //
   2009 //------------------------------------------------------------------------------
   2010 void   RegexCompile::insertOp(int32_t where) {
   2011     UVector64 *code = fRXPat->fCompiledPat;
   2012     U_ASSERT(where>0 && where < code->size());
   2013 
   2014     int32_t  nop = buildOp(URX_NOP, 0);
   2015     code->insertElementAt(nop, where, *fStatus);
   2016 
   2017     // Walk through the pattern, looking for any ops with targets that
   2018     //  were moved down by the insert.  Fix them.
   2019     int32_t loc;
   2020     for (loc=0; loc<code->size(); loc++) {
   2021         int32_t op = (int32_t)code->elementAti(loc);
   2022         int32_t opType = URX_TYPE(op);
   2023         int32_t opValue = URX_VAL(op);
   2024         if ((opType == URX_JMP         ||
   2025             opType == URX_JMPX         ||
   2026             opType == URX_STATE_SAVE   ||
   2027             opType == URX_CTR_LOOP     ||
   2028             opType == URX_CTR_LOOP_NG  ||
   2029             opType == URX_JMP_SAV      ||
   2030             opType == URX_JMP_SAV_X    ||
   2031             opType == URX_RELOC_OPRND)    && opValue > where) {
   2032             // Target location for this opcode is after the insertion point and
   2033             //   needs to be incremented to adjust for the insertion.
   2034             opValue++;
   2035             op = buildOp(opType, opValue);
   2036             code->setElementAt(op, loc);
   2037         }
   2038     }
   2039 
   2040     // Now fix up the parentheses stack.  All positive values in it are locations in
   2041     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
   2042     for (loc=0; loc<fParenStack.size(); loc++) {
   2043         int32_t x = fParenStack.elementAti(loc);
   2044         U_ASSERT(x < code->size());
   2045         if (x>where) {
   2046             x++;
   2047             fParenStack.setElementAt(x, loc);
   2048         }
   2049     }
   2050 
   2051     if (fMatchCloseParen > where) {
   2052         fMatchCloseParen++;
   2053     }
   2054     if (fMatchOpenParen > where) {
   2055         fMatchOpenParen++;
   2056     }
   2057 }
   2058 
   2059 
   2060 //------------------------------------------------------------------------------
   2061 //
   2062 //   allocateData()        Allocate storage in the matcher's static data area.
   2063 //                         Return the index for the newly allocated data.
   2064 //                         The storage won't actually exist until we are running a match
   2065 //                         operation, but the storage indexes are inserted into various
   2066 //                         opcodes while compiling the pattern.
   2067 //
   2068 //------------------------------------------------------------------------------
   2069 int32_t RegexCompile::allocateData(int32_t size) {
   2070     if (U_FAILURE(*fStatus)) {
   2071         return 0;
   2072     }
   2073     if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
   2074         error(U_REGEX_INTERNAL_ERROR);
   2075         return 0;
   2076     }
   2077     int32_t dataIndex = fRXPat->fDataSize;
   2078     fRXPat->fDataSize += size;
   2079     if (fRXPat->fDataSize >= 0x00fffff0) {
   2080         error(U_REGEX_INTERNAL_ERROR);
   2081     }
   2082     return dataIndex;
   2083 }
   2084 
   2085 
   2086 //------------------------------------------------------------------------------
   2087 //
   2088 //   allocateStackData()   Allocate space in the back-tracking stack frame.
   2089 //                         Return the index for the newly allocated data.
   2090 //                         The frame indexes are inserted into various
   2091 //                         opcodes while compiling the pattern, meaning that frame
   2092 //                         size must be restricted to the size that will fit
   2093 //                         as an operand (24 bits).
   2094 //
   2095 //------------------------------------------------------------------------------
   2096 int32_t RegexCompile::allocateStackData(int32_t size) {
   2097     if (U_FAILURE(*fStatus)) {
   2098         return 0;
   2099     }
   2100     if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
   2101         error(U_REGEX_INTERNAL_ERROR);
   2102         return 0;
   2103     }
   2104     int32_t dataIndex = fRXPat->fFrameSize;
   2105     fRXPat->fFrameSize += size;
   2106     if (fRXPat->fFrameSize >= 0x00fffff0) {
   2107         error(U_REGEX_PATTERN_TOO_BIG);
   2108     }
   2109     return dataIndex;
   2110 }
   2111 
   2112 
   2113 //------------------------------------------------------------------------------
   2114 //
   2115 //   blockTopLoc()          Find or create a location in the compiled pattern
   2116 //                          at the start of the operation or block that has
   2117 //                          just been compiled.  Needed when a quantifier (* or
   2118 //                          whatever) appears, and we need to add an operation
   2119 //                          at the start of the thing being quantified.
   2120 //
   2121 //                          (Parenthesized Blocks) have a slot with a NOP that
   2122 //                          is reserved for this purpose.  .* or similar don't
   2123 //                          and a slot needs to be added.
   2124 //
   2125 //       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
   2126 //                                         at the returned location.
   2127 //                                 FALSE - just return the address,
   2128 //                                         do not reserve a location there.
   2129 //
   2130 //------------------------------------------------------------------------------
   2131 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
   2132     int32_t   theLoc;
   2133     fixLiterals(TRUE);  // Emit code for any pending literals.
   2134                         //   If last item was a string, emit separate op for the its last char.
   2135     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
   2136     {
   2137         // The item just processed is a parenthesized block.
   2138         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
   2139         U_ASSERT(theLoc > 0);
   2140         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
   2141     }
   2142     else {
   2143         // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
   2144         // No slot for STATE_SAVE was pre-reserved in the compiled code.
   2145         // We need to make space now.
   2146         theLoc = fRXPat->fCompiledPat->size()-1;
   2147         int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
   2148         if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
   2149             // Strings take two opcode, we want the position of the first one.
   2150             // We can have a string at this point if a single character case-folded to two.
   2151             theLoc--;
   2152         }
   2153         if (reserveLoc) {
   2154             int32_t  nop = buildOp(URX_NOP, 0);
   2155             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
   2156         }
   2157     }
   2158     return theLoc;
   2159 }
   2160 
   2161 
   2162 
   2163 //------------------------------------------------------------------------------
   2164 //
   2165 //    handleCloseParen      When compiling a close paren, we need to go back
   2166 //                          and fix up any JMP or SAVE operations within the
   2167 //                          parenthesized block that need to target the end
   2168 //                          of the block.  The locations of these are kept on
   2169 //                          the paretheses stack.
   2170 //
   2171 //                          This function is called both when encountering a
   2172 //                          real ) and at the end of the pattern.
   2173 //
   2174 //------------------------------------------------------------------------------
   2175 void  RegexCompile::handleCloseParen() {
   2176     int32_t   patIdx;
   2177     int32_t   patOp;
   2178     if (fParenStack.size() <= 0) {
   2179         error(U_REGEX_MISMATCHED_PAREN);
   2180         return;
   2181     }
   2182 
   2183     // Emit code for any pending literals.
   2184     fixLiterals(FALSE);
   2185 
   2186     // Fixup any operations within the just-closed parenthesized group
   2187     //    that need to reference the end of the (block).
   2188     //    (The first one popped from the stack is an unused slot for
   2189     //     alternation (OR) state save, but applying the fixup to it does no harm.)
   2190     for (;;) {
   2191         patIdx = fParenStack.popi();
   2192         if (patIdx < 0) {
   2193             // value < 0 flags the start of the frame on the paren stack.
   2194             break;
   2195         }
   2196         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
   2197         patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
   2198         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
   2199         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
   2200         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
   2201         fMatchOpenParen     = patIdx;
   2202     }
   2203 
   2204     //  At the close of any parenthesized block, restore the match mode flags  to
   2205     //  the value they had at the open paren.  Saved value is
   2206     //  at the top of the paren stack.
   2207     fModeFlags = fParenStack.popi();
   2208     U_ASSERT(fModeFlags < 0);
   2209 
   2210     // DO any additional fixups, depending on the specific kind of
   2211     // parentesized grouping this is
   2212 
   2213     switch (patIdx) {
   2214     case plain:
   2215     case flags:
   2216         // No additional fixups required.
   2217         //   (Grouping-only parentheses)
   2218         break;
   2219     case capturing:
   2220         // Capturing Parentheses.
   2221         //   Insert a End Capture op into the pattern.
   2222         //   The frame offset of the variables for this cg is obtained from the
   2223         //       start capture op and put it into the end-capture op.
   2224         {
   2225             int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
   2226             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
   2227 
   2228             int32_t   frameVarLocation = URX_VAL(captureOp);
   2229             appendOp(URX_END_CAPTURE, frameVarLocation);
   2230         }
   2231         break;
   2232     case atomic:
   2233         // Atomic Parenthesis.
   2234         //   Insert a LD_SP operation to restore the state stack to the position
   2235         //   it was when the atomic parens were entered.
   2236         {
   2237             int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
   2238             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
   2239             int32_t   stoLoc = URX_VAL(stoOp);
   2240             appendOp(URX_LD_SP, stoLoc);
   2241         }
   2242         break;
   2243 
   2244     case lookAhead:
   2245         {
   2246             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
   2247             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
   2248             int32_t dataLoc  = URX_VAL(startOp);
   2249             appendOp(URX_LA_END, dataLoc);
   2250         }
   2251         break;
   2252 
   2253     case negLookAhead:
   2254         {
   2255             // See comment at doOpenLookAheadNeg
   2256             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
   2257             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
   2258             int32_t dataLoc  = URX_VAL(startOp);
   2259             appendOp(URX_LA_END, dataLoc);
   2260             appendOp(URX_BACKTRACK, 0);
   2261             appendOp(URX_LA_END, dataLoc);
   2262 
   2263             // Patch the URX_SAVE near the top of the block.
   2264             // The destination of the SAVE is the final LA_END that was just added.
   2265             int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
   2266             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
   2267             int32_t dest     = fRXPat->fCompiledPat->size()-1;
   2268             saveOp           = buildOp(URX_STATE_SAVE, dest);
   2269             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
   2270         }
   2271         break;
   2272 
   2273     case lookBehind:
   2274         {
   2275             // See comment at doOpenLookBehind.
   2276 
   2277             // Append the URX_LB_END and URX_LA_END to the compiled pattern.
   2278             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
   2279             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
   2280             int32_t dataLoc  = URX_VAL(startOp);
   2281             appendOp(URX_LB_END, dataLoc);
   2282             appendOp(URX_LA_END, dataLoc);
   2283 
   2284             // Determine the min and max bounds for the length of the
   2285             //  string that the pattern can match.
   2286             //  An unbounded upper limit is an error.
   2287             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
   2288             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
   2289             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
   2290             if (URX_TYPE(maxML) != 0) {
   2291                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2292                 break;
   2293             }
   2294             if (maxML == INT32_MAX) {
   2295                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2296                 break;
   2297             }
   2298             U_ASSERT(minML <= maxML);
   2299 
   2300             // Insert the min and max match len bounds into the URX_LB_CONT op that
   2301             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
   2302             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
   2303             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
   2304 
   2305         }
   2306         break;
   2307 
   2308 
   2309 
   2310     case lookBehindN:
   2311         {
   2312             // See comment at doOpenLookBehindNeg.
   2313 
   2314             // Append the URX_LBN_END to the compiled pattern.
   2315             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
   2316             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
   2317             int32_t dataLoc  = URX_VAL(startOp);
   2318             appendOp(URX_LBN_END, dataLoc);
   2319 
   2320             // Determine the min and max bounds for the length of the
   2321             //  string that the pattern can match.
   2322             //  An unbounded upper limit is an error.
   2323             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
   2324             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
   2325             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
   2326             if (URX_TYPE(maxML) != 0) {
   2327                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2328                 break;
   2329             }
   2330             if (maxML == INT32_MAX) {
   2331                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2332                 break;
   2333             }
   2334             U_ASSERT(minML <= maxML);
   2335 
   2336             // Insert the min and max match len bounds into the URX_LB_CONT op that
   2337             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
   2338             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
   2339             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
   2340 
   2341             // Insert the pattern location to continue at after a successful match
   2342             //  as the last operand of the URX_LBN_CONT
   2343             int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
   2344             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
   2345         }
   2346         break;
   2347 
   2348 
   2349 
   2350     default:
   2351         U_ASSERT(FALSE);
   2352     }
   2353 
   2354     // remember the next location in the compiled pattern.
   2355     // The compilation of Quantifiers will look at this to see whether its looping
   2356     //   over a parenthesized block or a single item
   2357     fMatchCloseParen = fRXPat->fCompiledPat->size();
   2358 }
   2359 
   2360 
   2361 
   2362 //------------------------------------------------------------------------------
   2363 //
   2364 //   compileSet       Compile the pattern operations for a reference to a
   2365 //                    UnicodeSet.
   2366 //
   2367 //------------------------------------------------------------------------------
   2368 void        RegexCompile::compileSet(UnicodeSet *theSet)
   2369 {
   2370     if (theSet == NULL) {
   2371         return;
   2372     }
   2373     //  Remove any strings from the set.
   2374     //  There shoudn't be any, but just in case.
   2375     //     (Case Closure can add them; if we had a simple case closure avaialble that
   2376     //      ignored strings, that would be better.)
   2377     theSet->removeAllStrings();
   2378     int32_t  setSize = theSet->size();
   2379 
   2380     switch (setSize) {
   2381     case 0:
   2382         {
   2383             // Set of no elements.   Always fails to match.
   2384             appendOp(URX_BACKTRACK, 0);
   2385             delete theSet;
   2386         }
   2387         break;
   2388 
   2389     case 1:
   2390         {
   2391             // The set contains only a single code point.  Put it into
   2392             //   the compiled pattern as a single char operation rather
   2393             //   than a set, and discard the set itself.
   2394             literalChar(theSet->charAt(0));
   2395             delete theSet;
   2396         }
   2397         break;
   2398 
   2399     default:
   2400         {
   2401             //  The set contains two or more chars.  (the normal case)
   2402             //  Put it into the compiled pattern as a set.
   2403             int32_t setNumber = fRXPat->fSets->size();
   2404             fRXPat->fSets->addElement(theSet, *fStatus);
   2405             appendOp(URX_SETREF, setNumber);
   2406         }
   2407     }
   2408 }
   2409 
   2410 
   2411 //------------------------------------------------------------------------------
   2412 //
   2413 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
   2414 //                      Except for the specific opcodes used, the code is the same
   2415 //                      for all three types (greedy, non-greedy, possessive) of
   2416 //                      intervals.  The opcodes are supplied as parameters.
   2417 //                      (There are two sets of opcodes - greedy & possessive use the
   2418 //                      same ones, while non-greedy has it's own.)
   2419 //
   2420 //                      The code for interval loops has this form:
   2421 //                         0  CTR_INIT   counter loc (in stack frame)
   2422 //                         1             5  patt address of CTR_LOOP at bottom of block
   2423 //                         2             min count
   2424 //                         3             max count   (-1 for unbounded)
   2425 //                         4  ...        block to be iterated over
   2426 //                         5  CTR_LOOP
   2427 //
   2428 //                       In
   2429 //------------------------------------------------------------------------------
   2430 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
   2431 {
   2432     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
   2433     //   four slots in the compiled code.  Reserve them.
   2434     int32_t   topOfBlock = blockTopLoc(TRUE);
   2435     insertOp(topOfBlock);
   2436     insertOp(topOfBlock);
   2437     insertOp(topOfBlock);
   2438 
   2439     // The operands for the CTR_INIT opcode include the index in the matcher data
   2440     //   of the counter.  Allocate it now. There are two data items
   2441     //        counterLoc   -->  Loop counter
   2442     //               +1    -->  Input index (for breaking non-progressing loops)
   2443     //                          (Only present if unbounded upper limit on loop)
   2444     int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
   2445     int32_t   counterLoc = allocateStackData(dataSize);
   2446 
   2447     int32_t   op = buildOp(InitOp, counterLoc);
   2448     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
   2449 
   2450     // The second operand of CTR_INIT is the location following the end of the loop.
   2451     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
   2452     //   compilation of something later on causes the code to grow and the target
   2453     //   position to move.
   2454     int32_t loopEnd = fRXPat->fCompiledPat->size();
   2455     op = buildOp(URX_RELOC_OPRND, loopEnd);
   2456     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
   2457 
   2458     // Followed by the min and max counts.
   2459     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
   2460     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
   2461 
   2462     // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
   2463     //   Goes at end of the block being looped over, so just append to the code so far.
   2464     appendOp(LoopOp, topOfBlock);
   2465 
   2466     if ((fIntervalLow & 0xff000000) != 0 ||
   2467         (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
   2468             error(U_REGEX_NUMBER_TOO_BIG);
   2469         }
   2470 
   2471     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
   2472         error(U_REGEX_MAX_LT_MIN);
   2473     }
   2474 }
   2475 
   2476 
   2477 
   2478 UBool RegexCompile::compileInlineInterval() {
   2479     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
   2480         // Too big to inline.  Fail, which will cause looping code to be generated.
   2481         //   (Upper < Lower picks up unbounded upper and errors, both.)
   2482         return FALSE;
   2483     }
   2484 
   2485     int32_t   topOfBlock = blockTopLoc(FALSE);
   2486     if (fIntervalUpper == 0) {
   2487         // Pathological case.  Attempt no matches, as if the block doesn't exist.
   2488         // Discard the generated code for the block.
   2489         // If the block included parens, discard the info pertaining to them as well.
   2490         fRXPat->fCompiledPat->setSize(topOfBlock);
   2491         if (fMatchOpenParen >= topOfBlock) {
   2492             fMatchOpenParen = -1;
   2493         }
   2494         if (fMatchCloseParen >= topOfBlock) {
   2495             fMatchCloseParen = -1;
   2496         }
   2497         return TRUE;
   2498     }
   2499 
   2500     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
   2501         // The thing being repeated is not a single op, but some
   2502         //   more complex block.  Do it as a loop, not inlines.
   2503         //   Note that things "repeated" a max of once are handled as inline, because
   2504         //     the one copy of the code already generated is just fine.
   2505         return FALSE;
   2506     }
   2507 
   2508     // Pick up the opcode that is to be repeated
   2509     //
   2510     int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
   2511 
   2512     // Compute the pattern location where the inline sequence
   2513     //   will end, and set up the state save op that will be needed.
   2514     //
   2515     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
   2516                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
   2517     int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
   2518     if (fIntervalLow == 0) {
   2519         insertOp(topOfBlock);
   2520         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
   2521     }
   2522 
   2523 
   2524 
   2525     //  Loop, emitting the op for the thing being repeated each time.
   2526     //    Loop starts at 1 because one instance of the op already exists in the pattern,
   2527     //    it was put there when it was originally encountered.
   2528     int32_t i;
   2529     for (i=1; i<fIntervalUpper; i++ ) {
   2530         if (i >= fIntervalLow) {
   2531             appendOp(saveOp);
   2532         }
   2533         appendOp(op);
   2534     }
   2535     return TRUE;
   2536 }
   2537 
   2538 
   2539 
   2540 //------------------------------------------------------------------------------
   2541 //
   2542 //   caseInsensitiveStart  given a single code point from a pattern string, determine the
   2543 //                         set of characters that could potentially begin a case-insensitive
   2544 //                         match of a string beginning with that character, using full Unicode
   2545 //                         case insensitive matching.
   2546 //
   2547 //          This is used in optimizing find().
   2548 //
   2549 //          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
   2550 //          misses cases like this:
   2551 //             A string from the pattern begins with 'ss' (although all we know
   2552 //                 in this context is that it begins with 's')
   2553 //             The pattern could match a string beginning with a German sharp-s
   2554 //
   2555 //           To the ordinary case closure for a character c, we add all other
   2556 //           characters cx where the case closure of cx incudes a string form that begins
   2557 //           with the original character c.
   2558 //
   2559 //           This function could be made smarter. The full pattern string is available
   2560 //           and it would be possible to verify that the extra characters being added
   2561 //           to the starting set fully match, rather than having just a first-char of the
   2562 //           folded form match.
   2563 //
   2564 //------------------------------------------------------------------------------
   2565 void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
   2566 
   2567 // Machine Generated below.
   2568 // It may need updating with new versions of Unicode.
   2569 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
   2570 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
   2571 
   2572 // Machine Generated Data. Do not hand edit.
   2573     static const UChar32 RECaseFixCodePoints[] = {
   2574         0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
   2575         0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
   2576         0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
   2577         0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
   2578         0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
   2579 
   2580     static const int16_t RECaseFixStringOffsets[] = {
   2581         0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
   2582         0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
   2583         0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
   2584         0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
   2585         0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
   2586 
   2587     static const int16_t RECaseFixCounts[] = {
   2588         0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
   2589         0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
   2590         0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
   2591         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
   2592         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
   2593 
   2594     static const UChar RECaseFixData[] = {
   2595         0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
   2596         0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
   2597         0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
   2598         0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
   2599         0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
   2600         0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
   2601         0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
   2602         0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
   2603         0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
   2604         0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
   2605         0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
   2606 
   2607 // End of machine generated data.
   2608 
   2609     if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
   2610         // This function should never be called with an invalid input character.
   2611         U_ASSERT(FALSE);
   2612         starterChars->clear();
   2613     } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
   2614         UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
   2615         starterChars->set(caseFoldedC, caseFoldedC);
   2616 
   2617         int32_t i;
   2618         for (i=0; RECaseFixCodePoints[i]<c ; i++) {
   2619             // Simple linear search through the sorted list of interesting code points.
   2620         }
   2621 
   2622         if (RECaseFixCodePoints[i] == c) {
   2623             int32_t dataIndex = RECaseFixStringOffsets[i];
   2624             int32_t numCharsToAdd = RECaseFixCounts[i];
   2625             UChar32 cpToAdd = 0;
   2626             for (int32_t j=0; j<numCharsToAdd; j++) {
   2627                 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
   2628                 starterChars->add(cpToAdd);
   2629             }
   2630         }
   2631 
   2632         starterChars->closeOver(USET_CASE_INSENSITIVE);
   2633         starterChars->removeAllStrings();
   2634     } else {
   2635         // Not a cased character. Just return it alone.
   2636         starterChars->set(c, c);
   2637     }
   2638 }
   2639 
   2640 
   2641 // Increment with overflow check.
   2642 // val and delta will both be positive.
   2643 
   2644 static int32_t safeIncrement(int32_t val, int32_t delta) {
   2645     if (INT32_MAX - val > delta) {
   2646         return val + delta;
   2647     } else {
   2648         return INT32_MAX;
   2649     }
   2650 }
   2651 
   2652 
   2653 //------------------------------------------------------------------------------
   2654 //
   2655 //   matchStartType    Determine how a match can start.
   2656 //                     Used to optimize find() operations.
   2657 //
   2658 //                     Operation is very similar to minMatchLength().  Walk the compiled
   2659 //                     pattern, keeping an on-going minimum-match-length.  For any
   2660 //                     op where the min match coming in is zero, add that ops possible
   2661 //                     starting matches to the possible starts for the overall pattern.
   2662 //
   2663 //------------------------------------------------------------------------------
   2664 void   RegexCompile::matchStartType() {
   2665     if (U_FAILURE(*fStatus)) {
   2666         return;
   2667     }
   2668 
   2669 
   2670     int32_t    loc;                    // Location in the pattern of the current op being processed.
   2671     int32_t    op;                     // The op being processed
   2672     int32_t    opType;                 // The opcode type of the op
   2673     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
   2674     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
   2675 
   2676     UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
   2677                                        //   could have advanced the position in a match.
   2678                                        //   (Maximum match length so far == 0)
   2679 
   2680     // forwardedLength is a vector holding minimum-match-length values that
   2681     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
   2682     //   It must be one longer than the pattern being checked because some  ops
   2683     //   will jmp to a end-of-block+1 location from within a block, and we must
   2684     //   count those when checking the block.
   2685     int32_t end = fRXPat->fCompiledPat->size();
   2686     UVector32  forwardedLength(end+1, *fStatus);
   2687     forwardedLength.setSize(end+1);
   2688     for (loc=3; loc<end; loc++) {
   2689         forwardedLength.setElementAt(INT32_MAX, loc);
   2690     }
   2691 
   2692     for (loc = 3; loc<end; loc++) {
   2693         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   2694         opType = URX_TYPE(op);
   2695 
   2696         // The loop is advancing linearly through the pattern.
   2697         // If the op we are now at was the destination of a branch in the pattern,
   2698         // and that path has a shorter minimum length than the current accumulated value,
   2699         // replace the current accumulated value.
   2700         if (forwardedLength.elementAti(loc) < currentLen) {
   2701             currentLen = forwardedLength.elementAti(loc);
   2702             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   2703         }
   2704 
   2705         switch (opType) {
   2706             // Ops that don't change the total length matched
   2707         case URX_RESERVED_OP:
   2708         case URX_END:
   2709         case URX_FAIL:
   2710         case URX_STRING_LEN:
   2711         case URX_NOP:
   2712         case URX_START_CAPTURE:
   2713         case URX_END_CAPTURE:
   2714         case URX_BACKSLASH_B:
   2715         case URX_BACKSLASH_BU:
   2716         case URX_BACKSLASH_G:
   2717         case URX_BACKSLASH_Z:
   2718         case URX_DOLLAR:
   2719         case URX_DOLLAR_M:
   2720         case URX_DOLLAR_D:
   2721         case URX_DOLLAR_MD:
   2722         case URX_RELOC_OPRND:
   2723         case URX_STO_INP_LOC:
   2724         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   2725         case URX_BACKREF_I:
   2726 
   2727         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   2728         case URX_LD_SP:
   2729             break;
   2730 
   2731         case URX_CARET:
   2732             if (atStart) {
   2733                 fRXPat->fStartType = START_START;
   2734             }
   2735             break;
   2736 
   2737         case URX_CARET_M:
   2738         case URX_CARET_M_UNIX:
   2739             if (atStart) {
   2740                 fRXPat->fStartType = START_LINE;
   2741             }
   2742             break;
   2743 
   2744         case URX_ONECHAR:
   2745             if (currentLen == 0) {
   2746                 // This character could appear at the start of a match.
   2747                 //   Add it to the set of possible starting characters.
   2748                 fRXPat->fInitialChars->add(URX_VAL(op));
   2749                 numInitialStrings += 2;
   2750             }
   2751             currentLen = safeIncrement(currentLen, 1);
   2752             atStart = FALSE;
   2753             break;
   2754 
   2755 
   2756         case URX_SETREF:
   2757             if (currentLen == 0) {
   2758                 int32_t  sn = URX_VAL(op);
   2759                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
   2760                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
   2761                 fRXPat->fInitialChars->addAll(*s);
   2762                 numInitialStrings += 2;
   2763             }
   2764             currentLen = safeIncrement(currentLen, 1);
   2765             atStart = FALSE;
   2766             break;
   2767 
   2768         case URX_LOOP_SR_I:
   2769             // [Set]*, like a SETREF, above, in what it can match,
   2770             //  but may not match at all, so currentLen is not incremented.
   2771             if (currentLen == 0) {
   2772                 int32_t  sn = URX_VAL(op);
   2773                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
   2774                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
   2775                 fRXPat->fInitialChars->addAll(*s);
   2776                 numInitialStrings += 2;
   2777             }
   2778             atStart = FALSE;
   2779             break;
   2780 
   2781         case URX_LOOP_DOT_I:
   2782             if (currentLen == 0) {
   2783                 // .* at the start of a pattern.
   2784                 //    Any character can begin the match.
   2785                 fRXPat->fInitialChars->clear();
   2786                 fRXPat->fInitialChars->complement();
   2787                 numInitialStrings += 2;
   2788             }
   2789             atStart = FALSE;
   2790             break;
   2791 
   2792 
   2793         case URX_STATIC_SETREF:
   2794             if (currentLen == 0) {
   2795                 int32_t  sn = URX_VAL(op);
   2796                 U_ASSERT(sn>0 && sn<URX_LAST_SET);
   2797                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
   2798                 fRXPat->fInitialChars->addAll(*s);
   2799                 numInitialStrings += 2;
   2800             }
   2801             currentLen = safeIncrement(currentLen, 1);
   2802             atStart = FALSE;
   2803             break;
   2804 
   2805 
   2806 
   2807         case URX_STAT_SETREF_N:
   2808             if (currentLen == 0) {
   2809                 int32_t  sn = URX_VAL(op);
   2810                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
   2811                 UnicodeSet sc(*s);
   2812                 sc.complement();
   2813                 fRXPat->fInitialChars->addAll(sc);
   2814                 numInitialStrings += 2;
   2815             }
   2816             currentLen = safeIncrement(currentLen, 1);
   2817             atStart = FALSE;
   2818             break;
   2819 
   2820 
   2821 
   2822         case URX_BACKSLASH_D:
   2823             // Digit Char
   2824              if (currentLen == 0) {
   2825                  UnicodeSet s;
   2826                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
   2827                  if (URX_VAL(op) != 0) {
   2828                      s.complement();
   2829                  }
   2830                  fRXPat->fInitialChars->addAll(s);
   2831                  numInitialStrings += 2;
   2832             }
   2833             currentLen = safeIncrement(currentLen, 1);
   2834             atStart = FALSE;
   2835             break;
   2836 
   2837 
   2838         case URX_BACKSLASH_H:
   2839             // Horiz white space
   2840             if (currentLen == 0) {
   2841                 UnicodeSet s;
   2842                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
   2843                 s.add((UChar32)9);   // Tab
   2844                 if (URX_VAL(op) != 0) {
   2845                     s.complement();
   2846                 }
   2847                 fRXPat->fInitialChars->addAll(s);
   2848                 numInitialStrings += 2;
   2849             }
   2850             currentLen = safeIncrement(currentLen, 1);
   2851             atStart = FALSE;
   2852             break;
   2853 
   2854 
   2855         case URX_BACKSLASH_R:       // Any line ending sequence
   2856         case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
   2857             if (currentLen == 0) {
   2858                 UnicodeSet s;
   2859                 s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
   2860                 s.add((UChar32)0x85);
   2861                 s.add((UChar32)0x2028, (UChar32)0x2029);
   2862                 if (URX_VAL(op) != 0) {
   2863                      // Complement option applies to URX_BACKSLASH_V only.
   2864                      s.complement();
   2865                 }
   2866                 fRXPat->fInitialChars->addAll(s);
   2867                 numInitialStrings += 2;
   2868             }
   2869             currentLen = safeIncrement(currentLen, 1);
   2870             atStart = FALSE;
   2871             break;
   2872 
   2873 
   2874 
   2875         case URX_ONECHAR_I:
   2876             // Case Insensitive Single Character.
   2877             if (currentLen == 0) {
   2878                 UChar32  c = URX_VAL(op);
   2879                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
   2880                     UnicodeSet starters(c, c);
   2881                     starters.closeOver(USET_CASE_INSENSITIVE);
   2882                     // findCaseInsensitiveStarters(c, &starters);
   2883                     //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
   2884                     //   The expanded folding can't match the pattern.
   2885                     fRXPat->fInitialChars->addAll(starters);
   2886                 } else {
   2887                     // Char has no case variants.  Just add it as-is to the
   2888                     //   set of possible starting chars.
   2889                     fRXPat->fInitialChars->add(c);
   2890                 }
   2891                 numInitialStrings += 2;
   2892             }
   2893             currentLen = safeIncrement(currentLen, 1);
   2894             atStart = FALSE;
   2895             break;
   2896 
   2897 
   2898         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
   2899         case URX_DOTANY_ALL:    // . matches one or two.
   2900         case URX_DOTANY:
   2901         case URX_DOTANY_UNIX:
   2902             if (currentLen == 0) {
   2903                 // These constructs are all bad news when they appear at the start
   2904                 //   of a match.  Any character can begin the match.
   2905                 fRXPat->fInitialChars->clear();
   2906                 fRXPat->fInitialChars->complement();
   2907                 numInitialStrings += 2;
   2908             }
   2909             currentLen = safeIncrement(currentLen, 1);
   2910             atStart = FALSE;
   2911             break;
   2912 
   2913 
   2914         case URX_JMPX:
   2915             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
   2916             U_FALLTHROUGH;
   2917         case URX_JMP:
   2918             {
   2919                 int32_t  jmpDest = URX_VAL(op);
   2920                 if (jmpDest < loc) {
   2921                     // Loop of some kind.  Can safely ignore, the worst that will happen
   2922                     //  is that we understate the true minimum length
   2923                     currentLen = forwardedLength.elementAti(loc+1);
   2924 
   2925                 } else {
   2926                     // Forward jump.  Propagate the current min length to the target loc of the jump.
   2927                     U_ASSERT(jmpDest <= end+1);
   2928                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
   2929                         forwardedLength.setElementAt(currentLen, jmpDest);
   2930                     }
   2931                 }
   2932             }
   2933             atStart = FALSE;
   2934             break;
   2935 
   2936         case URX_JMP_SAV:
   2937         case URX_JMP_SAV_X:
   2938             // Combo of state save to the next loc, + jmp backwards.
   2939             //   Net effect on min. length computation is nothing.
   2940             atStart = FALSE;
   2941             break;
   2942 
   2943         case URX_BACKTRACK:
   2944             // Fails are kind of like a branch, except that the min length was
   2945             //   propagated already, by the state save.
   2946             currentLen = forwardedLength.elementAti(loc+1);
   2947             atStart = FALSE;
   2948             break;
   2949 
   2950 
   2951         case URX_STATE_SAVE:
   2952             {
   2953                 // State Save, for forward jumps, propagate the current minimum.
   2954                 //             of the state save.
   2955                 int32_t  jmpDest = URX_VAL(op);
   2956                 if (jmpDest > loc) {
   2957                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
   2958                         forwardedLength.setElementAt(currentLen, jmpDest);
   2959                     }
   2960                 }
   2961             }
   2962             atStart = FALSE;
   2963             break;
   2964 
   2965 
   2966 
   2967 
   2968         case URX_STRING:
   2969             {
   2970                 loc++;
   2971                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   2972                 int32_t stringLen   = URX_VAL(stringLenOp);
   2973                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
   2974                 U_ASSERT(stringLenOp >= 2);
   2975                 if (currentLen == 0) {
   2976                     // Add the starting character of this string to the set of possible starting
   2977                     //   characters for this pattern.
   2978                     int32_t stringStartIdx = URX_VAL(op);
   2979                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
   2980                     fRXPat->fInitialChars->add(c);
   2981 
   2982                     // Remember this string.  After the entire pattern has been checked,
   2983                     //  if nothing else is identified that can start a match, we'll use it.
   2984                     numInitialStrings++;
   2985                     fRXPat->fInitialStringIdx = stringStartIdx;
   2986                     fRXPat->fInitialStringLen = stringLen;
   2987                 }
   2988 
   2989                 currentLen = safeIncrement(currentLen, stringLen);
   2990                 atStart = FALSE;
   2991             }
   2992             break;
   2993 
   2994         case URX_STRING_I:
   2995             {
   2996                 // Case-insensitive string.  Unlike exact-match strings, we won't
   2997                 //   attempt a string search for possible match positions.  But we
   2998                 //   do update the set of possible starting characters.
   2999                 loc++;
   3000                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3001                 int32_t stringLen   = URX_VAL(stringLenOp);
   3002                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
   3003                 U_ASSERT(stringLenOp >= 2);
   3004                 if (currentLen == 0) {
   3005                     // Add the starting character of this string to the set of possible starting
   3006                     //   characters for this pattern.
   3007                     int32_t stringStartIdx = URX_VAL(op);
   3008                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
   3009                     UnicodeSet s;
   3010                     findCaseInsensitiveStarters(c, &s);
   3011                     fRXPat->fInitialChars->addAll(s);
   3012                     numInitialStrings += 2;  // Matching on an initial string not possible.
   3013                 }
   3014                 currentLen = safeIncrement(currentLen, stringLen);
   3015                 atStart = FALSE;
   3016             }
   3017             break;
   3018 
   3019         case URX_CTR_INIT:
   3020         case URX_CTR_INIT_NG:
   3021             {
   3022                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops
   3023                 //   so location must be updated accordingly.
   3024                 // Loop Init Ops.
   3025                 //   If the min loop count == 0
   3026                 //      move loc forwards to the end of the loop, skipping over the body.
   3027                 //   If the min count is > 0,
   3028                 //      continue normal processing of the body of the loop.
   3029                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
   3030                         loopEndLoc   = URX_VAL(loopEndLoc);
   3031                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
   3032                 if (minLoopCount == 0) {
   3033                     // Min Loop Count of 0, treat like a forward branch and
   3034                     //   move the current minimum length up to the target
   3035                     //   (end of loop) location.
   3036                     U_ASSERT(loopEndLoc <= end+1);
   3037                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
   3038                         forwardedLength.setElementAt(currentLen, loopEndLoc);
   3039                     }
   3040                 }
   3041                 loc+=3;  // Skips over operands of CTR_INIT
   3042             }
   3043             atStart = FALSE;
   3044             break;
   3045 
   3046 
   3047         case URX_CTR_LOOP:
   3048         case URX_CTR_LOOP_NG:
   3049             // Loop ops.
   3050             //  The jump is conditional, backwards only.
   3051             atStart = FALSE;
   3052             break;
   3053 
   3054         case URX_LOOP_C:
   3055             // More loop ops.  These state-save to themselves.
   3056             //   don't change the minimum match
   3057             atStart = FALSE;
   3058             break;
   3059 
   3060 
   3061         case URX_LA_START:
   3062         case URX_LB_START:
   3063             {
   3064                 // Look-around.  Scan forward until the matching look-ahead end,
   3065                 //   without processing the look-around block.  This is overly pessimistic.
   3066 
   3067                 // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
   3068                 //   lookahead contains two LA_END instructions, so count goes up by two
   3069                 //   for each LA_START.
   3070                 int32_t  depth = (opType == URX_LA_START? 2: 1);
   3071                 for (;;) {
   3072                     loc++;
   3073                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3074                     if (URX_TYPE(op) == URX_LA_START) {
   3075                         depth+=2;
   3076                     }
   3077                     if (URX_TYPE(op) == URX_LB_START) {
   3078                         depth++;
   3079                     }
   3080                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
   3081                         depth--;
   3082                         if (depth == 0) {
   3083                             break;
   3084                         }
   3085                     }
   3086                     if (URX_TYPE(op) == URX_STATE_SAVE) {
   3087                         // Need this because neg lookahead blocks will FAIL to outside
   3088                         //   of the block.
   3089                         int32_t  jmpDest = URX_VAL(op);
   3090                         if (jmpDest > loc) {
   3091                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3092                                 forwardedLength.setElementAt(currentLen, jmpDest);
   3093                             }
   3094                         }
   3095                     }
   3096                     U_ASSERT(loc <= end);
   3097                 }
   3098             }
   3099             break;
   3100 
   3101         case URX_LA_END:
   3102         case URX_LB_CONT:
   3103         case URX_LB_END:
   3104         case URX_LBN_CONT:
   3105         case URX_LBN_END:
   3106             U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be
   3107                                  //  consumed by the scan in URX_LA_START and LB_START
   3108 
   3109             break;
   3110 
   3111         default:
   3112             U_ASSERT(FALSE);
   3113             }
   3114 
   3115         }
   3116 
   3117 
   3118     // We have finished walking through the ops.  Check whether some forward jump
   3119     //   propagated a shorter length to location end+1.
   3120     if (forwardedLength.elementAti(end+1) < currentLen) {
   3121         currentLen = forwardedLength.elementAti(end+1);
   3122     }
   3123 
   3124 
   3125     fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
   3126 
   3127 
   3128     // Sort out what we should check for when looking for candidate match start positions.
   3129     // In order of preference,
   3130     //     1.   Start of input text buffer.
   3131     //     2.   A literal string.
   3132     //     3.   Start of line in multi-line mode.
   3133     //     4.   A single literal character.
   3134     //     5.   A character from a set of characters.
   3135     //
   3136     if (fRXPat->fStartType == START_START) {
   3137         // Match only at the start of an input text string.
   3138         //    start type is already set.  We're done.
   3139     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
   3140         // Match beginning only with a literal string.
   3141         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
   3142         U_ASSERT(fRXPat->fInitialChars->contains(c));
   3143         fRXPat->fStartType   = START_STRING;
   3144         fRXPat->fInitialChar = c;
   3145     } else if (fRXPat->fStartType == START_LINE) {
   3146         // Match at start of line in Multi-Line mode.
   3147         // Nothing to do here; everything is already set.
   3148     } else if (fRXPat->fMinMatchLen == 0) {
   3149         // Zero length match possible.  We could start anywhere.
   3150         fRXPat->fStartType = START_NO_INFO;
   3151     } else if (fRXPat->fInitialChars->size() == 1) {
   3152         // All matches begin with the same char.
   3153         fRXPat->fStartType   = START_CHAR;
   3154         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
   3155         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
   3156     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
   3157         fRXPat->fMinMatchLen > 0) {
   3158         // Matches start with a set of character smaller than the set of all chars.
   3159         fRXPat->fStartType = START_SET;
   3160     } else {
   3161         // Matches can start with anything
   3162         fRXPat->fStartType = START_NO_INFO;
   3163     }
   3164 
   3165     return;
   3166 }
   3167 
   3168 
   3169 
   3170 //------------------------------------------------------------------------------
   3171 //
   3172 //   minMatchLength    Calculate the length of the shortest string that could
   3173 //                     match the specified pattern.
   3174 //                     Length is in 16 bit code units, not code points.
   3175 //
   3176 //                     The calculated length may not be exact.  The returned
   3177 //                     value may be shorter than the actual minimum; it must
   3178 //                     never be longer.
   3179 //
   3180 //                     start and end are the range of p-code operations to be
   3181 //                     examined.  The endpoints are included in the range.
   3182 //
   3183 //------------------------------------------------------------------------------
   3184 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
   3185     if (U_FAILURE(*fStatus)) {
   3186         return 0;
   3187     }
   3188 
   3189     U_ASSERT(start <= end);
   3190     U_ASSERT(end < fRXPat->fCompiledPat->size());
   3191 
   3192 
   3193     int32_t    loc;
   3194     int32_t    op;
   3195     int32_t    opType;
   3196     int32_t    currentLen = 0;
   3197 
   3198 
   3199     // forwardedLength is a vector holding minimum-match-length values that
   3200     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
   3201     //   It must be one longer than the pattern being checked because some  ops
   3202     //   will jmp to a end-of-block+1 location from within a block, and we must
   3203     //   count those when checking the block.
   3204     UVector32  forwardedLength(end+2, *fStatus);
   3205     forwardedLength.setSize(end+2);
   3206     for (loc=start; loc<=end+1; loc++) {
   3207         forwardedLength.setElementAt(INT32_MAX, loc);
   3208     }
   3209 
   3210     for (loc = start; loc<=end; loc++) {
   3211         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3212         opType = URX_TYPE(op);
   3213 
   3214         // The loop is advancing linearly through the pattern.
   3215         // If the op we are now at was the destination of a branch in the pattern,
   3216         // and that path has a shorter minimum length than the current accumulated value,
   3217         // replace the current accumulated value.
   3218         // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
   3219                                                                //   no-match-possible cases.
   3220         if (forwardedLength.elementAti(loc) < currentLen) {
   3221             currentLen = forwardedLength.elementAti(loc);
   3222             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   3223         }
   3224 
   3225         switch (opType) {
   3226             // Ops that don't change the total length matched
   3227         case URX_RESERVED_OP:
   3228         case URX_END:
   3229         case URX_STRING_LEN:
   3230         case URX_NOP:
   3231         case URX_START_CAPTURE:
   3232         case URX_END_CAPTURE:
   3233         case URX_BACKSLASH_B:
   3234         case URX_BACKSLASH_BU:
   3235         case URX_BACKSLASH_G:
   3236         case URX_BACKSLASH_Z:
   3237         case URX_CARET:
   3238         case URX_DOLLAR:
   3239         case URX_DOLLAR_M:
   3240         case URX_DOLLAR_D:
   3241         case URX_DOLLAR_MD:
   3242         case URX_RELOC_OPRND:
   3243         case URX_STO_INP_LOC:
   3244         case URX_CARET_M:
   3245         case URX_CARET_M_UNIX:
   3246         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   3247         case URX_BACKREF_I:
   3248 
   3249         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   3250         case URX_LD_SP:
   3251 
   3252         case URX_JMP_SAV:
   3253         case URX_JMP_SAV_X:
   3254             break;
   3255 
   3256 
   3257             // Ops that match a minimum of one character (one or two 16 bit code units.)
   3258             //
   3259         case URX_ONECHAR:
   3260         case URX_STATIC_SETREF:
   3261         case URX_STAT_SETREF_N:
   3262         case URX_SETREF:
   3263         case URX_BACKSLASH_D:
   3264         case URX_BACKSLASH_H:
   3265         case URX_BACKSLASH_R:
   3266         case URX_BACKSLASH_V:
   3267         case URX_ONECHAR_I:
   3268         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
   3269         case URX_DOTANY_ALL:    // . matches one or two.
   3270         case URX_DOTANY:
   3271         case URX_DOTANY_UNIX:
   3272             currentLen = safeIncrement(currentLen, 1);
   3273             break;
   3274 
   3275 
   3276         case URX_JMPX:
   3277             loc++;              // URX_JMPX has an extra operand, ignored here,
   3278                                 //   otherwise processed identically to URX_JMP.
   3279             U_FALLTHROUGH;
   3280         case URX_JMP:
   3281             {
   3282                 int32_t  jmpDest = URX_VAL(op);
   3283                 if (jmpDest < loc) {
   3284                     // Loop of some kind.  Can safely ignore, the worst that will happen
   3285                     //  is that we understate the true minimum length
   3286                     currentLen = forwardedLength.elementAti(loc+1);
   3287                 } else {
   3288                     // Forward jump.  Propagate the current min length to the target loc of the jump.
   3289                     U_ASSERT(jmpDest <= end+1);
   3290                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
   3291                         forwardedLength.setElementAt(currentLen, jmpDest);
   3292                     }
   3293                 }
   3294             }
   3295             break;
   3296 
   3297         case URX_BACKTRACK:
   3298             {
   3299                 // Back-tracks are kind of like a branch, except that the min length was
   3300                 //   propagated already, by the state save.
   3301                 currentLen = forwardedLength.elementAti(loc+1);
   3302             }
   3303             break;
   3304 
   3305 
   3306         case URX_STATE_SAVE:
   3307             {
   3308                 // State Save, for forward jumps, propagate the current minimum.
   3309                 //             of the state save.
   3310                 int32_t  jmpDest = URX_VAL(op);
   3311                 if (jmpDest > loc) {
   3312                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3313                         forwardedLength.setElementAt(currentLen, jmpDest);
   3314                     }
   3315                 }
   3316             }
   3317             break;
   3318 
   3319 
   3320         case URX_STRING:
   3321             {
   3322                 loc++;
   3323                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3324                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
   3325             }
   3326             break;
   3327 
   3328 
   3329         case URX_STRING_I:
   3330             {
   3331                 loc++;
   3332                 // TODO: with full case folding, matching input text may be shorter than
   3333                 //       the string we have here.  More smarts could put some bounds on it.
   3334                 //       Assume a min length of one for now.  A min length of zero causes
   3335                 //        optimization failures for a pattern like "string"+
   3336                 // currentLen += URX_VAL(stringLenOp);
   3337                 currentLen = safeIncrement(currentLen, 1);
   3338             }
   3339             break;
   3340 
   3341         case URX_CTR_INIT:
   3342         case URX_CTR_INIT_NG:
   3343             {
   3344                 // Loop Init Ops.
   3345                 //   If the min loop count == 0
   3346                 //      move loc forwards to the end of the loop, skipping over the body.
   3347                 //   If the min count is > 0,
   3348                 //      continue normal processing of the body of the loop.
   3349                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
   3350                         loopEndLoc   = URX_VAL(loopEndLoc);
   3351                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
   3352                 if (minLoopCount == 0) {
   3353                     loc = loopEndLoc;
   3354                 } else {
   3355                     loc+=3;  // Skips over operands of CTR_INIT
   3356                 }
   3357             }
   3358             break;
   3359 
   3360 
   3361         case URX_CTR_LOOP:
   3362         case URX_CTR_LOOP_NG:
   3363             // Loop ops.
   3364             //  The jump is conditional, backwards only.
   3365             break;
   3366 
   3367         case URX_LOOP_SR_I:
   3368         case URX_LOOP_DOT_I:
   3369         case URX_LOOP_C:
   3370             // More loop ops.  These state-save to themselves.
   3371             //   don't change the minimum match - could match nothing at all.
   3372             break;
   3373 
   3374 
   3375         case URX_LA_START:
   3376         case URX_LB_START:
   3377             {
   3378                 // Look-around.  Scan forward until the matching look-ahead end,
   3379                 //   without processing the look-around block.  This is overly pessimistic for look-ahead,
   3380                 //   it assumes that the look-ahead match might be zero-length.
   3381                 //   TODO:  Positive lookahead could recursively do the block, then continue
   3382                 //          with the longer of the block or the value coming in.  Ticket 6060
   3383                 int32_t  depth = (opType == URX_LA_START? 2: 1);;
   3384                 for (;;) {
   3385                     loc++;
   3386                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3387                     if (URX_TYPE(op) == URX_LA_START) {
   3388                         // The boilerplate for look-ahead includes two LA_END insturctions,
   3389                         //    Depth will be decremented by each one when it is seen.
   3390                         depth += 2;
   3391                     }
   3392                     if (URX_TYPE(op) == URX_LB_START) {
   3393                         depth++;
   3394                     }
   3395                     if (URX_TYPE(op) == URX_LA_END) {
   3396                         depth--;
   3397                         if (depth == 0) {
   3398                             break;
   3399                         }
   3400                     }
   3401                     if (URX_TYPE(op)==URX_LBN_END) {
   3402                         depth--;
   3403                         if (depth == 0) {
   3404                             break;
   3405                         }
   3406                     }
   3407                     if (URX_TYPE(op) == URX_STATE_SAVE) {
   3408                         // Need this because neg lookahead blocks will FAIL to outside
   3409                         //   of the block.
   3410                         int32_t  jmpDest = URX_VAL(op);
   3411                         if (jmpDest > loc) {
   3412                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3413                                 forwardedLength.setElementAt(currentLen, jmpDest);
   3414                             }
   3415                         }
   3416                     }
   3417                     U_ASSERT(loc <= end);
   3418                 }
   3419             }
   3420             break;
   3421 
   3422         case URX_LA_END:
   3423         case URX_LB_CONT:
   3424         case URX_LB_END:
   3425         case URX_LBN_CONT:
   3426         case URX_LBN_END:
   3427             // Only come here if the matching URX_LA_START or URX_LB_START was not in the
   3428             //   range being sized, which happens when measuring size of look-behind blocks.
   3429             break;
   3430 
   3431         default:
   3432             U_ASSERT(FALSE);
   3433             }
   3434 
   3435         }
   3436 
   3437     // We have finished walking through the ops.  Check whether some forward jump
   3438     //   propagated a shorter length to location end+1.
   3439     if (forwardedLength.elementAti(end+1) < currentLen) {
   3440         currentLen = forwardedLength.elementAti(end+1);
   3441         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   3442     }
   3443 
   3444     return currentLen;
   3445 }
   3446 
   3447 //------------------------------------------------------------------------------
   3448 //
   3449 //   maxMatchLength    Calculate the length of the longest string that could
   3450 //                     match the specified pattern.
   3451 //                     Length is in 16 bit code units, not code points.
   3452 //
   3453 //                     The calculated length may not be exact.  The returned
   3454 //                     value may be longer than the actual maximum; it must
   3455 //                     never be shorter.
   3456 //
   3457 //------------------------------------------------------------------------------
   3458 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
   3459     if (U_FAILURE(*fStatus)) {
   3460         return 0;
   3461     }
   3462     U_ASSERT(start <= end);
   3463     U_ASSERT(end < fRXPat->fCompiledPat->size());
   3464 
   3465 
   3466     int32_t    loc;
   3467     int32_t    op;
   3468     int32_t    opType;
   3469     int32_t    currentLen = 0;
   3470     UVector32  forwardedLength(end+1, *fStatus);
   3471     forwardedLength.setSize(end+1);
   3472 
   3473     for (loc=start; loc<=end; loc++) {
   3474         forwardedLength.setElementAt(0, loc);
   3475     }
   3476 
   3477     for (loc = start; loc<=end; loc++) {
   3478         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3479         opType = URX_TYPE(op);
   3480 
   3481         // The loop is advancing linearly through the pattern.
   3482         // If the op we are now at was the destination of a branch in the pattern,
   3483         // and that path has a longer maximum length than the current accumulated value,
   3484         // replace the current accumulated value.
   3485         if (forwardedLength.elementAti(loc) > currentLen) {
   3486             currentLen = forwardedLength.elementAti(loc);
   3487         }
   3488 
   3489         switch (opType) {
   3490             // Ops that don't change the total length matched
   3491         case URX_RESERVED_OP:
   3492         case URX_END:
   3493         case URX_STRING_LEN:
   3494         case URX_NOP:
   3495         case URX_START_CAPTURE:
   3496         case URX_END_CAPTURE:
   3497         case URX_BACKSLASH_B:
   3498         case URX_BACKSLASH_BU:
   3499         case URX_BACKSLASH_G:
   3500         case URX_BACKSLASH_Z:
   3501         case URX_CARET:
   3502         case URX_DOLLAR:
   3503         case URX_DOLLAR_M:
   3504         case URX_DOLLAR_D:
   3505         case URX_DOLLAR_MD:
   3506         case URX_RELOC_OPRND:
   3507         case URX_STO_INP_LOC:
   3508         case URX_CARET_M:
   3509         case URX_CARET_M_UNIX:
   3510 
   3511         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   3512         case URX_LD_SP:
   3513 
   3514         case URX_LB_END:
   3515         case URX_LB_CONT:
   3516         case URX_LBN_CONT:
   3517         case URX_LBN_END:
   3518             break;
   3519 
   3520 
   3521             // Ops that increase that cause an unbounded increase in the length
   3522             //   of a matched string, or that increase it a hard to characterize way.
   3523             //   Call the max length unbounded, and stop further checking.
   3524         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   3525         case URX_BACKREF_I:
   3526         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
   3527             currentLen = INT32_MAX;
   3528             break;
   3529 
   3530 
   3531             // Ops that match a max of one character (possibly two 16 bit code units.)
   3532             //
   3533         case URX_STATIC_SETREF:
   3534         case URX_STAT_SETREF_N:
   3535         case URX_SETREF:
   3536         case URX_BACKSLASH_D:
   3537         case URX_BACKSLASH_H:
   3538         case URX_BACKSLASH_R:
   3539         case URX_BACKSLASH_V:
   3540         case URX_ONECHAR_I:
   3541         case URX_DOTANY_ALL:
   3542         case URX_DOTANY:
   3543         case URX_DOTANY_UNIX:
   3544             currentLen = safeIncrement(currentLen, 2);
   3545             break;
   3546 
   3547             // Single literal character.  Increase current max length by one or two,
   3548             //       depending on whether the char is in the supplementary range.
   3549         case URX_ONECHAR:
   3550             currentLen = safeIncrement(currentLen, 1);
   3551             if (URX_VAL(op) > 0x10000) {
   3552                 currentLen = safeIncrement(currentLen, 1);
   3553             }
   3554             break;
   3555 
   3556             // Jumps.
   3557             //
   3558         case URX_JMP:
   3559         case URX_JMPX:
   3560         case URX_JMP_SAV:
   3561         case URX_JMP_SAV_X:
   3562             {
   3563                 int32_t  jmpDest = URX_VAL(op);
   3564                 if (jmpDest < loc) {
   3565                     // Loop of some kind.  Max match length is unbounded.
   3566                     currentLen = INT32_MAX;
   3567                 } else {
   3568                     // Forward jump.  Propagate the current min length to the target loc of the jump.
   3569                     if (forwardedLength.elementAti(jmpDest) < currentLen) {
   3570                         forwardedLength.setElementAt(currentLen, jmpDest);
   3571                     }
   3572                     currentLen = 0;
   3573                 }
   3574             }
   3575             break;
   3576 
   3577         case URX_BACKTRACK:
   3578             // back-tracks are kind of like a branch, except that the max length was
   3579             //   propagated already, by the state save.
   3580             currentLen = forwardedLength.elementAti(loc+1);
   3581             break;
   3582 
   3583 
   3584         case URX_STATE_SAVE:
   3585             {
   3586                 // State Save, for forward jumps, propagate the current minimum.
   3587                 //               of the state save.
   3588                 //             For backwards jumps, they create a loop, maximum
   3589                 //               match length is unbounded.
   3590                 int32_t  jmpDest = URX_VAL(op);
   3591                 if (jmpDest > loc) {
   3592                     if (currentLen > forwardedLength.elementAti(jmpDest)) {
   3593                         forwardedLength.setElementAt(currentLen, jmpDest);
   3594                     }
   3595                 } else {
   3596                     currentLen = INT32_MAX;
   3597                 }
   3598             }
   3599             break;
   3600 
   3601 
   3602 
   3603 
   3604         case URX_STRING:
   3605             {
   3606                 loc++;
   3607                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3608                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
   3609                 break;
   3610             }
   3611 
   3612         case URX_STRING_I:
   3613             // TODO:  This code assumes that any user string that matches will be no longer
   3614             //        than our compiled string, with case insensitive matching.
   3615             //        Our compiled string has been case-folded already.
   3616             //
   3617             //        Any matching user string will have no more code points than our
   3618             //        compiled (folded) string.  Folding may add code points, but
   3619             //        not remove them.
   3620             //
   3621             //        There is a potential problem if a supplemental code point
   3622             //        case-folds to a BMP code point.  In this case our compiled string
   3623             //        could be shorter (in code units) than a matching user string.
   3624             //
   3625             //        At this time (Unicode 6.1) there are no such characters, and this case
   3626             //        is not being handled.  A test, intltest regex/Bug9283, will fail if
   3627             //        any problematic characters are added to Unicode.
   3628             //
   3629             //        If this happens, we can make a set of the BMP chars that the
   3630             //        troublesome supplementals fold to, scan our string, and bump the
   3631             //        currentLen one extra for each that is found.
   3632             //
   3633             {
   3634                 loc++;
   3635                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3636                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
   3637             }
   3638             break;
   3639 
   3640         case URX_CTR_INIT:
   3641         case URX_CTR_INIT_NG:
   3642             // For Loops, recursively call this function on the pattern for the loop body,
   3643             //   then multiply the result by the maximum loop count.
   3644             {
   3645                 int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
   3646                 if (loopEndLoc == loc+4) {
   3647                     // Loop has an empty body. No affect on max match length.
   3648                     // Continue processing with code after the loop end.
   3649                     loc = loopEndLoc;
   3650                     break;
   3651                 }
   3652 
   3653                 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
   3654                 if (maxLoopCount == -1) {
   3655                     // Unbounded Loop. No upper bound on match length.
   3656                     currentLen = INT32_MAX;
   3657                     break;
   3658                 }
   3659 
   3660                 U_ASSERT(loopEndLoc >= loc+4);
   3661                 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
   3662                 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
   3663                 if (updatedLen >= INT32_MAX) {
   3664                     currentLen = INT32_MAX;
   3665                     break;
   3666                 }
   3667                 currentLen = (int32_t)updatedLen;
   3668                 loc = loopEndLoc;
   3669                 break;
   3670             }
   3671 
   3672         case URX_CTR_LOOP:
   3673         case URX_CTR_LOOP_NG:
   3674             // These opcodes will be skipped over by code for URX_CRT_INIT.
   3675             // We shouldn't encounter them here.
   3676             U_ASSERT(FALSE);
   3677             break;
   3678 
   3679         case URX_LOOP_SR_I:
   3680         case URX_LOOP_DOT_I:
   3681         case URX_LOOP_C:
   3682             // For anything to do with loops, make the match length unbounded.
   3683             currentLen = INT32_MAX;
   3684             break;
   3685 
   3686 
   3687 
   3688         case URX_LA_START:
   3689         case URX_LA_END:
   3690             // Look-ahead.  Just ignore, treat the look-ahead block as if
   3691             // it were normal pattern.  Gives a too-long match length,
   3692             //  but good enough for now.
   3693             break;
   3694 
   3695             // End of look-ahead ops should always be consumed by the processing at
   3696             //  the URX_LA_START op.
   3697             // U_ASSERT(FALSE);
   3698             // break;
   3699 
   3700         case URX_LB_START:
   3701             {
   3702                 // Look-behind.  Scan forward until the matching look-around end,
   3703                 //   without processing the look-behind block.
   3704                 int32_t  depth = 0;
   3705                 for (;;) {
   3706                     loc++;
   3707                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3708                     if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
   3709                         depth++;
   3710                     }
   3711                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
   3712                         if (depth == 0) {
   3713                             break;
   3714                         }
   3715                         depth--;
   3716                     }
   3717                     U_ASSERT(loc < end);
   3718                 }
   3719             }
   3720             break;
   3721 
   3722         default:
   3723             U_ASSERT(FALSE);
   3724         }
   3725 
   3726 
   3727         if (currentLen == INT32_MAX) {
   3728             //  The maximum length is unbounded.
   3729             //  Stop further processing of the pattern.
   3730             break;
   3731         }
   3732 
   3733     }
   3734     return currentLen;
   3735 
   3736 }
   3737 
   3738 
   3739 //------------------------------------------------------------------------------
   3740 //
   3741 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
   3742 //                Extra NOPs are inserted for some constructs during the initial
   3743 //                code generation to provide locations that may be patched later.
   3744 //                Many end up unneeded, and are removed by this function.
   3745 //
   3746 //                In order to minimize the number of passes through the pattern,
   3747 //                back-reference fixup is also performed here (adjusting
   3748 //                back-reference operands to point to the correct frame offsets).
   3749 //
   3750 //------------------------------------------------------------------------------
   3751 void RegexCompile::stripNOPs() {
   3752 
   3753     if (U_FAILURE(*fStatus)) {
   3754         return;
   3755     }
   3756 
   3757     int32_t    end = fRXPat->fCompiledPat->size();
   3758     UVector32  deltas(end, *fStatus);
   3759 
   3760     // Make a first pass over the code, computing the amount that things
   3761     //   will be offset at each location in the original code.
   3762     int32_t   loc;
   3763     int32_t   d = 0;
   3764     for (loc=0; loc<end; loc++) {
   3765         deltas.addElement(d, *fStatus);
   3766         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3767         if (URX_TYPE(op) == URX_NOP) {
   3768             d++;
   3769         }
   3770     }
   3771 
   3772     UnicodeString caseStringBuffer;
   3773 
   3774     // Make a second pass over the code, removing the NOPs by moving following
   3775     //  code up, and patching operands that refer to code locations that
   3776     //  are being moved.  The array of offsets from the first step is used
   3777     //  to compute the new operand values.
   3778     int32_t src;
   3779     int32_t dst = 0;
   3780     for (src=0; src<end; src++) {
   3781         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
   3782         int32_t opType = URX_TYPE(op);
   3783         switch (opType) {
   3784         case URX_NOP:
   3785             break;
   3786 
   3787         case URX_STATE_SAVE:
   3788         case URX_JMP:
   3789         case URX_CTR_LOOP:
   3790         case URX_CTR_LOOP_NG:
   3791         case URX_RELOC_OPRND:
   3792         case URX_JMPX:
   3793         case URX_JMP_SAV:
   3794         case URX_JMP_SAV_X:
   3795             // These are instructions with operands that refer to code locations.
   3796             {
   3797                 int32_t  operandAddress = URX_VAL(op);
   3798                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
   3799                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
   3800                 op = buildOp(opType, fixedOperandAddress);
   3801                 fRXPat->fCompiledPat->setElementAt(op, dst);
   3802                 dst++;
   3803                 break;
   3804             }
   3805 
   3806         case URX_BACKREF:
   3807         case URX_BACKREF_I:
   3808             {
   3809                 int32_t where = URX_VAL(op);
   3810                 if (where > fRXPat->fGroupMap->size()) {
   3811                     error(U_REGEX_INVALID_BACK_REF);
   3812                     break;
   3813                 }
   3814                 where = fRXPat->fGroupMap->elementAti(where-1);
   3815                 op    = buildOp(opType, where);
   3816                 fRXPat->fCompiledPat->setElementAt(op, dst);
   3817                 dst++;
   3818 
   3819                 fRXPat->fNeedsAltInput = TRUE;
   3820                 break;
   3821             }
   3822         case URX_RESERVED_OP:
   3823         case URX_RESERVED_OP_N:
   3824         case URX_BACKTRACK:
   3825         case URX_END:
   3826         case URX_ONECHAR:
   3827         case URX_STRING:
   3828         case URX_STRING_LEN:
   3829         case URX_START_CAPTURE:
   3830         case URX_END_CAPTURE:
   3831         case URX_STATIC_SETREF:
   3832         case URX_STAT_SETREF_N:
   3833         case URX_SETREF:
   3834         case URX_DOTANY:
   3835         case URX_FAIL:
   3836         case URX_BACKSLASH_B:
   3837         case URX_BACKSLASH_BU:
   3838         case URX_BACKSLASH_G:
   3839         case URX_BACKSLASH_X:
   3840         case URX_BACKSLASH_Z:
   3841         case URX_DOTANY_ALL:
   3842         case URX_BACKSLASH_D:
   3843         case URX_CARET:
   3844         case URX_DOLLAR:
   3845         case URX_CTR_INIT:
   3846         case URX_CTR_INIT_NG:
   3847         case URX_DOTANY_UNIX:
   3848         case URX_STO_SP:
   3849         case URX_LD_SP:
   3850         case URX_STO_INP_LOC:
   3851         case URX_LA_START:
   3852         case URX_LA_END:
   3853         case URX_ONECHAR_I:
   3854         case URX_STRING_I:
   3855         case URX_DOLLAR_M:
   3856         case URX_CARET_M:
   3857         case URX_CARET_M_UNIX:
   3858         case URX_LB_START:
   3859         case URX_LB_CONT:
   3860         case URX_LB_END:
   3861         case URX_LBN_CONT:
   3862         case URX_LBN_END:
   3863         case URX_LOOP_SR_I:
   3864         case URX_LOOP_DOT_I:
   3865         case URX_LOOP_C:
   3866         case URX_DOLLAR_D:
   3867         case URX_DOLLAR_MD:
   3868         case URX_BACKSLASH_H:
   3869         case URX_BACKSLASH_R:
   3870         case URX_BACKSLASH_V:
   3871             // These instructions are unaltered by the relocation.
   3872             fRXPat->fCompiledPat->setElementAt(op, dst);
   3873             dst++;
   3874             break;
   3875 
   3876         default:
   3877             // Some op is unaccounted for.
   3878             U_ASSERT(FALSE);
   3879             error(U_REGEX_INTERNAL_ERROR);
   3880         }
   3881     }
   3882 
   3883     fRXPat->fCompiledPat->setSize(dst);
   3884 }
   3885 
   3886 
   3887 
   3888 
   3889 //------------------------------------------------------------------------------
   3890 //
   3891 //  Error         Report a rule parse error.
   3892 //                Only report it if no previous error has been recorded.
   3893 //
   3894 //------------------------------------------------------------------------------
   3895 void RegexCompile::error(UErrorCode e) {
   3896     if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
   3897         *fStatus = e;
   3898         // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
   3899         // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
   3900         // int64_t. If the values of the latter are out of range for the former,
   3901         // set them to the appropriate "field not supported" values.
   3902         if (fLineNum > 0x7FFFFFFF) {
   3903             fParseErr->line   = 0;
   3904             fParseErr->offset = -1;
   3905         } else if (fCharNum > 0x7FFFFFFF) {
   3906             fParseErr->line   = (int32_t)fLineNum;
   3907             fParseErr->offset = -1;
   3908         } else {
   3909             fParseErr->line   = (int32_t)fLineNum;
   3910             fParseErr->offset = (int32_t)fCharNum;
   3911         }
   3912 
   3913         UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
   3914 
   3915         // Fill in the context.
   3916         //   Note: extractBetween() pins supplied indicies to the string bounds.
   3917         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
   3918         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
   3919         utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
   3920         utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
   3921     }
   3922 }
   3923 
   3924 
   3925 //
   3926 //  Assorted Unicode character constants.
   3927 //     Numeric because there is no portable way to enter them as literals.
   3928 //     (Think EBCDIC).
   3929 //
   3930 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
   3931 static const UChar      chLF        = 0x0a;      // Line Feed
   3932 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
   3933 static const UChar      chDigit0    = 0x30;      // '0'
   3934 static const UChar      chDigit7    = 0x37;      // '9'
   3935 static const UChar      chColon     = 0x3A;      // ':'
   3936 static const UChar      chE         = 0x45;      // 'E'
   3937 static const UChar      chQ         = 0x51;      // 'Q'
   3938 //static const UChar      chN         = 0x4E;      // 'N'
   3939 static const UChar      chP         = 0x50;      // 'P'
   3940 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
   3941 //static const UChar      chLBracket  = 0x5b;      // '['
   3942 static const UChar      chRBracket  = 0x5d;      // ']'
   3943 static const UChar      chUp        = 0x5e;      // '^'
   3944 static const UChar      chLowerP    = 0x70;
   3945 static const UChar      chLBrace    = 0x7b;      // '{'
   3946 static const UChar      chRBrace    = 0x7d;      // '}'
   3947 static const UChar      chNEL       = 0x85;      //    NEL newline variant
   3948 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
   3949 
   3950 
   3951 //------------------------------------------------------------------------------
   3952 //
   3953 //  nextCharLL    Low Level Next Char from the regex pattern.
   3954 //                Get a char from the string, keep track of input position
   3955 //                     for error reporting.
   3956 //
   3957 //------------------------------------------------------------------------------
   3958 UChar32  RegexCompile::nextCharLL() {
   3959     UChar32       ch;
   3960 
   3961     if (fPeekChar != -1) {
   3962         ch = fPeekChar;
   3963         fPeekChar = -1;
   3964         return ch;
   3965     }
   3966 
   3967     // assume we're already in the right place
   3968     ch = UTEXT_NEXT32(fRXPat->fPattern);
   3969     if (ch == U_SENTINEL) {
   3970         return ch;
   3971     }
   3972 
   3973     if (ch == chCR ||
   3974         ch == chNEL ||
   3975         ch == chLS   ||
   3976         (ch == chLF && fLastChar != chCR)) {
   3977         // Character is starting a new line.  Bump up the line number, and
   3978         //  reset the column to 0.
   3979         fLineNum++;
   3980         fCharNum=0;
   3981     }
   3982     else {
   3983         // Character is not starting a new line.  Except in the case of a
   3984         //   LF following a CR, increment the column position.
   3985         if (ch != chLF) {
   3986             fCharNum++;
   3987         }
   3988     }
   3989     fLastChar = ch;
   3990     return ch;
   3991 }
   3992 
   3993 //------------------------------------------------------------------------------
   3994 //
   3995 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
   3996 //                 character without actually getting it.
   3997 //
   3998 //------------------------------------------------------------------------------
   3999 UChar32  RegexCompile::peekCharLL() {
   4000     if (fPeekChar == -1) {
   4001         fPeekChar = nextCharLL();
   4002     }
   4003     return fPeekChar;
   4004 }
   4005 
   4006 
   4007 //------------------------------------------------------------------------------
   4008 //
   4009 //   nextChar     for pattern scanning.  At this level, we handle stripping
   4010 //                out comments and processing some backslash character escapes.
   4011 //                The rest of the pattern grammar is handled at the next level up.
   4012 //
   4013 //------------------------------------------------------------------------------
   4014 void RegexCompile::nextChar(RegexPatternChar &c) {
   4015 
   4016     fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
   4017     c.fChar    = nextCharLL();
   4018     c.fQuoted  = FALSE;
   4019 
   4020     if (fQuoteMode) {
   4021         c.fQuoted = TRUE;
   4022         if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
   4023             c.fChar == (UChar32)-1) {
   4024             fQuoteMode = FALSE;  //  Exit quote mode,
   4025             nextCharLL();        // discard the E
   4026             nextChar(c);         // recurse to get the real next char
   4027         }
   4028     }
   4029     else if (fInBackslashQuote) {
   4030         // The current character immediately follows a '\'
   4031         // Don't check for any further escapes, just return it as-is.
   4032         // Don't set c.fQuoted, because that would prevent the state machine from
   4033         //    dispatching on the character.
   4034         fInBackslashQuote = FALSE;
   4035     }
   4036     else
   4037     {
   4038         // We are not in a \Q quoted region \E of the source.
   4039         //
   4040         if (fModeFlags & UREGEX_COMMENTS) {
   4041             //
   4042             // We are in free-spacing and comments mode.
   4043             //  Scan through any white space and comments, until we
   4044             //  reach a significant character or the end of inut.
   4045             for (;;) {
   4046                 if (c.fChar == (UChar32)-1) {
   4047                     break;     // End of Input
   4048                 }
   4049                 if  (c.fChar == chPound && fEOLComments == TRUE) {
   4050                     // Start of a comment.  Consume the rest of it, until EOF or a new line
   4051                     for (;;) {
   4052                         c.fChar = nextCharLL();
   4053                         if (c.fChar == (UChar32)-1 ||  // EOF
   4054                             c.fChar == chCR        ||
   4055                             c.fChar == chLF        ||
   4056                             c.fChar == chNEL       ||
   4057                             c.fChar == chLS)       {
   4058                             break;
   4059                         }
   4060                     }
   4061                 }
   4062                 // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
   4063                 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
   4064                     break;
   4065                 }
   4066                 c.fChar = nextCharLL();
   4067             }
   4068         }
   4069 
   4070         //
   4071         //  check for backslash escaped characters.
   4072         //
   4073         if (c.fChar == chBackSlash) {
   4074             int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
   4075             if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
   4076                 //
   4077                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
   4078                 //   Includes \uxxxx, \n, \r, many others.
   4079                 //   Return the single equivalent character.
   4080                 //
   4081                 nextCharLL();                 // get & discard the peeked char.
   4082                 c.fQuoted = TRUE;
   4083 
   4084                 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
   4085                     int32_t endIndex = (int32_t)pos;
   4086                     c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
   4087 
   4088                     if (endIndex == pos) {
   4089                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   4090                     }
   4091                     fCharNum += endIndex - pos;
   4092                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
   4093                 } else {
   4094                     int32_t offset = 0;
   4095                     struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
   4096 
   4097                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
   4098                     c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
   4099 
   4100                     if (offset == 0) {
   4101                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   4102                     } else if (context.lastOffset == offset) {
   4103                         UTEXT_PREVIOUS32(fRXPat->fPattern);
   4104                     } else if (context.lastOffset != offset-1) {
   4105                         utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
   4106                     }
   4107                     fCharNum += offset;
   4108                 }
   4109             }
   4110             else if (peekCharLL() == chDigit0) {
   4111                 //  Octal Escape, using Java Regexp Conventions
   4112                 //    which are \0 followed by 1-3 octal digits.
   4113                 //    Different from ICU Unescape handling of Octal, which does not
   4114                 //    require the leading 0.
   4115                 //  Java also has the convention of only consuming 2 octal digits if
   4116                 //    the three digit number would be > 0xff
   4117                 //
   4118                 c.fChar = 0;
   4119                 nextCharLL();    // Consume the initial 0.
   4120                 int index;
   4121                 for (index=0; index<3; index++) {
   4122                     int32_t ch = peekCharLL();
   4123                     if (ch<chDigit0 || ch>chDigit7) {
   4124                         if (index==0) {
   4125                            // \0 is not followed by any octal digits.
   4126                            error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   4127                         }
   4128                         break;
   4129                     }
   4130                     c.fChar <<= 3;
   4131                     c.fChar += ch&7;
   4132                     if (c.fChar <= 255) {
   4133                         nextCharLL();
   4134                     } else {
   4135                         // The last digit made the number too big.  Forget we saw it.
   4136                         c.fChar >>= 3;
   4137                     }
   4138                 }
   4139                 c.fQuoted = TRUE;
   4140             }
   4141             else if (peekCharLL() == chQ) {
   4142                 //  "\Q"  enter quote mode, which will continue until "\E"
   4143                 fQuoteMode = TRUE;
   4144                 nextCharLL();       // discard the 'Q'.
   4145                 nextChar(c);        // recurse to get the real next char.
   4146             }
   4147             else
   4148             {
   4149                 // We are in a '\' escape that will be handled by the state table scanner.
   4150                 // Just return the backslash, but remember that the following char is to
   4151                 //  be taken literally.
   4152                 fInBackslashQuote = TRUE;
   4153             }
   4154         }
   4155     }
   4156 
   4157     // re-enable # to end-of-line comments, in case they were disabled.
   4158     // They are disabled by the parser upon seeing '(?', but this lasts for
   4159     //  the fetching of the next character only.
   4160     fEOLComments = TRUE;
   4161 
   4162     // putc(c.fChar, stdout);
   4163 }
   4164 
   4165 
   4166 
   4167 //------------------------------------------------------------------------------
   4168 //
   4169 //  scanNamedChar
   4170 //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
   4171 //
   4172 //             The scan position will be at the 'N'.  On return
   4173 //             the scan position should be just after the '}'
   4174 //
   4175 //             Return the UChar32
   4176 //
   4177 //------------------------------------------------------------------------------
   4178 UChar32  RegexCompile::scanNamedChar() {
   4179     if (U_FAILURE(*fStatus)) {
   4180         return 0;
   4181     }
   4182 
   4183     nextChar(fC);
   4184     if (fC.fChar != chLBrace) {
   4185         error(U_REGEX_PROPERTY_SYNTAX);
   4186         return 0;
   4187     }
   4188 
   4189     UnicodeString  charName;
   4190     for (;;) {
   4191         nextChar(fC);
   4192         if (fC.fChar == chRBrace) {
   4193             break;
   4194         }
   4195         if (fC.fChar == -1) {
   4196             error(U_REGEX_PROPERTY_SYNTAX);
   4197             return 0;
   4198         }
   4199         charName.append(fC.fChar);
   4200     }
   4201 
   4202     char name[100];
   4203     if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
   4204          (uint32_t)charName.length()>=sizeof(name)) {
   4205         // All Unicode character names have only invariant characters.
   4206         // The API to get a character, given a name, accepts only char *, forcing us to convert,
   4207         //   which requires this error check
   4208         error(U_REGEX_PROPERTY_SYNTAX);
   4209         return 0;
   4210     }
   4211     charName.extract(0, charName.length(), name, sizeof(name), US_INV);
   4212 
   4213     UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
   4214     if (U_FAILURE(*fStatus)) {
   4215         error(U_REGEX_PROPERTY_SYNTAX);
   4216     }
   4217 
   4218     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
   4219     return theChar;
   4220 }
   4221 
   4222 //------------------------------------------------------------------------------
   4223 //
   4224 //  scanProp   Construct a UnicodeSet from the text at the current scan
   4225 //             position, which will be of the form \p{whaterver}
   4226 //
   4227 //             The scan position will be at the 'p' or 'P'.  On return
   4228 //             the scan position should be just after the '}'
   4229 //
   4230 //             Return a UnicodeSet, constructed from the \P pattern,
   4231 //             or NULL if the pattern is invalid.
   4232 //
   4233 //------------------------------------------------------------------------------
   4234 UnicodeSet *RegexCompile::scanProp() {
   4235     UnicodeSet    *uset = NULL;
   4236 
   4237     if (U_FAILURE(*fStatus)) {
   4238         return NULL;
   4239     }
   4240     (void)chLowerP;   // Suppress compiler unused variable warning.
   4241     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
   4242     UBool negated = (fC.fChar == chP);
   4243 
   4244     UnicodeString propertyName;
   4245     nextChar(fC);
   4246     if (fC.fChar != chLBrace) {
   4247         error(U_REGEX_PROPERTY_SYNTAX);
   4248         return NULL;
   4249     }
   4250     for (;;) {
   4251         nextChar(fC);
   4252         if (fC.fChar == chRBrace) {
   4253             break;
   4254         }
   4255         if (fC.fChar == -1) {
   4256             // Hit the end of the input string without finding the closing '}'
   4257             error(U_REGEX_PROPERTY_SYNTAX);
   4258             return NULL;
   4259         }
   4260         propertyName.append(fC.fChar);
   4261     }
   4262     uset = createSetForProperty(propertyName, negated);
   4263     nextChar(fC);    // Move input scan to position following the closing '}'
   4264     return uset;
   4265 }
   4266 
   4267 //------------------------------------------------------------------------------
   4268 //
   4269 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
   4270 //             position, which is expected be of the form [:property expression:]
   4271 //
   4272 //             The scan position will be at the opening ':'.  On return
   4273 //             the scan position must be on the closing ']'
   4274 //
   4275 //             Return a UnicodeSet constructed from the pattern,
   4276 //             or NULL if this is not a valid POSIX-style set expression.
   4277 //             If not a property expression, restore the initial scan position
   4278 //                (to the opening ':')
   4279 //
   4280 //               Note:  the opening '[:' is not sufficient to guarantee that
   4281 //                      this is a [:property:] expression.
   4282 //                      [:'+=,] is a perfectly good ordinary set expression that
   4283 //                              happens to include ':' as one of its characters.
   4284 //
   4285 //------------------------------------------------------------------------------
   4286 UnicodeSet *RegexCompile::scanPosixProp() {
   4287     UnicodeSet    *uset = NULL;
   4288 
   4289     if (U_FAILURE(*fStatus)) {
   4290         return NULL;
   4291     }
   4292 
   4293     U_ASSERT(fC.fChar == chColon);
   4294 
   4295     // Save the scanner state.
   4296     // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
   4297     int64_t     savedScanIndex        = fScanIndex;
   4298     int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
   4299     UBool       savedQuoteMode        = fQuoteMode;
   4300     UBool       savedInBackslashQuote = fInBackslashQuote;
   4301     UBool       savedEOLComments      = fEOLComments;
   4302     int64_t     savedLineNum          = fLineNum;
   4303     int64_t     savedCharNum          = fCharNum;
   4304     UChar32     savedLastChar         = fLastChar;
   4305     UChar32     savedPeekChar         = fPeekChar;
   4306     RegexPatternChar savedfC          = fC;
   4307 
   4308     // Scan for a closing ].   A little tricky because there are some perverse
   4309     //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
   4310     //   ending on the second closing ].
   4311 
   4312     UnicodeString propName;
   4313     UBool         negated  = FALSE;
   4314 
   4315     // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
   4316     nextChar(fC);
   4317     if (fC.fChar == chUp) {
   4318        negated = TRUE;
   4319        nextChar(fC);
   4320     }
   4321 
   4322     // Scan for the closing ":]", collecting the property name along the way.
   4323     UBool  sawPropSetTerminator = FALSE;
   4324     for (;;) {
   4325         propName.append(fC.fChar);
   4326         nextChar(fC);
   4327         if (fC.fQuoted || fC.fChar == -1) {
   4328             // Escaped characters or end of input - either says this isn't a [:Property:]
   4329             break;
   4330         }
   4331         if (fC.fChar == chColon) {
   4332             nextChar(fC);
   4333             if (fC.fChar == chRBracket) {
   4334                 sawPropSetTerminator = TRUE;
   4335             }
   4336             break;
   4337         }
   4338     }
   4339 
   4340     if (sawPropSetTerminator) {
   4341         uset = createSetForProperty(propName, negated);
   4342     }
   4343     else
   4344     {
   4345         // No closing ":]".
   4346         //  Restore the original scan position.
   4347         //  The main scanner will retry the input as a normal set expression,
   4348         //    not a [:Property:] expression.
   4349         fScanIndex        = savedScanIndex;
   4350         fQuoteMode        = savedQuoteMode;
   4351         fInBackslashQuote = savedInBackslashQuote;
   4352         fEOLComments      = savedEOLComments;
   4353         fLineNum          = savedLineNum;
   4354         fCharNum          = savedCharNum;
   4355         fLastChar         = savedLastChar;
   4356         fPeekChar         = savedPeekChar;
   4357         fC                = savedfC;
   4358         UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
   4359     }
   4360     return uset;
   4361 }
   4362 
   4363 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
   4364     set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
   4365     addCategory(set, U_GC_CF_MASK, ec);
   4366 }
   4367 
   4368 //
   4369 //  Create a Unicode Set from a Unicode Property expression.
   4370 //     This is common code underlying both \p{...} ane [:...:] expressions.
   4371 //     Includes trying the Java "properties" that aren't supported as
   4372 //     normal ICU UnicodeSet properties
   4373 //
   4374 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
   4375 
   4376     if (U_FAILURE(*fStatus)) {
   4377         return nullptr;
   4378     }
   4379     LocalPointer<UnicodeSet> set;
   4380     UErrorCode status = U_ZERO_ERROR;
   4381 
   4382     do {      // non-loop, exists to allow breaks from the block.
   4383         //
   4384         //  First try the property as we received it
   4385         //
   4386         UnicodeString   setExpr;
   4387         uint32_t        usetFlags = 0;
   4388         setExpr.append(u"[\\p{", -1);
   4389         setExpr.append(propName);
   4390         setExpr.append(u"}]", -1);
   4391         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   4392             usetFlags |= USET_CASE_INSENSITIVE;
   4393         }
   4394         set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, NULL, status), status);
   4395         if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
   4396             break;
   4397         }
   4398 
   4399         //
   4400         //  The incoming property wasn't directly recognized by ICU.
   4401 
   4402         //  Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
   4403         //     Java accepts 'word' with mixed case.
   4404         //     Java accepts 'all' only in all lower case.
   4405 
   4406         status = U_ZERO_ERROR;
   4407         if (propName.caseCompare(u"word", -1, 0) == 0) {
   4408             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])), status);
   4409             break;
   4410         }
   4411         if (propName.compare(u"all", -1) == 0) {
   4412             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
   4413             break;
   4414         }
   4415 
   4416 
   4417         //    Do Java InBlock expressions
   4418         //
   4419         UnicodeString mPropName = propName;
   4420         if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
   4421             status = U_ZERO_ERROR;
   4422             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
   4423             if (U_FAILURE(status)) {
   4424                 break;
   4425             }
   4426             UnicodeString blockName(mPropName, 2);  // Property with the leading "In" removed.
   4427             set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
   4428             break;
   4429         }
   4430 
   4431         //  Check for the Java form "IsBooleanPropertyValue", which we will recast
   4432         //  as "BooleanPropertyValue". The property value can be either a
   4433         //  a General Category or a Script Name.
   4434 
   4435         if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
   4436             mPropName.remove(0, 2);      // Strip the "Is"
   4437             if (mPropName.indexOf(u'=') >= 0) {
   4438                 // Reject any "Is..." property expression containing an '=', that is,
   4439                 // any non-binary property expression.
   4440                 status = U_REGEX_PROPERTY_SYNTAX;
   4441                 break;
   4442             }
   4443 
   4444             if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
   4445                 mPropName.setTo(u"unassigned", -1);
   4446                 negated = !negated;
   4447             } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
   4448                 mPropName.setTo(u"Titlecase_Letter", -1);
   4449             }
   4450 
   4451             mPropName.insert(0, u"[\\p{", -1);
   4452             mPropName.append(u"}]", -1);
   4453             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
   4454 
   4455             if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
   4456                 set->closeOver(USET_CASE_INSENSITIVE);
   4457             }
   4458             break;
   4459 
   4460         }
   4461 
   4462         if (propName.startsWith(u"java", -1)) {
   4463             status = U_ZERO_ERROR;
   4464             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
   4465             if (U_FAILURE(status)) {
   4466                 break;
   4467             }
   4468             //
   4469             //  Try the various Java specific properties.
   4470             //   These all begin with "java"
   4471             //
   4472             if (propName.compare(u"javaDefined", -1) == 0) {
   4473                 addCategory(set.getAlias(), U_GC_CN_MASK, status);
   4474                 set->complement();
   4475             }
   4476             else if (propName.compare(u"javaDigit", -1) == 0) {
   4477                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
   4478             }
   4479             else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
   4480                 addIdentifierIgnorable(set.getAlias(), status);
   4481             }
   4482             else if (propName.compare(u"javaISOControl", -1) == 0) {
   4483                 set->add(0, 0x1F).add(0x7F, 0x9F);
   4484             }
   4485             else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
   4486                 addCategory(set.getAlias(), U_GC_L_MASK, status);
   4487                 addCategory(set.getAlias(), U_GC_SC_MASK, status);
   4488                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
   4489                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
   4490                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
   4491                 addCategory(set.getAlias(), U_GC_MC_MASK, status);
   4492                 addCategory(set.getAlias(), U_GC_MN_MASK, status);
   4493                 addIdentifierIgnorable(set.getAlias(), status);
   4494             }
   4495             else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
   4496                 addCategory(set.getAlias(), U_GC_L_MASK, status);
   4497                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
   4498                 addCategory(set.getAlias(), U_GC_SC_MASK, status);
   4499                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
   4500             }
   4501             else if (propName.compare(u"javaLetter", -1) == 0) {
   4502                 addCategory(set.getAlias(), U_GC_L_MASK, status);
   4503             }
   4504             else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
   4505                 addCategory(set.getAlias(), U_GC_L_MASK, status);
   4506                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
   4507             }
   4508             else if (propName.compare(u"javaLowerCase", -1) == 0) {
   4509                 addCategory(set.getAlias(), U_GC_LL_MASK, status);
   4510             }
   4511             else if (propName.compare(u"javaMirrored", -1) == 0) {
   4512                 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
   4513             }
   4514             else if (propName.compare(u"javaSpaceChar", -1) == 0) {
   4515                 addCategory(set.getAlias(), U_GC_Z_MASK, status);
   4516             }
   4517             else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
   4518                 set->add(0x10000, UnicodeSet::MAX_VALUE);
   4519             }
   4520             else if (propName.compare(u"javaTitleCase", -1) == 0) {
   4521                 addCategory(set.getAlias(), U_GC_LT_MASK, status);
   4522             }
   4523             else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
   4524                 addCategory(set.getAlias(), U_GC_L_MASK, status);
   4525                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
   4526             }
   4527             else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
   4528                 addCategory(set.getAlias(), U_GC_L_MASK, status);
   4529                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
   4530                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
   4531                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
   4532                 addCategory(set.getAlias(), U_GC_MC_MASK, status);
   4533                 addCategory(set.getAlias(), U_GC_MN_MASK, status);
   4534                 addIdentifierIgnorable(set.getAlias(), status);
   4535             }
   4536             else if (propName.compare(u"javaUpperCase", -1) == 0) {
   4537                 addCategory(set.getAlias(), U_GC_LU_MASK, status);
   4538             }
   4539             else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
   4540                 set->add(0, UnicodeSet::MAX_VALUE);
   4541             }
   4542             else if (propName.compare(u"javaWhitespace", -1) == 0) {
   4543                 addCategory(set.getAlias(), U_GC_Z_MASK, status);
   4544                 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
   4545                 set->add(9, 0x0d).add(0x1c, 0x1f);
   4546             } else {
   4547                 status = U_REGEX_PROPERTY_SYNTAX;
   4548             }
   4549 
   4550             if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
   4551                 set->closeOver(USET_CASE_INSENSITIVE);
   4552             }
   4553             break;
   4554         }
   4555 
   4556         // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
   4557         // extensions matched it.
   4558         status = U_REGEX_PROPERTY_SYNTAX;
   4559     } while (false);   // End of do loop block. Code above breaks out of the block on success or hard failure.
   4560 
   4561     if (U_SUCCESS(status)) {
   4562         U_ASSERT(set.isValid());
   4563         if (negated) {
   4564             set->complement();
   4565         }
   4566         return set.orphan();
   4567     } else {
   4568         if (status == U_ILLEGAL_ARGUMENT_ERROR) {
   4569             status = U_REGEX_PROPERTY_SYNTAX;
   4570         }
   4571         error(status);
   4572         return nullptr;
   4573     }
   4574 }
   4575 
   4576 
   4577 //
   4578 //  SetEval   Part of the evaluation of [set expressions].
   4579 //            Perform any pending (stacked) operations with precedence
   4580 //            equal or greater to that of the next operator encountered
   4581 //            in the expression.
   4582 //
   4583 void RegexCompile::setEval(int32_t nextOp) {
   4584     UnicodeSet *rightOperand = NULL;
   4585     UnicodeSet *leftOperand  = NULL;
   4586     for (;;) {
   4587         U_ASSERT(fSetOpStack.empty()==FALSE);
   4588         int32_t pendingSetOperation = fSetOpStack.peeki();
   4589         if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
   4590             break;
   4591         }
   4592         fSetOpStack.popi();
   4593         U_ASSERT(fSetStack.empty() == FALSE);
   4594         rightOperand = (UnicodeSet *)fSetStack.peek();
   4595         switch (pendingSetOperation) {
   4596             case setNegation:
   4597                 rightOperand->complement();
   4598                 break;
   4599             case setCaseClose:
   4600                 // TODO: need a simple close function.  Ticket 6065
   4601                 rightOperand->closeOver(USET_CASE_INSENSITIVE);
   4602                 rightOperand->removeAllStrings();
   4603                 break;
   4604             case setDifference1:
   4605             case setDifference2:
   4606                 fSetStack.pop();
   4607                 leftOperand = (UnicodeSet *)fSetStack.peek();
   4608                 leftOperand->removeAll(*rightOperand);
   4609                 delete rightOperand;
   4610                 break;
   4611             case setIntersection1:
   4612             case setIntersection2:
   4613                 fSetStack.pop();
   4614                 leftOperand = (UnicodeSet *)fSetStack.peek();
   4615                 leftOperand->retainAll(*rightOperand);
   4616                 delete rightOperand;
   4617                 break;
   4618             case setUnion:
   4619                 fSetStack.pop();
   4620                 leftOperand = (UnicodeSet *)fSetStack.peek();
   4621                 leftOperand->addAll(*rightOperand);
   4622                 delete rightOperand;
   4623                 break;
   4624             default:
   4625                 U_ASSERT(FALSE);
   4626                 break;
   4627             }
   4628         }
   4629     }
   4630 
   4631 void RegexCompile::setPushOp(int32_t op) {
   4632     setEval(op);
   4633     fSetOpStack.push(op, *fStatus);
   4634     fSetStack.push(new UnicodeSet(), *fStatus);
   4635 }
   4636 
   4637 U_NAMESPACE_END
   4638 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
   4639 
   4640