Home | History | Annotate | Download | only in i18n
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 //
      4 //  file:  regexcmp.cpp
      5 //
      6 //  Copyright (C) 2002-2016 International Business Machines Corporation and others.
      7 //  All Rights Reserved.
      8 //
      9 //  This file contains the ICU regular expression compiler, which is responsible
     10 //  for processing a regular expression pattern into the compiled form that
     11 //  is used by the match finding engine.
     12 //
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     17 
     18 #include "unicode/ustring.h"
     19 #include "unicode/unistr.h"
     20 #include "unicode/uniset.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/uchriter.h"
     23 #include "unicode/parsepos.h"
     24 #include "unicode/parseerr.h"
     25 #include "unicode/regex.h"
     26 #include "unicode/utf.h"
     27 #include "unicode/utf16.h"
     28 #include "patternprops.h"
     29 #include "putilimp.h"
     30 #include "cmemory.h"
     31 #include "cstring.h"
     32 #include "uvectr32.h"
     33 #include "uvectr64.h"
     34 #include "uassert.h"
     35 #include "uinvchar.h"
     36 
     37 #include "regeximp.h"
     38 #include "regexcst.h"   // Contains state table for the regex pattern parser.
     39                         //   generated by a Perl script.
     40 #include "regexcmp.h"
     41 #include "regexst.h"
     42 #include "regextxt.h"
     43 
     44 
     45 
     46 U_NAMESPACE_BEGIN
     47 
     48 
     49 //------------------------------------------------------------------------------
     50 //
     51 //  Constructor.
     52 //
     53 //------------------------------------------------------------------------------
     54 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
     55    fParenStack(status), fSetStack(status), fSetOpStack(status)
     56 {
     57     // Lazy init of all shared global sets (needed for init()'s empty text)
     58     RegexStaticSets::initGlobals(&status);
     59 
     60     fStatus           = &status;
     61 
     62     fRXPat            = rxp;
     63     fScanIndex        = 0;
     64     fLastChar         = -1;
     65     fPeekChar         = -1;
     66     fLineNum          = 1;
     67     fCharNum          = 0;
     68     fQuoteMode        = FALSE;
     69     fInBackslashQuote = FALSE;
     70     fModeFlags        = fRXPat->fFlags | 0x80000000;
     71     fEOLComments      = TRUE;
     72 
     73     fMatchOpenParen   = -1;
     74     fMatchCloseParen  = -1;
     75     fCaptureName      = NULL;
     76     fLastSetLiteral   = U_SENTINEL;
     77 
     78     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
     79         status = rxp->fDeferredStatus;
     80     }
     81 }
     82 
     83 static const UChar      chAmp       = 0x26;      // '&'
     84 static const UChar      chDash      = 0x2d;      // '-'
     85 
     86 
     87 //------------------------------------------------------------------------------
     88 //
     89 //  Destructor
     90 //
     91 //------------------------------------------------------------------------------
     92 RegexCompile::~RegexCompile() {
     93     delete fCaptureName;         // Normally will be NULL, but can exist if pattern
     94                                  //   compilation stops with a syntax error.
     95 }
     96 
     97 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
     98     set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
     99 }
    100 
    101 //------------------------------------------------------------------------------
    102 //
    103 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
    104 //                           The state tables are hand-written in the file regexcst.txt,
    105 //                           and converted to the form used here by a perl
    106 //                           script regexcst.pl
    107 //
    108 //------------------------------------------------------------------------------
    109 void    RegexCompile::compile(
    110                          const UnicodeString &pat,   // Source pat to be compiled.
    111                          UParseError &pp,            // Error position info
    112                          UErrorCode &e)              // Error Code
    113 {
    114     fRXPat->fPatternString = new UnicodeString(pat);
    115     UText patternText = UTEXT_INITIALIZER;
    116     utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
    117 
    118     if (U_SUCCESS(e)) {
    119         compile(&patternText, pp, e);
    120         utext_close(&patternText);
    121     }
    122 }
    123 
    124 //
    125 //   compile, UText mode
    126 //     All the work is actually done here.
    127 //
    128 void    RegexCompile::compile(
    129                          UText *pat,                 // Source pat to be compiled.
    130                          UParseError &pp,            // Error position info
    131                          UErrorCode &e)              // Error Code
    132 {
    133     fStatus             = &e;
    134     fParseErr           = &pp;
    135     fStackPtr           = 0;
    136     fStack[fStackPtr]   = 0;
    137 
    138     if (U_FAILURE(*fStatus)) {
    139         return;
    140     }
    141 
    142     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
    143     U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
    144 
    145     // Prepare the RegexPattern object to receive the compiled pattern.
    146     fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
    147     if (U_FAILURE(*fStatus)) {
    148         return;
    149     }
    150     fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
    151     fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;
    152 
    153 
    154     // Initialize the pattern scanning state machine
    155     fPatternLength = utext_nativeLength(pat);
    156     uint16_t                state = 1;
    157     const RegexTableEl      *tableEl;
    158 
    159     // UREGEX_LITERAL force entire pattern to be treated as a literal string.
    160     if (fModeFlags & UREGEX_LITERAL) {
    161         fQuoteMode = TRUE;
    162     }
    163 
    164     nextChar(fC);                        // Fetch the first char from the pattern string.
    165 
    166     //
    167     // Main loop for the regex pattern parsing state machine.
    168     //   Runs once per state transition.
    169     //   Each time through optionally performs, depending on the state table,
    170     //      - an advance to the the next pattern char
    171     //      - an action to be performed.
    172     //      - pushing or popping a state to/from the local state return stack.
    173     //   file regexcst.txt is the source for the state table.  The logic behind
    174     //     recongizing the pattern syntax is there, not here.
    175     //
    176     for (;;) {
    177         //  Bail out if anything has gone wrong.
    178         //  Regex pattern parsing stops on the first error encountered.
    179         if (U_FAILURE(*fStatus)) {
    180             break;
    181         }
    182 
    183         U_ASSERT(state != 0);
    184 
    185         // Find the state table element that matches the input char from the pattern, or the
    186         //    class of the input character.  Start with the first table row for this
    187         //    state, then linearly scan forward until we find a row that matches the
    188         //    character.  The last row for each state always matches all characters, so
    189         //    the search will stop there, if not before.
    190         //
    191         tableEl = &gRuleParseStateTable[state];
    192         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
    193             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
    194 
    195         for (;;) {    // loop through table rows belonging to this state, looking for one
    196                       //   that matches the current input char.
    197             REGEX_SCAN_DEBUG_PRINTF(("."));
    198             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
    199                 // Table row specified an individual character, not a set, and
    200                 //   the input character is not quoted, and
    201                 //   the input character matched it.
    202                 break;
    203             }
    204             if (tableEl->fCharClass == 255) {
    205                 // Table row specified default, match anything character class.
    206                 break;
    207             }
    208             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
    209                 // Table row specified "quoted" and the char was quoted.
    210                 break;
    211             }
    212             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
    213                 // Table row specified eof and we hit eof on the input.
    214                 break;
    215             }
    216 
    217             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
    218                 fC.fQuoted == FALSE &&                                       //   char is not escaped &&
    219                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
    220                 U_ASSERT(tableEl->fCharClass <= 137);
    221                 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
    222                     // Table row specified a character class, or set of characters,
    223                     //   and the current char matches it.
    224                     break;
    225                 }
    226             }
    227 
    228             // No match on this row, advance to the next  row for this state,
    229             tableEl++;
    230         }
    231         REGEX_SCAN_DEBUG_PRINTF(("\n"));
    232 
    233         //
    234         // We've found the row of the state table that matches the current input
    235         //   character from the rules string.
    236         // Perform any action specified  by this row in the state table.
    237         if (doParseActions(tableEl->fAction) == FALSE) {
    238             // Break out of the state machine loop if the
    239             //   the action signalled some kind of error, or
    240             //   the action was to exit, occurs on normal end-of-rules-input.
    241             break;
    242         }
    243 
    244         if (tableEl->fPushState != 0) {
    245             fStackPtr++;
    246             if (fStackPtr >= kStackSize) {
    247                 error(U_REGEX_INTERNAL_ERROR);
    248                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
    249                 fStackPtr--;
    250             }
    251             fStack[fStackPtr] = tableEl->fPushState;
    252         }
    253 
    254         //
    255         //  NextChar.  This is where characters are actually fetched from the pattern.
    256         //             Happens under control of the 'n' tag in the state table.
    257         //
    258         if (tableEl->fNextChar) {
    259             nextChar(fC);
    260         }
    261 
    262         // Get the next state from the table entry, or from the
    263         //   state stack if the next state was specified as "pop".
    264         if (tableEl->fNextState != 255) {
    265             state = tableEl->fNextState;
    266         } else {
    267             state = fStack[fStackPtr];
    268             fStackPtr--;
    269             if (fStackPtr < 0) {
    270                 // state stack underflow
    271                 // This will occur if the user pattern has mis-matched parentheses,
    272                 //   with extra close parens.
    273                 //
    274                 fStackPtr++;
    275                 error(U_REGEX_MISMATCHED_PAREN);
    276             }
    277         }
    278 
    279     }
    280 
    281     if (U_FAILURE(*fStatus)) {
    282         // Bail out if the pattern had errors.
    283         //   Set stack cleanup:  a successful compile would have left it empty,
    284         //   but errors can leave temporary sets hanging around.
    285         while (!fSetStack.empty()) {
    286             delete (UnicodeSet *)fSetStack.pop();
    287         }
    288         return;
    289     }
    290 
    291     //
    292     // The pattern has now been read and processed, and the compiled code generated.
    293     //
    294 
    295     //
    296     // The pattern's fFrameSize so far has accumulated the requirements for
    297     //   storage for capture parentheses, counters, etc. that are encountered
    298     //   in the pattern.  Add space for the two variables that are always
    299     //   present in the saved state:  the input string position (int64_t) and
    300     //   the position in the compiled pattern.
    301     //
    302     allocateStackData(RESTACKFRAME_HDRCOUNT);
    303 
    304     //
    305     // Optimization pass 1: NOPs, back-references, and case-folding
    306     //
    307     stripNOPs();
    308 
    309     //
    310     // Get bounds for the minimum and maximum length of a string that this
    311     //   pattern can match.  Used to avoid looking for matches in strings that
    312     //   are too short.
    313     //
    314     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
    315 
    316     //
    317     // Optimization pass 2: match start type
    318     //
    319     matchStartType();
    320 
    321     //
    322     // Set up fast latin-1 range sets
    323     //
    324     int32_t numSets = fRXPat->fSets->size();
    325     fRXPat->fSets8 = new Regex8BitSet[numSets];
    326     // Null pointer check.
    327     if (fRXPat->fSets8 == NULL) {
    328         e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
    329         return;
    330     }
    331     int32_t i;
    332     for (i=0; i<numSets; i++) {
    333         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
    334         fRXPat->fSets8[i].init(s);
    335     }
    336 
    337 }
    338 
    339 
    340 
    341 
    342 
    343 //------------------------------------------------------------------------------
    344 //
    345 //  doParseAction        Do some action during regex pattern parsing.
    346 //                       Called by the parse state machine.
    347 //
    348 //                       Generation of the match engine PCode happens here, or
    349 //                       in functions called from the parse actions defined here.
    350 //
    351 //
    352 //------------------------------------------------------------------------------
    353 UBool RegexCompile::doParseActions(int32_t action)
    354 {
    355     UBool   returnVal = TRUE;
    356 
    357     switch ((Regex_PatternParseAction)action) {
    358 
    359     case doPatStart:
    360         // Start of pattern compiles to:
    361         //0   SAVE   2        Fall back to position of FAIL
    362         //1   jmp    3
    363         //2   FAIL            Stop if we ever reach here.
    364         //3   NOP             Dummy, so start of pattern looks the same as
    365         //                    the start of an ( grouping.
    366         //4   NOP             Resreved, will be replaced by a save if there are
    367         //                    OR | operators at the top level
    368         appendOp(URX_STATE_SAVE, 2);
    369         appendOp(URX_JMP,  3);
    370         appendOp(URX_FAIL, 0);
    371 
    372         // Standard open nonCapture paren action emits the two NOPs and
    373         //   sets up the paren stack frame.
    374         doParseActions(doOpenNonCaptureParen);
    375         break;
    376 
    377     case doPatFinish:
    378         // We've scanned to the end of the pattern
    379         //  The end of pattern compiles to:
    380         //        URX_END
    381         //    which will stop the runtime match engine.
    382         //  Encountering end of pattern also behaves like a close paren,
    383         //   and forces fixups of the State Save at the beginning of the compiled pattern
    384         //   and of any OR operations at the top level.
    385         //
    386         handleCloseParen();
    387         if (fParenStack.size() > 0) {
    388             // Missing close paren in pattern.
    389             error(U_REGEX_MISMATCHED_PAREN);
    390         }
    391 
    392         // add the END operation to the compiled pattern.
    393         appendOp(URX_END, 0);
    394 
    395         // Terminate the pattern compilation state machine.
    396         returnVal = FALSE;
    397         break;
    398 
    399 
    400 
    401     case doOrOperator:
    402         // Scanning a '|', as in (A|B)
    403         {
    404             // Generate code for any pending literals preceding the '|'
    405             fixLiterals(FALSE);
    406 
    407             // Insert a SAVE operation at the start of the pattern section preceding
    408             //   this OR at this level.  This SAVE will branch the match forward
    409             //   to the right hand side of the OR in the event that the left hand
    410             //   side fails to match and backtracks.  Locate the position for the
    411             //   save from the location on the top of the parentheses stack.
    412             int32_t savePosition = fParenStack.popi();
    413             int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
    414             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
    415             op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
    416             fRXPat->fCompiledPat->setElementAt(op, savePosition);
    417 
    418             // Append an JMP operation into the compiled pattern.  The operand for
    419             //  the JMP will eventually be the location following the ')' for the
    420             //  group.  This will be patched in later, when the ')' is encountered.
    421             appendOp(URX_JMP, 0);
    422 
    423             // Push the position of the newly added JMP op onto the parentheses stack.
    424             // This registers if for fixup when this block's close paren is encountered.
    425             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
    426 
    427             // Append a NOP to the compiled pattern.  This is the slot reserved
    428             //   for a SAVE in the event that there is yet another '|' following
    429             //   this one.
    430             appendOp(URX_NOP, 0);
    431             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
    432         }
    433         break;
    434 
    435 
    436     case doBeginNamedCapture:
    437         // Scanning (?<letter.
    438         //   The first letter of the name will come through again under doConinueNamedCapture.
    439         fCaptureName = new UnicodeString();
    440         if (fCaptureName == NULL) {
    441             error(U_MEMORY_ALLOCATION_ERROR);
    442         }
    443         break;
    444 
    445     case  doContinueNamedCapture:
    446         fCaptureName->append(fC.fChar);
    447         break;
    448 
    449     case doBadNamedCapture:
    450         error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
    451         break;
    452 
    453     case doOpenCaptureParen:
    454         // Open Capturing Paren, possibly named.
    455         //   Compile to a
    456         //      - NOP, which later may be replaced by a save-state if the
    457         //         parenthesized group gets a * quantifier, followed by
    458         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
    459         //      - NOP, which may later be replaced by a save-state if there
    460         //             is an '|' alternation within the parens.
    461         //
    462         //    Each capture group gets three slots in the save stack frame:
    463         //         0: Capture Group start position (in input string being matched.)
    464         //         1: Capture Group end position.
    465         //         2: Start of Match-in-progress.
    466         //    The first two locations are for a completed capture group, and are
    467         //     referred to by back references and the like.
    468         //    The third location stores the capture start position when an START_CAPTURE is
    469         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
    470         //      END_CAPTURE is encountered.
    471         {
    472             fixLiterals();
    473             appendOp(URX_NOP, 0);
    474             int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
    475             appendOp(URX_START_CAPTURE, varsLoc);
    476             appendOp(URX_NOP, 0);
    477 
    478             // On the Parentheses stack, start a new frame and add the postions
    479             //   of the two NOPs.  Depending on what follows in the pattern, the
    480             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
    481             //   address of the end of the parenthesized group.
    482             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    483             fParenStack.push(capturing, *fStatus);                        // Frame type.
    484             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
    485             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
    486 
    487             // Save the mapping from group number to stack frame variable position.
    488             fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
    489 
    490             // If this is a named capture group, add the name->group number mapping.
    491             if (fCaptureName != NULL) {
    492                 int32_t groupNumber = fRXPat->fGroupMap->size();
    493                 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
    494                 fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
    495                 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
    496                     error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
    497                 }
    498             }
    499         }
    500         break;
    501 
    502     case doOpenNonCaptureParen:
    503         // Open non-caputuring (grouping only) Paren.
    504         //   Compile to a
    505         //      - NOP, which later may be replaced by a save-state if the
    506         //         parenthesized group gets a * quantifier, followed by
    507         //      - NOP, which may later be replaced by a save-state if there
    508         //             is an '|' alternation within the parens.
    509         {
    510             fixLiterals();
    511             appendOp(URX_NOP, 0);
    512             appendOp(URX_NOP, 0);
    513 
    514             // On the Parentheses stack, start a new frame and add the postions
    515             //   of the two NOPs.
    516             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    517             fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
    518             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
    519             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
    520         }
    521          break;
    522 
    523 
    524     case doOpenAtomicParen:
    525         // Open Atomic Paren.  (?>
    526         //   Compile to a
    527         //      - NOP, which later may be replaced if the parenthesized group
    528         //         has a quantifier, followed by
    529         //      - STO_SP  save state stack position, so it can be restored at the ")"
    530         //      - NOP, which may later be replaced by a save-state if there
    531         //             is an '|' alternation within the parens.
    532         {
    533             fixLiterals();
    534             appendOp(URX_NOP, 0);
    535             int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
    536             appendOp(URX_STO_SP, varLoc);
    537             appendOp(URX_NOP, 0);
    538 
    539             // On the Parentheses stack, start a new frame and add the postions
    540             //   of the two NOPs.  Depending on what follows in the pattern, the
    541             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
    542             //   address of the end of the parenthesized group.
    543             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    544             fParenStack.push(atomic, *fStatus);                           // Frame type.
    545             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
    546             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
    547         }
    548         break;
    549 
    550 
    551     case doOpenLookAhead:
    552         // Positive Look-ahead   (?=  stuff  )
    553         //
    554         //   Note:   Addition of transparent input regions, with the need to
    555         //           restore the original regions when failing out of a lookahead
    556         //           block, complicated this sequence.  Some conbined opcodes
    557         //           might make sense - or might not, lookahead aren't that common.
    558         //
    559         //      Caution:  min match length optimization knows about this
    560         //               sequence; don't change without making updates there too.
    561         //
    562         // Compiles to
    563         //    1    START_LA     dataLoc     Saves SP, Input Pos
    564         //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
    565         //    3    JMP          6           continue ...
    566         //
    567         //    4.   LA_END                   Look Ahead failed.  Restore regions.
    568         //    5.   BACKTRACK                and back track again.
    569         //
    570         //    6.   NOP              reserved for use by quantifiers on the block.
    571         //                          Look-ahead can't have quantifiers, but paren stack
    572         //                             compile time conventions require the slot anyhow.
    573         //    7.   NOP              may be replaced if there is are '|' ops in the block.
    574         //    8.     code for parenthesized stuff.
    575         //    9.   LA_END
    576         //
    577         //  Two data slots are reserved, for saving the stack ptr and the input position.
    578         {
    579             fixLiterals();
    580             int32_t dataLoc = allocateData(2);
    581             appendOp(URX_LA_START, dataLoc);
    582             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
    583             appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
    584             appendOp(URX_LA_END, dataLoc);
    585             appendOp(URX_BACKTRACK, 0);
    586             appendOp(URX_NOP, 0);
    587             appendOp(URX_NOP, 0);
    588 
    589             // On the Parentheses stack, start a new frame and add the postions
    590             //   of the NOPs.
    591             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    592             fParenStack.push(lookAhead, *fStatus);                        // Frame type.
    593             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
    594             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
    595         }
    596         break;
    597 
    598     case doOpenLookAheadNeg:
    599         // Negated Lookahead.   (?! stuff )
    600         // Compiles to
    601         //    1.    START_LA    dataloc
    602         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
    603         //                                //   which continues with the match.
    604         //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
    605         //    4.       code for parenthesized stuff.
    606         //    5.    END_LA                // Cut back stack, remove saved state from step 2.
    607         //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
    608         //    7.    END_LA                // Restore match region, in case look-ahead was using
    609         //                                        an alternate (transparent) region.
    610         {
    611             fixLiterals();
    612             int32_t dataLoc = allocateData(2);
    613             appendOp(URX_LA_START, dataLoc);
    614             appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
    615             appendOp(URX_NOP, 0);
    616 
    617             // On the Parentheses stack, start a new frame and add the postions
    618             //   of the StateSave and NOP.
    619             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    620             fParenStack.push(negLookAhead, *fStatus);                    // Frame type
    621             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
    622             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
    623 
    624             // Instructions #5 - #7 will be added when the ')' is encountered.
    625         }
    626         break;
    627 
    628     case doOpenLookBehind:
    629         {
    630             //   Compile a (?<= look-behind open paren.
    631             //
    632             //          Compiles to
    633             //              0       URX_LB_START     dataLoc
    634             //              1       URX_LB_CONT      dataLoc
    635             //              2                        MinMatchLen
    636             //              3                        MaxMatchLen
    637             //              4       URX_NOP          Standard '(' boilerplate.
    638             //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
    639             //              6         <code for LookBehind expression>
    640             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
    641             //              8       URX_LA_END       dataLoc    # Restore stack, input pos
    642             //
    643             //          Allocate a block of matcher data, to contain (when running a match)
    644             //              0:    Stack ptr on entry
    645             //              1:    Input Index on entry
    646             //              2:    Start index of match current match attempt.
    647             //              3:    Original Input String len.
    648 
    649             // Generate match code for any pending literals.
    650             fixLiterals();
    651 
    652             // Allocate data space
    653             int32_t dataLoc = allocateData(4);
    654 
    655             // Emit URX_LB_START
    656             appendOp(URX_LB_START, dataLoc);
    657 
    658             // Emit URX_LB_CONT
    659             appendOp(URX_LB_CONT, dataLoc);
    660             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
    661             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
    662 
    663             // Emit the NOPs
    664             appendOp(URX_NOP, 0);
    665             appendOp(URX_NOP, 0);
    666 
    667             // On the Parentheses stack, start a new frame and add the postions
    668             //   of the URX_LB_CONT and the NOP.
    669             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    670             fParenStack.push(lookBehind, *fStatus);                       // Frame type
    671             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
    672             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
    673 
    674             // The final two instructions will be added when the ')' is encountered.
    675         }
    676 
    677         break;
    678 
    679     case doOpenLookBehindNeg:
    680         {
    681             //   Compile a (?<! negated look-behind open paren.
    682             //
    683             //          Compiles to
    684             //              0       URX_LB_START     dataLoc    # Save entry stack, input len
    685             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
    686             //              2                        MinMatchLen
    687             //              3                        MaxMatchLen
    688             //              4                        continueLoc (9)
    689             //              5       URX_NOP          Standard '(' boilerplate.
    690             //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
    691             //              7         <code for LookBehind expression>
    692             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
    693             //              9       ...
    694             //
    695             //          Allocate a block of matcher data, to contain (when running a match)
    696             //              0:    Stack ptr on entry
    697             //              1:    Input Index on entry
    698             //              2:    Start index of match current match attempt.
    699             //              3:    Original Input String len.
    700 
    701             // Generate match code for any pending literals.
    702             fixLiterals();
    703 
    704             // Allocate data space
    705             int32_t dataLoc = allocateData(4);
    706 
    707             // Emit URX_LB_START
    708             appendOp(URX_LB_START, dataLoc);
    709 
    710             // Emit URX_LBN_CONT
    711             appendOp(URX_LBN_CONT, dataLoc);
    712             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
    713             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
    714             appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
    715 
    716             // Emit the NOPs
    717             appendOp(URX_NOP, 0);
    718             appendOp(URX_NOP, 0);
    719 
    720             // On the Parentheses stack, start a new frame and add the postions
    721             //   of the URX_LB_CONT and the NOP.
    722             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    723             fParenStack.push(lookBehindN, *fStatus);                      // Frame type
    724             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
    725             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
    726 
    727             // The final two instructions will be added when the ')' is encountered.
    728         }
    729         break;
    730 
    731     case doConditionalExpr:
    732         // Conditionals such as (?(1)a:b)
    733     case doPerlInline:
    734         // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
    735         error(U_REGEX_UNIMPLEMENTED);
    736         break;
    737 
    738 
    739     case doCloseParen:
    740         handleCloseParen();
    741         if (fParenStack.size() <= 0) {
    742             //  Extra close paren, or missing open paren.
    743             error(U_REGEX_MISMATCHED_PAREN);
    744         }
    745         break;
    746 
    747     case doNOP:
    748         break;
    749 
    750 
    751     case doBadOpenParenType:
    752     case doRuleError:
    753         error(U_REGEX_RULE_SYNTAX);
    754         break;
    755 
    756 
    757     case doMismatchedParenErr:
    758         error(U_REGEX_MISMATCHED_PAREN);
    759         break;
    760 
    761     case doPlus:
    762         //  Normal '+'  compiles to
    763         //     1.   stuff to be repeated  (already built)
    764         //     2.   jmp-sav 1
    765         //     3.   ...
    766         //
    767         //  Or, if the item to be repeated can match a zero length string,
    768         //     1.   STO_INP_LOC  data-loc
    769         //     2.      body of stuff to be repeated
    770         //     3.   JMP_SAV_X    2
    771         //     4.   ...
    772 
    773         //
    774         //  Or, if the item to be repeated is simple
    775         //     1.   Item to be repeated.
    776         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
    777         //     3.   LOOP_C       stack location
    778         {
    779             int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
    780             int32_t  frameLoc;
    781 
    782             // Check for simple constructs, which may get special optimized code.
    783             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
    784                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
    785 
    786                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
    787                     // Emit optimized code for [char set]+
    788                     appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
    789                     frameLoc = allocateStackData(1);
    790                     appendOp(URX_LOOP_C, frameLoc);
    791                     break;
    792                 }
    793 
    794                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
    795                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
    796                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
    797                     // Emit Optimized code for .+ operations.
    798                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
    799                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
    800                         // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
    801                         loopOpI |= 1;
    802                     }
    803                     if (fModeFlags & UREGEX_UNIX_LINES) {
    804                         loopOpI |= 2;
    805                     }
    806                     appendOp(loopOpI);
    807                     frameLoc = allocateStackData(1);
    808                     appendOp(URX_LOOP_C, frameLoc);
    809                     break;
    810                 }
    811 
    812             }
    813 
    814             // General case.
    815 
    816             // Check for minimum match length of zero, which requires
    817             //    extra loop-breaking code.
    818             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
    819                 // Zero length match is possible.
    820                 // Emit the code sequence that can handle it.
    821                 insertOp(topLoc);
    822                 frameLoc = allocateStackData(1);
    823 
    824                 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
    825                 fRXPat->fCompiledPat->setElementAt(op, topLoc);
    826 
    827                 appendOp(URX_JMP_SAV_X, topLoc+1);
    828             } else {
    829                 // Simpler code when the repeated body must match something non-empty
    830                 appendOp(URX_JMP_SAV, topLoc);
    831             }
    832         }
    833         break;
    834 
    835     case doNGPlus:
    836         //  Non-greedy '+?'  compiles to
    837         //     1.   stuff to be repeated  (already built)
    838         //     2.   state-save  1
    839         //     3.   ...
    840         {
    841             int32_t topLoc      = blockTopLoc(FALSE);
    842             appendOp(URX_STATE_SAVE, topLoc);
    843         }
    844         break;
    845 
    846 
    847     case doOpt:
    848         // Normal (greedy) ? quantifier.
    849         //  Compiles to
    850         //     1. state save 3
    851         //     2.    body of optional block
    852         //     3. ...
    853         // Insert the state save into the compiled pattern, and we're done.
    854         {
    855             int32_t   saveStateLoc = blockTopLoc(TRUE);
    856             int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
    857             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
    858         }
    859         break;
    860 
    861     case doNGOpt:
    862         // Non-greedy ?? quantifier
    863         //   compiles to
    864         //    1.  jmp   4
    865         //    2.     body of optional block
    866         //    3   jmp   5
    867         //    4.  state save 2
    868         //    5    ...
    869         //  This code is less than ideal, with two jmps instead of one, because we can only
    870         //  insert one instruction at the top of the block being iterated.
    871         {
    872             int32_t  jmp1_loc = blockTopLoc(TRUE);
    873             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
    874 
    875             int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
    876             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
    877 
    878             appendOp(URX_JMP, jmp2_loc+2);
    879 
    880             appendOp(URX_STATE_SAVE, jmp1_loc+1);
    881         }
    882         break;
    883 
    884 
    885     case doStar:
    886         // Normal (greedy) * quantifier.
    887         // Compiles to
    888         //       1.   STATE_SAVE   4
    889         //       2.      body of stuff being iterated over
    890         //       3.   JMP_SAV      2
    891         //       4.   ...
    892         //
    893         // Or, if the body is a simple [Set],
    894         //       1.   LOOP_SR_I    set number
    895         //       2.   LOOP_C       stack location
    896         //       ...
    897         //
    898         // Or if this is a .*
    899         //       1.   LOOP_DOT_I    (. matches all mode flag)
    900         //       2.   LOOP_C        stack location
    901         //
    902         // Or, if the body can match a zero-length string, to inhibit infinite loops,
    903         //       1.   STATE_SAVE   5
    904         //       2.   STO_INP_LOC  data-loc
    905         //       3.      body of stuff
    906         //       4.   JMP_SAV_X    2
    907         //       5.   ...
    908         {
    909             // location of item #1, the STATE_SAVE
    910             int32_t   topLoc = blockTopLoc(FALSE);
    911             int32_t   dataLoc = -1;
    912 
    913             // Check for simple *, where the construct being repeated
    914             //   compiled to single opcode, and might be optimizable.
    915             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
    916                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
    917 
    918                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
    919                     // Emit optimized code for a [char set]*
    920                     int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
    921                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
    922                     dataLoc = allocateStackData(1);
    923                     appendOp(URX_LOOP_C, dataLoc);
    924                     break;
    925                 }
    926 
    927                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
    928                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
    929                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
    930                     // Emit Optimized code for .* operations.
    931                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
    932                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
    933                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
    934                         loopOpI |= 1;
    935                     }
    936                     if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
    937                         loopOpI |= 2;
    938                     }
    939                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
    940                     dataLoc = allocateStackData(1);
    941                     appendOp(URX_LOOP_C, dataLoc);
    942                     break;
    943                 }
    944             }
    945 
    946             // Emit general case code for this *
    947             // The optimizations did not apply.
    948 
    949             int32_t   saveStateLoc = blockTopLoc(TRUE);
    950             int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
    951 
    952             // Check for minimum match length of zero, which requires
    953             //    extra loop-breaking code.
    954             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
    955                 insertOp(saveStateLoc);
    956                 dataLoc = allocateStackData(1);
    957 
    958                 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
    959                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
    960                 jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
    961             }
    962 
    963             // Locate the position in the compiled pattern where the match will continue
    964             //   after completing the *.   (4 or 5 in the comment above)
    965             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
    966 
    967             // Put together the save state op and store it into the compiled code.
    968             int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
    969             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
    970 
    971             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
    972             appendOp(jmpOp);
    973         }
    974         break;
    975 
    976     case doNGStar:
    977         // Non-greedy *? quantifier
    978         // compiles to
    979         //     1.   JMP    3
    980         //     2.      body of stuff being iterated over
    981         //     3.   STATE_SAVE  2
    982         //     4    ...
    983         {
    984             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
    985             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
    986             int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
    987             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
    988             appendOp(URX_STATE_SAVE, jmpLoc+1);
    989         }
    990         break;
    991 
    992 
    993     case doIntervalInit:
    994         // The '{' opening an interval quantifier was just scanned.
    995         // Init the counter varaiables that will accumulate the values as the digits
    996         //    are scanned.
    997         fIntervalLow = 0;
    998         fIntervalUpper = -1;
    999         break;
   1000 
   1001     case doIntevalLowerDigit:
   1002         // Scanned a digit from the lower value of an {lower,upper} interval
   1003         {
   1004             int32_t digitValue = u_charDigitValue(fC.fChar);
   1005             U_ASSERT(digitValue >= 0);
   1006             int64_t val = (int64_t)fIntervalLow*10 + digitValue;
   1007             if (val > INT32_MAX) {
   1008                 error(U_REGEX_NUMBER_TOO_BIG);
   1009             } else {
   1010                 fIntervalLow = (int32_t)val;
   1011             }
   1012         }
   1013         break;
   1014 
   1015     case doIntervalUpperDigit:
   1016         // Scanned a digit from the upper value of an {lower,upper} interval
   1017         {
   1018             if (fIntervalUpper < 0) {
   1019                 fIntervalUpper = 0;
   1020             }
   1021             int32_t digitValue = u_charDigitValue(fC.fChar);
   1022             U_ASSERT(digitValue >= 0);
   1023             int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
   1024             if (val > INT32_MAX) {
   1025                 error(U_REGEX_NUMBER_TOO_BIG);
   1026             } else {
   1027                 fIntervalUpper = (int32_t)val;
   1028             }
   1029         }
   1030         break;
   1031 
   1032     case doIntervalSame:
   1033         // Scanned a single value interval like {27}.  Upper = Lower.
   1034         fIntervalUpper = fIntervalLow;
   1035         break;
   1036 
   1037     case doInterval:
   1038         // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
   1039         if (compileInlineInterval() == FALSE) {
   1040             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
   1041         }
   1042         break;
   1043 
   1044     case doPossessiveInterval:
   1045         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
   1046         {
   1047             // Remember the loc for the top of the block being looped over.
   1048             //   (Can not reserve a slot in the compiled pattern at this time, because
   1049             //    compileInterval needs to reserve also, and blockTopLoc can only reserve
   1050             //    once per block.)
   1051             int32_t topLoc = blockTopLoc(FALSE);
   1052 
   1053             // Produce normal looping code.
   1054             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
   1055 
   1056             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
   1057             //  just as if the loop was inclosed in atomic parentheses.
   1058 
   1059             // First the STO_SP before the start of the loop
   1060             insertOp(topLoc);
   1061 
   1062             int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
   1063             int32_t  op     = buildOp(URX_STO_SP, varLoc);
   1064             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1065 
   1066             int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
   1067             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
   1068             loopOp++;     // point LoopOp after the just-inserted STO_SP
   1069             fRXPat->fCompiledPat->push(loopOp, *fStatus);
   1070 
   1071             // Then the LD_SP after the end of the loop
   1072             appendOp(URX_LD_SP, varLoc);
   1073         }
   1074 
   1075         break;
   1076 
   1077     case doNGInterval:
   1078         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
   1079         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
   1080         break;
   1081 
   1082     case doIntervalError:
   1083         error(U_REGEX_BAD_INTERVAL);
   1084         break;
   1085 
   1086     case doLiteralChar:
   1087         // We've just scanned a "normal" character from the pattern,
   1088         literalChar(fC.fChar);
   1089         break;
   1090 
   1091 
   1092     case doEscapedLiteralChar:
   1093         // We've just scanned an backslashed escaped character with  no
   1094         //   special meaning.  It represents itself.
   1095         if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
   1096             ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
   1097             (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
   1098                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1099              }
   1100         literalChar(fC.fChar);
   1101         break;
   1102 
   1103 
   1104     case doDotAny:
   1105         // scanned a ".",  match any single character.
   1106         {
   1107             fixLiterals(FALSE);
   1108             if (fModeFlags & UREGEX_DOTALL) {
   1109                 appendOp(URX_DOTANY_ALL, 0);
   1110             } else if (fModeFlags & UREGEX_UNIX_LINES) {
   1111                 appendOp(URX_DOTANY_UNIX, 0);
   1112             } else {
   1113                 appendOp(URX_DOTANY, 0);
   1114             }
   1115         }
   1116         break;
   1117 
   1118     case doCaret:
   1119         {
   1120             fixLiterals(FALSE);
   1121             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1122                 appendOp(URX_CARET, 0);
   1123             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1124                 appendOp(URX_CARET_M, 0);
   1125             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1126                 appendOp(URX_CARET, 0);   // Only testing true start of input.
   1127             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1128                 appendOp(URX_CARET_M_UNIX, 0);
   1129             }
   1130         }
   1131         break;
   1132 
   1133     case doDollar:
   1134         {
   1135             fixLiterals(FALSE);
   1136             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1137                 appendOp(URX_DOLLAR, 0);
   1138             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1139                 appendOp(URX_DOLLAR_M, 0);
   1140             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1141                 appendOp(URX_DOLLAR_D, 0);
   1142             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1143                 appendOp(URX_DOLLAR_MD, 0);
   1144             }
   1145         }
   1146         break;
   1147 
   1148     case doBackslashA:
   1149         fixLiterals(FALSE);
   1150         appendOp(URX_CARET, 0);
   1151         break;
   1152 
   1153     case doBackslashB:
   1154         {
   1155             #if  UCONFIG_NO_BREAK_ITERATION==1
   1156             if (fModeFlags & UREGEX_UWORD) {
   1157                 error(U_UNSUPPORTED_ERROR);
   1158             }
   1159             #endif
   1160             fixLiterals(FALSE);
   1161             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
   1162             appendOp(op, 1);
   1163         }
   1164         break;
   1165 
   1166     case doBackslashb:
   1167         {
   1168             #if  UCONFIG_NO_BREAK_ITERATION==1
   1169             if (fModeFlags & UREGEX_UWORD) {
   1170                 error(U_UNSUPPORTED_ERROR);
   1171             }
   1172             #endif
   1173             fixLiterals(FALSE);
   1174             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
   1175             appendOp(op, 0);
   1176         }
   1177         break;
   1178 
   1179     case doBackslashD:
   1180         fixLiterals(FALSE);
   1181         appendOp(URX_BACKSLASH_D, 1);
   1182         break;
   1183 
   1184     case doBackslashd:
   1185         fixLiterals(FALSE);
   1186         appendOp(URX_BACKSLASH_D, 0);
   1187         break;
   1188 
   1189     case doBackslashG:
   1190         fixLiterals(FALSE);
   1191         appendOp(URX_BACKSLASH_G, 0);
   1192         break;
   1193 
   1194     case doBackslashH:
   1195         fixLiterals(FALSE);
   1196         appendOp(URX_BACKSLASH_H, 1);
   1197         break;
   1198 
   1199     case doBackslashh:
   1200         fixLiterals(FALSE);
   1201         appendOp(URX_BACKSLASH_H, 0);
   1202         break;
   1203 
   1204     case doBackslashR:
   1205         fixLiterals(FALSE);
   1206         appendOp(URX_BACKSLASH_R, 0);
   1207         break;
   1208 
   1209     case doBackslashS:
   1210         fixLiterals(FALSE);
   1211         appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
   1212         break;
   1213 
   1214     case doBackslashs:
   1215         fixLiterals(FALSE);
   1216         appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
   1217         break;
   1218 
   1219     case doBackslashV:
   1220         fixLiterals(FALSE);
   1221         appendOp(URX_BACKSLASH_V, 1);
   1222         break;
   1223 
   1224     case doBackslashv:
   1225         fixLiterals(FALSE);
   1226         appendOp(URX_BACKSLASH_V, 0);
   1227         break;
   1228 
   1229     case doBackslashW:
   1230         fixLiterals(FALSE);
   1231         appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
   1232         break;
   1233 
   1234     case doBackslashw:
   1235         fixLiterals(FALSE);
   1236         appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
   1237         break;
   1238 
   1239     case doBackslashX:
   1240         fixLiterals(FALSE);
   1241         appendOp(URX_BACKSLASH_X, 0);
   1242         break;
   1243 
   1244 
   1245     case doBackslashZ:
   1246         fixLiterals(FALSE);
   1247         appendOp(URX_DOLLAR, 0);
   1248         break;
   1249 
   1250     case doBackslashz:
   1251         fixLiterals(FALSE);
   1252         appendOp(URX_BACKSLASH_Z, 0);
   1253         break;
   1254 
   1255     case doEscapeError:
   1256         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1257         break;
   1258 
   1259     case doExit:
   1260         fixLiterals(FALSE);
   1261         returnVal = FALSE;
   1262         break;
   1263 
   1264     case doProperty:
   1265         {
   1266             fixLiterals(FALSE);
   1267             UnicodeSet *theSet = scanProp();
   1268             compileSet(theSet);
   1269         }
   1270         break;
   1271 
   1272     case doNamedChar:
   1273         {
   1274             UChar32 c = scanNamedChar();
   1275             literalChar(c);
   1276         }
   1277         break;
   1278 
   1279 
   1280     case doBackRef:
   1281         // BackReference.  Somewhat unusual in that the front-end can not completely parse
   1282         //                 the regular expression, because the number of digits to be consumed
   1283         //                 depends on the number of capture groups that have been defined.  So
   1284         //                 we have to do it here instead.
   1285         {
   1286             int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
   1287             int32_t  groupNum = 0;
   1288             UChar32  c        = fC.fChar;
   1289 
   1290             for (;;) {
   1291                 // Loop once per digit, for max allowed number of digits in a back reference.
   1292                 int32_t digit = u_charDigitValue(c);
   1293                 groupNum = groupNum * 10 + digit;
   1294                 if (groupNum >= numCaptureGroups) {
   1295                     break;
   1296                 }
   1297                 c = peekCharLL();
   1298                 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
   1299                     break;
   1300                 }
   1301                 nextCharLL();
   1302             }
   1303 
   1304             // Scan of the back reference in the source regexp is complete.  Now generate
   1305             //  the compiled code for it.
   1306             // Because capture groups can be forward-referenced by back-references,
   1307             //  we fill the operand with the capture group number.  At the end
   1308             //  of compilation, it will be changed to the variable's location.
   1309             U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
   1310                                      //    and shouldn't enter this code path at all.
   1311             fixLiterals(FALSE);
   1312             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1313                 appendOp(URX_BACKREF_I, groupNum);
   1314             } else {
   1315                 appendOp(URX_BACKREF, groupNum);
   1316             }
   1317         }
   1318         break;
   1319 
   1320     case doBeginNamedBackRef:
   1321         U_ASSERT(fCaptureName == NULL);
   1322         fCaptureName = new UnicodeString;
   1323         if (fCaptureName == NULL) {
   1324             error(U_MEMORY_ALLOCATION_ERROR);
   1325         }
   1326         break;
   1327 
   1328     case doContinueNamedBackRef:
   1329         fCaptureName->append(fC.fChar);
   1330         break;
   1331 
   1332     case doCompleteNamedBackRef:
   1333         {
   1334         int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
   1335         if (groupNumber == 0) {
   1336             // Group name has not been defined.
   1337             //   Could be a forward reference. If we choose to support them at some
   1338             //   future time, extra mechanism will be required at this point.
   1339             error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   1340         } else {
   1341             // Given the number, handle identically to a \n numbered back reference.
   1342             // See comments above, under doBackRef
   1343             fixLiterals(FALSE);
   1344             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1345                 appendOp(URX_BACKREF_I, groupNumber);
   1346             } else {
   1347                 appendOp(URX_BACKREF, groupNumber);
   1348             }
   1349         }
   1350         delete fCaptureName;
   1351         fCaptureName = NULL;
   1352         break;
   1353         }
   1354 
   1355     case doPossessivePlus:
   1356         // Possessive ++ quantifier.
   1357         // Compiles to
   1358         //       1.   STO_SP
   1359         //       2.      body of stuff being iterated over
   1360         //       3.   STATE_SAVE 5
   1361         //       4.   JMP        2
   1362         //       5.   LD_SP
   1363         //       6.   ...
   1364         //
   1365         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
   1366         //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
   1367         //
   1368         {
   1369             // Emit the STO_SP
   1370             int32_t   topLoc = blockTopLoc(TRUE);
   1371             int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
   1372             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
   1373             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1374 
   1375             // Emit the STATE_SAVE
   1376             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
   1377 
   1378             // Emit the JMP
   1379             appendOp(URX_JMP, topLoc+1);
   1380 
   1381             // Emit the LD_SP
   1382             appendOp(URX_LD_SP, stoLoc);
   1383         }
   1384         break;
   1385 
   1386     case doPossessiveStar:
   1387         // Possessive *+ quantifier.
   1388         // Compiles to
   1389         //       1.   STO_SP       loc
   1390         //       2.   STATE_SAVE   5
   1391         //       3.      body of stuff being iterated over
   1392         //       4.   JMP          2
   1393         //       5.   LD_SP        loc
   1394         //       6    ...
   1395         // TODO:  do something to cut back the state stack each time through the loop.
   1396         {
   1397             // Reserve two slots at the top of the block.
   1398             int32_t   topLoc = blockTopLoc(TRUE);
   1399             insertOp(topLoc);
   1400 
   1401             // emit   STO_SP     loc
   1402             int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
   1403             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
   1404             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1405 
   1406             // Emit the SAVE_STATE   5
   1407             int32_t L7 = fRXPat->fCompiledPat->size()+1;
   1408             op = buildOp(URX_STATE_SAVE, L7);
   1409             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
   1410 
   1411             // Append the JMP operation.
   1412             appendOp(URX_JMP, topLoc+1);
   1413 
   1414             // Emit the LD_SP       loc
   1415             appendOp(URX_LD_SP, stoLoc);
   1416         }
   1417         break;
   1418 
   1419     case doPossessiveOpt:
   1420         // Possessive  ?+ quantifier.
   1421         //  Compiles to
   1422         //     1. STO_SP      loc
   1423         //     2. SAVE_STATE  5
   1424         //     3.    body of optional block
   1425         //     4. LD_SP       loc
   1426         //     5. ...
   1427         //
   1428         {
   1429             // Reserve two slots at the top of the block.
   1430             int32_t   topLoc = blockTopLoc(TRUE);
   1431             insertOp(topLoc);
   1432 
   1433             // Emit the STO_SP
   1434             int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
   1435             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
   1436             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1437 
   1438             // Emit the SAVE_STATE
   1439             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
   1440             op = buildOp(URX_STATE_SAVE, continueLoc);
   1441             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
   1442 
   1443             // Emit the LD_SP
   1444             appendOp(URX_LD_SP, stoLoc);
   1445         }
   1446         break;
   1447 
   1448 
   1449     case doBeginMatchMode:
   1450         fNewModeFlags = fModeFlags;
   1451         fSetModeFlag  = TRUE;
   1452         break;
   1453 
   1454     case doMatchMode:   //  (?i)    and similar
   1455         {
   1456             int32_t  bit = 0;
   1457             switch (fC.fChar) {
   1458             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
   1459             case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
   1460             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
   1461             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
   1462             case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
   1463             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
   1464             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
   1465             case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
   1466             default:
   1467                 U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
   1468                                    // by the scanner.
   1469             }
   1470             if (fSetModeFlag) {
   1471                 fNewModeFlags |= bit;
   1472             } else {
   1473                 fNewModeFlags &= ~bit;
   1474             }
   1475         }
   1476         break;
   1477 
   1478     case doSetMatchMode:
   1479         // Emit code to match any pending literals, using the not-yet changed match mode.
   1480         fixLiterals();
   1481 
   1482         // We've got a (?i) or similar.  The match mode is being changed, but
   1483         //   the change is not scoped to a parenthesized block.
   1484         U_ASSERT(fNewModeFlags < 0);
   1485         fModeFlags = fNewModeFlags;
   1486 
   1487         break;
   1488 
   1489 
   1490     case doMatchModeParen:
   1491         // We've got a (?i: or similar.  Begin a parenthesized block, save old
   1492         //   mode flags so they can be restored at the close of the block.
   1493         //
   1494         //   Compile to a
   1495         //      - NOP, which later may be replaced by a save-state if the
   1496         //         parenthesized group gets a * quantifier, followed by
   1497         //      - NOP, which may later be replaced by a save-state if there
   1498         //             is an '|' alternation within the parens.
   1499         {
   1500             fixLiterals(FALSE);
   1501             appendOp(URX_NOP, 0);
   1502             appendOp(URX_NOP, 0);
   1503 
   1504             // On the Parentheses stack, start a new frame and add the postions
   1505             //   of the two NOPs (a normal non-capturing () frame, except for the
   1506             //   saving of the orignal mode flags.)
   1507             fParenStack.push(fModeFlags, *fStatus);
   1508             fParenStack.push(flags, *fStatus);                            // Frame Marker
   1509             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
   1510             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
   1511 
   1512             // Set the current mode flags to the new values.
   1513             U_ASSERT(fNewModeFlags < 0);
   1514             fModeFlags = fNewModeFlags;
   1515         }
   1516         break;
   1517 
   1518     case doBadModeFlag:
   1519         error(U_REGEX_INVALID_FLAG);
   1520         break;
   1521 
   1522     case doSuppressComments:
   1523         // We have just scanned a '(?'.  We now need to prevent the character scanner from
   1524         // treating a '#' as a to-the-end-of-line comment.
   1525         //   (This Perl compatibility just gets uglier and uglier to do...)
   1526         fEOLComments = FALSE;
   1527         break;
   1528 
   1529 
   1530     case doSetAddAmp:
   1531         {
   1532           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1533           set->add(chAmp);
   1534         }
   1535         break;
   1536 
   1537     case doSetAddDash:
   1538         {
   1539           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1540           set->add(chDash);
   1541         }
   1542         break;
   1543 
   1544      case doSetBackslash_s:
   1545         {
   1546          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1547          set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
   1548          break;
   1549         }
   1550 
   1551      case doSetBackslash_S:
   1552         {
   1553             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1554             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
   1555             SSet.complement();
   1556             set->addAll(SSet);
   1557             break;
   1558         }
   1559 
   1560     case doSetBackslash_d:
   1561         {
   1562             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1563             // TODO - make a static set, ticket 6058.
   1564             addCategory(set, U_GC_ND_MASK, *fStatus);
   1565             break;
   1566         }
   1567 
   1568     case doSetBackslash_D:
   1569         {
   1570             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1571             UnicodeSet digits;
   1572             // TODO - make a static set, ticket 6058.
   1573             digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
   1574             digits.complement();
   1575             set->addAll(digits);
   1576             break;
   1577         }
   1578 
   1579     case doSetBackslash_h:
   1580         {
   1581             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1582             UnicodeSet h;
   1583             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
   1584             h.add((UChar32)9);   // Tab
   1585             set->addAll(h);
   1586             break;
   1587         }
   1588 
   1589     case doSetBackslash_H:
   1590         {
   1591             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1592             UnicodeSet h;
   1593             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
   1594             h.add((UChar32)9);   // Tab
   1595             h.complement();
   1596             set->addAll(h);
   1597             break;
   1598         }
   1599 
   1600     case doSetBackslash_v:
   1601         {
   1602             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1603             set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
   1604             set->add((UChar32)0x85);
   1605             set->add((UChar32)0x2028, (UChar32)0x2029);
   1606             break;
   1607         }
   1608 
   1609     case doSetBackslash_V:
   1610         {
   1611             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1612             UnicodeSet v;
   1613             v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
   1614             v.add((UChar32)0x85);
   1615             v.add((UChar32)0x2028, (UChar32)0x2029);
   1616             v.complement();
   1617             set->addAll(v);
   1618             break;
   1619         }
   1620 
   1621     case doSetBackslash_w:
   1622         {
   1623             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1624             set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
   1625             break;
   1626         }
   1627 
   1628     case doSetBackslash_W:
   1629         {
   1630             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1631             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
   1632             SSet.complement();
   1633             set->addAll(SSet);
   1634             break;
   1635         }
   1636 
   1637     case doSetBegin:
   1638         fixLiterals(FALSE);
   1639         fSetStack.push(new UnicodeSet(), *fStatus);
   1640         fSetOpStack.push(setStart, *fStatus);
   1641         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1642             fSetOpStack.push(setCaseClose, *fStatus);
   1643         }
   1644         break;
   1645 
   1646     case doSetBeginDifference1:
   1647         //  We have scanned something like [[abc]-[
   1648         //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
   1649         //  Push a Difference operator, which will cause the new set to be subtracted from what
   1650         //    went before once it is created.
   1651         setPushOp(setDifference1);
   1652         fSetOpStack.push(setStart, *fStatus);
   1653         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1654             fSetOpStack.push(setCaseClose, *fStatus);
   1655         }
   1656         break;
   1657 
   1658     case doSetBeginIntersection1:
   1659         //  We have scanned something like  [[abc]&[
   1660         //   Need both the '&' operator and the open '[' operator.
   1661         setPushOp(setIntersection1);
   1662         fSetOpStack.push(setStart, *fStatus);
   1663         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1664             fSetOpStack.push(setCaseClose, *fStatus);
   1665         }
   1666         break;
   1667 
   1668     case doSetBeginUnion:
   1669         //  We have scanned something like  [[abc][
   1670         //     Need to handle the union operation explicitly [[abc] | [
   1671         setPushOp(setUnion);
   1672         fSetOpStack.push(setStart, *fStatus);
   1673         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1674             fSetOpStack.push(setCaseClose, *fStatus);
   1675         }
   1676         break;
   1677 
   1678     case doSetDifference2:
   1679         // We have scanned something like [abc--
   1680         //   Consider this to unambiguously be a set difference operator.
   1681         setPushOp(setDifference2);
   1682         break;
   1683 
   1684     case doSetEnd:
   1685         // Have encountered the ']' that closes a set.
   1686         //    Force the evaluation of any pending operations within this set,
   1687         //    leave the completed set on the top of the set stack.
   1688         setEval(setEnd);
   1689         U_ASSERT(fSetOpStack.peeki()==setStart);
   1690         fSetOpStack.popi();
   1691         break;
   1692 
   1693     case doSetFinish:
   1694         {
   1695         // Finished a complete set expression, including all nested sets.
   1696         //   The close bracket has already triggered clearing out pending set operators,
   1697         //    the operator stack should be empty and the operand stack should have just
   1698         //    one entry, the result set.
   1699         U_ASSERT(fSetOpStack.empty());
   1700         UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
   1701         U_ASSERT(fSetStack.empty());
   1702         compileSet(theSet);
   1703         break;
   1704         }
   1705 
   1706     case doSetIntersection2:
   1707         // Have scanned something like [abc&&
   1708         setPushOp(setIntersection2);
   1709         break;
   1710 
   1711     case doSetLiteral:
   1712         // Union the just-scanned literal character into the set being built.
   1713         //    This operation is the highest precedence set operation, so we can always do
   1714         //    it immediately, without waiting to see what follows.  It is necessary to perform
   1715         //    any pending '-' or '&' operation first, because these have the same precedence
   1716         //    as union-ing in a literal'
   1717         {
   1718             setEval(setUnion);
   1719             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1720             s->add(fC.fChar);
   1721             fLastSetLiteral = fC.fChar;
   1722             break;
   1723         }
   1724 
   1725     case doSetLiteralEscaped:
   1726         // A back-slash escaped literal character was encountered.
   1727         // Processing is the same as with setLiteral, above, with the addition of
   1728         //  the optional check for errors on escaped ASCII letters.
   1729         {
   1730             if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
   1731                 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
   1732                  (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
   1733                 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1734             }
   1735             setEval(setUnion);
   1736             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1737             s->add(fC.fChar);
   1738             fLastSetLiteral = fC.fChar;
   1739             break;
   1740         }
   1741 
   1742         case doSetNamedChar:
   1743         // Scanning a \N{UNICODE CHARACTER NAME}
   1744         //  Aside from the source of the character, the processing is identical to doSetLiteral,
   1745         //    above.
   1746         {
   1747             UChar32  c = scanNamedChar();
   1748             setEval(setUnion);
   1749             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1750             s->add(c);
   1751             fLastSetLiteral = c;
   1752             break;
   1753         }
   1754 
   1755     case doSetNamedRange:
   1756         // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
   1757         // The left character is already in the set, and is saved in fLastSetLiteral.
   1758         // The right side needs to be picked up, the scan is at the 'N'.
   1759         // Lower Limit > Upper limit being an error matches both Java
   1760         //        and ICU UnicodeSet behavior.
   1761         {
   1762             UChar32  c = scanNamedChar();
   1763             if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
   1764                 error(U_REGEX_INVALID_RANGE);
   1765             }
   1766             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1767             s->add(fLastSetLiteral, c);
   1768             fLastSetLiteral = c;
   1769             break;
   1770         }
   1771 
   1772 
   1773     case  doSetNegate:
   1774         // Scanned a '^' at the start of a set.
   1775         // Push the negation operator onto the set op stack.
   1776         // A twist for case-insensitive matching:
   1777         //   the case closure operation must happen _before_ negation.
   1778         //   But the case closure operation will already be on the stack if it's required.
   1779         //   This requires checking for case closure, and swapping the stack order
   1780         //    if it is present.
   1781         {
   1782             int32_t  tosOp = fSetOpStack.peeki();
   1783             if (tosOp == setCaseClose) {
   1784                 fSetOpStack.popi();
   1785                 fSetOpStack.push(setNegation, *fStatus);
   1786                 fSetOpStack.push(setCaseClose, *fStatus);
   1787             } else {
   1788                 fSetOpStack.push(setNegation, *fStatus);
   1789             }
   1790         }
   1791         break;
   1792 
   1793     case doSetNoCloseError:
   1794         error(U_REGEX_MISSING_CLOSE_BRACKET);
   1795         break;
   1796 
   1797     case doSetOpError:
   1798         error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
   1799         break;
   1800 
   1801     case doSetPosixProp:
   1802         {
   1803             UnicodeSet *s = scanPosixProp();
   1804             if (s != NULL) {
   1805                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
   1806                 tos->addAll(*s);
   1807                 delete s;
   1808             }  // else error.  scanProp() reported the error status already.
   1809         }
   1810         break;
   1811 
   1812     case doSetProp:
   1813         //  Scanned a \p \P within [brackets].
   1814         {
   1815             UnicodeSet *s = scanProp();
   1816             if (s != NULL) {
   1817                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
   1818                 tos->addAll(*s);
   1819                 delete s;
   1820             }  // else error.  scanProp() reported the error status already.
   1821         }
   1822         break;
   1823 
   1824 
   1825     case doSetRange:
   1826         // We have scanned literal-literal.  Add the range to the set.
   1827         // The left character is already in the set, and is saved in fLastSetLiteral.
   1828         // The right side is the current character.
   1829         // Lower Limit > Upper limit being an error matches both Java
   1830         //        and ICU UnicodeSet behavior.
   1831         {
   1832 
   1833         if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
   1834             error(U_REGEX_INVALID_RANGE);
   1835         }
   1836         UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1837         s->add(fLastSetLiteral, fC.fChar);
   1838         break;
   1839         }
   1840 
   1841     default:
   1842         U_ASSERT(FALSE);
   1843         error(U_REGEX_INTERNAL_ERROR);
   1844         break;
   1845     }
   1846 
   1847     if (U_FAILURE(*fStatus)) {
   1848         returnVal = FALSE;
   1849     }
   1850 
   1851     return returnVal;
   1852 }
   1853 
   1854 
   1855 
   1856 //------------------------------------------------------------------------------
   1857 //
   1858 //   literalChar           We've encountered a literal character from the pattern,
   1859 //                             or an escape sequence that reduces to a character.
   1860 //                         Add it to the string containing all literal chars/strings from
   1861 //                             the pattern.
   1862 //
   1863 //------------------------------------------------------------------------------
   1864 void RegexCompile::literalChar(UChar32 c)  {
   1865     fLiteralChars.append(c);
   1866 }
   1867 
   1868 
   1869 //------------------------------------------------------------------------------
   1870 //
   1871 //    fixLiterals           When compiling something that can follow a literal
   1872 //                          string in a pattern, emit the code to match the
   1873 //                          accumulated literal string.
   1874 //
   1875 //                          Optionally, split the last char of the string off into
   1876 //                          a single "ONE_CHAR" operation, so that quantifiers can
   1877 //                          apply to that char alone.  Example:   abc*
   1878 //                          The * must apply to the 'c' only.
   1879 //
   1880 //------------------------------------------------------------------------------
   1881 void    RegexCompile::fixLiterals(UBool split) {
   1882 
   1883     // If no literal characters have been scanned but not yet had code generated
   1884     //   for them, nothing needs to be done.
   1885     if (fLiteralChars.length() == 0) {
   1886         return;
   1887     }
   1888 
   1889     int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
   1890     UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
   1891 
   1892     // Split:  We need to  ensure that the last item in the compiled pattern
   1893     //     refers only to the last literal scanned in the pattern, so that
   1894     //     quantifiers (*, +, etc.) affect only it, and not a longer string.
   1895     //     Split before case folding for case insensitive matches.
   1896 
   1897     if (split) {
   1898         fLiteralChars.truncate(indexOfLastCodePoint);
   1899         fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
   1900                               //  Note that the truncated literal string may be empty, in which case
   1901                               //  nothing will be emitted.
   1902 
   1903         literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
   1904         fixLiterals(FALSE);          // Second recursive call, code for the final code point.
   1905         return;
   1906     }
   1907 
   1908     // If we are doing case-insensitive matching, case fold the string.  This may expand
   1909     //   the string, e.g. the German sharp-s turns into "ss"
   1910     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1911         fLiteralChars.foldCase();
   1912         indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
   1913         lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
   1914     }
   1915 
   1916     if (indexOfLastCodePoint == 0) {
   1917         // Single character, emit a URX_ONECHAR op to match it.
   1918         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
   1919                  u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
   1920             appendOp(URX_ONECHAR_I, lastCodePoint);
   1921         } else {
   1922             appendOp(URX_ONECHAR, lastCodePoint);
   1923         }
   1924     } else {
   1925         // Two or more chars, emit a URX_STRING to match them.
   1926         if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
   1927             error(U_REGEX_PATTERN_TOO_BIG);
   1928         }
   1929         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1930             appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
   1931         } else {
   1932             // TODO here:  add optimization to split case sensitive strings of length two
   1933             //             into two single char ops, for efficiency.
   1934             appendOp(URX_STRING, fRXPat->fLiteralText.length());
   1935         }
   1936         appendOp(URX_STRING_LEN, fLiteralChars.length());
   1937 
   1938         // Add this string into the accumulated strings of the compiled pattern.
   1939         fRXPat->fLiteralText.append(fLiteralChars);
   1940     }
   1941 
   1942     fLiteralChars.remove();
   1943 }
   1944 
   1945 
   1946 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
   1947     if (U_FAILURE(*fStatus)) {
   1948         return 0;
   1949     }
   1950     if (type < 0 || type > 255) {
   1951         U_ASSERT(FALSE);
   1952         error(U_REGEX_INTERNAL_ERROR);
   1953         type = URX_RESERVED_OP;
   1954     }
   1955     if (val > 0x00ffffff) {
   1956         U_ASSERT(FALSE);
   1957         error(U_REGEX_INTERNAL_ERROR);
   1958         val = 0;
   1959     }
   1960     if (val < 0) {
   1961         if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
   1962             U_ASSERT(FALSE);
   1963             error(U_REGEX_INTERNAL_ERROR);
   1964             return -1;
   1965         }
   1966         if (URX_TYPE(val) != 0xff) {
   1967             U_ASSERT(FALSE);
   1968             error(U_REGEX_INTERNAL_ERROR);
   1969             return -1;
   1970         }
   1971         type = URX_RESERVED_OP_N;
   1972     }
   1973     return (type << 24) | val;
   1974 }
   1975 
   1976 
   1977 //------------------------------------------------------------------------------
   1978 //
   1979 //   appendOp()             Append a new instruction onto the compiled pattern
   1980 //                          Includes error checking, limiting the size of the
   1981 //                          pattern to lengths that can be represented in the
   1982 //                          24 bit operand field of an instruction.
   1983 //
   1984 //------------------------------------------------------------------------------
   1985 void RegexCompile::appendOp(int32_t op) {
   1986     if (U_FAILURE(*fStatus)) {
   1987         return;
   1988     }
   1989     fRXPat->fCompiledPat->addElement(op, *fStatus);
   1990     if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
   1991         error(U_REGEX_PATTERN_TOO_BIG);
   1992     }
   1993 }
   1994 
   1995 void RegexCompile::appendOp(int32_t type, int32_t val) {
   1996     appendOp(buildOp(type, val));
   1997 }
   1998 
   1999 
   2000 //------------------------------------------------------------------------------
   2001 //
   2002 //   insertOp()             Insert a slot for a new opcode into the already
   2003 //                          compiled pattern code.
   2004 //
   2005 //                          Fill the slot with a NOP.  Our caller will replace it
   2006 //                          with what they really wanted.
   2007 //
   2008 //------------------------------------------------------------------------------
   2009 void   RegexCompile::insertOp(int32_t where) {
   2010     UVector64 *code = fRXPat->fCompiledPat;
   2011     U_ASSERT(where>0 && where < code->size());
   2012 
   2013     int32_t  nop = buildOp(URX_NOP, 0);
   2014     code->insertElementAt(nop, where, *fStatus);
   2015 
   2016     // Walk through the pattern, looking for any ops with targets that
   2017     //  were moved down by the insert.  Fix them.
   2018     int32_t loc;
   2019     for (loc=0; loc<code->size(); loc++) {
   2020         int32_t op = (int32_t)code->elementAti(loc);
   2021         int32_t opType = URX_TYPE(op);
   2022         int32_t opValue = URX_VAL(op);
   2023         if ((opType == URX_JMP         ||
   2024             opType == URX_JMPX         ||
   2025             opType == URX_STATE_SAVE   ||
   2026             opType == URX_CTR_LOOP     ||
   2027             opType == URX_CTR_LOOP_NG  ||
   2028             opType == URX_JMP_SAV      ||
   2029             opType == URX_JMP_SAV_X    ||
   2030             opType == URX_RELOC_OPRND)    && opValue > where) {
   2031             // Target location for this opcode is after the insertion point and
   2032             //   needs to be incremented to adjust for the insertion.
   2033             opValue++;
   2034             op = buildOp(opType, opValue);
   2035             code->setElementAt(op, loc);
   2036         }
   2037     }
   2038 
   2039     // Now fix up the parentheses stack.  All positive values in it are locations in
   2040     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
   2041     for (loc=0; loc<fParenStack.size(); loc++) {
   2042         int32_t x = fParenStack.elementAti(loc);
   2043         U_ASSERT(x < code->size());
   2044         if (x>where) {
   2045             x++;
   2046             fParenStack.setElementAt(x, loc);
   2047         }
   2048     }
   2049 
   2050     if (fMatchCloseParen > where) {
   2051         fMatchCloseParen++;
   2052     }
   2053     if (fMatchOpenParen > where) {
   2054         fMatchOpenParen++;
   2055     }
   2056 }
   2057 
   2058 
   2059 //------------------------------------------------------------------------------
   2060 //
   2061 //   allocateData()        Allocate storage in the matcher's static data area.
   2062 //                         Return the index for the newly allocated data.
   2063 //                         The storage won't actually exist until we are running a match
   2064 //                         operation, but the storage indexes are inserted into various
   2065 //                         opcodes while compiling the pattern.
   2066 //
   2067 //------------------------------------------------------------------------------
   2068 int32_t RegexCompile::allocateData(int32_t size) {
   2069     if (U_FAILURE(*fStatus)) {
   2070         return 0;
   2071     }
   2072     if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
   2073         error(U_REGEX_INTERNAL_ERROR);
   2074         return 0;
   2075     }
   2076     int32_t dataIndex = fRXPat->fDataSize;
   2077     fRXPat->fDataSize += size;
   2078     if (fRXPat->fDataSize >= 0x00fffff0) {
   2079         error(U_REGEX_INTERNAL_ERROR);
   2080     }
   2081     return dataIndex;
   2082 }
   2083 
   2084 
   2085 //------------------------------------------------------------------------------
   2086 //
   2087 //   allocateStackData()   Allocate space in the back-tracking stack frame.
   2088 //                         Return the index for the newly allocated data.
   2089 //                         The frame indexes are inserted into various
   2090 //                         opcodes while compiling the pattern, meaning that frame
   2091 //                         size must be restricted to the size that will fit
   2092 //                         as an operand (24 bits).
   2093 //
   2094 //------------------------------------------------------------------------------
   2095 int32_t RegexCompile::allocateStackData(int32_t size) {
   2096     if (U_FAILURE(*fStatus)) {
   2097         return 0;
   2098     }
   2099     if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
   2100         error(U_REGEX_INTERNAL_ERROR);
   2101         return 0;
   2102     }
   2103     int32_t dataIndex = fRXPat->fFrameSize;
   2104     fRXPat->fFrameSize += size;
   2105     if (fRXPat->fFrameSize >= 0x00fffff0) {
   2106         error(U_REGEX_PATTERN_TOO_BIG);
   2107     }
   2108     return dataIndex;
   2109 }
   2110 
   2111 
   2112 //------------------------------------------------------------------------------
   2113 //
   2114 //   blockTopLoc()          Find or create a location in the compiled pattern
   2115 //                          at the start of the operation or block that has
   2116 //                          just been compiled.  Needed when a quantifier (* or
   2117 //                          whatever) appears, and we need to add an operation
   2118 //                          at the start of the thing being quantified.
   2119 //
   2120 //                          (Parenthesized Blocks) have a slot with a NOP that
   2121 //                          is reserved for this purpose.  .* or similar don't
   2122 //                          and a slot needs to be added.
   2123 //
   2124 //       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
   2125 //                                         at the returned location.
   2126 //                                 FALSE - just return the address,
   2127 //                                         do not reserve a location there.
   2128 //
   2129 //------------------------------------------------------------------------------
   2130 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
   2131     int32_t   theLoc;
   2132     fixLiterals(TRUE);  // Emit code for any pending literals.
   2133                         //   If last item was a string, emit separate op for the its last char.
   2134     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
   2135     {
   2136         // The item just processed is a parenthesized block.
   2137         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
   2138         U_ASSERT(theLoc > 0);
   2139         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
   2140     }
   2141     else {
   2142         // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
   2143         // No slot for STATE_SAVE was pre-reserved in the compiled code.
   2144         // We need to make space now.
   2145         theLoc = fRXPat->fCompiledPat->size()-1;
   2146         int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
   2147         if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
   2148             // Strings take two opcode, we want the position of the first one.
   2149             // We can have a string at this point if a single character case-folded to two.
   2150             theLoc--;
   2151         }
   2152         if (reserveLoc) {
   2153             int32_t  nop = buildOp(URX_NOP, 0);
   2154             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
   2155         }
   2156     }
   2157     return theLoc;
   2158 }
   2159 
   2160 
   2161 
   2162 //------------------------------------------------------------------------------
   2163 //
   2164 //    handleCloseParen      When compiling a close paren, we need to go back
   2165 //                          and fix up any JMP or SAVE operations within the
   2166 //                          parenthesized block that need to target the end
   2167 //                          of the block.  The locations of these are kept on
   2168 //                          the paretheses stack.
   2169 //
   2170 //                          This function is called both when encountering a
   2171 //                          real ) and at the end of the pattern.
   2172 //
   2173 //------------------------------------------------------------------------------
   2174 void  RegexCompile::handleCloseParen() {
   2175     int32_t   patIdx;
   2176     int32_t   patOp;
   2177     if (fParenStack.size() <= 0) {
   2178         error(U_REGEX_MISMATCHED_PAREN);
   2179         return;
   2180     }
   2181 
   2182     // Emit code for any pending literals.
   2183     fixLiterals(FALSE);
   2184 
   2185     // Fixup any operations within the just-closed parenthesized group
   2186     //    that need to reference the end of the (block).
   2187     //    (The first one popped from the stack is an unused slot for
   2188     //     alternation (OR) state save, but applying the fixup to it does no harm.)
   2189     for (;;) {
   2190         patIdx = fParenStack.popi();
   2191         if (patIdx < 0) {
   2192             // value < 0 flags the start of the frame on the paren stack.
   2193             break;
   2194         }
   2195         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
   2196         patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
   2197         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
   2198         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
   2199         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
   2200         fMatchOpenParen     = patIdx;
   2201     }
   2202 
   2203     //  At the close of any parenthesized block, restore the match mode flags  to
   2204     //  the value they had at the open paren.  Saved value is
   2205     //  at the top of the paren stack.
   2206     fModeFlags = fParenStack.popi();
   2207     U_ASSERT(fModeFlags < 0);
   2208 
   2209     // DO any additional fixups, depending on the specific kind of
   2210     // parentesized grouping this is
   2211 
   2212     switch (patIdx) {
   2213     case plain:
   2214     case flags:
   2215         // No additional fixups required.
   2216         //   (Grouping-only parentheses)
   2217         break;
   2218     case capturing:
   2219         // Capturing Parentheses.
   2220         //   Insert a End Capture op into the pattern.
   2221         //   The frame offset of the variables for this cg is obtained from the
   2222         //       start capture op and put it into the end-capture op.
   2223         {
   2224             int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
   2225             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
   2226 
   2227             int32_t   frameVarLocation = URX_VAL(captureOp);
   2228             appendOp(URX_END_CAPTURE, frameVarLocation);
   2229         }
   2230         break;
   2231     case atomic:
   2232         // Atomic Parenthesis.
   2233         //   Insert a LD_SP operation to restore the state stack to the position
   2234         //   it was when the atomic parens were entered.
   2235         {
   2236             int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
   2237             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
   2238             int32_t   stoLoc = URX_VAL(stoOp);
   2239             appendOp(URX_LD_SP, stoLoc);
   2240         }
   2241         break;
   2242 
   2243     case lookAhead:
   2244         {
   2245             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
   2246             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
   2247             int32_t dataLoc  = URX_VAL(startOp);
   2248             appendOp(URX_LA_END, dataLoc);
   2249         }
   2250         break;
   2251 
   2252     case negLookAhead:
   2253         {
   2254             // See comment at doOpenLookAheadNeg
   2255             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
   2256             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
   2257             int32_t dataLoc  = URX_VAL(startOp);
   2258             appendOp(URX_LA_END, dataLoc);
   2259             appendOp(URX_BACKTRACK, 0);
   2260             appendOp(URX_LA_END, dataLoc);
   2261 
   2262             // Patch the URX_SAVE near the top of the block.
   2263             // The destination of the SAVE is the final LA_END that was just added.
   2264             int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
   2265             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
   2266             int32_t dest     = fRXPat->fCompiledPat->size()-1;
   2267             saveOp           = buildOp(URX_STATE_SAVE, dest);
   2268             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
   2269         }
   2270         break;
   2271 
   2272     case lookBehind:
   2273         {
   2274             // See comment at doOpenLookBehind.
   2275 
   2276             // Append the URX_LB_END and URX_LA_END to the compiled pattern.
   2277             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
   2278             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
   2279             int32_t dataLoc  = URX_VAL(startOp);
   2280             appendOp(URX_LB_END, dataLoc);
   2281             appendOp(URX_LA_END, dataLoc);
   2282 
   2283             // Determine the min and max bounds for the length of the
   2284             //  string that the pattern can match.
   2285             //  An unbounded upper limit is an error.
   2286             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
   2287             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
   2288             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
   2289             if (URX_TYPE(maxML) != 0) {
   2290                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2291                 break;
   2292             }
   2293             if (maxML == INT32_MAX) {
   2294                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2295                 break;
   2296             }
   2297             U_ASSERT(minML <= maxML);
   2298 
   2299             // Insert the min and max match len bounds into the URX_LB_CONT op that
   2300             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
   2301             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
   2302             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
   2303 
   2304         }
   2305         break;
   2306 
   2307 
   2308 
   2309     case lookBehindN:
   2310         {
   2311             // See comment at doOpenLookBehindNeg.
   2312 
   2313             // Append the URX_LBN_END to the compiled pattern.
   2314             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
   2315             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
   2316             int32_t dataLoc  = URX_VAL(startOp);
   2317             appendOp(URX_LBN_END, dataLoc);
   2318 
   2319             // Determine the min and max bounds for the length of the
   2320             //  string that the pattern can match.
   2321             //  An unbounded upper limit is an error.
   2322             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
   2323             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
   2324             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
   2325             if (URX_TYPE(maxML) != 0) {
   2326                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2327                 break;
   2328             }
   2329             if (maxML == INT32_MAX) {
   2330                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2331                 break;
   2332             }
   2333             U_ASSERT(minML <= maxML);
   2334 
   2335             // Insert the min and max match len bounds into the URX_LB_CONT op that
   2336             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
   2337             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
   2338             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
   2339 
   2340             // Insert the pattern location to continue at after a successful match
   2341             //  as the last operand of the URX_LBN_CONT
   2342             int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
   2343             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
   2344         }
   2345         break;
   2346 
   2347 
   2348 
   2349     default:
   2350         U_ASSERT(FALSE);
   2351     }
   2352 
   2353     // remember the next location in the compiled pattern.
   2354     // The compilation of Quantifiers will look at this to see whether its looping
   2355     //   over a parenthesized block or a single item
   2356     fMatchCloseParen = fRXPat->fCompiledPat->size();
   2357 }
   2358 
   2359 
   2360 
   2361 //------------------------------------------------------------------------------
   2362 //
   2363 //   compileSet       Compile the pattern operations for a reference to a
   2364 //                    UnicodeSet.
   2365 //
   2366 //------------------------------------------------------------------------------
   2367 void        RegexCompile::compileSet(UnicodeSet *theSet)
   2368 {
   2369     if (theSet == NULL) {
   2370         return;
   2371     }
   2372     //  Remove any strings from the set.
   2373     //  There shoudn't be any, but just in case.
   2374     //     (Case Closure can add them; if we had a simple case closure avaialble that
   2375     //      ignored strings, that would be better.)
   2376     theSet->removeAllStrings();
   2377     int32_t  setSize = theSet->size();
   2378 
   2379     switch (setSize) {
   2380     case 0:
   2381         {
   2382             // Set of no elements.   Always fails to match.
   2383             appendOp(URX_BACKTRACK, 0);
   2384             delete theSet;
   2385         }
   2386         break;
   2387 
   2388     case 1:
   2389         {
   2390             // The set contains only a single code point.  Put it into
   2391             //   the compiled pattern as a single char operation rather
   2392             //   than a set, and discard the set itself.
   2393             literalChar(theSet->charAt(0));
   2394             delete theSet;
   2395         }
   2396         break;
   2397 
   2398     default:
   2399         {
   2400             //  The set contains two or more chars.  (the normal case)
   2401             //  Put it into the compiled pattern as a set.
   2402             int32_t setNumber = fRXPat->fSets->size();
   2403             fRXPat->fSets->addElement(theSet, *fStatus);
   2404             appendOp(URX_SETREF, setNumber);
   2405         }
   2406     }
   2407 }
   2408 
   2409 
   2410 //------------------------------------------------------------------------------
   2411 //
   2412 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
   2413 //                      Except for the specific opcodes used, the code is the same
   2414 //                      for all three types (greedy, non-greedy, possessive) of
   2415 //                      intervals.  The opcodes are supplied as parameters.
   2416 //                      (There are two sets of opcodes - greedy & possessive use the
   2417 //                      same ones, while non-greedy has it's own.)
   2418 //
   2419 //                      The code for interval loops has this form:
   2420 //                         0  CTR_INIT   counter loc (in stack frame)
   2421 //                         1             5  patt address of CTR_LOOP at bottom of block
   2422 //                         2             min count
   2423 //                         3             max count   (-1 for unbounded)
   2424 //                         4  ...        block to be iterated over
   2425 //                         5  CTR_LOOP
   2426 //
   2427 //                       In
   2428 //------------------------------------------------------------------------------
   2429 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
   2430 {
   2431     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
   2432     //   four slots in the compiled code.  Reserve them.
   2433     int32_t   topOfBlock = blockTopLoc(TRUE);
   2434     insertOp(topOfBlock);
   2435     insertOp(topOfBlock);
   2436     insertOp(topOfBlock);
   2437 
   2438     // The operands for the CTR_INIT opcode include the index in the matcher data
   2439     //   of the counter.  Allocate it now. There are two data items
   2440     //        counterLoc   -->  Loop counter
   2441     //               +1    -->  Input index (for breaking non-progressing loops)
   2442     //                          (Only present if unbounded upper limit on loop)
   2443     int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
   2444     int32_t   counterLoc = allocateStackData(dataSize);
   2445 
   2446     int32_t   op = buildOp(InitOp, counterLoc);
   2447     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
   2448 
   2449     // The second operand of CTR_INIT is the location following the end of the loop.
   2450     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
   2451     //   compilation of something later on causes the code to grow and the target
   2452     //   position to move.
   2453     int32_t loopEnd = fRXPat->fCompiledPat->size();
   2454     op = buildOp(URX_RELOC_OPRND, loopEnd);
   2455     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
   2456 
   2457     // Followed by the min and max counts.
   2458     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
   2459     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
   2460 
   2461     // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
   2462     //   Goes at end of the block being looped over, so just append to the code so far.
   2463     appendOp(LoopOp, topOfBlock);
   2464 
   2465     if ((fIntervalLow & 0xff000000) != 0 ||
   2466         (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
   2467             error(U_REGEX_NUMBER_TOO_BIG);
   2468         }
   2469 
   2470     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
   2471         error(U_REGEX_MAX_LT_MIN);
   2472     }
   2473 }
   2474 
   2475 
   2476 
   2477 UBool RegexCompile::compileInlineInterval() {
   2478     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
   2479         // Too big to inline.  Fail, which will cause looping code to be generated.
   2480         //   (Upper < Lower picks up unbounded upper and errors, both.)
   2481         return FALSE;
   2482     }
   2483 
   2484     int32_t   topOfBlock = blockTopLoc(FALSE);
   2485     if (fIntervalUpper == 0) {
   2486         // Pathological case.  Attempt no matches, as if the block doesn't exist.
   2487         // Discard the generated code for the block.
   2488         // If the block included parens, discard the info pertaining to them as well.
   2489         fRXPat->fCompiledPat->setSize(topOfBlock);
   2490         if (fMatchOpenParen >= topOfBlock) {
   2491             fMatchOpenParen = -1;
   2492         }
   2493         if (fMatchCloseParen >= topOfBlock) {
   2494             fMatchCloseParen = -1;
   2495         }
   2496         return TRUE;
   2497     }
   2498 
   2499     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
   2500         // The thing being repeated is not a single op, but some
   2501         //   more complex block.  Do it as a loop, not inlines.
   2502         //   Note that things "repeated" a max of once are handled as inline, because
   2503         //     the one copy of the code already generated is just fine.
   2504         return FALSE;
   2505     }
   2506 
   2507     // Pick up the opcode that is to be repeated
   2508     //
   2509     int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
   2510 
   2511     // Compute the pattern location where the inline sequence
   2512     //   will end, and set up the state save op that will be needed.
   2513     //
   2514     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
   2515                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
   2516     int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
   2517     if (fIntervalLow == 0) {
   2518         insertOp(topOfBlock);
   2519         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
   2520     }
   2521 
   2522 
   2523 
   2524     //  Loop, emitting the op for the thing being repeated each time.
   2525     //    Loop starts at 1 because one instance of the op already exists in the pattern,
   2526     //    it was put there when it was originally encountered.
   2527     int32_t i;
   2528     for (i=1; i<fIntervalUpper; i++ ) {
   2529         if (i >= fIntervalLow) {
   2530             appendOp(saveOp);
   2531         }
   2532         appendOp(op);
   2533     }
   2534     return TRUE;
   2535 }
   2536 
   2537 
   2538 
   2539 //------------------------------------------------------------------------------
   2540 //
   2541 //   caseInsensitiveStart  given a single code point from a pattern string, determine the
   2542 //                         set of characters that could potentially begin a case-insensitive
   2543 //                         match of a string beginning with that character, using full Unicode
   2544 //                         case insensitive matching.
   2545 //
   2546 //          This is used in optimizing find().
   2547 //
   2548 //          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
   2549 //          misses cases like this:
   2550 //             A string from the pattern begins with 'ss' (although all we know
   2551 //                 in this context is that it begins with 's')
   2552 //             The pattern could match a string beginning with a German sharp-s
   2553 //
   2554 //           To the ordinary case closure for a character c, we add all other
   2555 //           characters cx where the case closure of cx incudes a string form that begins
   2556 //           with the original character c.
   2557 //
   2558 //           This function could be made smarter. The full pattern string is available
   2559 //           and it would be possible to verify that the extra characters being added
   2560 //           to the starting set fully match, rather than having just a first-char of the
   2561 //           folded form match.
   2562 //
   2563 //------------------------------------------------------------------------------
   2564 void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
   2565 
   2566 // Machine Generated below.
   2567 // It may need updating with new versions of Unicode.
   2568 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
   2569 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
   2570 
   2571 // Machine Generated Data. Do not hand edit.
   2572     static const UChar32 RECaseFixCodePoints[] = {
   2573         0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
   2574         0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
   2575         0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
   2576         0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
   2577         0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
   2578 
   2579     static const int16_t RECaseFixStringOffsets[] = {
   2580         0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
   2581         0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
   2582         0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
   2583         0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
   2584         0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
   2585 
   2586     static const int16_t RECaseFixCounts[] = {
   2587         0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
   2588         0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
   2589         0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
   2590         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
   2591         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
   2592 
   2593     static const UChar RECaseFixData[] = {
   2594         0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
   2595         0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
   2596         0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
   2597         0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
   2598         0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
   2599         0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
   2600         0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
   2601         0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
   2602         0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
   2603         0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
   2604         0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
   2605 
   2606 // End of machine generated data.
   2607 
   2608     if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
   2609         // This function should never be called with an invalid input character.
   2610         U_ASSERT(FALSE);
   2611         starterChars->clear();
   2612     } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
   2613         UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
   2614         starterChars->set(caseFoldedC, caseFoldedC);
   2615 
   2616         int32_t i;
   2617         for (i=0; RECaseFixCodePoints[i]<c ; i++) {
   2618             // Simple linear search through the sorted list of interesting code points.
   2619         }
   2620 
   2621         if (RECaseFixCodePoints[i] == c) {
   2622             int32_t dataIndex = RECaseFixStringOffsets[i];
   2623             int32_t numCharsToAdd = RECaseFixCounts[i];
   2624             UChar32 cpToAdd = 0;
   2625             for (int32_t j=0; j<numCharsToAdd; j++) {
   2626                 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
   2627                 starterChars->add(cpToAdd);
   2628             }
   2629         }
   2630 
   2631         starterChars->closeOver(USET_CASE_INSENSITIVE);
   2632         starterChars->removeAllStrings();
   2633     } else {
   2634         // Not a cased character. Just return it alone.
   2635         starterChars->set(c, c);
   2636     }
   2637 }
   2638 
   2639 
   2640 
   2641 
   2642 //------------------------------------------------------------------------------
   2643 //
   2644 //   matchStartType    Determine how a match can start.
   2645 //                     Used to optimize find() operations.
   2646 //
   2647 //                     Operation is very similar to minMatchLength().  Walk the compiled
   2648 //                     pattern, keeping an on-going minimum-match-length.  For any
   2649 //                     op where the min match coming in is zero, add that ops possible
   2650 //                     starting matches to the possible starts for the overall pattern.
   2651 //
   2652 //------------------------------------------------------------------------------
   2653 void   RegexCompile::matchStartType() {
   2654     if (U_FAILURE(*fStatus)) {
   2655         return;
   2656     }
   2657 
   2658 
   2659     int32_t    loc;                    // Location in the pattern of the current op being processed.
   2660     int32_t    op;                     // The op being processed
   2661     int32_t    opType;                 // The opcode type of the op
   2662     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
   2663     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
   2664 
   2665     UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
   2666                                        //   could have advanced the position in a match.
   2667                                        //   (Maximum match length so far == 0)
   2668 
   2669     // forwardedLength is a vector holding minimum-match-length values that
   2670     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
   2671     //   It must be one longer than the pattern being checked because some  ops
   2672     //   will jmp to a end-of-block+1 location from within a block, and we must
   2673     //   count those when checking the block.
   2674     int32_t end = fRXPat->fCompiledPat->size();
   2675     UVector32  forwardedLength(end+1, *fStatus);
   2676     forwardedLength.setSize(end+1);
   2677     for (loc=3; loc<end; loc++) {
   2678         forwardedLength.setElementAt(INT32_MAX, loc);
   2679     }
   2680 
   2681     for (loc = 3; loc<end; loc++) {
   2682         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   2683         opType = URX_TYPE(op);
   2684 
   2685         // The loop is advancing linearly through the pattern.
   2686         // If the op we are now at was the destination of a branch in the pattern,
   2687         // and that path has a shorter minimum length than the current accumulated value,
   2688         // replace the current accumulated value.
   2689         if (forwardedLength.elementAti(loc) < currentLen) {
   2690             currentLen = forwardedLength.elementAti(loc);
   2691             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   2692         }
   2693 
   2694         switch (opType) {
   2695             // Ops that don't change the total length matched
   2696         case URX_RESERVED_OP:
   2697         case URX_END:
   2698         case URX_FAIL:
   2699         case URX_STRING_LEN:
   2700         case URX_NOP:
   2701         case URX_START_CAPTURE:
   2702         case URX_END_CAPTURE:
   2703         case URX_BACKSLASH_B:
   2704         case URX_BACKSLASH_BU:
   2705         case URX_BACKSLASH_G:
   2706         case URX_BACKSLASH_Z:
   2707         case URX_DOLLAR:
   2708         case URX_DOLLAR_M:
   2709         case URX_DOLLAR_D:
   2710         case URX_DOLLAR_MD:
   2711         case URX_RELOC_OPRND:
   2712         case URX_STO_INP_LOC:
   2713         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   2714         case URX_BACKREF_I:
   2715 
   2716         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   2717         case URX_LD_SP:
   2718             break;
   2719 
   2720         case URX_CARET:
   2721             if (atStart) {
   2722                 fRXPat->fStartType = START_START;
   2723             }
   2724             break;
   2725 
   2726         case URX_CARET_M:
   2727         case URX_CARET_M_UNIX:
   2728             if (atStart) {
   2729                 fRXPat->fStartType = START_LINE;
   2730             }
   2731             break;
   2732 
   2733         case URX_ONECHAR:
   2734             if (currentLen == 0) {
   2735                 // This character could appear at the start of a match.
   2736                 //   Add it to the set of possible starting characters.
   2737                 fRXPat->fInitialChars->add(URX_VAL(op));
   2738                 numInitialStrings += 2;
   2739             }
   2740             currentLen++;
   2741             atStart = FALSE;
   2742             break;
   2743 
   2744 
   2745         case URX_SETREF:
   2746             if (currentLen == 0) {
   2747                 int32_t  sn = URX_VAL(op);
   2748                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
   2749                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
   2750                 fRXPat->fInitialChars->addAll(*s);
   2751                 numInitialStrings += 2;
   2752             }
   2753             currentLen++;
   2754             atStart = FALSE;
   2755             break;
   2756 
   2757         case URX_LOOP_SR_I:
   2758             // [Set]*, like a SETREF, above, in what it can match,
   2759             //  but may not match at all, so currentLen is not incremented.
   2760             if (currentLen == 0) {
   2761                 int32_t  sn = URX_VAL(op);
   2762                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
   2763                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
   2764                 fRXPat->fInitialChars->addAll(*s);
   2765                 numInitialStrings += 2;
   2766             }
   2767             atStart = FALSE;
   2768             break;
   2769 
   2770         case URX_LOOP_DOT_I:
   2771             if (currentLen == 0) {
   2772                 // .* at the start of a pattern.
   2773                 //    Any character can begin the match.
   2774                 fRXPat->fInitialChars->clear();
   2775                 fRXPat->fInitialChars->complement();
   2776                 numInitialStrings += 2;
   2777             }
   2778             atStart = FALSE;
   2779             break;
   2780 
   2781 
   2782         case URX_STATIC_SETREF:
   2783             if (currentLen == 0) {
   2784                 int32_t  sn = URX_VAL(op);
   2785                 U_ASSERT(sn>0 && sn<URX_LAST_SET);
   2786                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
   2787                 fRXPat->fInitialChars->addAll(*s);
   2788                 numInitialStrings += 2;
   2789             }
   2790             currentLen++;
   2791             atStart = FALSE;
   2792             break;
   2793 
   2794 
   2795 
   2796         case URX_STAT_SETREF_N:
   2797             if (currentLen == 0) {
   2798                 int32_t  sn = URX_VAL(op);
   2799                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
   2800                 UnicodeSet sc(*s);
   2801                 sc.complement();
   2802                 fRXPat->fInitialChars->addAll(sc);
   2803                 numInitialStrings += 2;
   2804             }
   2805             currentLen++;
   2806             atStart = FALSE;
   2807             break;
   2808 
   2809 
   2810 
   2811         case URX_BACKSLASH_D:
   2812             // Digit Char
   2813              if (currentLen == 0) {
   2814                  UnicodeSet s;
   2815                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
   2816                  if (URX_VAL(op) != 0) {
   2817                      s.complement();
   2818                  }
   2819                  fRXPat->fInitialChars->addAll(s);
   2820                  numInitialStrings += 2;
   2821             }
   2822             currentLen++;
   2823             atStart = FALSE;
   2824             break;
   2825 
   2826 
   2827         case URX_BACKSLASH_H:
   2828             // Horiz white space
   2829             if (currentLen == 0) {
   2830                 UnicodeSet s;
   2831                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
   2832                 s.add((UChar32)9);   // Tab
   2833                 if (URX_VAL(op) != 0) {
   2834                     s.complement();
   2835                 }
   2836                 fRXPat->fInitialChars->addAll(s);
   2837                 numInitialStrings += 2;
   2838             }
   2839             currentLen++;
   2840             atStart = FALSE;
   2841             break;
   2842 
   2843 
   2844         case URX_BACKSLASH_R:       // Any line ending sequence
   2845         case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
   2846             if (currentLen == 0) {
   2847                 UnicodeSet s;
   2848                 s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
   2849                 s.add((UChar32)0x85);
   2850                 s.add((UChar32)0x2028, (UChar32)0x2029);
   2851                 if (URX_VAL(op) != 0) {
   2852                      // Complement option applies to URX_BACKSLASH_V only.
   2853                      s.complement();
   2854                 }
   2855                 fRXPat->fInitialChars->addAll(s);
   2856                 numInitialStrings += 2;
   2857             }
   2858             currentLen++;
   2859             atStart = FALSE;
   2860             break;
   2861 
   2862 
   2863 
   2864         case URX_ONECHAR_I:
   2865             // Case Insensitive Single Character.
   2866             if (currentLen == 0) {
   2867                 UChar32  c = URX_VAL(op);
   2868                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
   2869                     UnicodeSet starters(c, c);
   2870                     starters.closeOver(USET_CASE_INSENSITIVE);
   2871                     // findCaseInsensitiveStarters(c, &starters);
   2872                     //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
   2873                     //   The expanded folding can't match the pattern.
   2874                     fRXPat->fInitialChars->addAll(starters);
   2875                 } else {
   2876                     // Char has no case variants.  Just add it as-is to the
   2877                     //   set of possible starting chars.
   2878                     fRXPat->fInitialChars->add(c);
   2879                 }
   2880                 numInitialStrings += 2;
   2881             }
   2882             currentLen++;
   2883             atStart = FALSE;
   2884             break;
   2885 
   2886 
   2887         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
   2888         case URX_DOTANY_ALL:    // . matches one or two.
   2889         case URX_DOTANY:
   2890         case URX_DOTANY_UNIX:
   2891             if (currentLen == 0) {
   2892                 // These constructs are all bad news when they appear at the start
   2893                 //   of a match.  Any character can begin the match.
   2894                 fRXPat->fInitialChars->clear();
   2895                 fRXPat->fInitialChars->complement();
   2896                 numInitialStrings += 2;
   2897             }
   2898             currentLen++;
   2899             atStart = FALSE;
   2900             break;
   2901 
   2902 
   2903         case URX_JMPX:
   2904             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
   2905             U_FALLTHROUGH;
   2906         case URX_JMP:
   2907             {
   2908                 int32_t  jmpDest = URX_VAL(op);
   2909                 if (jmpDest < loc) {
   2910                     // Loop of some kind.  Can safely ignore, the worst that will happen
   2911                     //  is that we understate the true minimum length
   2912                     currentLen = forwardedLength.elementAti(loc+1);
   2913 
   2914                 } else {
   2915                     // Forward jump.  Propagate the current min length to the target loc of the jump.
   2916                     U_ASSERT(jmpDest <= end+1);
   2917                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
   2918                         forwardedLength.setElementAt(currentLen, jmpDest);
   2919                     }
   2920                 }
   2921             }
   2922             atStart = FALSE;
   2923             break;
   2924 
   2925         case URX_JMP_SAV:
   2926         case URX_JMP_SAV_X:
   2927             // Combo of state save to the next loc, + jmp backwards.
   2928             //   Net effect on min. length computation is nothing.
   2929             atStart = FALSE;
   2930             break;
   2931 
   2932         case URX_BACKTRACK:
   2933             // Fails are kind of like a branch, except that the min length was
   2934             //   propagated already, by the state save.
   2935             currentLen = forwardedLength.elementAti(loc+1);
   2936             atStart = FALSE;
   2937             break;
   2938 
   2939 
   2940         case URX_STATE_SAVE:
   2941             {
   2942                 // State Save, for forward jumps, propagate the current minimum.
   2943                 //             of the state save.
   2944                 int32_t  jmpDest = URX_VAL(op);
   2945                 if (jmpDest > loc) {
   2946                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
   2947                         forwardedLength.setElementAt(currentLen, jmpDest);
   2948                     }
   2949                 }
   2950             }
   2951             atStart = FALSE;
   2952             break;
   2953 
   2954 
   2955 
   2956 
   2957         case URX_STRING:
   2958             {
   2959                 loc++;
   2960                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   2961                 int32_t stringLen   = URX_VAL(stringLenOp);
   2962                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
   2963                 U_ASSERT(stringLenOp >= 2);
   2964                 if (currentLen == 0) {
   2965                     // Add the starting character of this string to the set of possible starting
   2966                     //   characters for this pattern.
   2967                     int32_t stringStartIdx = URX_VAL(op);
   2968                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
   2969                     fRXPat->fInitialChars->add(c);
   2970 
   2971                     // Remember this string.  After the entire pattern has been checked,
   2972                     //  if nothing else is identified that can start a match, we'll use it.
   2973                     numInitialStrings++;
   2974                     fRXPat->fInitialStringIdx = stringStartIdx;
   2975                     fRXPat->fInitialStringLen = stringLen;
   2976                 }
   2977 
   2978                 currentLen += stringLen;
   2979                 atStart = FALSE;
   2980             }
   2981             break;
   2982 
   2983         case URX_STRING_I:
   2984             {
   2985                 // Case-insensitive string.  Unlike exact-match strings, we won't
   2986                 //   attempt a string search for possible match positions.  But we
   2987                 //   do update the set of possible starting characters.
   2988                 loc++;
   2989                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   2990                 int32_t stringLen   = URX_VAL(stringLenOp);
   2991                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
   2992                 U_ASSERT(stringLenOp >= 2);
   2993                 if (currentLen == 0) {
   2994                     // Add the starting character of this string to the set of possible starting
   2995                     //   characters for this pattern.
   2996                     int32_t stringStartIdx = URX_VAL(op);
   2997                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
   2998                     UnicodeSet s;
   2999                     findCaseInsensitiveStarters(c, &s);
   3000                     fRXPat->fInitialChars->addAll(s);
   3001                     numInitialStrings += 2;  // Matching on an initial string not possible.
   3002                 }
   3003                 currentLen += stringLen;
   3004                 atStart = FALSE;
   3005             }
   3006             break;
   3007 
   3008         case URX_CTR_INIT:
   3009         case URX_CTR_INIT_NG:
   3010             {
   3011                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops
   3012                 //   so location must be updated accordingly.
   3013                 // Loop Init Ops.
   3014                 //   If the min loop count == 0
   3015                 //      move loc forwards to the end of the loop, skipping over the body.
   3016                 //   If the min count is > 0,
   3017                 //      continue normal processing of the body of the loop.
   3018                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
   3019                         loopEndLoc   = URX_VAL(loopEndLoc);
   3020                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
   3021                 if (minLoopCount == 0) {
   3022                     // Min Loop Count of 0, treat like a forward branch and
   3023                     //   move the current minimum length up to the target
   3024                     //   (end of loop) location.
   3025                     U_ASSERT(loopEndLoc <= end+1);
   3026                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
   3027                         forwardedLength.setElementAt(currentLen, loopEndLoc);
   3028                     }
   3029                 }
   3030                 loc+=3;  // Skips over operands of CTR_INIT
   3031             }
   3032             atStart = FALSE;
   3033             break;
   3034 
   3035 
   3036         case URX_CTR_LOOP:
   3037         case URX_CTR_LOOP_NG:
   3038             // Loop ops.
   3039             //  The jump is conditional, backwards only.
   3040             atStart = FALSE;
   3041             break;
   3042 
   3043         case URX_LOOP_C:
   3044             // More loop ops.  These state-save to themselves.
   3045             //   don't change the minimum match
   3046             atStart = FALSE;
   3047             break;
   3048 
   3049 
   3050         case URX_LA_START:
   3051         case URX_LB_START:
   3052             {
   3053                 // Look-around.  Scan forward until the matching look-ahead end,
   3054                 //   without processing the look-around block.  This is overly pessimistic.
   3055 
   3056                 // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
   3057                 //   lookahead contains two LA_END instructions, so count goes up by two
   3058                 //   for each LA_START.
   3059                 int32_t  depth = (opType == URX_LA_START? 2: 1);
   3060                 for (;;) {
   3061                     loc++;
   3062                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3063                     if (URX_TYPE(op) == URX_LA_START) {
   3064                         depth+=2;
   3065                     }
   3066                     if (URX_TYPE(op) == URX_LB_START) {
   3067                         depth++;
   3068                     }
   3069                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
   3070                         depth--;
   3071                         if (depth == 0) {
   3072                             break;
   3073                         }
   3074                     }
   3075                     if (URX_TYPE(op) == URX_STATE_SAVE) {
   3076                         // Need this because neg lookahead blocks will FAIL to outside
   3077                         //   of the block.
   3078                         int32_t  jmpDest = URX_VAL(op);
   3079                         if (jmpDest > loc) {
   3080                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3081                                 forwardedLength.setElementAt(currentLen, jmpDest);
   3082                             }
   3083                         }
   3084                     }
   3085                     U_ASSERT(loc <= end);
   3086                 }
   3087             }
   3088             break;
   3089 
   3090         case URX_LA_END:
   3091         case URX_LB_CONT:
   3092         case URX_LB_END:
   3093         case URX_LBN_CONT:
   3094         case URX_LBN_END:
   3095             U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be
   3096                                  //  consumed by the scan in URX_LA_START and LB_START
   3097 
   3098             break;
   3099 
   3100         default:
   3101             U_ASSERT(FALSE);
   3102             }
   3103 
   3104         }
   3105 
   3106 
   3107     // We have finished walking through the ops.  Check whether some forward jump
   3108     //   propagated a shorter length to location end+1.
   3109     if (forwardedLength.elementAti(end+1) < currentLen) {
   3110         currentLen = forwardedLength.elementAti(end+1);
   3111     }
   3112 
   3113 
   3114     fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
   3115 
   3116 
   3117     // Sort out what we should check for when looking for candidate match start positions.
   3118     // In order of preference,
   3119     //     1.   Start of input text buffer.
   3120     //     2.   A literal string.
   3121     //     3.   Start of line in multi-line mode.
   3122     //     4.   A single literal character.
   3123     //     5.   A character from a set of characters.
   3124     //
   3125     if (fRXPat->fStartType == START_START) {
   3126         // Match only at the start of an input text string.
   3127         //    start type is already set.  We're done.
   3128     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
   3129         // Match beginning only with a literal string.
   3130         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
   3131         U_ASSERT(fRXPat->fInitialChars->contains(c));
   3132         fRXPat->fStartType   = START_STRING;
   3133         fRXPat->fInitialChar = c;
   3134     } else if (fRXPat->fStartType == START_LINE) {
   3135         // Match at start of line in Multi-Line mode.
   3136         // Nothing to do here; everything is already set.
   3137     } else if (fRXPat->fMinMatchLen == 0) {
   3138         // Zero length match possible.  We could start anywhere.
   3139         fRXPat->fStartType = START_NO_INFO;
   3140     } else if (fRXPat->fInitialChars->size() == 1) {
   3141         // All matches begin with the same char.
   3142         fRXPat->fStartType   = START_CHAR;
   3143         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
   3144         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
   3145     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
   3146         fRXPat->fMinMatchLen > 0) {
   3147         // Matches start with a set of character smaller than the set of all chars.
   3148         fRXPat->fStartType = START_SET;
   3149     } else {
   3150         // Matches can start with anything
   3151         fRXPat->fStartType = START_NO_INFO;
   3152     }
   3153 
   3154     return;
   3155 }
   3156 
   3157 
   3158 
   3159 //------------------------------------------------------------------------------
   3160 //
   3161 //   minMatchLength    Calculate the length of the shortest string that could
   3162 //                     match the specified pattern.
   3163 //                     Length is in 16 bit code units, not code points.
   3164 //
   3165 //                     The calculated length may not be exact.  The returned
   3166 //                     value may be shorter than the actual minimum; it must
   3167 //                     never be longer.
   3168 //
   3169 //                     start and end are the range of p-code operations to be
   3170 //                     examined.  The endpoints are included in the range.
   3171 //
   3172 //------------------------------------------------------------------------------
   3173 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
   3174     if (U_FAILURE(*fStatus)) {
   3175         return 0;
   3176     }
   3177 
   3178     U_ASSERT(start <= end);
   3179     U_ASSERT(end < fRXPat->fCompiledPat->size());
   3180 
   3181 
   3182     int32_t    loc;
   3183     int32_t    op;
   3184     int32_t    opType;
   3185     int32_t    currentLen = 0;
   3186 
   3187 
   3188     // forwardedLength is a vector holding minimum-match-length values that
   3189     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
   3190     //   It must be one longer than the pattern being checked because some  ops
   3191     //   will jmp to a end-of-block+1 location from within a block, and we must
   3192     //   count those when checking the block.
   3193     UVector32  forwardedLength(end+2, *fStatus);
   3194     forwardedLength.setSize(end+2);
   3195     for (loc=start; loc<=end+1; loc++) {
   3196         forwardedLength.setElementAt(INT32_MAX, loc);
   3197     }
   3198 
   3199     for (loc = start; loc<=end; loc++) {
   3200         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3201         opType = URX_TYPE(op);
   3202 
   3203         // The loop is advancing linearly through the pattern.
   3204         // If the op we are now at was the destination of a branch in the pattern,
   3205         // and that path has a shorter minimum length than the current accumulated value,
   3206         // replace the current accumulated value.
   3207         // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
   3208                                                                //   no-match-possible cases.
   3209         if (forwardedLength.elementAti(loc) < currentLen) {
   3210             currentLen = forwardedLength.elementAti(loc);
   3211             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   3212         }
   3213 
   3214         switch (opType) {
   3215             // Ops that don't change the total length matched
   3216         case URX_RESERVED_OP:
   3217         case URX_END:
   3218         case URX_STRING_LEN:
   3219         case URX_NOP:
   3220         case URX_START_CAPTURE:
   3221         case URX_END_CAPTURE:
   3222         case URX_BACKSLASH_B:
   3223         case URX_BACKSLASH_BU:
   3224         case URX_BACKSLASH_G:
   3225         case URX_BACKSLASH_Z:
   3226         case URX_CARET:
   3227         case URX_DOLLAR:
   3228         case URX_DOLLAR_M:
   3229         case URX_DOLLAR_D:
   3230         case URX_DOLLAR_MD:
   3231         case URX_RELOC_OPRND:
   3232         case URX_STO_INP_LOC:
   3233         case URX_CARET_M:
   3234         case URX_CARET_M_UNIX:
   3235         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   3236         case URX_BACKREF_I:
   3237 
   3238         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   3239         case URX_LD_SP:
   3240 
   3241         case URX_JMP_SAV:
   3242         case URX_JMP_SAV_X:
   3243             break;
   3244 
   3245 
   3246             // Ops that match a minimum of one character (one or two 16 bit code units.)
   3247             //
   3248         case URX_ONECHAR:
   3249         case URX_STATIC_SETREF:
   3250         case URX_STAT_SETREF_N:
   3251         case URX_SETREF:
   3252         case URX_BACKSLASH_D:
   3253         case URX_BACKSLASH_H:
   3254         case URX_BACKSLASH_R:
   3255         case URX_BACKSLASH_V:
   3256         case URX_ONECHAR_I:
   3257         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
   3258         case URX_DOTANY_ALL:    // . matches one or two.
   3259         case URX_DOTANY:
   3260         case URX_DOTANY_UNIX:
   3261             currentLen++;
   3262             break;
   3263 
   3264 
   3265         case URX_JMPX:
   3266             loc++;              // URX_JMPX has an extra operand, ignored here,
   3267                                 //   otherwise processed identically to URX_JMP.
   3268             U_FALLTHROUGH;
   3269         case URX_JMP:
   3270             {
   3271                 int32_t  jmpDest = URX_VAL(op);
   3272                 if (jmpDest < loc) {
   3273                     // Loop of some kind.  Can safely ignore, the worst that will happen
   3274                     //  is that we understate the true minimum length
   3275                     currentLen = forwardedLength.elementAti(loc+1);
   3276                 } else {
   3277                     // Forward jump.  Propagate the current min length to the target loc of the jump.
   3278                     U_ASSERT(jmpDest <= end+1);
   3279                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
   3280                         forwardedLength.setElementAt(currentLen, jmpDest);
   3281                     }
   3282                 }
   3283             }
   3284             break;
   3285 
   3286         case URX_BACKTRACK:
   3287             {
   3288                 // Back-tracks are kind of like a branch, except that the min length was
   3289                 //   propagated already, by the state save.
   3290                 currentLen = forwardedLength.elementAti(loc+1);
   3291             }
   3292             break;
   3293 
   3294 
   3295         case URX_STATE_SAVE:
   3296             {
   3297                 // State Save, for forward jumps, propagate the current minimum.
   3298                 //             of the state save.
   3299                 int32_t  jmpDest = URX_VAL(op);
   3300                 if (jmpDest > loc) {
   3301                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3302                         forwardedLength.setElementAt(currentLen, jmpDest);
   3303                     }
   3304                 }
   3305             }
   3306             break;
   3307 
   3308 
   3309         case URX_STRING:
   3310             {
   3311                 loc++;
   3312                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3313                 currentLen += URX_VAL(stringLenOp);
   3314             }
   3315             break;
   3316 
   3317 
   3318         case URX_STRING_I:
   3319             {
   3320                 loc++;
   3321                 // TODO: with full case folding, matching input text may be shorter than
   3322                 //       the string we have here.  More smarts could put some bounds on it.
   3323                 //       Assume a min length of one for now.  A min length of zero causes
   3324                 //        optimization failures for a pattern like "string"+
   3325                 // currentLen += URX_VAL(stringLenOp);
   3326                 currentLen += 1;
   3327             }
   3328             break;
   3329 
   3330         case URX_CTR_INIT:
   3331         case URX_CTR_INIT_NG:
   3332             {
   3333                 // Loop Init Ops.
   3334                 //   If the min loop count == 0
   3335                 //      move loc forwards to the end of the loop, skipping over the body.
   3336                 //   If the min count is > 0,
   3337                 //      continue normal processing of the body of the loop.
   3338                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
   3339                         loopEndLoc   = URX_VAL(loopEndLoc);
   3340                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
   3341                 if (minLoopCount == 0) {
   3342                     loc = loopEndLoc;
   3343                 } else {
   3344                     loc+=3;  // Skips over operands of CTR_INIT
   3345                 }
   3346             }
   3347             break;
   3348 
   3349 
   3350         case URX_CTR_LOOP:
   3351         case URX_CTR_LOOP_NG:
   3352             // Loop ops.
   3353             //  The jump is conditional, backwards only.
   3354             break;
   3355 
   3356         case URX_LOOP_SR_I:
   3357         case URX_LOOP_DOT_I:
   3358         case URX_LOOP_C:
   3359             // More loop ops.  These state-save to themselves.
   3360             //   don't change the minimum match - could match nothing at all.
   3361             break;
   3362 
   3363 
   3364         case URX_LA_START:
   3365         case URX_LB_START:
   3366             {
   3367                 // Look-around.  Scan forward until the matching look-ahead end,
   3368                 //   without processing the look-around block.  This is overly pessimistic for look-ahead,
   3369                 //   it assumes that the look-ahead match might be zero-length.
   3370                 //   TODO:  Positive lookahead could recursively do the block, then continue
   3371                 //          with the longer of the block or the value coming in.  Ticket 6060
   3372                 int32_t  depth = (opType == URX_LA_START? 2: 1);;
   3373                 for (;;) {
   3374                     loc++;
   3375                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3376                     if (URX_TYPE(op) == URX_LA_START) {
   3377                         // The boilerplate for look-ahead includes two LA_END insturctions,
   3378                         //    Depth will be decremented by each one when it is seen.
   3379                         depth += 2;
   3380                     }
   3381                     if (URX_TYPE(op) == URX_LB_START) {
   3382                         depth++;
   3383                     }
   3384                     if (URX_TYPE(op) == URX_LA_END) {
   3385                         depth--;
   3386                         if (depth == 0) {
   3387                             break;
   3388                         }
   3389                     }
   3390                     if (URX_TYPE(op)==URX_LBN_END) {
   3391                         depth--;
   3392                         if (depth == 0) {
   3393                             break;
   3394                         }
   3395                     }
   3396                     if (URX_TYPE(op) == URX_STATE_SAVE) {
   3397                         // Need this because neg lookahead blocks will FAIL to outside
   3398                         //   of the block.
   3399                         int32_t  jmpDest = URX_VAL(op);
   3400                         if (jmpDest > loc) {
   3401                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3402                                 forwardedLength.setElementAt(currentLen, jmpDest);
   3403                             }
   3404                         }
   3405                     }
   3406                     U_ASSERT(loc <= end);
   3407                 }
   3408             }
   3409             break;
   3410 
   3411         case URX_LA_END:
   3412         case URX_LB_CONT:
   3413         case URX_LB_END:
   3414         case URX_LBN_CONT:
   3415         case URX_LBN_END:
   3416             // Only come here if the matching URX_LA_START or URX_LB_START was not in the
   3417             //   range being sized, which happens when measuring size of look-behind blocks.
   3418             break;
   3419 
   3420         default:
   3421             U_ASSERT(FALSE);
   3422             }
   3423 
   3424         }
   3425 
   3426     // We have finished walking through the ops.  Check whether some forward jump
   3427     //   propagated a shorter length to location end+1.
   3428     if (forwardedLength.elementAti(end+1) < currentLen) {
   3429         currentLen = forwardedLength.elementAti(end+1);
   3430         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   3431     }
   3432 
   3433     return currentLen;
   3434 }
   3435 
   3436 // Increment with overflow check.
   3437 // val and delta will both be positive.
   3438 
   3439 static int32_t safeIncrement(int32_t val, int32_t delta) {
   3440     if (INT32_MAX - val > delta) {
   3441         return val + delta;
   3442     } else {
   3443         return INT32_MAX;
   3444     }
   3445 }
   3446 
   3447 
   3448 //------------------------------------------------------------------------------
   3449 //
   3450 //   maxMatchLength    Calculate the length of the longest string that could
   3451 //                     match the specified pattern.
   3452 //                     Length is in 16 bit code units, not code points.
   3453 //
   3454 //                     The calculated length may not be exact.  The returned
   3455 //                     value may be longer than the actual maximum; it must
   3456 //                     never be shorter.
   3457 //
   3458 //------------------------------------------------------------------------------
   3459 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
   3460     if (U_FAILURE(*fStatus)) {
   3461         return 0;
   3462     }
   3463     U_ASSERT(start <= end);
   3464     U_ASSERT(end < fRXPat->fCompiledPat->size());
   3465 
   3466 
   3467     int32_t    loc;
   3468     int32_t    op;
   3469     int32_t    opType;
   3470     int32_t    currentLen = 0;
   3471     UVector32  forwardedLength(end+1, *fStatus);
   3472     forwardedLength.setSize(end+1);
   3473 
   3474     for (loc=start; loc<=end; loc++) {
   3475         forwardedLength.setElementAt(0, loc);
   3476     }
   3477 
   3478     for (loc = start; loc<=end; loc++) {
   3479         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3480         opType = URX_TYPE(op);
   3481 
   3482         // The loop is advancing linearly through the pattern.
   3483         // If the op we are now at was the destination of a branch in the pattern,
   3484         // and that path has a longer maximum length than the current accumulated value,
   3485         // replace the current accumulated value.
   3486         if (forwardedLength.elementAti(loc) > currentLen) {
   3487             currentLen = forwardedLength.elementAti(loc);
   3488         }
   3489 
   3490         switch (opType) {
   3491             // Ops that don't change the total length matched
   3492         case URX_RESERVED_OP:
   3493         case URX_END:
   3494         case URX_STRING_LEN:
   3495         case URX_NOP:
   3496         case URX_START_CAPTURE:
   3497         case URX_END_CAPTURE:
   3498         case URX_BACKSLASH_B:
   3499         case URX_BACKSLASH_BU:
   3500         case URX_BACKSLASH_G:
   3501         case URX_BACKSLASH_Z:
   3502         case URX_CARET:
   3503         case URX_DOLLAR:
   3504         case URX_DOLLAR_M:
   3505         case URX_DOLLAR_D:
   3506         case URX_DOLLAR_MD:
   3507         case URX_RELOC_OPRND:
   3508         case URX_STO_INP_LOC:
   3509         case URX_CARET_M:
   3510         case URX_CARET_M_UNIX:
   3511 
   3512         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   3513         case URX_LD_SP:
   3514 
   3515         case URX_LB_END:
   3516         case URX_LB_CONT:
   3517         case URX_LBN_CONT:
   3518         case URX_LBN_END:
   3519             break;
   3520 
   3521 
   3522             // Ops that increase that cause an unbounded increase in the length
   3523             //   of a matched string, or that increase it a hard to characterize way.
   3524             //   Call the max length unbounded, and stop further checking.
   3525         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   3526         case URX_BACKREF_I:
   3527         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
   3528             currentLen = INT32_MAX;
   3529             break;
   3530 
   3531 
   3532             // Ops that match a max of one character (possibly two 16 bit code units.)
   3533             //
   3534         case URX_STATIC_SETREF:
   3535         case URX_STAT_SETREF_N:
   3536         case URX_SETREF:
   3537         case URX_BACKSLASH_D:
   3538         case URX_BACKSLASH_H:
   3539         case URX_BACKSLASH_R:
   3540         case URX_BACKSLASH_V:
   3541         case URX_ONECHAR_I:
   3542         case URX_DOTANY_ALL:
   3543         case URX_DOTANY:
   3544         case URX_DOTANY_UNIX:
   3545             currentLen = safeIncrement(currentLen, 2);
   3546             break;
   3547 
   3548             // Single literal character.  Increase current max length by one or two,
   3549             //       depending on whether the char is in the supplementary range.
   3550         case URX_ONECHAR:
   3551             currentLen = safeIncrement(currentLen, 1);
   3552             if (URX_VAL(op) > 0x10000) {
   3553                 currentLen = safeIncrement(currentLen, 1);
   3554             }
   3555             break;
   3556 
   3557             // Jumps.
   3558             //
   3559         case URX_JMP:
   3560         case URX_JMPX:
   3561         case URX_JMP_SAV:
   3562         case URX_JMP_SAV_X:
   3563             {
   3564                 int32_t  jmpDest = URX_VAL(op);
   3565                 if (jmpDest < loc) {
   3566                     // Loop of some kind.  Max match length is unbounded.
   3567                     currentLen = INT32_MAX;
   3568                 } else {
   3569                     // Forward jump.  Propagate the current min length to the target loc of the jump.
   3570                     if (forwardedLength.elementAti(jmpDest) < currentLen) {
   3571                         forwardedLength.setElementAt(currentLen, jmpDest);
   3572                     }
   3573                     currentLen = 0;
   3574                 }
   3575             }
   3576             break;
   3577 
   3578         case URX_BACKTRACK:
   3579             // back-tracks are kind of like a branch, except that the max length was
   3580             //   propagated already, by the state save.
   3581             currentLen = forwardedLength.elementAti(loc+1);
   3582             break;
   3583 
   3584 
   3585         case URX_STATE_SAVE:
   3586             {
   3587                 // State Save, for forward jumps, propagate the current minimum.
   3588                 //               of the state save.
   3589                 //             For backwards jumps, they create a loop, maximum
   3590                 //               match length is unbounded.
   3591                 int32_t  jmpDest = URX_VAL(op);
   3592                 if (jmpDest > loc) {
   3593                     if (currentLen > forwardedLength.elementAti(jmpDest)) {
   3594                         forwardedLength.setElementAt(currentLen, jmpDest);
   3595                     }
   3596                 } else {
   3597                     currentLen = INT32_MAX;
   3598                 }
   3599             }
   3600             break;
   3601 
   3602 
   3603 
   3604 
   3605         case URX_STRING:
   3606             {
   3607                 loc++;
   3608                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3609                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
   3610                 break;
   3611             }
   3612 
   3613         case URX_STRING_I:
   3614             // TODO:  This code assumes that any user string that matches will be no longer
   3615             //        than our compiled string, with case insensitive matching.
   3616             //        Our compiled string has been case-folded already.
   3617             //
   3618             //        Any matching user string will have no more code points than our
   3619             //        compiled (folded) string.  Folding may add code points, but
   3620             //        not remove them.
   3621             //
   3622             //        There is a potential problem if a supplemental code point
   3623             //        case-folds to a BMP code point.  In this case our compiled string
   3624             //        could be shorter (in code units) than a matching user string.
   3625             //
   3626             //        At this time (Unicode 6.1) there are no such characters, and this case
   3627             //        is not being handled.  A test, intltest regex/Bug9283, will fail if
   3628             //        any problematic characters are added to Unicode.
   3629             //
   3630             //        If this happens, we can make a set of the BMP chars that the
   3631             //        troublesome supplementals fold to, scan our string, and bump the
   3632             //        currentLen one extra for each that is found.
   3633             //
   3634             {
   3635                 loc++;
   3636                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3637                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
   3638             }
   3639             break;
   3640 
   3641         case URX_CTR_INIT:
   3642         case URX_CTR_INIT_NG:
   3643             // For Loops, recursively call this function on the pattern for the loop body,
   3644             //   then multiply the result by the maximum loop count.
   3645             {
   3646                 int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
   3647                 if (loopEndLoc == loc+4) {
   3648                     // Loop has an empty body. No affect on max match length.
   3649                     // Continue processing with code after the loop end.
   3650                     loc = loopEndLoc;
   3651                     break;
   3652                 }
   3653 
   3654                 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
   3655                 if (maxLoopCount == -1) {
   3656                     // Unbounded Loop. No upper bound on match length.
   3657                     currentLen = INT32_MAX;
   3658                     break;
   3659                 }
   3660 
   3661                 U_ASSERT(loopEndLoc >= loc+4);
   3662                 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
   3663                 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
   3664                 if (updatedLen >= INT32_MAX) {
   3665                     currentLen = INT32_MAX;
   3666                     break;
   3667                 }
   3668                 currentLen = (int32_t)updatedLen;
   3669                 loc = loopEndLoc;
   3670                 break;
   3671             }
   3672 
   3673         case URX_CTR_LOOP:
   3674         case URX_CTR_LOOP_NG:
   3675             // These opcodes will be skipped over by code for URX_CRT_INIT.
   3676             // We shouldn't encounter them here.
   3677             U_ASSERT(FALSE);
   3678             break;
   3679 
   3680         case URX_LOOP_SR_I:
   3681         case URX_LOOP_DOT_I:
   3682         case URX_LOOP_C:
   3683             // For anything to do with loops, make the match length unbounded.
   3684             currentLen = INT32_MAX;
   3685             break;
   3686 
   3687 
   3688 
   3689         case URX_LA_START:
   3690         case URX_LA_END:
   3691             // Look-ahead.  Just ignore, treat the look-ahead block as if
   3692             // it were normal pattern.  Gives a too-long match length,
   3693             //  but good enough for now.
   3694             break;
   3695 
   3696             // End of look-ahead ops should always be consumed by the processing at
   3697             //  the URX_LA_START op.
   3698             // U_ASSERT(FALSE);
   3699             // break;
   3700 
   3701         case URX_LB_START:
   3702             {
   3703                 // Look-behind.  Scan forward until the matching look-around end,
   3704                 //   without processing the look-behind block.
   3705                 int32_t  depth = 0;
   3706                 for (;;) {
   3707                     loc++;
   3708                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3709                     if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
   3710                         depth++;
   3711                     }
   3712                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
   3713                         if (depth == 0) {
   3714                             break;
   3715                         }
   3716                         depth--;
   3717                     }
   3718                     U_ASSERT(loc < end);
   3719                 }
   3720             }
   3721             break;
   3722 
   3723         default:
   3724             U_ASSERT(FALSE);
   3725         }
   3726 
   3727 
   3728         if (currentLen == INT32_MAX) {
   3729             //  The maximum length is unbounded.
   3730             //  Stop further processing of the pattern.
   3731             break;
   3732         }
   3733 
   3734     }
   3735     return currentLen;
   3736 
   3737 }
   3738 
   3739 
   3740 //------------------------------------------------------------------------------
   3741 //
   3742 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
   3743 //                Extra NOPs are inserted for some constructs during the initial
   3744 //                code generation to provide locations that may be patched later.
   3745 //                Many end up unneeded, and are removed by this function.
   3746 //
   3747 //                In order to minimize the number of passes through the pattern,
   3748 //                back-reference fixup is also performed here (adjusting
   3749 //                back-reference operands to point to the correct frame offsets).
   3750 //
   3751 //------------------------------------------------------------------------------
   3752 void RegexCompile::stripNOPs() {
   3753 
   3754     if (U_FAILURE(*fStatus)) {
   3755         return;
   3756     }
   3757 
   3758     int32_t    end = fRXPat->fCompiledPat->size();
   3759     UVector32  deltas(end, *fStatus);
   3760 
   3761     // Make a first pass over the code, computing the amount that things
   3762     //   will be offset at each location in the original code.
   3763     int32_t   loc;
   3764     int32_t   d = 0;
   3765     for (loc=0; loc<end; loc++) {
   3766         deltas.addElement(d, *fStatus);
   3767         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3768         if (URX_TYPE(op) == URX_NOP) {
   3769             d++;
   3770         }
   3771     }
   3772 
   3773     UnicodeString caseStringBuffer;
   3774 
   3775     // Make a second pass over the code, removing the NOPs by moving following
   3776     //  code up, and patching operands that refer to code locations that
   3777     //  are being moved.  The array of offsets from the first step is used
   3778     //  to compute the new operand values.
   3779     int32_t src;
   3780     int32_t dst = 0;
   3781     for (src=0; src<end; src++) {
   3782         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
   3783         int32_t opType = URX_TYPE(op);
   3784         switch (opType) {
   3785         case URX_NOP:
   3786             break;
   3787 
   3788         case URX_STATE_SAVE:
   3789         case URX_JMP:
   3790         case URX_CTR_LOOP:
   3791         case URX_CTR_LOOP_NG:
   3792         case URX_RELOC_OPRND:
   3793         case URX_JMPX:
   3794         case URX_JMP_SAV:
   3795         case URX_JMP_SAV_X:
   3796             // These are instructions with operands that refer to code locations.
   3797             {
   3798                 int32_t  operandAddress = URX_VAL(op);
   3799                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
   3800                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
   3801                 op = buildOp(opType, fixedOperandAddress);
   3802                 fRXPat->fCompiledPat->setElementAt(op, dst);
   3803                 dst++;
   3804                 break;
   3805             }
   3806 
   3807         case URX_BACKREF:
   3808         case URX_BACKREF_I:
   3809             {
   3810                 int32_t where = URX_VAL(op);
   3811                 if (where > fRXPat->fGroupMap->size()) {
   3812                     error(U_REGEX_INVALID_BACK_REF);
   3813                     break;
   3814                 }
   3815                 where = fRXPat->fGroupMap->elementAti(where-1);
   3816                 op    = buildOp(opType, where);
   3817                 fRXPat->fCompiledPat->setElementAt(op, dst);
   3818                 dst++;
   3819 
   3820                 fRXPat->fNeedsAltInput = TRUE;
   3821                 break;
   3822             }
   3823         case URX_RESERVED_OP:
   3824         case URX_RESERVED_OP_N:
   3825         case URX_BACKTRACK:
   3826         case URX_END:
   3827         case URX_ONECHAR:
   3828         case URX_STRING:
   3829         case URX_STRING_LEN:
   3830         case URX_START_CAPTURE:
   3831         case URX_END_CAPTURE:
   3832         case URX_STATIC_SETREF:
   3833         case URX_STAT_SETREF_N:
   3834         case URX_SETREF:
   3835         case URX_DOTANY:
   3836         case URX_FAIL:
   3837         case URX_BACKSLASH_B:
   3838         case URX_BACKSLASH_BU:
   3839         case URX_BACKSLASH_G:
   3840         case URX_BACKSLASH_X:
   3841         case URX_BACKSLASH_Z:
   3842         case URX_DOTANY_ALL:
   3843         case URX_BACKSLASH_D:
   3844         case URX_CARET:
   3845         case URX_DOLLAR:
   3846         case URX_CTR_INIT:
   3847         case URX_CTR_INIT_NG:
   3848         case URX_DOTANY_UNIX:
   3849         case URX_STO_SP:
   3850         case URX_LD_SP:
   3851         case URX_STO_INP_LOC:
   3852         case URX_LA_START:
   3853         case URX_LA_END:
   3854         case URX_ONECHAR_I:
   3855         case URX_STRING_I:
   3856         case URX_DOLLAR_M:
   3857         case URX_CARET_M:
   3858         case URX_CARET_M_UNIX:
   3859         case URX_LB_START:
   3860         case URX_LB_CONT:
   3861         case URX_LB_END:
   3862         case URX_LBN_CONT:
   3863         case URX_LBN_END:
   3864         case URX_LOOP_SR_I:
   3865         case URX_LOOP_DOT_I:
   3866         case URX_LOOP_C:
   3867         case URX_DOLLAR_D:
   3868         case URX_DOLLAR_MD:
   3869         case URX_BACKSLASH_H:
   3870         case URX_BACKSLASH_R:
   3871         case URX_BACKSLASH_V:
   3872             // These instructions are unaltered by the relocation.
   3873             fRXPat->fCompiledPat->setElementAt(op, dst);
   3874             dst++;
   3875             break;
   3876 
   3877         default:
   3878             // Some op is unaccounted for.
   3879             U_ASSERT(FALSE);
   3880             error(U_REGEX_INTERNAL_ERROR);
   3881         }
   3882     }
   3883 
   3884     fRXPat->fCompiledPat->setSize(dst);
   3885 }
   3886 
   3887 
   3888 
   3889 
   3890 //------------------------------------------------------------------------------
   3891 //
   3892 //  Error         Report a rule parse error.
   3893 //                Only report it if no previous error has been recorded.
   3894 //
   3895 //------------------------------------------------------------------------------
   3896 void RegexCompile::error(UErrorCode e) {
   3897     if (U_SUCCESS(*fStatus)) {
   3898         *fStatus = e;
   3899         // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
   3900         // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
   3901         // int64_t. If the values of the latter are out of range for the former,
   3902         // set them to the appropriate "field not supported" values.
   3903         if (fLineNum > 0x7FFFFFFF) {
   3904             fParseErr->line   = 0;
   3905             fParseErr->offset = -1;
   3906         } else if (fCharNum > 0x7FFFFFFF) {
   3907             fParseErr->line   = (int32_t)fLineNum;
   3908             fParseErr->offset = -1;
   3909         } else {
   3910             fParseErr->line   = (int32_t)fLineNum;
   3911             fParseErr->offset = (int32_t)fCharNum;
   3912         }
   3913 
   3914         UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
   3915 
   3916         // Fill in the context.
   3917         //   Note: extractBetween() pins supplied indicies to the string bounds.
   3918         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
   3919         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
   3920         utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
   3921         utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
   3922     }
   3923 }
   3924 
   3925 
   3926 //
   3927 //  Assorted Unicode character constants.
   3928 //     Numeric because there is no portable way to enter them as literals.
   3929 //     (Think EBCDIC).
   3930 //
   3931 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
   3932 static const UChar      chLF        = 0x0a;      // Line Feed
   3933 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
   3934 static const UChar      chDigit0    = 0x30;      // '0'
   3935 static const UChar      chDigit7    = 0x37;      // '9'
   3936 static const UChar      chColon     = 0x3A;      // ':'
   3937 static const UChar      chE         = 0x45;      // 'E'
   3938 static const UChar      chQ         = 0x51;      // 'Q'
   3939 //static const UChar      chN         = 0x4E;      // 'N'
   3940 static const UChar      chP         = 0x50;      // 'P'
   3941 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
   3942 //static const UChar      chLBracket  = 0x5b;      // '['
   3943 static const UChar      chRBracket  = 0x5d;      // ']'
   3944 static const UChar      chUp        = 0x5e;      // '^'
   3945 static const UChar      chLowerP    = 0x70;
   3946 static const UChar      chLBrace    = 0x7b;      // '{'
   3947 static const UChar      chRBrace    = 0x7d;      // '}'
   3948 static const UChar      chNEL       = 0x85;      //    NEL newline variant
   3949 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
   3950 
   3951 
   3952 //------------------------------------------------------------------------------
   3953 //
   3954 //  nextCharLL    Low Level Next Char from the regex pattern.
   3955 //                Get a char from the string, keep track of input position
   3956 //                     for error reporting.
   3957 //
   3958 //------------------------------------------------------------------------------
   3959 UChar32  RegexCompile::nextCharLL() {
   3960     UChar32       ch;
   3961 
   3962     if (fPeekChar != -1) {
   3963         ch = fPeekChar;
   3964         fPeekChar = -1;
   3965         return ch;
   3966     }
   3967 
   3968     // assume we're already in the right place
   3969     ch = UTEXT_NEXT32(fRXPat->fPattern);
   3970     if (ch == U_SENTINEL) {
   3971         return ch;
   3972     }
   3973 
   3974     if (ch == chCR ||
   3975         ch == chNEL ||
   3976         ch == chLS   ||
   3977         (ch == chLF && fLastChar != chCR)) {
   3978         // Character is starting a new line.  Bump up the line number, and
   3979         //  reset the column to 0.
   3980         fLineNum++;
   3981         fCharNum=0;
   3982     }
   3983     else {
   3984         // Character is not starting a new line.  Except in the case of a
   3985         //   LF following a CR, increment the column position.
   3986         if (ch != chLF) {
   3987             fCharNum++;
   3988         }
   3989     }
   3990     fLastChar = ch;
   3991     return ch;
   3992 }
   3993 
   3994 //------------------------------------------------------------------------------
   3995 //
   3996 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
   3997 //                 character without actually getting it.
   3998 //
   3999 //------------------------------------------------------------------------------
   4000 UChar32  RegexCompile::peekCharLL() {
   4001     if (fPeekChar == -1) {
   4002         fPeekChar = nextCharLL();
   4003     }
   4004     return fPeekChar;
   4005 }
   4006 
   4007 
   4008 //------------------------------------------------------------------------------
   4009 //
   4010 //   nextChar     for pattern scanning.  At this level, we handle stripping
   4011 //                out comments and processing some backslash character escapes.
   4012 //                The rest of the pattern grammar is handled at the next level up.
   4013 //
   4014 //------------------------------------------------------------------------------
   4015 void RegexCompile::nextChar(RegexPatternChar &c) {
   4016 
   4017     fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
   4018     c.fChar    = nextCharLL();
   4019     c.fQuoted  = FALSE;
   4020 
   4021     if (fQuoteMode) {
   4022         c.fQuoted = TRUE;
   4023         if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
   4024             c.fChar == (UChar32)-1) {
   4025             fQuoteMode = FALSE;  //  Exit quote mode,
   4026             nextCharLL();        // discard the E
   4027             nextChar(c);         // recurse to get the real next char
   4028         }
   4029     }
   4030     else if (fInBackslashQuote) {
   4031         // The current character immediately follows a '\'
   4032         // Don't check for any further escapes, just return it as-is.
   4033         // Don't set c.fQuoted, because that would prevent the state machine from
   4034         //    dispatching on the character.
   4035         fInBackslashQuote = FALSE;
   4036     }
   4037     else
   4038     {
   4039         // We are not in a \Q quoted region \E of the source.
   4040         //
   4041         if (fModeFlags & UREGEX_COMMENTS) {
   4042             //
   4043             // We are in free-spacing and comments mode.
   4044             //  Scan through any white space and comments, until we
   4045             //  reach a significant character or the end of inut.
   4046             for (;;) {
   4047                 if (c.fChar == (UChar32)-1) {
   4048                     break;     // End of Input
   4049                 }
   4050                 if  (c.fChar == chPound && fEOLComments == TRUE) {
   4051                     // Start of a comment.  Consume the rest of it, until EOF or a new line
   4052                     for (;;) {
   4053                         c.fChar = nextCharLL();
   4054                         if (c.fChar == (UChar32)-1 ||  // EOF
   4055                             c.fChar == chCR        ||
   4056                             c.fChar == chLF        ||
   4057                             c.fChar == chNEL       ||
   4058                             c.fChar == chLS)       {
   4059                             break;
   4060                         }
   4061                     }
   4062                 }
   4063                 // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
   4064                 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
   4065                     break;
   4066                 }
   4067                 c.fChar = nextCharLL();
   4068             }
   4069         }
   4070 
   4071         //
   4072         //  check for backslash escaped characters.
   4073         //
   4074         if (c.fChar == chBackSlash) {
   4075             int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
   4076             if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
   4077                 //
   4078                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
   4079                 //   Includes \uxxxx, \n, \r, many others.
   4080                 //   Return the single equivalent character.
   4081                 //
   4082                 nextCharLL();                 // get & discard the peeked char.
   4083                 c.fQuoted = TRUE;
   4084 
   4085                 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
   4086                     int32_t endIndex = (int32_t)pos;
   4087                     c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
   4088 
   4089                     if (endIndex == pos) {
   4090                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   4091                     }
   4092                     fCharNum += endIndex - pos;
   4093                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
   4094                 } else {
   4095                     int32_t offset = 0;
   4096                     struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
   4097 
   4098                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
   4099                     c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
   4100 
   4101                     if (offset == 0) {
   4102                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   4103                     } else if (context.lastOffset == offset) {
   4104                         UTEXT_PREVIOUS32(fRXPat->fPattern);
   4105                     } else if (context.lastOffset != offset-1) {
   4106                         utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
   4107                     }
   4108                     fCharNum += offset;
   4109                 }
   4110             }
   4111             else if (peekCharLL() == chDigit0) {
   4112                 //  Octal Escape, using Java Regexp Conventions
   4113                 //    which are \0 followed by 1-3 octal digits.
   4114                 //    Different from ICU Unescape handling of Octal, which does not
   4115                 //    require the leading 0.
   4116                 //  Java also has the convention of only consuming 2 octal digits if
   4117                 //    the three digit number would be > 0xff
   4118                 //
   4119                 c.fChar = 0;
   4120                 nextCharLL();    // Consume the initial 0.
   4121                 int index;
   4122                 for (index=0; index<3; index++) {
   4123                     int32_t ch = peekCharLL();
   4124                     if (ch<chDigit0 || ch>chDigit7) {
   4125                         if (index==0) {
   4126                            // \0 is not followed by any octal digits.
   4127                            error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   4128                         }
   4129                         break;
   4130                     }
   4131                     c.fChar <<= 3;
   4132                     c.fChar += ch&7;
   4133                     if (c.fChar <= 255) {
   4134                         nextCharLL();
   4135                     } else {
   4136                         // The last digit made the number too big.  Forget we saw it.
   4137                         c.fChar >>= 3;
   4138                     }
   4139                 }
   4140                 c.fQuoted = TRUE;
   4141             }
   4142             else if (peekCharLL() == chQ) {
   4143                 //  "\Q"  enter quote mode, which will continue until "\E"
   4144                 fQuoteMode = TRUE;
   4145                 nextCharLL();       // discard the 'Q'.
   4146                 nextChar(c);        // recurse to get the real next char.
   4147             }
   4148             else
   4149             {
   4150                 // We are in a '\' escape that will be handled by the state table scanner.
   4151                 // Just return the backslash, but remember that the following char is to
   4152                 //  be taken literally.
   4153                 fInBackslashQuote = TRUE;
   4154             }
   4155         }
   4156     }
   4157 
   4158     // re-enable # to end-of-line comments, in case they were disabled.
   4159     // They are disabled by the parser upon seeing '(?', but this lasts for
   4160     //  the fetching of the next character only.
   4161     fEOLComments = TRUE;
   4162 
   4163     // putc(c.fChar, stdout);
   4164 }
   4165 
   4166 
   4167 
   4168 //------------------------------------------------------------------------------
   4169 //
   4170 //  scanNamedChar
   4171 //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
   4172 //
   4173 //             The scan position will be at the 'N'.  On return
   4174 //             the scan position should be just after the '}'
   4175 //
   4176 //             Return the UChar32
   4177 //
   4178 //------------------------------------------------------------------------------
   4179 UChar32  RegexCompile::scanNamedChar() {
   4180     if (U_FAILURE(*fStatus)) {
   4181         return 0;
   4182     }
   4183 
   4184     nextChar(fC);
   4185     if (fC.fChar != chLBrace) {
   4186         error(U_REGEX_PROPERTY_SYNTAX);
   4187         return 0;
   4188     }
   4189 
   4190     UnicodeString  charName;
   4191     for (;;) {
   4192         nextChar(fC);
   4193         if (fC.fChar == chRBrace) {
   4194             break;
   4195         }
   4196         if (fC.fChar == -1) {
   4197             error(U_REGEX_PROPERTY_SYNTAX);
   4198             return 0;
   4199         }
   4200         charName.append(fC.fChar);
   4201     }
   4202 
   4203     char name[100];
   4204     if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
   4205          (uint32_t)charName.length()>=sizeof(name)) {
   4206         // All Unicode character names have only invariant characters.
   4207         // The API to get a character, given a name, accepts only char *, forcing us to convert,
   4208         //   which requires this error check
   4209         error(U_REGEX_PROPERTY_SYNTAX);
   4210         return 0;
   4211     }
   4212     charName.extract(0, charName.length(), name, sizeof(name), US_INV);
   4213 
   4214     UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
   4215     if (U_FAILURE(*fStatus)) {
   4216         error(U_REGEX_PROPERTY_SYNTAX);
   4217     }
   4218 
   4219     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
   4220     return theChar;
   4221 }
   4222 
   4223 //------------------------------------------------------------------------------
   4224 //
   4225 //  scanProp   Construct a UnicodeSet from the text at the current scan
   4226 //             position, which will be of the form \p{whaterver}
   4227 //
   4228 //             The scan position will be at the 'p' or 'P'.  On return
   4229 //             the scan position should be just after the '}'
   4230 //
   4231 //             Return a UnicodeSet, constructed from the \P pattern,
   4232 //             or NULL if the pattern is invalid.
   4233 //
   4234 //------------------------------------------------------------------------------
   4235 UnicodeSet *RegexCompile::scanProp() {
   4236     UnicodeSet    *uset = NULL;
   4237 
   4238     if (U_FAILURE(*fStatus)) {
   4239         return NULL;
   4240     }
   4241     (void)chLowerP;   // Suppress compiler unused variable warning.
   4242     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
   4243     UBool negated = (fC.fChar == chP);
   4244 
   4245     UnicodeString propertyName;
   4246     nextChar(fC);
   4247     if (fC.fChar != chLBrace) {
   4248         error(U_REGEX_PROPERTY_SYNTAX);
   4249         return NULL;
   4250     }
   4251     for (;;) {
   4252         nextChar(fC);
   4253         if (fC.fChar == chRBrace) {
   4254             break;
   4255         }
   4256         if (fC.fChar == -1) {
   4257             // Hit the end of the input string without finding the closing '}'
   4258             error(U_REGEX_PROPERTY_SYNTAX);
   4259             return NULL;
   4260         }
   4261         propertyName.append(fC.fChar);
   4262     }
   4263     uset = createSetForProperty(propertyName, negated);
   4264     nextChar(fC);    // Move input scan to position following the closing '}'
   4265     return uset;
   4266 }
   4267 
   4268 //------------------------------------------------------------------------------
   4269 //
   4270 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
   4271 //             position, which is expected be of the form [:property expression:]
   4272 //
   4273 //             The scan position will be at the opening ':'.  On return
   4274 //             the scan position must be on the closing ']'
   4275 //
   4276 //             Return a UnicodeSet constructed from the pattern,
   4277 //             or NULL if this is not a valid POSIX-style set expression.
   4278 //             If not a property expression, restore the initial scan position
   4279 //                (to the opening ':')
   4280 //
   4281 //               Note:  the opening '[:' is not sufficient to guarantee that
   4282 //                      this is a [:property:] expression.
   4283 //                      [:'+=,] is a perfectly good ordinary set expression that
   4284 //                              happens to include ':' as one of its characters.
   4285 //
   4286 //------------------------------------------------------------------------------
   4287 UnicodeSet *RegexCompile::scanPosixProp() {
   4288     UnicodeSet    *uset = NULL;
   4289 
   4290     if (U_FAILURE(*fStatus)) {
   4291         return NULL;
   4292     }
   4293 
   4294     U_ASSERT(fC.fChar == chColon);
   4295 
   4296     // Save the scanner state.
   4297     // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
   4298     int64_t     savedScanIndex        = fScanIndex;
   4299     int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
   4300     UBool       savedQuoteMode        = fQuoteMode;
   4301     UBool       savedInBackslashQuote = fInBackslashQuote;
   4302     UBool       savedEOLComments      = fEOLComments;
   4303     int64_t     savedLineNum          = fLineNum;
   4304     int64_t     savedCharNum          = fCharNum;
   4305     UChar32     savedLastChar         = fLastChar;
   4306     UChar32     savedPeekChar         = fPeekChar;
   4307     RegexPatternChar savedfC          = fC;
   4308 
   4309     // Scan for a closing ].   A little tricky because there are some perverse
   4310     //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
   4311     //   ending on the second closing ].
   4312 
   4313     UnicodeString propName;
   4314     UBool         negated  = FALSE;
   4315 
   4316     // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
   4317     nextChar(fC);
   4318     if (fC.fChar == chUp) {
   4319        negated = TRUE;
   4320        nextChar(fC);
   4321     }
   4322 
   4323     // Scan for the closing ":]", collecting the property name along the way.
   4324     UBool  sawPropSetTerminator = FALSE;
   4325     for (;;) {
   4326         propName.append(fC.fChar);
   4327         nextChar(fC);
   4328         if (fC.fQuoted || fC.fChar == -1) {
   4329             // Escaped characters or end of input - either says this isn't a [:Property:]
   4330             break;
   4331         }
   4332         if (fC.fChar == chColon) {
   4333             nextChar(fC);
   4334             if (fC.fChar == chRBracket) {
   4335                 sawPropSetTerminator = TRUE;
   4336             }
   4337             break;
   4338         }
   4339     }
   4340 
   4341     if (sawPropSetTerminator) {
   4342         uset = createSetForProperty(propName, negated);
   4343     }
   4344     else
   4345     {
   4346         // No closing ":]".
   4347         //  Restore the original scan position.
   4348         //  The main scanner will retry the input as a normal set expression,
   4349         //    not a [:Property:] expression.
   4350         fScanIndex        = savedScanIndex;
   4351         fQuoteMode        = savedQuoteMode;
   4352         fInBackslashQuote = savedInBackslashQuote;
   4353         fEOLComments      = savedEOLComments;
   4354         fLineNum          = savedLineNum;
   4355         fCharNum          = savedCharNum;
   4356         fLastChar         = savedLastChar;
   4357         fPeekChar         = savedPeekChar;
   4358         fC                = savedfC;
   4359         UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
   4360     }
   4361     return uset;
   4362 }
   4363 
   4364 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
   4365     set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
   4366     addCategory(set, U_GC_CF_MASK, ec);
   4367 }
   4368 
   4369 //
   4370 //  Create a Unicode Set from a Unicode Property expression.
   4371 //     This is common code underlying both \p{...} ane [:...:] expressions.
   4372 //     Includes trying the Java "properties" that aren't supported as
   4373 //     normal ICU UnicodeSet properties
   4374 //
   4375 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
   4376 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
   4377 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
   4378     UnicodeString   setExpr;
   4379     UnicodeSet      *set;
   4380     uint32_t        usetFlags = 0;
   4381 
   4382     if (U_FAILURE(*fStatus)) {
   4383         return NULL;
   4384     }
   4385 
   4386     //
   4387     //  First try the property as we received it
   4388     //
   4389     if (negated) {
   4390         setExpr.append(negSetPrefix, -1);
   4391     } else {
   4392         setExpr.append(posSetPrefix, -1);
   4393     }
   4394     setExpr.append(propName);
   4395     setExpr.append(chRBrace);
   4396     setExpr.append(chRBracket);
   4397     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   4398         usetFlags |= USET_CASE_INSENSITIVE;
   4399     }
   4400     set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
   4401     if (U_SUCCESS(*fStatus)) {
   4402        return set;
   4403     }
   4404     delete set;
   4405     set = NULL;
   4406 
   4407     //
   4408     //  The property as it was didn't work.
   4409 
   4410     //  Do [:word:]. It is not recognized as a property by UnicodeSet.  "word" not standard POSIX
   4411     //     or standard Java, but many other regular expression packages do recognize it.
   4412 
   4413     if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
   4414         *fStatus = U_ZERO_ERROR;
   4415         set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
   4416         if (set == NULL) {
   4417             *fStatus = U_MEMORY_ALLOCATION_ERROR;
   4418             return set;
   4419         }
   4420         if (negated) {
   4421             set->complement();
   4422         }
   4423         return set;
   4424     }
   4425 
   4426 
   4427     //    Do Java fixes -
   4428     //       InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
   4429     //       InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
   4430     //
   4431     //       Note on Spaces:  either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
   4432     //                        is accepted by Java.  The property part of the name is compared
   4433     //                        case-insenstively.  The spaces must be exactly as shown, either
   4434     //                        all there, or all omitted, with exactly one at each position
   4435     //                        if they are present.  From checking against JDK 1.6
   4436     //
   4437     //       This code should be removed when ICU properties support the Java  compatibility names
   4438     //          (ICU 4.0?)
   4439     //
   4440     UnicodeString mPropName = propName;
   4441     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
   4442         mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
   4443     }
   4444     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
   4445         mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
   4446         mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
   4447     }
   4448     else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
   4449         mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
   4450     }
   4451 
   4452     //    See if the property looks like a Java "InBlockName", which
   4453     //    we will recast as "Block=BlockName"
   4454     //
   4455     static const UChar IN[] = {0x49, 0x6E, 0};  // "In"
   4456     static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00};  // "Block="
   4457     if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
   4458         setExpr.truncate(4);   // Leaves "[\p{", or "[\P{"
   4459         setExpr.append(BLOCK, -1);
   4460         setExpr.append(UnicodeString(mPropName, 2));  // Property with the leading "In" removed.
   4461         setExpr.append(chRBrace);
   4462         setExpr.append(chRBracket);
   4463         *fStatus = U_ZERO_ERROR;
   4464         set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
   4465         if (U_SUCCESS(*fStatus)) {
   4466             return set;
   4467         }
   4468         delete set;
   4469         set = NULL;
   4470     }
   4471 
   4472     if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
   4473         propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
   4474     {
   4475         UErrorCode localStatus = U_ZERO_ERROR;
   4476         //setExpr.remove();
   4477         set = new UnicodeSet();
   4478         //
   4479         //  Try the various Java specific properties.
   4480         //   These all begin with "java"
   4481         //
   4482         if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
   4483             addCategory(set, U_GC_CN_MASK, localStatus);
   4484             set->complement();
   4485         }
   4486         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
   4487             addCategory(set, U_GC_ND_MASK, localStatus);
   4488         }
   4489         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
   4490             addIdentifierIgnorable(set, localStatus);
   4491         }
   4492         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
   4493             set->add(0, 0x1F).add(0x7F, 0x9F);
   4494         }
   4495         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
   4496             addCategory(set, U_GC_L_MASK, localStatus);
   4497             addCategory(set, U_GC_SC_MASK, localStatus);
   4498             addCategory(set, U_GC_PC_MASK, localStatus);
   4499             addCategory(set, U_GC_ND_MASK, localStatus);
   4500             addCategory(set, U_GC_NL_MASK, localStatus);
   4501             addCategory(set, U_GC_MC_MASK, localStatus);
   4502             addCategory(set, U_GC_MN_MASK, localStatus);
   4503             addIdentifierIgnorable(set, localStatus);
   4504         }
   4505         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
   4506             addCategory(set, U_GC_L_MASK, localStatus);
   4507             addCategory(set, U_GC_NL_MASK, localStatus);
   4508             addCategory(set, U_GC_SC_MASK, localStatus);
   4509             addCategory(set, U_GC_PC_MASK, localStatus);
   4510         }
   4511         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
   4512             addCategory(set, U_GC_L_MASK, localStatus);
   4513         }
   4514         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
   4515             addCategory(set, U_GC_L_MASK, localStatus);
   4516             addCategory(set, U_GC_ND_MASK, localStatus);
   4517         }
   4518         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
   4519             addCategory(set, U_GC_LL_MASK, localStatus);
   4520         }
   4521         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
   4522             set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
   4523         }
   4524         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
   4525             addCategory(set, U_GC_Z_MASK, localStatus);
   4526         }
   4527         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
   4528             set->add(0x10000, UnicodeSet::MAX_VALUE);
   4529         }
   4530         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
   4531             addCategory(set, U_GC_LT_MASK, localStatus);
   4532         }
   4533         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
   4534             addCategory(set, U_GC_L_MASK, localStatus);
   4535             addCategory(set, U_GC_NL_MASK, localStatus);
   4536         }
   4537         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
   4538             addCategory(set, U_GC_L_MASK, localStatus);
   4539             addCategory(set, U_GC_PC_MASK, localStatus);
   4540             addCategory(set, U_GC_ND_MASK, localStatus);
   4541             addCategory(set, U_GC_NL_MASK, localStatus);
   4542             addCategory(set, U_GC_MC_MASK, localStatus);
   4543             addCategory(set, U_GC_MN_MASK, localStatus);
   4544             addIdentifierIgnorable(set, localStatus);
   4545         }
   4546         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
   4547             addCategory(set, U_GC_LU_MASK, localStatus);
   4548         }
   4549         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
   4550             set->add(0, UnicodeSet::MAX_VALUE);
   4551         }
   4552         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
   4553             addCategory(set, U_GC_Z_MASK, localStatus);
   4554             set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
   4555             set->add(9, 0x0d).add(0x1c, 0x1f);
   4556         }
   4557         else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
   4558             set->add(0, UnicodeSet::MAX_VALUE);
   4559         }
   4560 
   4561         if (U_SUCCESS(localStatus) && !set->isEmpty()) {
   4562             *fStatus = U_ZERO_ERROR;
   4563             if (usetFlags & USET_CASE_INSENSITIVE) {
   4564                 set->closeOver(USET_CASE_INSENSITIVE);
   4565             }
   4566             if (negated) {
   4567                 set->complement();
   4568             }
   4569             return set;
   4570         }
   4571         delete set;
   4572         set = NULL;
   4573     }
   4574     error(*fStatus);
   4575     return NULL;
   4576 }
   4577 
   4578 
   4579 
   4580 //
   4581 //  SetEval   Part of the evaluation of [set expressions].
   4582 //            Perform any pending (stacked) operations with precedence
   4583 //            equal or greater to that of the next operator encountered
   4584 //            in the expression.
   4585 //
   4586 void RegexCompile::setEval(int32_t nextOp) {
   4587     UnicodeSet *rightOperand = NULL;
   4588     UnicodeSet *leftOperand  = NULL;
   4589     for (;;) {
   4590         U_ASSERT(fSetOpStack.empty()==FALSE);
   4591         int32_t pendingSetOperation = fSetOpStack.peeki();
   4592         if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
   4593             break;
   4594         }
   4595         fSetOpStack.popi();
   4596         U_ASSERT(fSetStack.empty() == FALSE);
   4597         rightOperand = (UnicodeSet *)fSetStack.peek();
   4598         switch (pendingSetOperation) {
   4599             case setNegation:
   4600                 rightOperand->complement();
   4601                 break;
   4602             case setCaseClose:
   4603                 // TODO: need a simple close function.  Ticket 6065
   4604                 rightOperand->closeOver(USET_CASE_INSENSITIVE);
   4605                 rightOperand->removeAllStrings();
   4606                 break;
   4607             case setDifference1:
   4608             case setDifference2:
   4609                 fSetStack.pop();
   4610                 leftOperand = (UnicodeSet *)fSetStack.peek();
   4611                 leftOperand->removeAll(*rightOperand);
   4612                 delete rightOperand;
   4613                 break;
   4614             case setIntersection1:
   4615             case setIntersection2:
   4616                 fSetStack.pop();
   4617                 leftOperand = (UnicodeSet *)fSetStack.peek();
   4618                 leftOperand->retainAll(*rightOperand);
   4619                 delete rightOperand;
   4620                 break;
   4621             case setUnion:
   4622                 fSetStack.pop();
   4623                 leftOperand = (UnicodeSet *)fSetStack.peek();
   4624                 leftOperand->addAll(*rightOperand);
   4625                 delete rightOperand;
   4626                 break;
   4627             default:
   4628                 U_ASSERT(FALSE);
   4629                 break;
   4630             }
   4631         }
   4632     }
   4633 
   4634 void RegexCompile::setPushOp(int32_t op) {
   4635     setEval(op);
   4636     fSetOpStack.push(op, *fStatus);
   4637     fSetStack.push(new UnicodeSet(), *fStatus);
   4638 }
   4639 
   4640 U_NAMESPACE_END
   4641 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
   4642 
   4643