Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 //
      4 //  file:  regexcmp.cpp
      5 //
      6 //  Copyright (C) 2002-2016 International Business Machines Corporation and others.
      7 //  All Rights Reserved.
      8 //
      9 //  This file contains the ICU regular expression compiler, which is responsible
     10 //  for processing a regular expression pattern into the compiled form that
     11 //  is used by the match finding engine.
     12 //
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     17 
     18 #include "unicode/ustring.h"
     19 #include "unicode/unistr.h"
     20 #include "unicode/uniset.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/uchriter.h"
     23 #include "unicode/parsepos.h"
     24 #include "unicode/parseerr.h"
     25 #include "unicode/regex.h"
     26 #include "unicode/utf.h"
     27 #include "unicode/utf16.h"
     28 #include "patternprops.h"
     29 #include "putilimp.h"
     30 #include "cmemory.h"
     31 #include "cstring.h"
     32 #include "uvectr32.h"
     33 #include "uvectr64.h"
     34 #include "uassert.h"
     35 #include "uinvchar.h"
     36 
     37 #include "regeximp.h"
     38 #include "regexcst.h"   // Contains state table for the regex pattern parser.
     39                         //   generated by a Perl script.
     40 #include "regexcmp.h"
     41 #include "regexst.h"
     42 #include "regextxt.h"
     43 
     44 
     45 
     46 U_NAMESPACE_BEGIN
     47 
     48 
     49 //------------------------------------------------------------------------------
     50 //
     51 //  Constructor.
     52 //
     53 //------------------------------------------------------------------------------
     54 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
     55    fParenStack(status), fSetStack(status), fSetOpStack(status)
     56 {
     57     // Lazy init of all shared global sets (needed for init()'s empty text)
     58     RegexStaticSets::initGlobals(&status);
     59 
     60     fStatus           = &status;
     61 
     62     fRXPat            = rxp;
     63     fScanIndex        = 0;
     64     fLastChar         = -1;
     65     fPeekChar         = -1;
     66     fLineNum          = 1;
     67     fCharNum          = 0;
     68     fQuoteMode        = FALSE;
     69     fInBackslashQuote = FALSE;
     70     fModeFlags        = fRXPat->fFlags | 0x80000000;
     71     fEOLComments      = TRUE;
     72 
     73     fMatchOpenParen   = -1;
     74     fMatchCloseParen  = -1;
     75     fCaptureName      = NULL;
     76     fLastSetLiteral   = U_SENTINEL;
     77 
     78     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
     79         status = rxp->fDeferredStatus;
     80     }
     81 }
     82 
     83 static const UChar      chAmp       = 0x26;      // '&'
     84 static const UChar      chDash      = 0x2d;      // '-'
     85 
     86 
     87 //------------------------------------------------------------------------------
     88 //
     89 //  Destructor
     90 //
     91 //------------------------------------------------------------------------------
     92 RegexCompile::~RegexCompile() {
     93     delete fCaptureName;         // Normally will be NULL, but can exist if pattern
     94                                  //   compilation stops with a syntax error.
     95 }
     96 
     97 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
     98     set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
     99 }
    100 
    101 //------------------------------------------------------------------------------
    102 //
    103 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
    104 //                           The state tables are hand-written in the file regexcst.txt,
    105 //                           and converted to the form used here by a perl
    106 //                           script regexcst.pl
    107 //
    108 //------------------------------------------------------------------------------
    109 void    RegexCompile::compile(
    110                          const UnicodeString &pat,   // Source pat to be compiled.
    111                          UParseError &pp,            // Error position info
    112                          UErrorCode &e)              // Error Code
    113 {
    114     fRXPat->fPatternString = new UnicodeString(pat);
    115     UText patternText = UTEXT_INITIALIZER;
    116     utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
    117 
    118     if (U_SUCCESS(e)) {
    119         compile(&patternText, pp, e);
    120         utext_close(&patternText);
    121     }
    122 }
    123 
    124 //
    125 //   compile, UText mode
    126 //     All the work is actually done here.
    127 //
    128 void    RegexCompile::compile(
    129                          UText *pat,                 // Source pat to be compiled.
    130                          UParseError &pp,            // Error position info
    131                          UErrorCode &e)              // Error Code
    132 {
    133     fStatus             = &e;
    134     fParseErr           = &pp;
    135     fStackPtr           = 0;
    136     fStack[fStackPtr]   = 0;
    137 
    138     if (U_FAILURE(*fStatus)) {
    139         return;
    140     }
    141 
    142     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
    143     U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
    144 
    145     // Prepare the RegexPattern object to receive the compiled pattern.
    146     fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
    147     if (U_FAILURE(*fStatus)) {
    148         return;
    149     }
    150     fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
    151     fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;
    152 
    153 
    154     // Initialize the pattern scanning state machine
    155     fPatternLength = utext_nativeLength(pat);
    156     uint16_t                state = 1;
    157     const RegexTableEl      *tableEl;
    158 
    159     // UREGEX_LITERAL force entire pattern to be treated as a literal string.
    160     if (fModeFlags & UREGEX_LITERAL) {
    161         fQuoteMode = TRUE;
    162     }
    163 
    164     nextChar(fC);                        // Fetch the first char from the pattern string.
    165 
    166     //
    167     // Main loop for the regex pattern parsing state machine.
    168     //   Runs once per state transition.
    169     //   Each time through optionally performs, depending on the state table,
    170     //      - an advance to the the next pattern char
    171     //      - an action to be performed.
    172     //      - pushing or popping a state to/from the local state return stack.
    173     //   file regexcst.txt is the source for the state table.  The logic behind
    174     //     recongizing the pattern syntax is there, not here.
    175     //
    176     for (;;) {
    177         //  Bail out if anything has gone wrong.
    178         //  Regex pattern parsing stops on the first error encountered.
    179         if (U_FAILURE(*fStatus)) {
    180             break;
    181         }
    182 
    183         U_ASSERT(state != 0);
    184 
    185         // Find the state table element that matches the input char from the pattern, or the
    186         //    class of the input character.  Start with the first table row for this
    187         //    state, then linearly scan forward until we find a row that matches the
    188         //    character.  The last row for each state always matches all characters, so
    189         //    the search will stop there, if not before.
    190         //
    191         tableEl = &gRuleParseStateTable[state];
    192         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
    193             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
    194 
    195         for (;;) {    // loop through table rows belonging to this state, looking for one
    196                       //   that matches the current input char.
    197             REGEX_SCAN_DEBUG_PRINTF(("."));
    198             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
    199                 // Table row specified an individual character, not a set, and
    200                 //   the input character is not quoted, and
    201                 //   the input character matched it.
    202                 break;
    203             }
    204             if (tableEl->fCharClass == 255) {
    205                 // Table row specified default, match anything character class.
    206                 break;
    207             }
    208             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
    209                 // Table row specified "quoted" and the char was quoted.
    210                 break;
    211             }
    212             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
    213                 // Table row specified eof and we hit eof on the input.
    214                 break;
    215             }
    216 
    217             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
    218                 fC.fQuoted == FALSE &&                                       //   char is not escaped &&
    219                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
    220                 U_ASSERT(tableEl->fCharClass <= 137);
    221                 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
    222                     // Table row specified a character class, or set of characters,
    223                     //   and the current char matches it.
    224                     break;
    225                 }
    226             }
    227 
    228             // No match on this row, advance to the next  row for this state,
    229             tableEl++;
    230         }
    231         REGEX_SCAN_DEBUG_PRINTF(("\n"));
    232 
    233         //
    234         // We've found the row of the state table that matches the current input
    235         //   character from the rules string.
    236         // Perform any action specified  by this row in the state table.
    237         if (doParseActions(tableEl->fAction) == FALSE) {
    238             // Break out of the state machine loop if the
    239             //   the action signalled some kind of error, or
    240             //   the action was to exit, occurs on normal end-of-rules-input.
    241             break;
    242         }
    243 
    244         if (tableEl->fPushState != 0) {
    245             fStackPtr++;
    246             if (fStackPtr >= kStackSize) {
    247                 error(U_REGEX_INTERNAL_ERROR);
    248                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
    249                 fStackPtr--;
    250             }
    251             fStack[fStackPtr] = tableEl->fPushState;
    252         }
    253 
    254         //
    255         //  NextChar.  This is where characters are actually fetched from the pattern.
    256         //             Happens under control of the 'n' tag in the state table.
    257         //
    258         if (tableEl->fNextChar) {
    259             nextChar(fC);
    260         }
    261 
    262         // Get the next state from the table entry, or from the
    263         //   state stack if the next state was specified as "pop".
    264         if (tableEl->fNextState != 255) {
    265             state = tableEl->fNextState;
    266         } else {
    267             state = fStack[fStackPtr];
    268             fStackPtr--;
    269             if (fStackPtr < 0) {
    270                 // state stack underflow
    271                 // This will occur if the user pattern has mis-matched parentheses,
    272                 //   with extra close parens.
    273                 //
    274                 fStackPtr++;
    275                 error(U_REGEX_MISMATCHED_PAREN);
    276             }
    277         }
    278 
    279     }
    280 
    281     if (U_FAILURE(*fStatus)) {
    282         // Bail out if the pattern had errors.
    283         //   Set stack cleanup:  a successful compile would have left it empty,
    284         //   but errors can leave temporary sets hanging around.
    285         while (!fSetStack.empty()) {
    286             delete (UnicodeSet *)fSetStack.pop();
    287         }
    288         return;
    289     }
    290 
    291     //
    292     // The pattern has now been read and processed, and the compiled code generated.
    293     //
    294 
    295     //
    296     // The pattern's fFrameSize so far has accumulated the requirements for
    297     //   storage for capture parentheses, counters, etc. that are encountered
    298     //   in the pattern.  Add space for the two variables that are always
    299     //   present in the saved state:  the input string position (int64_t) and
    300     //   the position in the compiled pattern.
    301     //
    302     allocateStackData(RESTACKFRAME_HDRCOUNT);
    303 
    304     //
    305     // Optimization pass 1: NOPs, back-references, and case-folding
    306     //
    307     stripNOPs();
    308 
    309     //
    310     // Get bounds for the minimum and maximum length of a string that this
    311     //   pattern can match.  Used to avoid looking for matches in strings that
    312     //   are too short.
    313     //
    314     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
    315 
    316     //
    317     // Optimization pass 2: match start type
    318     //
    319     matchStartType();
    320 
    321     //
    322     // Set up fast latin-1 range sets
    323     //
    324     int32_t numSets = fRXPat->fSets->size();
    325     fRXPat->fSets8 = new Regex8BitSet[numSets];
    326     // Null pointer check.
    327     if (fRXPat->fSets8 == NULL) {
    328         e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
    329         return;
    330     }
    331     int32_t i;
    332     for (i=0; i<numSets; i++) {
    333         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
    334         fRXPat->fSets8[i].init(s);
    335     }
    336 
    337 }
    338 
    339 
    340 
    341 
    342 
    343 //------------------------------------------------------------------------------
    344 //
    345 //  doParseAction        Do some action during regex pattern parsing.
    346 //                       Called by the parse state machine.
    347 //
    348 //                       Generation of the match engine PCode happens here, or
    349 //                       in functions called from the parse actions defined here.
    350 //
    351 //
    352 //------------------------------------------------------------------------------
    353 UBool RegexCompile::doParseActions(int32_t action)
    354 {
    355     UBool   returnVal = TRUE;
    356 
    357     switch ((Regex_PatternParseAction)action) {
    358 
    359     case doPatStart:
    360         // Start of pattern compiles to:
    361         //0   SAVE   2        Fall back to position of FAIL
    362         //1   jmp    3
    363         //2   FAIL            Stop if we ever reach here.
    364         //3   NOP             Dummy, so start of pattern looks the same as
    365         //                    the start of an ( grouping.
    366         //4   NOP             Resreved, will be replaced by a save if there are
    367         //                    OR | operators at the top level
    368         appendOp(URX_STATE_SAVE, 2);
    369         appendOp(URX_JMP,  3);
    370         appendOp(URX_FAIL, 0);
    371 
    372         // Standard open nonCapture paren action emits the two NOPs and
    373         //   sets up the paren stack frame.
    374         doParseActions(doOpenNonCaptureParen);
    375         break;
    376 
    377     case doPatFinish:
    378         // We've scanned to the end of the pattern
    379         //  The end of pattern compiles to:
    380         //        URX_END
    381         //    which will stop the runtime match engine.
    382         //  Encountering end of pattern also behaves like a close paren,
    383         //   and forces fixups of the State Save at the beginning of the compiled pattern
    384         //   and of any OR operations at the top level.
    385         //
    386         handleCloseParen();
    387         if (fParenStack.size() > 0) {
    388             // Missing close paren in pattern.
    389             error(U_REGEX_MISMATCHED_PAREN);
    390         }
    391 
    392         // add the END operation to the compiled pattern.
    393         appendOp(URX_END, 0);
    394 
    395         // Terminate the pattern compilation state machine.
    396         returnVal = FALSE;
    397         break;
    398 
    399 
    400 
    401     case doOrOperator:
    402         // Scanning a '|', as in (A|B)
    403         {
    404             // Generate code for any pending literals preceding the '|'
    405             fixLiterals(FALSE);
    406 
    407             // Insert a SAVE operation at the start of the pattern section preceding
    408             //   this OR at this level.  This SAVE will branch the match forward
    409             //   to the right hand side of the OR in the event that the left hand
    410             //   side fails to match and backtracks.  Locate the position for the
    411             //   save from the location on the top of the parentheses stack.
    412             int32_t savePosition = fParenStack.popi();
    413             int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
    414             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
    415             op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
    416             fRXPat->fCompiledPat->setElementAt(op, savePosition);
    417 
    418             // Append an JMP operation into the compiled pattern.  The operand for
    419             //  the JMP will eventually be the location following the ')' for the
    420             //  group.  This will be patched in later, when the ')' is encountered.
    421             appendOp(URX_JMP, 0);
    422 
    423             // Push the position of the newly added JMP op onto the parentheses stack.
    424             // This registers if for fixup when this block's close paren is encountered.
    425             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
    426 
    427             // Append a NOP to the compiled pattern.  This is the slot reserved
    428             //   for a SAVE in the event that there is yet another '|' following
    429             //   this one.
    430             appendOp(URX_NOP, 0);
    431             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
    432         }
    433         break;
    434 
    435 
    436     case doBeginNamedCapture:
    437         // Scanning (?<letter.
    438         //   The first letter of the name will come through again under doConinueNamedCapture.
    439         fCaptureName = new UnicodeString();
    440         if (fCaptureName == NULL) {
    441             error(U_MEMORY_ALLOCATION_ERROR);
    442         }
    443         break;
    444 
    445     case  doContinueNamedCapture:
    446         fCaptureName->append(fC.fChar);
    447         break;
    448 
    449     case doBadNamedCapture:
    450         error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
    451         break;
    452 
    453     case doOpenCaptureParen:
    454         // Open Capturing Paren, possibly named.
    455         //   Compile to a
    456         //      - NOP, which later may be replaced by a save-state if the
    457         //         parenthesized group gets a * quantifier, followed by
    458         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
    459         //      - NOP, which may later be replaced by a save-state if there
    460         //             is an '|' alternation within the parens.
    461         //
    462         //    Each capture group gets three slots in the save stack frame:
    463         //         0: Capture Group start position (in input string being matched.)
    464         //         1: Capture Group end position.
    465         //         2: Start of Match-in-progress.
    466         //    The first two locations are for a completed capture group, and are
    467         //     referred to by back references and the like.
    468         //    The third location stores the capture start position when an START_CAPTURE is
    469         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
    470         //      END_CAPTURE is encountered.
    471         {
    472             fixLiterals();
    473             appendOp(URX_NOP, 0);
    474             int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
    475             appendOp(URX_START_CAPTURE, varsLoc);
    476             appendOp(URX_NOP, 0);
    477 
    478             // On the Parentheses stack, start a new frame and add the postions
    479             //   of the two NOPs.  Depending on what follows in the pattern, the
    480             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
    481             //   address of the end of the parenthesized group.
    482             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    483             fParenStack.push(capturing, *fStatus);                        // Frame type.
    484             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
    485             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
    486 
    487             // Save the mapping from group number to stack frame variable position.
    488             fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
    489 
    490             // If this is a named capture group, add the name->group number mapping.
    491             if (fCaptureName != NULL) {
    492                 int32_t groupNumber = fRXPat->fGroupMap->size();
    493                 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
    494                 fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
    495                 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
    496                     error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
    497                 }
    498             }
    499         }
    500         break;
    501 
    502     case doOpenNonCaptureParen:
    503         // Open non-caputuring (grouping only) Paren.
    504         //   Compile to a
    505         //      - NOP, which later may be replaced by a save-state if the
    506         //         parenthesized group gets a * quantifier, followed by
    507         //      - NOP, which may later be replaced by a save-state if there
    508         //             is an '|' alternation within the parens.
    509         {
    510             fixLiterals();
    511             appendOp(URX_NOP, 0);
    512             appendOp(URX_NOP, 0);
    513 
    514             // On the Parentheses stack, start a new frame and add the postions
    515             //   of the two NOPs.
    516             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    517             fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
    518             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
    519             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
    520         }
    521          break;
    522 
    523 
    524     case doOpenAtomicParen:
    525         // Open Atomic Paren.  (?>
    526         //   Compile to a
    527         //      - NOP, which later may be replaced if the parenthesized group
    528         //         has a quantifier, followed by
    529         //      - STO_SP  save state stack position, so it can be restored at the ")"
    530         //      - NOP, which may later be replaced by a save-state if there
    531         //             is an '|' alternation within the parens.
    532         {
    533             fixLiterals();
    534             appendOp(URX_NOP, 0);
    535             int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
    536             appendOp(URX_STO_SP, varLoc);
    537             appendOp(URX_NOP, 0);
    538 
    539             // On the Parentheses stack, start a new frame and add the postions
    540             //   of the two NOPs.  Depending on what follows in the pattern, the
    541             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
    542             //   address of the end of the parenthesized group.
    543             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    544             fParenStack.push(atomic, *fStatus);                           // Frame type.
    545             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
    546             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
    547         }
    548         break;
    549 
    550 
    551     case doOpenLookAhead:
    552         // Positive Look-ahead   (?=  stuff  )
    553         //
    554         //   Note:   Addition of transparent input regions, with the need to
    555         //           restore the original regions when failing out of a lookahead
    556         //           block, complicated this sequence.  Some conbined opcodes
    557         //           might make sense - or might not, lookahead aren't that common.
    558         //
    559         //      Caution:  min match length optimization knows about this
    560         //               sequence; don't change without making updates there too.
    561         //
    562         // Compiles to
    563         //    1    START_LA     dataLoc     Saves SP, Input Pos
    564         //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
    565         //    3    JMP          6           continue ...
    566         //
    567         //    4.   LA_END                   Look Ahead failed.  Restore regions.
    568         //    5.   BACKTRACK                and back track again.
    569         //
    570         //    6.   NOP              reserved for use by quantifiers on the block.
    571         //                          Look-ahead can't have quantifiers, but paren stack
    572         //                             compile time conventions require the slot anyhow.
    573         //    7.   NOP              may be replaced if there is are '|' ops in the block.
    574         //    8.     code for parenthesized stuff.
    575         //    9.   LA_END
    576         //
    577         //  Two data slots are reserved, for saving the stack ptr and the input position.
    578         {
    579             fixLiterals();
    580             int32_t dataLoc = allocateData(2);
    581             appendOp(URX_LA_START, dataLoc);
    582             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
    583             appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
    584             appendOp(URX_LA_END, dataLoc);
    585             appendOp(URX_BACKTRACK, 0);
    586             appendOp(URX_NOP, 0);
    587             appendOp(URX_NOP, 0);
    588 
    589             // On the Parentheses stack, start a new frame and add the postions
    590             //   of the NOPs.
    591             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    592             fParenStack.push(lookAhead, *fStatus);                        // Frame type.
    593             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
    594             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
    595         }
    596         break;
    597 
    598     case doOpenLookAheadNeg:
    599         // Negated Lookahead.   (?! stuff )
    600         // Compiles to
    601         //    1.    START_LA    dataloc
    602         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
    603         //                                //   which continues with the match.
    604         //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
    605         //    4.       code for parenthesized stuff.
    606         //    5.    END_LA                // Cut back stack, remove saved state from step 2.
    607         //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
    608         //    7.    END_LA                // Restore match region, in case look-ahead was using
    609         //                                        an alternate (transparent) region.
    610         {
    611             fixLiterals();
    612             int32_t dataLoc = allocateData(2);
    613             appendOp(URX_LA_START, dataLoc);
    614             appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
    615             appendOp(URX_NOP, 0);
    616 
    617             // On the Parentheses stack, start a new frame and add the postions
    618             //   of the StateSave and NOP.
    619             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    620             fParenStack.push(negLookAhead, *fStatus);                    // Frame type
    621             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
    622             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
    623 
    624             // Instructions #5 - #7 will be added when the ')' is encountered.
    625         }
    626         break;
    627 
    628     case doOpenLookBehind:
    629         {
    630             //   Compile a (?<= look-behind open paren.
    631             //
    632             //          Compiles to
    633             //              0       URX_LB_START     dataLoc
    634             //              1       URX_LB_CONT      dataLoc
    635             //              2                        MinMatchLen
    636             //              3                        MaxMatchLen
    637             //              4       URX_NOP          Standard '(' boilerplate.
    638             //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
    639             //              6         <code for LookBehind expression>
    640             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
    641             //              8       URX_LA_END       dataLoc    # Restore stack, input pos
    642             //
    643             //          Allocate a block of matcher data, to contain (when running a match)
    644             //              0:    Stack ptr on entry
    645             //              1:    Input Index on entry
    646             //              2:    Start index of match current match attempt.
    647             //              3:    Original Input String len.
    648 
    649             // Generate match code for any pending literals.
    650             fixLiterals();
    651 
    652             // Allocate data space
    653             int32_t dataLoc = allocateData(4);
    654 
    655             // Emit URX_LB_START
    656             appendOp(URX_LB_START, dataLoc);
    657 
    658             // Emit URX_LB_CONT
    659             appendOp(URX_LB_CONT, dataLoc);
    660             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
    661             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
    662 
    663             // Emit the NOPs
    664             appendOp(URX_NOP, 0);
    665             appendOp(URX_NOP, 0);
    666 
    667             // On the Parentheses stack, start a new frame and add the postions
    668             //   of the URX_LB_CONT and the NOP.
    669             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    670             fParenStack.push(lookBehind, *fStatus);                       // Frame type
    671             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
    672             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
    673 
    674             // The final two instructions will be added when the ')' is encountered.
    675         }
    676 
    677         break;
    678 
    679     case doOpenLookBehindNeg:
    680         {
    681             //   Compile a (?<! negated look-behind open paren.
    682             //
    683             //          Compiles to
    684             //              0       URX_LB_START     dataLoc    # Save entry stack, input len
    685             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
    686             //              2                        MinMatchLen
    687             //              3                        MaxMatchLen
    688             //              4                        continueLoc (9)
    689             //              5       URX_NOP          Standard '(' boilerplate.
    690             //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
    691             //              7         <code for LookBehind expression>
    692             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
    693             //              9       ...
    694             //
    695             //          Allocate a block of matcher data, to contain (when running a match)
    696             //              0:    Stack ptr on entry
    697             //              1:    Input Index on entry
    698             //              2:    Start index of match current match attempt.
    699             //              3:    Original Input String len.
    700 
    701             // Generate match code for any pending literals.
    702             fixLiterals();
    703 
    704             // Allocate data space
    705             int32_t dataLoc = allocateData(4);
    706 
    707             // Emit URX_LB_START
    708             appendOp(URX_LB_START, dataLoc);
    709 
    710             // Emit URX_LBN_CONT
    711             appendOp(URX_LBN_CONT, dataLoc);
    712             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
    713             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
    714             appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
    715 
    716             // Emit the NOPs
    717             appendOp(URX_NOP, 0);
    718             appendOp(URX_NOP, 0);
    719 
    720             // On the Parentheses stack, start a new frame and add the postions
    721             //   of the URX_LB_CONT and the NOP.
    722             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    723             fParenStack.push(lookBehindN, *fStatus);                      // Frame type
    724             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
    725             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
    726 
    727             // The final two instructions will be added when the ')' is encountered.
    728         }
    729         break;
    730 
    731     case doConditionalExpr:
    732         // Conditionals such as (?(1)a:b)
    733     case doPerlInline:
    734         // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
    735         error(U_REGEX_UNIMPLEMENTED);
    736         break;
    737 
    738 
    739     case doCloseParen:
    740         handleCloseParen();
    741         if (fParenStack.size() <= 0) {
    742             //  Extra close paren, or missing open paren.
    743             error(U_REGEX_MISMATCHED_PAREN);
    744         }
    745         break;
    746 
    747     case doNOP:
    748         break;
    749 
    750 
    751     case doBadOpenParenType:
    752     case doRuleError:
    753         error(U_REGEX_RULE_SYNTAX);
    754         break;
    755 
    756 
    757     case doMismatchedParenErr:
    758         error(U_REGEX_MISMATCHED_PAREN);
    759         break;
    760 
    761     case doPlus:
    762         //  Normal '+'  compiles to
    763         //     1.   stuff to be repeated  (already built)
    764         //     2.   jmp-sav 1
    765         //     3.   ...
    766         //
    767         //  Or, if the item to be repeated can match a zero length string,
    768         //     1.   STO_INP_LOC  data-loc
    769         //     2.      body of stuff to be repeated
    770         //     3.   JMP_SAV_X    2
    771         //     4.   ...
    772 
    773         //
    774         //  Or, if the item to be repeated is simple
    775         //     1.   Item to be repeated.
    776         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
    777         //     3.   LOOP_C       stack location
    778         {
    779             int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
    780             int32_t  frameLoc;
    781 
    782             // Check for simple constructs, which may get special optimized code.
    783             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
    784                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
    785 
    786                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
    787                     // Emit optimized code for [char set]+
    788                     appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
    789                     frameLoc = allocateStackData(1);
    790                     appendOp(URX_LOOP_C, frameLoc);
    791                     break;
    792                 }
    793 
    794                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
    795                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
    796                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
    797                     // Emit Optimized code for .+ operations.
    798                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
    799                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
    800                         // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
    801                         loopOpI |= 1;
    802                     }
    803                     if (fModeFlags & UREGEX_UNIX_LINES) {
    804                         loopOpI |= 2;
    805                     }
    806                     appendOp(loopOpI);
    807                     frameLoc = allocateStackData(1);
    808                     appendOp(URX_LOOP_C, frameLoc);
    809                     break;
    810                 }
    811 
    812             }
    813 
    814             // General case.
    815 
    816             // Check for minimum match length of zero, which requires
    817             //    extra loop-breaking code.
    818             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
    819                 // Zero length match is possible.
    820                 // Emit the code sequence that can handle it.
    821                 insertOp(topLoc);
    822                 frameLoc = allocateStackData(1);
    823 
    824                 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
    825                 fRXPat->fCompiledPat->setElementAt(op, topLoc);
    826 
    827                 appendOp(URX_JMP_SAV_X, topLoc+1);
    828             } else {
    829                 // Simpler code when the repeated body must match something non-empty
    830                 appendOp(URX_JMP_SAV, topLoc);
    831             }
    832         }
    833         break;
    834 
    835     case doNGPlus:
    836         //  Non-greedy '+?'  compiles to
    837         //     1.   stuff to be repeated  (already built)
    838         //     2.   state-save  1
    839         //     3.   ...
    840         {
    841             int32_t topLoc      = blockTopLoc(FALSE);
    842             appendOp(URX_STATE_SAVE, topLoc);
    843         }
    844         break;
    845 
    846 
    847     case doOpt:
    848         // Normal (greedy) ? quantifier.
    849         //  Compiles to
    850         //     1. state save 3
    851         //     2.    body of optional block
    852         //     3. ...
    853         // Insert the state save into the compiled pattern, and we're done.
    854         {
    855             int32_t   saveStateLoc = blockTopLoc(TRUE);
    856             int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
    857             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
    858         }
    859         break;
    860 
    861     case doNGOpt:
    862         // Non-greedy ?? quantifier
    863         //   compiles to
    864         //    1.  jmp   4
    865         //    2.     body of optional block
    866         //    3   jmp   5
    867         //    4.  state save 2
    868         //    5    ...
    869         //  This code is less than ideal, with two jmps instead of one, because we can only
    870         //  insert one instruction at the top of the block being iterated.
    871         {
    872             int32_t  jmp1_loc = blockTopLoc(TRUE);
    873             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
    874 
    875             int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
    876             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
    877 
    878             appendOp(URX_JMP, jmp2_loc+2);
    879 
    880             appendOp(URX_STATE_SAVE, jmp1_loc+1);
    881         }
    882         break;
    883 
    884 
    885     case doStar:
    886         // Normal (greedy) * quantifier.
    887         // Compiles to
    888         //       1.   STATE_SAVE   4
    889         //       2.      body of stuff being iterated over
    890         //       3.   JMP_SAV      2
    891         //       4.   ...
    892         //
    893         // Or, if the body is a simple [Set],
    894         //       1.   LOOP_SR_I    set number
    895         //       2.   LOOP_C       stack location
    896         //       ...
    897         //
    898         // Or if this is a .*
    899         //       1.   LOOP_DOT_I    (. matches all mode flag)
    900         //       2.   LOOP_C        stack location
    901         //
    902         // Or, if the body can match a zero-length string, to inhibit infinite loops,
    903         //       1.   STATE_SAVE   5
    904         //       2.   STO_INP_LOC  data-loc
    905         //       3.      body of stuff
    906         //       4.   JMP_SAV_X    2
    907         //       5.   ...
    908         {
    909             // location of item #1, the STATE_SAVE
    910             int32_t   topLoc = blockTopLoc(FALSE);
    911             int32_t   dataLoc = -1;
    912 
    913             // Check for simple *, where the construct being repeated
    914             //   compiled to single opcode, and might be optimizable.
    915             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
    916                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
    917 
    918                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
    919                     // Emit optimized code for a [char set]*
    920                     int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
    921                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
    922                     dataLoc = allocateStackData(1);
    923                     appendOp(URX_LOOP_C, dataLoc);
    924                     break;
    925                 }
    926 
    927                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
    928                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
    929                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
    930                     // Emit Optimized code for .* operations.
    931                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
    932                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
    933                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
    934                         loopOpI |= 1;
    935                     }
    936                     if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
    937                         loopOpI |= 2;
    938                     }
    939                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
    940                     dataLoc = allocateStackData(1);
    941                     appendOp(URX_LOOP_C, dataLoc);
    942                     break;
    943                 }
    944             }
    945 
    946             // Emit general case code for this *
    947             // The optimizations did not apply.
    948 
    949             int32_t   saveStateLoc = blockTopLoc(TRUE);
    950             int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
    951 
    952             // Check for minimum match length of zero, which requires
    953             //    extra loop-breaking code.
    954             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
    955                 insertOp(saveStateLoc);
    956                 dataLoc = allocateStackData(1);
    957 
    958                 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
    959                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
    960                 jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
    961             }
    962 
    963             // Locate the position in the compiled pattern where the match will continue
    964             //   after completing the *.   (4 or 5 in the comment above)
    965             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
    966 
    967             // Put together the save state op and store it into the compiled code.
    968             int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
    969             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
    970 
    971             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
    972             appendOp(jmpOp);
    973         }
    974         break;
    975 
    976     case doNGStar:
    977         // Non-greedy *? quantifier
    978         // compiles to
    979         //     1.   JMP    3
    980         //     2.      body of stuff being iterated over
    981         //     3.   STATE_SAVE  2
    982         //     4    ...
    983         {
    984             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
    985             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
    986             int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
    987             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
    988             appendOp(URX_STATE_SAVE, jmpLoc+1);
    989         }
    990         break;
    991 
    992 
    993     case doIntervalInit:
    994         // The '{' opening an interval quantifier was just scanned.
    995         // Init the counter varaiables that will accumulate the values as the digits
    996         //    are scanned.
    997         fIntervalLow = 0;
    998         fIntervalUpper = -1;
    999         break;
   1000 
   1001     case doIntevalLowerDigit:
   1002         // Scanned a digit from the lower value of an {lower,upper} interval
   1003         {
   1004             int32_t digitValue = u_charDigitValue(fC.fChar);
   1005             U_ASSERT(digitValue >= 0);
   1006             int64_t val = (int64_t)fIntervalLow*10 + digitValue;
   1007             if (val > INT32_MAX) {
   1008                 error(U_REGEX_NUMBER_TOO_BIG);
   1009             } else {
   1010                 fIntervalLow = (int32_t)val;
   1011             }
   1012         }
   1013         break;
   1014 
   1015     case doIntervalUpperDigit:
   1016         // Scanned a digit from the upper value of an {lower,upper} interval
   1017         {
   1018             if (fIntervalUpper < 0) {
   1019                 fIntervalUpper = 0;
   1020             }
   1021             int32_t digitValue = u_charDigitValue(fC.fChar);
   1022             U_ASSERT(digitValue >= 0);
   1023             int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
   1024             if (val > INT32_MAX) {
   1025                 error(U_REGEX_NUMBER_TOO_BIG);
   1026             } else {
   1027                 fIntervalUpper = (int32_t)val;
   1028             }
   1029         }
   1030         break;
   1031 
   1032     case doIntervalSame:
   1033         // Scanned a single value interval like {27}.  Upper = Lower.
   1034         fIntervalUpper = fIntervalLow;
   1035         break;
   1036 
   1037     case doInterval:
   1038         // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
   1039         if (compileInlineInterval() == FALSE) {
   1040             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
   1041         }
   1042         break;
   1043 
   1044     case doPossessiveInterval:
   1045         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
   1046         {
   1047             // Remember the loc for the top of the block being looped over.
   1048             //   (Can not reserve a slot in the compiled pattern at this time, because
   1049             //    compileInterval needs to reserve also, and blockTopLoc can only reserve
   1050             //    once per block.)
   1051             int32_t topLoc = blockTopLoc(FALSE);
   1052 
   1053             // Produce normal looping code.
   1054             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
   1055 
   1056             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
   1057             //  just as if the loop was inclosed in atomic parentheses.
   1058 
   1059             // First the STO_SP before the start of the loop
   1060             insertOp(topLoc);
   1061 
   1062             int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
   1063             int32_t  op     = buildOp(URX_STO_SP, varLoc);
   1064             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1065 
   1066             int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
   1067             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
   1068             loopOp++;     // point LoopOp after the just-inserted STO_SP
   1069             fRXPat->fCompiledPat->push(loopOp, *fStatus);
   1070 
   1071             // Then the LD_SP after the end of the loop
   1072             appendOp(URX_LD_SP, varLoc);
   1073         }
   1074 
   1075         break;
   1076 
   1077     case doNGInterval:
   1078         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
   1079         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
   1080         break;
   1081 
   1082     case doIntervalError:
   1083         error(U_REGEX_BAD_INTERVAL);
   1084         break;
   1085 
   1086     case doLiteralChar:
   1087         // We've just scanned a "normal" character from the pattern,
   1088         literalChar(fC.fChar);
   1089         break;
   1090 
   1091 
   1092     case doEscapedLiteralChar:
   1093         // We've just scanned an backslashed escaped character with  no
   1094         //   special meaning.  It represents itself.
   1095         if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
   1096             ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
   1097             (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
   1098                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1099              }
   1100         literalChar(fC.fChar);
   1101         break;
   1102 
   1103 
   1104     case doDotAny:
   1105         // scanned a ".",  match any single character.
   1106         {
   1107             fixLiterals(FALSE);
   1108             if (fModeFlags & UREGEX_DOTALL) {
   1109                 appendOp(URX_DOTANY_ALL, 0);
   1110             } else if (fModeFlags & UREGEX_UNIX_LINES) {
   1111                 appendOp(URX_DOTANY_UNIX, 0);
   1112             } else {
   1113                 appendOp(URX_DOTANY, 0);
   1114             }
   1115         }
   1116         break;
   1117 
   1118     case doCaret:
   1119         {
   1120             fixLiterals(FALSE);
   1121             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1122                 appendOp(URX_CARET, 0);
   1123             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1124                 appendOp(URX_CARET_M, 0);
   1125             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1126                 appendOp(URX_CARET, 0);   // Only testing true start of input.
   1127             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1128                 appendOp(URX_CARET_M_UNIX, 0);
   1129             }
   1130         }
   1131         break;
   1132 
   1133     case doDollar:
   1134         {
   1135             fixLiterals(FALSE);
   1136             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1137                 appendOp(URX_DOLLAR, 0);
   1138             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1139                 appendOp(URX_DOLLAR_M, 0);
   1140             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1141                 appendOp(URX_DOLLAR_D, 0);
   1142             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1143                 appendOp(URX_DOLLAR_MD, 0);
   1144             }
   1145         }
   1146         break;
   1147 
   1148     case doBackslashA:
   1149         fixLiterals(FALSE);
   1150         appendOp(URX_CARET, 0);
   1151         break;
   1152 
   1153     case doBackslashB:
   1154         {
   1155             #if  UCONFIG_NO_BREAK_ITERATION==1
   1156             if (fModeFlags & UREGEX_UWORD) {
   1157                 error(U_UNSUPPORTED_ERROR);
   1158             }
   1159             #endif
   1160             fixLiterals(FALSE);
   1161             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
   1162             appendOp(op, 1);
   1163         }
   1164         break;
   1165 
   1166     case doBackslashb:
   1167         {
   1168             #if  UCONFIG_NO_BREAK_ITERATION==1
   1169             if (fModeFlags & UREGEX_UWORD) {
   1170                 error(U_UNSUPPORTED_ERROR);
   1171             }
   1172             #endif
   1173             fixLiterals(FALSE);
   1174             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
   1175             appendOp(op, 0);
   1176         }
   1177         break;
   1178 
   1179     case doBackslashD:
   1180         fixLiterals(FALSE);
   1181         appendOp(URX_BACKSLASH_D, 1);
   1182         break;
   1183 
   1184     case doBackslashd:
   1185         fixLiterals(FALSE);
   1186         appendOp(URX_BACKSLASH_D, 0);
   1187         break;
   1188 
   1189     case doBackslashG:
   1190         fixLiterals(FALSE);
   1191         appendOp(URX_BACKSLASH_G, 0);
   1192         break;
   1193 
   1194     case doBackslashH:
   1195         fixLiterals(FALSE);
   1196         appendOp(URX_BACKSLASH_H, 1);
   1197         break;
   1198 
   1199     case doBackslashh:
   1200         fixLiterals(FALSE);
   1201         appendOp(URX_BACKSLASH_H, 0);
   1202         break;
   1203 
   1204     case doBackslashR:
   1205         fixLiterals(FALSE);
   1206         appendOp(URX_BACKSLASH_R, 0);
   1207         break;
   1208 
   1209     case doBackslashS:
   1210         fixLiterals(FALSE);
   1211         appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
   1212         break;
   1213 
   1214     case doBackslashs:
   1215         fixLiterals(FALSE);
   1216         appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
   1217         break;
   1218 
   1219     case doBackslashV:
   1220         fixLiterals(FALSE);
   1221         appendOp(URX_BACKSLASH_V, 1);
   1222         break;
   1223 
   1224     case doBackslashv:
   1225         fixLiterals(FALSE);
   1226         appendOp(URX_BACKSLASH_V, 0);
   1227         break;
   1228 
   1229     case doBackslashW:
   1230         fixLiterals(FALSE);
   1231         appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
   1232         break;
   1233 
   1234     case doBackslashw:
   1235         fixLiterals(FALSE);
   1236         appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
   1237         break;
   1238 
   1239     case doBackslashX:
   1240         fixLiterals(FALSE);
   1241         appendOp(URX_BACKSLASH_X, 0);
   1242         break;
   1243 
   1244 
   1245     case doBackslashZ:
   1246         fixLiterals(FALSE);
   1247         appendOp(URX_DOLLAR, 0);
   1248         break;
   1249 
   1250     case doBackslashz:
   1251         fixLiterals(FALSE);
   1252         appendOp(URX_BACKSLASH_Z, 0);
   1253         break;
   1254 
   1255     case doEscapeError:
   1256         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1257         break;
   1258 
   1259     case doExit:
   1260         fixLiterals(FALSE);
   1261         returnVal = FALSE;
   1262         break;
   1263 
   1264     case doProperty:
   1265         {
   1266             fixLiterals(FALSE);
   1267             UnicodeSet *theSet = scanProp();
   1268             compileSet(theSet);
   1269         }
   1270         break;
   1271 
   1272     case doNamedChar:
   1273         {
   1274             UChar32 c = scanNamedChar();
   1275             literalChar(c);
   1276         }
   1277         break;
   1278 
   1279 
   1280     case doBackRef:
   1281         // BackReference.  Somewhat unusual in that the front-end can not completely parse
   1282         //                 the regular expression, because the number of digits to be consumed
   1283         //                 depends on the number of capture groups that have been defined.  So
   1284         //                 we have to do it here instead.
   1285         {
   1286             int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
   1287             int32_t  groupNum = 0;
   1288             UChar32  c        = fC.fChar;
   1289 
   1290             for (;;) {
   1291                 // Loop once per digit, for max allowed number of digits in a back reference.
   1292                 int32_t digit = u_charDigitValue(c);
   1293                 groupNum = groupNum * 10 + digit;
   1294                 if (groupNum >= numCaptureGroups) {
   1295                     break;
   1296                 }
   1297                 c = peekCharLL();
   1298                 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
   1299                     break;
   1300                 }
   1301                 nextCharLL();
   1302             }
   1303 
   1304             // Scan of the back reference in the source regexp is complete.  Now generate
   1305             //  the compiled code for it.
   1306             // Because capture groups can be forward-referenced by back-references,
   1307             //  we fill the operand with the capture group number.  At the end
   1308             //  of compilation, it will be changed to the variable's location.
   1309             U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
   1310                                      //    and shouldn't enter this code path at all.
   1311             fixLiterals(FALSE);
   1312             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1313                 appendOp(URX_BACKREF_I, groupNum);
   1314             } else {
   1315                 appendOp(URX_BACKREF, groupNum);
   1316             }
   1317         }
   1318         break;
   1319 
   1320     case doBeginNamedBackRef:
   1321         U_ASSERT(fCaptureName == NULL);
   1322         fCaptureName = new UnicodeString;
   1323         if (fCaptureName == NULL) {
   1324             error(U_MEMORY_ALLOCATION_ERROR);
   1325         }
   1326         break;
   1327 
   1328     case doContinueNamedBackRef:
   1329         fCaptureName->append(fC.fChar);
   1330         break;
   1331 
   1332     case doCompleteNamedBackRef:
   1333         {
   1334         int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
   1335         if (groupNumber == 0) {
   1336             // Group name has not been defined.
   1337             //   Could be a forward reference. If we choose to support them at some
   1338             //   future time, extra mechanism will be required at this point.
   1339             error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   1340         } else {
   1341             // Given the number, handle identically to a \n numbered back reference.
   1342             // See comments above, under doBackRef
   1343             fixLiterals(FALSE);
   1344             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1345                 appendOp(URX_BACKREF_I, groupNumber);
   1346             } else {
   1347                 appendOp(URX_BACKREF, groupNumber);
   1348             }
   1349         }
   1350         delete fCaptureName;
   1351         fCaptureName = NULL;
   1352         break;
   1353         }
   1354 
   1355     case doPossessivePlus:
   1356         // Possessive ++ quantifier.
   1357         // Compiles to
   1358         //       1.   STO_SP
   1359         //       2.      body of stuff being iterated over
   1360         //       3.   STATE_SAVE 5
   1361         //       4.   JMP        2
   1362         //       5.   LD_SP
   1363         //       6.   ...
   1364         //
   1365         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
   1366         //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
   1367         //
   1368         {
   1369             // Emit the STO_SP
   1370             int32_t   topLoc = blockTopLoc(TRUE);
   1371             int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
   1372             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
   1373             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1374 
   1375             // Emit the STATE_SAVE
   1376             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
   1377 
   1378             // Emit the JMP
   1379             appendOp(URX_JMP, topLoc+1);
   1380 
   1381             // Emit the LD_SP
   1382             appendOp(URX_LD_SP, stoLoc);
   1383         }
   1384         break;
   1385 
   1386     case doPossessiveStar:
   1387         // Possessive *+ quantifier.
   1388         // Compiles to
   1389         //       1.   STO_SP       loc
   1390         //       2.   STATE_SAVE   5
   1391         //       3.      body of stuff being iterated over
   1392         //       4.   JMP          2
   1393         //       5.   LD_SP        loc
   1394         //       6    ...
   1395         // TODO:  do something to cut back the state stack each time through the loop.
   1396         {
   1397             // Reserve two slots at the top of the block.
   1398             int32_t   topLoc = blockTopLoc(TRUE);
   1399             insertOp(topLoc);
   1400 
   1401             // emit   STO_SP     loc
   1402             int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
   1403             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
   1404             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1405 
   1406             // Emit the SAVE_STATE   5
   1407             int32_t L7 = fRXPat->fCompiledPat->size()+1;
   1408             op = buildOp(URX_STATE_SAVE, L7);
   1409             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
   1410 
   1411             // Append the JMP operation.
   1412             appendOp(URX_JMP, topLoc+1);
   1413 
   1414             // Emit the LD_SP       loc
   1415             appendOp(URX_LD_SP, stoLoc);
   1416         }
   1417         break;
   1418 
   1419     case doPossessiveOpt:
   1420         // Possessive  ?+ quantifier.
   1421         //  Compiles to
   1422         //     1. STO_SP      loc
   1423         //     2. SAVE_STATE  5
   1424         //     3.    body of optional block
   1425         //     4. LD_SP       loc
   1426         //     5. ...
   1427         //
   1428         {
   1429             // Reserve two slots at the top of the block.
   1430             int32_t   topLoc = blockTopLoc(TRUE);
   1431             insertOp(topLoc);
   1432 
   1433             // Emit the STO_SP
   1434             int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
   1435             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
   1436             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1437 
   1438             // Emit the SAVE_STATE
   1439             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
   1440             op = buildOp(URX_STATE_SAVE, continueLoc);
   1441             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
   1442 
   1443             // Emit the LD_SP
   1444             appendOp(URX_LD_SP, stoLoc);
   1445         }
   1446         break;
   1447 
   1448 
   1449     case doBeginMatchMode:
   1450         fNewModeFlags = fModeFlags;
   1451         fSetModeFlag  = TRUE;
   1452         break;
   1453 
   1454     case doMatchMode:   //  (?i)    and similar
   1455         {
   1456             int32_t  bit = 0;
   1457             switch (fC.fChar) {
   1458             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
   1459             case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
   1460             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
   1461             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
   1462             case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
   1463             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
   1464             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
   1465             case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
   1466             default:
   1467                 U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
   1468                                    // by the scanner.
   1469             }
   1470             if (fSetModeFlag) {
   1471                 fNewModeFlags |= bit;
   1472             } else {
   1473                 fNewModeFlags &= ~bit;
   1474             }
   1475         }
   1476         break;
   1477 
   1478     case doSetMatchMode:
   1479         // Emit code to match any pending literals, using the not-yet changed match mode.
   1480         fixLiterals();
   1481 
   1482         // We've got a (?i) or similar.  The match mode is being changed, but
   1483         //   the change is not scoped to a parenthesized block.
   1484         U_ASSERT(fNewModeFlags < 0);
   1485         fModeFlags = fNewModeFlags;
   1486 
   1487         break;
   1488 
   1489 
   1490     case doMatchModeParen:
   1491         // We've got a (?i: or similar.  Begin a parenthesized block, save old
   1492         //   mode flags so they can be restored at the close of the block.
   1493         //
   1494         //   Compile to a
   1495         //      - NOP, which later may be replaced by a save-state if the
   1496         //         parenthesized group gets a * quantifier, followed by
   1497         //      - NOP, which may later be replaced by a save-state if there
   1498         //             is an '|' alternation within the parens.
   1499         {
   1500             fixLiterals(FALSE);
   1501             appendOp(URX_NOP, 0);
   1502             appendOp(URX_NOP, 0);
   1503 
   1504             // On the Parentheses stack, start a new frame and add the postions
   1505             //   of the two NOPs (a normal non-capturing () frame, except for the
   1506             //   saving of the orignal mode flags.)
   1507             fParenStack.push(fModeFlags, *fStatus);
   1508             fParenStack.push(flags, *fStatus);                            // Frame Marker
   1509             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
   1510             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
   1511 
   1512             // Set the current mode flags to the new values.
   1513             U_ASSERT(fNewModeFlags < 0);
   1514             fModeFlags = fNewModeFlags;
   1515         }
   1516         break;
   1517 
   1518     case doBadModeFlag:
   1519         error(U_REGEX_INVALID_FLAG);
   1520         break;
   1521 
   1522     case doSuppressComments:
   1523         // We have just scanned a '(?'.  We now need to prevent the character scanner from
   1524         // treating a '#' as a to-the-end-of-line comment.
   1525         //   (This Perl compatibility just gets uglier and uglier to do...)
   1526         fEOLComments = FALSE;
   1527         break;
   1528 
   1529 
   1530     case doSetAddAmp:
   1531         {
   1532           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1533           set->add(chAmp);
   1534         }
   1535         break;
   1536 
   1537     case doSetAddDash:
   1538         {
   1539           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1540           set->add(chDash);
   1541         }
   1542         break;
   1543 
   1544      case doSetBackslash_s:
   1545         {
   1546          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1547          set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
   1548          break;
   1549         }
   1550 
   1551      case doSetBackslash_S:
   1552         {
   1553             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1554             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
   1555             SSet.complement();
   1556             set->addAll(SSet);
   1557             break;
   1558         }
   1559 
   1560     case doSetBackslash_d:
   1561         {
   1562             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1563             // TODO - make a static set, ticket 6058.
   1564             addCategory(set, U_GC_ND_MASK, *fStatus);
   1565             break;
   1566         }
   1567 
   1568     case doSetBackslash_D:
   1569         {
   1570             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1571             UnicodeSet digits;
   1572             // TODO - make a static set, ticket 6058.
   1573             digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
   1574             digits.complement();
   1575             set->addAll(digits);
   1576             break;
   1577         }
   1578 
   1579     case doSetBackslash_h:
   1580         {
   1581             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1582             UnicodeSet h;
   1583             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
   1584             h.add((UChar32)9);   // Tab
   1585             set->addAll(h);
   1586             break;
   1587         }
   1588 
   1589     case doSetBackslash_H:
   1590         {
   1591             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1592             UnicodeSet h;
   1593             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
   1594             h.add((UChar32)9);   // Tab
   1595             h.complement();
   1596             set->addAll(h);
   1597             break;
   1598         }
   1599 
   1600     case doSetBackslash_v:
   1601         {
   1602             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1603             set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
   1604             set->add((UChar32)0x85);
   1605             set->add((UChar32)0x2028, (UChar32)0x2029);
   1606             break;
   1607         }
   1608 
   1609     case doSetBackslash_V:
   1610         {
   1611             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1612             UnicodeSet v;
   1613             v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
   1614             v.add((UChar32)0x85);
   1615             v.add((UChar32)0x2028, (UChar32)0x2029);
   1616             v.complement();
   1617             set->addAll(v);
   1618             break;
   1619         }
   1620 
   1621     case doSetBackslash_w:
   1622         {
   1623             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1624             set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
   1625             break;
   1626         }
   1627 
   1628     case doSetBackslash_W:
   1629         {
   1630             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1631             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
   1632             SSet.complement();
   1633             set->addAll(SSet);
   1634             break;
   1635         }
   1636 
   1637     case doSetBegin:
   1638         fixLiterals(FALSE);
   1639         fSetStack.push(new UnicodeSet(), *fStatus);
   1640         fSetOpStack.push(setStart, *fStatus);
   1641         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1642             fSetOpStack.push(setCaseClose, *fStatus);
   1643         }
   1644         break;
   1645 
   1646     case doSetBeginDifference1:
   1647         //  We have scanned something like [[abc]-[
   1648         //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
   1649         //  Push a Difference operator, which will cause the new set to be subtracted from what
   1650         //    went before once it is created.
   1651         setPushOp(setDifference1);
   1652         fSetOpStack.push(setStart, *fStatus);
   1653         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1654             fSetOpStack.push(setCaseClose, *fStatus);
   1655         }
   1656         break;
   1657 
   1658     case doSetBeginIntersection1:
   1659         //  We have scanned something like  [[abc]&[
   1660         //   Need both the '&' operator and the open '[' operator.
   1661         setPushOp(setIntersection1);
   1662         fSetOpStack.push(setStart, *fStatus);
   1663         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1664             fSetOpStack.push(setCaseClose, *fStatus);
   1665         }
   1666         break;
   1667 
   1668     case doSetBeginUnion:
   1669         //  We have scanned something like  [[abc][
   1670         //     Need to handle the union operation explicitly [[abc] | [
   1671         setPushOp(setUnion);
   1672         fSetOpStack.push(setStart, *fStatus);
   1673         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1674             fSetOpStack.push(setCaseClose, *fStatus);
   1675         }
   1676         break;
   1677 
   1678     case doSetDifference2:
   1679         // We have scanned something like [abc--
   1680         //   Consider this to unambiguously be a set difference operator.
   1681         setPushOp(setDifference2);
   1682         break;
   1683 
   1684     case doSetEnd:
   1685         // Have encountered the ']' that closes a set.
   1686         //    Force the evaluation of any pending operations within this set,
   1687         //    leave the completed set on the top of the set stack.
   1688         setEval(setEnd);
   1689         U_ASSERT(fSetOpStack.peeki()==setStart);
   1690         fSetOpStack.popi();
   1691         break;
   1692 
   1693     case doSetFinish:
   1694         {
   1695         // Finished a complete set expression, including all nested sets.
   1696         //   The close bracket has already triggered clearing out pending set operators,
   1697         //    the operator stack should be empty and the operand stack should have just
   1698         //    one entry, the result set.
   1699         U_ASSERT(fSetOpStack.empty());
   1700         UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
   1701         U_ASSERT(fSetStack.empty());
   1702         compileSet(theSet);
   1703         break;
   1704         }
   1705 
   1706     case doSetIntersection2:
   1707         // Have scanned something like [abc&&
   1708         setPushOp(setIntersection2);
   1709         break;
   1710 
   1711     case doSetLiteral:
   1712         // Union the just-scanned literal character into the set being built.
   1713         //    This operation is the highest precedence set operation, so we can always do
   1714         //    it immediately, without waiting to see what follows.  It is necessary to perform
   1715         //    any pending '-' or '&' operation first, because these have the same precedence
   1716         //    as union-ing in a literal'
   1717         {
   1718             setEval(setUnion);
   1719             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1720             s->add(fC.fChar);
   1721             fLastSetLiteral = fC.fChar;
   1722             break;
   1723         }
   1724 
   1725     case doSetLiteralEscaped:
   1726         // A back-slash escaped literal character was encountered.
   1727         // Processing is the same as with setLiteral, above, with the addition of
   1728         //  the optional check for errors on escaped ASCII letters.
   1729         {
   1730             if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
   1731                 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
   1732                  (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
   1733                 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1734             }
   1735             setEval(setUnion);
   1736             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1737             s->add(fC.fChar);
   1738             fLastSetLiteral = fC.fChar;
   1739             break;
   1740         }
   1741 
   1742         case doSetNamedChar:
   1743         // Scanning a \N{UNICODE CHARACTER NAME}
   1744         //  Aside from the source of the character, the processing is identical to doSetLiteral,
   1745         //    above.
   1746         {
   1747             UChar32  c = scanNamedChar();
   1748             setEval(setUnion);
   1749             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1750             s->add(c);
   1751             fLastSetLiteral = c;
   1752             break;
   1753         }
   1754 
   1755     case doSetNamedRange:
   1756         // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
   1757         // The left character is already in the set, and is saved in fLastSetLiteral.
   1758         // The right side needs to be picked up, the scan is at the 'N'.
   1759         // Lower Limit > Upper limit being an error matches both Java
   1760         //        and ICU UnicodeSet behavior.
   1761         {
   1762             UChar32  c = scanNamedChar();
   1763             if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
   1764                 error(U_REGEX_INVALID_RANGE);
   1765             }
   1766             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1767             s->add(fLastSetLiteral, c);
   1768             fLastSetLiteral = c;
   1769             break;
   1770         }
   1771 
   1772 
   1773     case  doSetNegate:
   1774         // Scanned a '^' at the start of a set.
   1775         // Push the negation operator onto the set op stack.
   1776         // A twist for case-insensitive matching:
   1777         //   the case closure operation must happen _before_ negation.
   1778         //   But the case closure operation will already be on the stack if it's required.
   1779         //   This requires checking for case closure, and swapping the stack order
   1780         //    if it is present.
   1781         {
   1782             int32_t  tosOp = fSetOpStack.peeki();
   1783             if (tosOp == setCaseClose) {
   1784                 fSetOpStack.popi();
   1785                 fSetOpStack.push(setNegation, *fStatus);
   1786                 fSetOpStack.push(setCaseClose, *fStatus);
   1787             } else {
   1788                 fSetOpStack.push(setNegation, *fStatus);
   1789             }
   1790         }
   1791         break;
   1792 
   1793     case doSetNoCloseError:
   1794         error(U_REGEX_MISSING_CLOSE_BRACKET);
   1795         break;
   1796 
   1797     case doSetOpError:
   1798         error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
   1799         break;
   1800 
   1801     case doSetPosixProp:
   1802         {
   1803             UnicodeSet *s = scanPosixProp();
   1804             if (s != NULL) {
   1805                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
   1806                 tos->addAll(*s);
   1807                 delete s;
   1808             }  // else error.  scanProp() reported the error status already.
   1809         }
   1810         break;
   1811 
   1812     case doSetProp:
   1813         //  Scanned a \p \P within [brackets].
   1814         {
   1815             UnicodeSet *s = scanProp();
   1816             if (s != NULL) {
   1817                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
   1818                 tos->addAll(*s);
   1819                 delete s;
   1820             }  // else error.  scanProp() reported the error status already.
   1821         }
   1822         break;
   1823 
   1824 
   1825     case doSetRange:
   1826         // We have scanned literal-literal.  Add the range to the set.
   1827         // The left character is already in the set, and is saved in fLastSetLiteral.
   1828         // The right side is the current character.
   1829         // Lower Limit > Upper limit being an error matches both Java
   1830         //        and ICU UnicodeSet behavior.
   1831         {
   1832 
   1833         if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
   1834             error(U_REGEX_INVALID_RANGE);
   1835         }
   1836         UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1837         s->add(fLastSetLiteral, fC.fChar);
   1838         break;
   1839         }
   1840 
   1841     default:
   1842         U_ASSERT(FALSE);
   1843         error(U_REGEX_INTERNAL_ERROR);
   1844         break;
   1845     }
   1846 
   1847     if (U_FAILURE(*fStatus)) {
   1848         returnVal = FALSE;
   1849     }
   1850 
   1851     return returnVal;
   1852 }
   1853 
   1854 
   1855 
   1856 //------------------------------------------------------------------------------
   1857 //
   1858 //   literalChar           We've encountered a literal character from the pattern,
   1859 //                             or an escape sequence that reduces to a character.
   1860 //                         Add it to the string containing all literal chars/strings from
   1861 //                             the pattern.
   1862 //
   1863 //------------------------------------------------------------------------------
   1864 void RegexCompile::literalChar(UChar32 c)  {
   1865     fLiteralChars.append(c);
   1866 }
   1867 
   1868 
   1869 //------------------------------------------------------------------------------
   1870 //
   1871 //    fixLiterals           When compiling something that can follow a literal
   1872 //                          string in a pattern, emit the code to match the
   1873 //                          accumulated literal string.
   1874 //
   1875 //                          Optionally, split the last char of the string off into
   1876 //                          a single "ONE_CHAR" operation, so that quantifiers can
   1877 //                          apply to that char alone.  Example:   abc*
   1878 //                          The * must apply to the 'c' only.
   1879 //
   1880 //------------------------------------------------------------------------------
   1881 void    RegexCompile::fixLiterals(UBool split) {
   1882 
   1883     // If no literal characters have been scanned but not yet had code generated
   1884     //   for them, nothing needs to be done.
   1885     if (fLiteralChars.length() == 0) {
   1886         return;
   1887     }
   1888 
   1889     int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
   1890     UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
   1891 
   1892     // Split:  We need to  ensure that the last item in the compiled pattern
   1893     //     refers only to the last literal scanned in the pattern, so that
   1894     //     quantifiers (*, +, etc.) affect only it, and not a longer string.
   1895     //     Split before case folding for case insensitive matches.
   1896 
   1897     if (split) {
   1898         fLiteralChars.truncate(indexOfLastCodePoint);
   1899         fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
   1900                               //  Note that the truncated literal string may be empty, in which case
   1901                               //  nothing will be emitted.
   1902 
   1903         literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
   1904         fixLiterals(FALSE);          // Second recursive call, code for the final code point.
   1905         return;
   1906     }
   1907 
   1908     // If we are doing case-insensitive matching, case fold the string.  This may expand
   1909     //   the string, e.g. the German sharp-s turns into "ss"
   1910     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1911         fLiteralChars.foldCase();
   1912         indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
   1913         lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
   1914     }
   1915 
   1916     if (indexOfLastCodePoint == 0) {
   1917         // Single character, emit a URX_ONECHAR op to match it.
   1918         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
   1919                  u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
   1920             appendOp(URX_ONECHAR_I, lastCodePoint);
   1921         } else {
   1922             appendOp(URX_ONECHAR, lastCodePoint);
   1923         }
   1924     } else {
   1925         // Two or more chars, emit a URX_STRING to match them.
   1926         if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
   1927             error(U_REGEX_PATTERN_TOO_BIG);
   1928         }
   1929         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1930             appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
   1931         } else {
   1932             // TODO here:  add optimization to split case sensitive strings of length two
   1933             //             into two single char ops, for efficiency.
   1934             appendOp(URX_STRING, fRXPat->fLiteralText.length());
   1935         }
   1936         appendOp(URX_STRING_LEN, fLiteralChars.length());
   1937 
   1938         // Add this string into the accumulated strings of the compiled pattern.
   1939         fRXPat->fLiteralText.append(fLiteralChars);
   1940     }
   1941 
   1942     fLiteralChars.remove();
   1943 }
   1944 
   1945 
   1946 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
   1947     if (U_FAILURE(*fStatus)) {
   1948         return 0;
   1949     }
   1950     if (type < 0 || type > 255) {
   1951         U_ASSERT(FALSE);
   1952         error(U_REGEX_INTERNAL_ERROR);
   1953         type = URX_RESERVED_OP;
   1954     }
   1955     if (val > 0x00ffffff) {
   1956         U_ASSERT(FALSE);
   1957         error(U_REGEX_INTERNAL_ERROR);
   1958         val = 0;
   1959     }
   1960     if (val < 0) {
   1961         if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
   1962             U_ASSERT(FALSE);
   1963             error(U_REGEX_INTERNAL_ERROR);
   1964             return -1;
   1965         }
   1966         if (URX_TYPE(val) != 0xff) {
   1967             U_ASSERT(FALSE);
   1968             error(U_REGEX_INTERNAL_ERROR);
   1969             return -1;
   1970         }
   1971         type = URX_RESERVED_OP_N;
   1972     }
   1973     return (type << 24) | val;
   1974 }
   1975 
   1976 
   1977 //------------------------------------------------------------------------------
   1978 //
   1979 //   appendOp()             Append a new instruction onto the compiled pattern
   1980 //                          Includes error checking, limiting the size of the
   1981 //                          pattern to lengths that can be represented in the
   1982 //                          24 bit operand field of an instruction.
   1983 //
   1984 //------------------------------------------------------------------------------
   1985 void RegexCompile::appendOp(int32_t op) {
   1986     if (U_FAILURE(*fStatus)) {
   1987         return;
   1988     }
   1989     fRXPat->fCompiledPat->addElement(op, *fStatus);
   1990     if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
   1991         error(U_REGEX_PATTERN_TOO_BIG);
   1992     }
   1993 }
   1994 
   1995 void RegexCompile::appendOp(int32_t type, int32_t val) {
   1996     appendOp(buildOp(type, val));
   1997 }
   1998 
   1999 
   2000 //------------------------------------------------------------------------------
   2001 //
   2002 //   insertOp()             Insert a slot for a new opcode into the already
   2003 //                          compiled pattern code.
   2004 //
   2005 //                          Fill the slot with a NOP.  Our caller will replace it
   2006 //                          with what they really wanted.
   2007 //
   2008 //------------------------------------------------------------------------------
   2009 void   RegexCompile::insertOp(int32_t where) {
   2010     UVector64 *code = fRXPat->fCompiledPat;
   2011     U_ASSERT(where>0 && where < code->size());
   2012 
   2013     int32_t  nop = buildOp(URX_NOP, 0);
   2014     code->insertElementAt(nop, where, *fStatus);
   2015 
   2016     // Walk through the pattern, looking for any ops with targets that
   2017     //  were moved down by the insert.  Fix them.
   2018     int32_t loc;
   2019     for (loc=0; loc<code->size(); loc++) {
   2020         int32_t op = (int32_t)code->elementAti(loc);
   2021         int32_t opType = URX_TYPE(op);
   2022         int32_t opValue = URX_VAL(op);
   2023         if ((opType == URX_JMP         ||
   2024             opType == URX_JMPX         ||
   2025             opType == URX_STATE_SAVE   ||
   2026             opType == URX_CTR_LOOP     ||
   2027             opType == URX_CTR_LOOP_NG  ||
   2028             opType == URX_JMP_SAV      ||
   2029             opType == URX_JMP_SAV_X    ||
   2030             opType == URX_RELOC_OPRND)    && opValue > where) {
   2031             // Target location for this opcode is after the insertion point and
   2032             //   needs to be incremented to adjust for the insertion.
   2033             opValue++;
   2034             op = buildOp(opType, opValue);
   2035             code->setElementAt(op, loc);
   2036         }
   2037     }
   2038 
   2039     // Now fix up the parentheses stack.  All positive values in it are locations in
   2040     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
   2041     for (loc=0; loc<fParenStack.size(); loc++) {
   2042         int32_t x = fParenStack.elementAti(loc);
   2043         U_ASSERT(x < code->size());
   2044         if (x>where) {
   2045             x++;
   2046             fParenStack.setElementAt(x, loc);
   2047         }
   2048     }
   2049 
   2050     if (fMatchCloseParen > where) {
   2051         fMatchCloseParen++;
   2052     }
   2053     if (fMatchOpenParen > where) {
   2054         fMatchOpenParen++;
   2055     }
   2056 }
   2057 
   2058 
   2059 //------------------------------------------------------------------------------
   2060 //
   2061 //   allocateData()        Allocate storage in the matcher's static data area.
   2062 //                         Return the index for the newly allocated data.
   2063 //                         The storage won't actually exist until we are running a match
   2064 //                         operation, but the storage indexes are inserted into various
   2065 //                         opcodes while compiling the pattern.
   2066 //
   2067 //------------------------------------------------------------------------------
   2068 int32_t RegexCompile::allocateData(int32_t size) {
   2069     if (U_FAILURE(*fStatus)) {
   2070         return 0;
   2071     }
   2072     if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
   2073         error(U_REGEX_INTERNAL_ERROR);
   2074         return 0;
   2075     }
   2076     int32_t dataIndex = fRXPat->fDataSize;
   2077     fRXPat->fDataSize += size;
   2078     if (fRXPat->fDataSize >= 0x00fffff0) {
   2079         error(U_REGEX_INTERNAL_ERROR);
   2080     }
   2081     return dataIndex;
   2082 }
   2083 
   2084 
   2085 //------------------------------------------------------------------------------
   2086 //
   2087 //   allocateStackData()   Allocate space in the back-tracking stack frame.
   2088 //                         Return the index for the newly allocated data.
   2089 //                         The frame indexes are inserted into various
   2090 //                         opcodes while compiling the pattern, meaning that frame
   2091 //                         size must be restricted to the size that will fit
   2092 //                         as an operand (24 bits).
   2093 //
   2094 //------------------------------------------------------------------------------
   2095 int32_t RegexCompile::allocateStackData(int32_t size) {
   2096     if (U_FAILURE(*fStatus)) {
   2097         return 0;
   2098     }
   2099     if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
   2100         error(U_REGEX_INTERNAL_ERROR);
   2101         return 0;
   2102     }
   2103     int32_t dataIndex = fRXPat->fFrameSize;
   2104     fRXPat->fFrameSize += size;
   2105     if (fRXPat->fFrameSize >= 0x00fffff0) {
   2106         error(U_REGEX_PATTERN_TOO_BIG);
   2107     }
   2108     return dataIndex;
   2109 }
   2110 
   2111 
   2112 //------------------------------------------------------------------------------
   2113 //
   2114 //   blockTopLoc()          Find or create a location in the compiled pattern
   2115 //                          at the start of the operation or block that has
   2116 //                          just been compiled.  Needed when a quantifier (* or
   2117 //                          whatever) appears, and we need to add an operation
   2118 //                          at the start of the thing being quantified.
   2119 //
   2120 //                          (Parenthesized Blocks) have a slot with a NOP that
   2121 //                          is reserved for this purpose.  .* or similar don't
   2122 //                          and a slot needs to be added.
   2123 //
   2124 //       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
   2125 //                                         at the returned location.
   2126 //                                 FALSE - just return the address,
   2127 //                                         do not reserve a location there.
   2128 //
   2129 //------------------------------------------------------------------------------
   2130 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
   2131     int32_t   theLoc;
   2132     fixLiterals(TRUE);  // Emit code for any pending literals.
   2133                         //   If last item was a string, emit separate op for the its last char.
   2134     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
   2135     {
   2136         // The item just processed is a parenthesized block.
   2137         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
   2138         U_ASSERT(theLoc > 0);
   2139         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
   2140     }
   2141     else {
   2142         // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
   2143         // No slot for STATE_SAVE was pre-reserved in the compiled code.
   2144         // We need to make space now.
   2145         theLoc = fRXPat->fCompiledPat->size()-1;
   2146         int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
   2147         if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
   2148             // Strings take two opcode, we want the position of the first one.
   2149             // We can have a string at this point if a single character case-folded to two.
   2150             theLoc--;
   2151         }
   2152         if (reserveLoc) {
   2153             int32_t  nop = buildOp(URX_NOP, 0);
   2154             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
   2155         }
   2156     }
   2157     return theLoc;
   2158 }
   2159 
   2160 
   2161 
   2162 //------------------------------------------------------------------------------
   2163 //
   2164 //    handleCloseParen      When compiling a close paren, we need to go back
   2165 //                          and fix up any JMP or SAVE operations within the
   2166 //                          parenthesized block that need to target the end
   2167 //                          of the block.  The locations of these are kept on
   2168 //                          the paretheses stack.
   2169 //
   2170 //                          This function is called both when encountering a
   2171 //                          real ) and at the end of the pattern.
   2172 //
   2173 //------------------------------------------------------------------------------
   2174 void  RegexCompile::handleCloseParen() {
   2175     int32_t   patIdx;
   2176     int32_t   patOp;
   2177     if (fParenStack.size() <= 0) {
   2178         error(U_REGEX_MISMATCHED_PAREN);
   2179         return;
   2180     }
   2181 
   2182     // Emit code for any pending literals.
   2183     fixLiterals(FALSE);
   2184 
   2185     // Fixup any operations within the just-closed parenthesized group
   2186     //    that need to reference the end of the (block).
   2187     //    (The first one popped from the stack is an unused slot for
   2188     //     alternation (OR) state save, but applying the fixup to it does no harm.)
   2189     for (;;) {
   2190         patIdx = fParenStack.popi();
   2191         if (patIdx < 0) {
   2192             // value < 0 flags the start of the frame on the paren stack.
   2193             break;
   2194         }
   2195         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
   2196         patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
   2197         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
   2198         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
   2199         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
   2200         fMatchOpenParen     = patIdx;
   2201     }
   2202 
   2203     //  At the close of any parenthesized block, restore the match mode flags  to
   2204     //  the value they had at the open paren.  Saved value is
   2205     //  at the top of the paren stack.
   2206     fModeFlags = fParenStack.popi();
   2207     U_ASSERT(fModeFlags < 0);
   2208 
   2209     // DO any additional fixups, depending on the specific kind of
   2210     // parentesized grouping this is
   2211 
   2212     switch (patIdx) {
   2213     case plain:
   2214     case flags:
   2215         // No additional fixups required.
   2216         //   (Grouping-only parentheses)
   2217         break;
   2218     case capturing:
   2219         // Capturing Parentheses.
   2220         //   Insert a End Capture op into the pattern.
   2221         //   The frame offset of the variables for this cg is obtained from the
   2222         //       start capture op and put it into the end-capture op.
   2223         {
   2224             int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
   2225             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
   2226 
   2227             int32_t   frameVarLocation = URX_VAL(captureOp);
   2228             appendOp(URX_END_CAPTURE, frameVarLocation);
   2229         }
   2230         break;
   2231     case atomic:
   2232         // Atomic Parenthesis.
   2233         //   Insert a LD_SP operation to restore the state stack to the position
   2234         //   it was when the atomic parens were entered.
   2235         {
   2236             int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
   2237             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
   2238             int32_t   stoLoc = URX_VAL(stoOp);
   2239             appendOp(URX_LD_SP, stoLoc);
   2240         }
   2241         break;
   2242 
   2243     case lookAhead:
   2244         {
   2245             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
   2246             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
   2247             int32_t dataLoc  = URX_VAL(startOp);
   2248             appendOp(URX_LA_END, dataLoc);
   2249         }
   2250         break;
   2251 
   2252     case negLookAhead:
   2253         {
   2254             // See comment at doOpenLookAheadNeg
   2255             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
   2256             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
   2257             int32_t dataLoc  = URX_VAL(startOp);
   2258             appendOp(URX_LA_END, dataLoc);
   2259             appendOp(URX_BACKTRACK, 0);
   2260             appendOp(URX_LA_END, dataLoc);
   2261 
   2262             // Patch the URX_SAVE near the top of the block.
   2263             // The destination of the SAVE is the final LA_END that was just added.
   2264             int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
   2265             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
   2266             int32_t dest     = fRXPat->fCompiledPat->size()-1;
   2267             saveOp           = buildOp(URX_STATE_SAVE, dest);
   2268             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
   2269         }
   2270         break;
   2271 
   2272     case lookBehind:
   2273         {
   2274             // See comment at doOpenLookBehind.
   2275 
   2276             // Append the URX_LB_END and URX_LA_END to the compiled pattern.
   2277             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
   2278             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
   2279             int32_t dataLoc  = URX_VAL(startOp);
   2280             appendOp(URX_LB_END, dataLoc);
   2281             appendOp(URX_LA_END, dataLoc);
   2282 
   2283             // Determine the min and max bounds for the length of the
   2284             //  string that the pattern can match.
   2285             //  An unbounded upper limit is an error.
   2286             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
   2287             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
   2288             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
   2289             if (URX_TYPE(maxML) != 0) {
   2290                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2291                 break;
   2292             }
   2293             if (maxML == INT32_MAX) {
   2294                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2295                 break;
   2296             }
   2297             U_ASSERT(minML <= maxML);
   2298 
   2299             // Insert the min and max match len bounds into the URX_LB_CONT op that
   2300             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
   2301             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
   2302             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
   2303 
   2304         }
   2305         break;
   2306 
   2307 
   2308 
   2309     case lookBehindN:
   2310         {
   2311             // See comment at doOpenLookBehindNeg.
   2312 
   2313             // Append the URX_LBN_END to the compiled pattern.
   2314             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
   2315             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
   2316             int32_t dataLoc  = URX_VAL(startOp);
   2317             appendOp(URX_LBN_END, dataLoc);
   2318 
   2319             // Determine the min and max bounds for the length of the
   2320             //  string that the pattern can match.
   2321             //  An unbounded upper limit is an error.
   2322             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
   2323             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
   2324             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
   2325             if (URX_TYPE(maxML) != 0) {
   2326                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2327                 break;
   2328             }
   2329             if (maxML == INT32_MAX) {
   2330                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2331                 break;
   2332             }
   2333             U_ASSERT(minML <= maxML);
   2334 
   2335             // Insert the min and max match len bounds into the URX_LB_CONT op that
   2336             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
   2337             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
   2338             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
   2339 
   2340             // Insert the pattern location to continue at after a successful match
   2341             //  as the last operand of the URX_LBN_CONT
   2342             int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
   2343             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
   2344         }
   2345         break;
   2346 
   2347 
   2348 
   2349     default:
   2350         U_ASSERT(FALSE);
   2351     }
   2352 
   2353     // remember the next location in the compiled pattern.
   2354     // The compilation of Quantifiers will look at this to see whether its looping
   2355     //   over a parenthesized block or a single item
   2356     fMatchCloseParen = fRXPat->fCompiledPat->size();
   2357 }
   2358 
   2359 
   2360 
   2361 //------------------------------------------------------------------------------
   2362 //
   2363 //   compileSet       Compile the pattern operations for a reference to a
   2364 //                    UnicodeSet.
   2365 //
   2366 //------------------------------------------------------------------------------
   2367 void        RegexCompile::compileSet(UnicodeSet *theSet)
   2368 {
   2369     if (theSet == NULL) {
   2370         return;
   2371     }
   2372     //  Remove any strings from the set.
   2373     //  There shoudn't be any, but just in case.
   2374     //     (Case Closure can add them; if we had a simple case closure avaialble that
   2375     //      ignored strings, that would be better.)
   2376     theSet->removeAllStrings();
   2377     int32_t  setSize = theSet->size();
   2378 
   2379     switch (setSize) {
   2380     case 0:
   2381         {
   2382             // Set of no elements.   Always fails to match.
   2383             appendOp(URX_BACKTRACK, 0);
   2384             delete theSet;
   2385         }
   2386         break;
   2387 
   2388     case 1:
   2389         {
   2390             // The set contains only a single code point.  Put it into
   2391             //   the compiled pattern as a single char operation rather
   2392             //   than a set, and discard the set itself.
   2393             literalChar(theSet->charAt(0));
   2394             delete theSet;
   2395         }
   2396         break;
   2397 
   2398     default:
   2399         {
   2400             //  The set contains two or more chars.  (the normal case)
   2401             //  Put it into the compiled pattern as a set.
   2402             int32_t setNumber = fRXPat->fSets->size();
   2403             fRXPat->fSets->addElement(theSet, *fStatus);
   2404             appendOp(URX_SETREF, setNumber);
   2405         }
   2406     }
   2407 }
   2408 
   2409 
   2410 //------------------------------------------------------------------------------
   2411 //
   2412 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
   2413 //                      Except for the specific opcodes used, the code is the same
   2414 //                      for all three types (greedy, non-greedy, possessive) of
   2415 //                      intervals.  The opcodes are supplied as parameters.
   2416 //                      (There are two sets of opcodes - greedy & possessive use the
   2417 //                      same ones, while non-greedy has it's own.)
   2418 //
   2419 //                      The code for interval loops has this form:
   2420 //                         0  CTR_INIT   counter loc (in stack frame)
   2421 //                         1             5  patt address of CTR_LOOP at bottom of block
   2422 //                         2             min count
   2423 //                         3             max count   (-1 for unbounded)
   2424 //                         4  ...        block to be iterated over
   2425 //                         5  CTR_LOOP
   2426 //
   2427 //                       In
   2428 //------------------------------------------------------------------------------
   2429 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
   2430 {
   2431     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
   2432     //   four slots in the compiled code.  Reserve them.
   2433     int32_t   topOfBlock = blockTopLoc(TRUE);
   2434     insertOp(topOfBlock);
   2435     insertOp(topOfBlock);
   2436     insertOp(topOfBlock);
   2437 
   2438     // The operands for the CTR_INIT opcode include the index in the matcher data
   2439     //   of the counter.  Allocate it now. There are two data items
   2440     //        counterLoc   -->  Loop counter
   2441     //               +1    -->  Input index (for breaking non-progressing loops)
   2442     //                          (Only present if unbounded upper limit on loop)
   2443     int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
   2444     int32_t   counterLoc = allocateStackData(dataSize);
   2445 
   2446     int32_t   op = buildOp(InitOp, counterLoc);
   2447     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
   2448 
   2449     // The second operand of CTR_INIT is the location following the end of the loop.
   2450     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
   2451     //   compilation of something later on causes the code to grow and the target
   2452     //   position to move.
   2453     int32_t loopEnd = fRXPat->fCompiledPat->size();
   2454     op = buildOp(URX_RELOC_OPRND, loopEnd);
   2455     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
   2456 
   2457     // Followed by the min and max counts.
   2458     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
   2459     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
   2460 
   2461     // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
   2462     //   Goes at end of the block being looped over, so just append to the code so far.
   2463     appendOp(LoopOp, topOfBlock);
   2464 
   2465     if ((fIntervalLow & 0xff000000) != 0 ||
   2466         (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
   2467             error(U_REGEX_NUMBER_TOO_BIG);
   2468         }
   2469 
   2470     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
   2471         error(U_REGEX_MAX_LT_MIN);
   2472     }
   2473 }
   2474 
   2475 
   2476 
   2477 UBool RegexCompile::compileInlineInterval() {
   2478     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
   2479         // Too big to inline.  Fail, which will cause looping code to be generated.
   2480         //   (Upper < Lower picks up unbounded upper and errors, both.)
   2481         return FALSE;
   2482     }
   2483 
   2484     int32_t   topOfBlock = blockTopLoc(FALSE);
   2485     if (fIntervalUpper == 0) {
   2486         // Pathological case.  Attempt no matches, as if the block doesn't exist.
   2487         // Discard the generated code for the block.
   2488         // If the block included parens, discard the info pertaining to them as well.
   2489         fRXPat->fCompiledPat->setSize(topOfBlock);
   2490         if (fMatchOpenParen >= topOfBlock) {
   2491             fMatchOpenParen = -1;
   2492         }
   2493         if (fMatchCloseParen >= topOfBlock) {
   2494             fMatchCloseParen = -1;
   2495         }
   2496         return TRUE;
   2497     }
   2498 
   2499     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
   2500         // The thing being repeated is not a single op, but some
   2501         //   more complex block.  Do it as a loop, not inlines.
   2502         //   Note that things "repeated" a max of once are handled as inline, because
   2503         //     the one copy of the code already generated is just fine.
   2504         return FALSE;
   2505     }
   2506 
   2507     // Pick up the opcode that is to be repeated
   2508     //
   2509     int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
   2510 
   2511     // Compute the pattern location where the inline sequence
   2512     //   will end, and set up the state save op that will be needed.
   2513     //
   2514     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
   2515                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
   2516     int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
   2517     if (fIntervalLow == 0) {
   2518         insertOp(topOfBlock);
   2519         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
   2520     }
   2521 
   2522 
   2523 
   2524     //  Loop, emitting the op for the thing being repeated each time.
   2525     //    Loop starts at 1 because one instance of the op already exists in the pattern,
   2526     //    it was put there when it was originally encountered.
   2527     int32_t i;
   2528     for (i=1; i<fIntervalUpper; i++ ) {
   2529         if (i >= fIntervalLow) {
   2530             appendOp(saveOp);
   2531         }
   2532         appendOp(op);
   2533     }
   2534     return TRUE;
   2535 }
   2536 
   2537 
   2538 
   2539 //------------------------------------------------------------------------------
   2540 //
   2541 //   caseInsensitiveStart  given a single code point from a pattern string, determine the
   2542 //                         set of characters that could potentially begin a case-insensitive
   2543 //                         match of a string beginning with that character, using full Unicode
   2544 //                         case insensitive matching.
   2545 //
   2546 //          This is used in optimizing find().
   2547 //
   2548 //          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
   2549 //          misses cases like this:
   2550 //             A string from the pattern begins with 'ss' (although all we know
   2551 //                 in this context is that it begins with 's')
   2552 //             The pattern could match a string beginning with a German sharp-s
   2553 //
   2554 //           To the ordinary case closure for a character c, we add all other
   2555 //           characters cx where the case closure of cx incudes a string form that begins
   2556 //           with the original character c.
   2557 //
   2558 //           This function could be made smarter. The full pattern string is available
   2559 //           and it would be possible to verify that the extra characters being added
   2560 //           to the starting set fully match, rather than having just a first-char of the
   2561 //           folded form match.
   2562 //
   2563 //------------------------------------------------------------------------------
   2564 void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
   2565 
   2566 // Machine Generated below.
   2567 // It may need updating with new versions of Unicode.
   2568 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
   2569 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
   2570 
   2571 // Machine Generated Data. Do not hand edit.
   2572     static const UChar32 RECaseFixCodePoints[] = {
   2573         0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
   2574         0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
   2575         0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
   2576         0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
   2577         0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
   2578 
   2579     static const int16_t RECaseFixStringOffsets[] = {
   2580         0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
   2581         0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
   2582         0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
   2583         0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
   2584         0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
   2585 
   2586     static const int16_t RECaseFixCounts[] = {
   2587         0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
   2588         0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
   2589         0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
   2590         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
   2591         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
   2592 
   2593     static const UChar RECaseFixData[] = {
   2594         0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
   2595         0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
   2596         0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
   2597         0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
   2598         0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
   2599         0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
   2600         0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
   2601         0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
   2602         0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
   2603         0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
   2604         0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
   2605 
   2606 // End of machine generated data.
   2607 
   2608     if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
   2609         // This function should never be called with an invalid input character.
   2610         U_ASSERT(FALSE);
   2611         starterChars->clear();
   2612     } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
   2613         UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
   2614         starterChars->set(caseFoldedC, caseFoldedC);
   2615 
   2616         int32_t i;
   2617         for (i=0; RECaseFixCodePoints[i]<c ; i++) {
   2618             // Simple linear search through the sorted list of interesting code points.
   2619         }
   2620 
   2621         if (RECaseFixCodePoints[i] == c) {
   2622             int32_t dataIndex = RECaseFixStringOffsets[i];
   2623             int32_t numCharsToAdd = RECaseFixCounts[i];
   2624             UChar32 cpToAdd = 0;
   2625             for (int32_t j=0; j<numCharsToAdd; j++) {
   2626                 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
   2627                 starterChars->add(cpToAdd);
   2628             }
   2629         }
   2630 
   2631         starterChars->closeOver(USET_CASE_INSENSITIVE);
   2632         starterChars->removeAllStrings();
   2633     } else {
   2634         // Not a cased character. Just return it alone.
   2635         starterChars->set(c, c);
   2636     }
   2637 }
   2638 
   2639 
   2640 // Increment with overflow check.
   2641 // val and delta will both be positive.
   2642 
   2643 static int32_t safeIncrement(int32_t val, int32_t delta) {
   2644     if (INT32_MAX - val > delta) {
   2645         return val + delta;
   2646     } else {
   2647         return INT32_MAX;
   2648     }
   2649 }
   2650 
   2651 
   2652 //------------------------------------------------------------------------------
   2653 //
   2654 //   matchStartType    Determine how a match can start.
   2655 //                     Used to optimize find() operations.
   2656 //
   2657 //                     Operation is very similar to minMatchLength().  Walk the compiled
   2658 //                     pattern, keeping an on-going minimum-match-length.  For any
   2659 //                     op where the min match coming in is zero, add that ops possible
   2660 //                     starting matches to the possible starts for the overall pattern.
   2661 //
   2662 //------------------------------------------------------------------------------
   2663 void   RegexCompile::matchStartType() {
   2664     if (U_FAILURE(*fStatus)) {
   2665         return;
   2666     }
   2667 
   2668 
   2669     int32_t    loc;                    // Location in the pattern of the current op being processed.
   2670     int32_t    op;                     // The op being processed
   2671     int32_t    opType;                 // The opcode type of the op
   2672     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
   2673     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
   2674 
   2675     UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
   2676                                        //   could have advanced the position in a match.
   2677                                        //   (Maximum match length so far == 0)
   2678 
   2679     // forwardedLength is a vector holding minimum-match-length values that
   2680     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
   2681     //   It must be one longer than the pattern being checked because some  ops
   2682     //   will jmp to a end-of-block+1 location from within a block, and we must
   2683     //   count those when checking the block.
   2684     int32_t end = fRXPat->fCompiledPat->size();
   2685     UVector32  forwardedLength(end+1, *fStatus);
   2686     forwardedLength.setSize(end+1);
   2687     for (loc=3; loc<end; loc++) {
   2688         forwardedLength.setElementAt(INT32_MAX, loc);
   2689     }
   2690 
   2691     for (loc = 3; loc<end; loc++) {
   2692         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   2693         opType = URX_TYPE(op);
   2694 
   2695         // The loop is advancing linearly through the pattern.
   2696         // If the op we are now at was the destination of a branch in the pattern,
   2697         // and that path has a shorter minimum length than the current accumulated value,
   2698         // replace the current accumulated value.
   2699         if (forwardedLength.elementAti(loc) < currentLen) {
   2700             currentLen = forwardedLength.elementAti(loc);
   2701             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   2702         }
   2703 
   2704         switch (opType) {
   2705             // Ops that don't change the total length matched
   2706         case URX_RESERVED_OP:
   2707         case URX_END:
   2708         case URX_FAIL:
   2709         case URX_STRING_LEN:
   2710         case URX_NOP:
   2711         case URX_START_CAPTURE:
   2712         case URX_END_CAPTURE:
   2713         case URX_BACKSLASH_B:
   2714         case URX_BACKSLASH_BU:
   2715         case URX_BACKSLASH_G:
   2716         case URX_BACKSLASH_Z:
   2717         case URX_DOLLAR:
   2718         case URX_DOLLAR_M:
   2719         case URX_DOLLAR_D:
   2720         case URX_DOLLAR_MD:
   2721         case URX_RELOC_OPRND:
   2722         case URX_STO_INP_LOC:
   2723         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   2724         case URX_BACKREF_I:
   2725 
   2726         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   2727         case URX_LD_SP:
   2728             break;
   2729 
   2730         case URX_CARET:
   2731             if (atStart) {
   2732                 fRXPat->fStartType = START_START;
   2733             }
   2734             break;
   2735 
   2736         case URX_CARET_M:
   2737         case URX_CARET_M_UNIX:
   2738             if (atStart) {
   2739                 fRXPat->fStartType = START_LINE;
   2740             }
   2741             break;
   2742 
   2743         case URX_ONECHAR:
   2744             if (currentLen == 0) {
   2745                 // This character could appear at the start of a match.
   2746                 //   Add it to the set of possible starting characters.
   2747                 fRXPat->fInitialChars->add(URX_VAL(op));
   2748                 numInitialStrings += 2;
   2749             }
   2750             currentLen = safeIncrement(currentLen, 1);
   2751             atStart = FALSE;
   2752             break;
   2753 
   2754 
   2755         case URX_SETREF:
   2756             if (currentLen == 0) {
   2757                 int32_t  sn = URX_VAL(op);
   2758                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
   2759                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
   2760                 fRXPat->fInitialChars->addAll(*s);
   2761                 numInitialStrings += 2;
   2762             }
   2763             currentLen = safeIncrement(currentLen, 1);
   2764             atStart = FALSE;
   2765             break;
   2766 
   2767         case URX_LOOP_SR_I:
   2768             // [Set]*, like a SETREF, above, in what it can match,
   2769             //  but may not match at all, so currentLen is not incremented.
   2770             if (currentLen == 0) {
   2771                 int32_t  sn = URX_VAL(op);
   2772                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
   2773                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
   2774                 fRXPat->fInitialChars->addAll(*s);
   2775                 numInitialStrings += 2;
   2776             }
   2777             atStart = FALSE;
   2778             break;
   2779 
   2780         case URX_LOOP_DOT_I:
   2781             if (currentLen == 0) {
   2782                 // .* at the start of a pattern.
   2783                 //    Any character can begin the match.
   2784                 fRXPat->fInitialChars->clear();
   2785                 fRXPat->fInitialChars->complement();
   2786                 numInitialStrings += 2;
   2787             }
   2788             atStart = FALSE;
   2789             break;
   2790 
   2791 
   2792         case URX_STATIC_SETREF:
   2793             if (currentLen == 0) {
   2794                 int32_t  sn = URX_VAL(op);
   2795                 U_ASSERT(sn>0 && sn<URX_LAST_SET);
   2796                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
   2797                 fRXPat->fInitialChars->addAll(*s);
   2798                 numInitialStrings += 2;
   2799             }
   2800             currentLen = safeIncrement(currentLen, 1);
   2801             atStart = FALSE;
   2802             break;
   2803 
   2804 
   2805 
   2806         case URX_STAT_SETREF_N:
   2807             if (currentLen == 0) {
   2808                 int32_t  sn = URX_VAL(op);
   2809                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
   2810                 UnicodeSet sc(*s);
   2811                 sc.complement();
   2812                 fRXPat->fInitialChars->addAll(sc);
   2813                 numInitialStrings += 2;
   2814             }
   2815             currentLen = safeIncrement(currentLen, 1);
   2816             atStart = FALSE;
   2817             break;
   2818 
   2819 
   2820 
   2821         case URX_BACKSLASH_D:
   2822             // Digit Char
   2823              if (currentLen == 0) {
   2824                  UnicodeSet s;
   2825                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
   2826                  if (URX_VAL(op) != 0) {
   2827                      s.complement();
   2828                  }
   2829                  fRXPat->fInitialChars->addAll(s);
   2830                  numInitialStrings += 2;
   2831             }
   2832             currentLen = safeIncrement(currentLen, 1);
   2833             atStart = FALSE;
   2834             break;
   2835 
   2836 
   2837         case URX_BACKSLASH_H:
   2838             // Horiz white space
   2839             if (currentLen == 0) {
   2840                 UnicodeSet s;
   2841                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
   2842                 s.add((UChar32)9);   // Tab
   2843                 if (URX_VAL(op) != 0) {
   2844                     s.complement();
   2845                 }
   2846                 fRXPat->fInitialChars->addAll(s);
   2847                 numInitialStrings += 2;
   2848             }
   2849             currentLen = safeIncrement(currentLen, 1);
   2850             atStart = FALSE;
   2851             break;
   2852 
   2853 
   2854         case URX_BACKSLASH_R:       // Any line ending sequence
   2855         case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
   2856             if (currentLen == 0) {
   2857                 UnicodeSet s;
   2858                 s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
   2859                 s.add((UChar32)0x85);
   2860                 s.add((UChar32)0x2028, (UChar32)0x2029);
   2861                 if (URX_VAL(op) != 0) {
   2862                      // Complement option applies to URX_BACKSLASH_V only.
   2863                      s.complement();
   2864                 }
   2865                 fRXPat->fInitialChars->addAll(s);
   2866                 numInitialStrings += 2;
   2867             }
   2868             currentLen = safeIncrement(currentLen, 1);
   2869             atStart = FALSE;
   2870             break;
   2871 
   2872 
   2873 
   2874         case URX_ONECHAR_I:
   2875             // Case Insensitive Single Character.
   2876             if (currentLen == 0) {
   2877                 UChar32  c = URX_VAL(op);
   2878                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
   2879                     UnicodeSet starters(c, c);
   2880                     starters.closeOver(USET_CASE_INSENSITIVE);
   2881                     // findCaseInsensitiveStarters(c, &starters);
   2882                     //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
   2883                     //   The expanded folding can't match the pattern.
   2884                     fRXPat->fInitialChars->addAll(starters);
   2885                 } else {
   2886                     // Char has no case variants.  Just add it as-is to the
   2887                     //   set of possible starting chars.
   2888                     fRXPat->fInitialChars->add(c);
   2889                 }
   2890                 numInitialStrings += 2;
   2891             }
   2892             currentLen = safeIncrement(currentLen, 1);
   2893             atStart = FALSE;
   2894             break;
   2895 
   2896 
   2897         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
   2898         case URX_DOTANY_ALL:    // . matches one or two.
   2899         case URX_DOTANY:
   2900         case URX_DOTANY_UNIX:
   2901             if (currentLen == 0) {
   2902                 // These constructs are all bad news when they appear at the start
   2903                 //   of a match.  Any character can begin the match.
   2904                 fRXPat->fInitialChars->clear();
   2905                 fRXPat->fInitialChars->complement();
   2906                 numInitialStrings += 2;
   2907             }
   2908             currentLen = safeIncrement(currentLen, 1);
   2909             atStart = FALSE;
   2910             break;
   2911 
   2912 
   2913         case URX_JMPX:
   2914             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
   2915             U_FALLTHROUGH;
   2916         case URX_JMP:
   2917             {
   2918                 int32_t  jmpDest = URX_VAL(op);
   2919                 if (jmpDest < loc) {
   2920                     // Loop of some kind.  Can safely ignore, the worst that will happen
   2921                     //  is that we understate the true minimum length
   2922                     currentLen = forwardedLength.elementAti(loc+1);
   2923 
   2924                 } else {
   2925                     // Forward jump.  Propagate the current min length to the target loc of the jump.
   2926                     U_ASSERT(jmpDest <= end+1);
   2927                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
   2928                         forwardedLength.setElementAt(currentLen, jmpDest);
   2929                     }
   2930                 }
   2931             }
   2932             atStart = FALSE;
   2933             break;
   2934 
   2935         case URX_JMP_SAV:
   2936         case URX_JMP_SAV_X:
   2937             // Combo of state save to the next loc, + jmp backwards.
   2938             //   Net effect on min. length computation is nothing.
   2939             atStart = FALSE;
   2940             break;
   2941 
   2942         case URX_BACKTRACK:
   2943             // Fails are kind of like a branch, except that the min length was
   2944             //   propagated already, by the state save.
   2945             currentLen = forwardedLength.elementAti(loc+1);
   2946             atStart = FALSE;
   2947             break;
   2948 
   2949 
   2950         case URX_STATE_SAVE:
   2951             {
   2952                 // State Save, for forward jumps, propagate the current minimum.
   2953                 //             of the state save.
   2954                 int32_t  jmpDest = URX_VAL(op);
   2955                 if (jmpDest > loc) {
   2956                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
   2957                         forwardedLength.setElementAt(currentLen, jmpDest);
   2958                     }
   2959                 }
   2960             }
   2961             atStart = FALSE;
   2962             break;
   2963 
   2964 
   2965 
   2966 
   2967         case URX_STRING:
   2968             {
   2969                 loc++;
   2970                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   2971                 int32_t stringLen   = URX_VAL(stringLenOp);
   2972                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
   2973                 U_ASSERT(stringLenOp >= 2);
   2974                 if (currentLen == 0) {
   2975                     // Add the starting character of this string to the set of possible starting
   2976                     //   characters for this pattern.
   2977                     int32_t stringStartIdx = URX_VAL(op);
   2978                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
   2979                     fRXPat->fInitialChars->add(c);
   2980 
   2981                     // Remember this string.  After the entire pattern has been checked,
   2982                     //  if nothing else is identified that can start a match, we'll use it.
   2983                     numInitialStrings++;
   2984                     fRXPat->fInitialStringIdx = stringStartIdx;
   2985                     fRXPat->fInitialStringLen = stringLen;
   2986                 }
   2987 
   2988                 currentLen = safeIncrement(currentLen, stringLen);
   2989                 atStart = FALSE;
   2990             }
   2991             break;
   2992 
   2993         case URX_STRING_I:
   2994             {
   2995                 // Case-insensitive string.  Unlike exact-match strings, we won't
   2996                 //   attempt a string search for possible match positions.  But we
   2997                 //   do update the set of possible starting characters.
   2998                 loc++;
   2999                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3000                 int32_t stringLen   = URX_VAL(stringLenOp);
   3001                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
   3002                 U_ASSERT(stringLenOp >= 2);
   3003                 if (currentLen == 0) {
   3004                     // Add the starting character of this string to the set of possible starting
   3005                     //   characters for this pattern.
   3006                     int32_t stringStartIdx = URX_VAL(op);
   3007                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
   3008                     UnicodeSet s;
   3009                     findCaseInsensitiveStarters(c, &s);
   3010                     fRXPat->fInitialChars->addAll(s);
   3011                     numInitialStrings += 2;  // Matching on an initial string not possible.
   3012                 }
   3013                 currentLen = safeIncrement(currentLen, stringLen);
   3014                 atStart = FALSE;
   3015             }
   3016             break;
   3017 
   3018         case URX_CTR_INIT:
   3019         case URX_CTR_INIT_NG:
   3020             {
   3021                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops
   3022                 //   so location must be updated accordingly.
   3023                 // Loop Init Ops.
   3024                 //   If the min loop count == 0
   3025                 //      move loc forwards to the end of the loop, skipping over the body.
   3026                 //   If the min count is > 0,
   3027                 //      continue normal processing of the body of the loop.
   3028                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
   3029                         loopEndLoc   = URX_VAL(loopEndLoc);
   3030                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
   3031                 if (minLoopCount == 0) {
   3032                     // Min Loop Count of 0, treat like a forward branch and
   3033                     //   move the current minimum length up to the target
   3034                     //   (end of loop) location.
   3035                     U_ASSERT(loopEndLoc <= end+1);
   3036                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
   3037                         forwardedLength.setElementAt(currentLen, loopEndLoc);
   3038                     }
   3039                 }
   3040                 loc+=3;  // Skips over operands of CTR_INIT
   3041             }
   3042             atStart = FALSE;
   3043             break;
   3044 
   3045 
   3046         case URX_CTR_LOOP:
   3047         case URX_CTR_LOOP_NG:
   3048             // Loop ops.
   3049             //  The jump is conditional, backwards only.
   3050             atStart = FALSE;
   3051             break;
   3052 
   3053         case URX_LOOP_C:
   3054             // More loop ops.  These state-save to themselves.
   3055             //   don't change the minimum match
   3056             atStart = FALSE;
   3057             break;
   3058 
   3059 
   3060         case URX_LA_START:
   3061         case URX_LB_START:
   3062             {
   3063                 // Look-around.  Scan forward until the matching look-ahead end,
   3064                 //   without processing the look-around block.  This is overly pessimistic.
   3065 
   3066                 // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
   3067                 //   lookahead contains two LA_END instructions, so count goes up by two
   3068                 //   for each LA_START.
   3069                 int32_t  depth = (opType == URX_LA_START? 2: 1);
   3070                 for (;;) {
   3071                     loc++;
   3072                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3073                     if (URX_TYPE(op) == URX_LA_START) {
   3074                         depth+=2;
   3075                     }
   3076                     if (URX_TYPE(op) == URX_LB_START) {
   3077                         depth++;
   3078                     }
   3079                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
   3080                         depth--;
   3081                         if (depth == 0) {
   3082                             break;
   3083                         }
   3084                     }
   3085                     if (URX_TYPE(op) == URX_STATE_SAVE) {
   3086                         // Need this because neg lookahead blocks will FAIL to outside
   3087                         //   of the block.
   3088                         int32_t  jmpDest = URX_VAL(op);
   3089                         if (jmpDest > loc) {
   3090                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3091                                 forwardedLength.setElementAt(currentLen, jmpDest);
   3092                             }
   3093                         }
   3094                     }
   3095                     U_ASSERT(loc <= end);
   3096                 }
   3097             }
   3098             break;
   3099 
   3100         case URX_LA_END:
   3101         case URX_LB_CONT:
   3102         case URX_LB_END:
   3103         case URX_LBN_CONT:
   3104         case URX_LBN_END:
   3105             U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be
   3106                                  //  consumed by the scan in URX_LA_START and LB_START
   3107 
   3108             break;
   3109 
   3110         default:
   3111             U_ASSERT(FALSE);
   3112             }
   3113 
   3114         }
   3115 
   3116 
   3117     // We have finished walking through the ops.  Check whether some forward jump
   3118     //   propagated a shorter length to location end+1.
   3119     if (forwardedLength.elementAti(end+1) < currentLen) {
   3120         currentLen = forwardedLength.elementAti(end+1);
   3121     }
   3122 
   3123 
   3124     fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
   3125 
   3126 
   3127     // Sort out what we should check for when looking for candidate match start positions.
   3128     // In order of preference,
   3129     //     1.   Start of input text buffer.
   3130     //     2.   A literal string.
   3131     //     3.   Start of line in multi-line mode.
   3132     //     4.   A single literal character.
   3133     //     5.   A character from a set of characters.
   3134     //
   3135     if (fRXPat->fStartType == START_START) {
   3136         // Match only at the start of an input text string.
   3137         //    start type is already set.  We're done.
   3138     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
   3139         // Match beginning only with a literal string.
   3140         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
   3141         U_ASSERT(fRXPat->fInitialChars->contains(c));
   3142         fRXPat->fStartType   = START_STRING;
   3143         fRXPat->fInitialChar = c;
   3144     } else if (fRXPat->fStartType == START_LINE) {
   3145         // Match at start of line in Multi-Line mode.
   3146         // Nothing to do here; everything is already set.
   3147     } else if (fRXPat->fMinMatchLen == 0) {
   3148         // Zero length match possible.  We could start anywhere.
   3149         fRXPat->fStartType = START_NO_INFO;
   3150     } else if (fRXPat->fInitialChars->size() == 1) {
   3151         // All matches begin with the same char.
   3152         fRXPat->fStartType   = START_CHAR;
   3153         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
   3154         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
   3155     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
   3156         fRXPat->fMinMatchLen > 0) {
   3157         // Matches start with a set of character smaller than the set of all chars.
   3158         fRXPat->fStartType = START_SET;
   3159     } else {
   3160         // Matches can start with anything
   3161         fRXPat->fStartType = START_NO_INFO;
   3162     }
   3163 
   3164     return;
   3165 }
   3166 
   3167 
   3168 
   3169 //------------------------------------------------------------------------------
   3170 //
   3171 //   minMatchLength    Calculate the length of the shortest string that could
   3172 //                     match the specified pattern.
   3173 //                     Length is in 16 bit code units, not code points.
   3174 //
   3175 //                     The calculated length may not be exact.  The returned
   3176 //                     value may be shorter than the actual minimum; it must
   3177 //                     never be longer.
   3178 //
   3179 //                     start and end are the range of p-code operations to be
   3180 //                     examined.  The endpoints are included in the range.
   3181 //
   3182 //------------------------------------------------------------------------------
   3183 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
   3184     if (U_FAILURE(*fStatus)) {
   3185         return 0;
   3186     }
   3187 
   3188     U_ASSERT(start <= end);
   3189     U_ASSERT(end < fRXPat->fCompiledPat->size());
   3190 
   3191 
   3192     int32_t    loc;
   3193     int32_t    op;
   3194     int32_t    opType;
   3195     int32_t    currentLen = 0;
   3196 
   3197 
   3198     // forwardedLength is a vector holding minimum-match-length values that
   3199     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
   3200     //   It must be one longer than the pattern being checked because some  ops
   3201     //   will jmp to a end-of-block+1 location from within a block, and we must
   3202     //   count those when checking the block.
   3203     UVector32  forwardedLength(end+2, *fStatus);
   3204     forwardedLength.setSize(end+2);
   3205     for (loc=start; loc<=end+1; loc++) {
   3206         forwardedLength.setElementAt(INT32_MAX, loc);
   3207     }
   3208 
   3209     for (loc = start; loc<=end; loc++) {
   3210         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3211         opType = URX_TYPE(op);
   3212 
   3213         // The loop is advancing linearly through the pattern.
   3214         // If the op we are now at was the destination of a branch in the pattern,
   3215         // and that path has a shorter minimum length than the current accumulated value,
   3216         // replace the current accumulated value.
   3217         // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
   3218                                                                //   no-match-possible cases.
   3219         if (forwardedLength.elementAti(loc) < currentLen) {
   3220             currentLen = forwardedLength.elementAti(loc);
   3221             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   3222         }
   3223 
   3224         switch (opType) {
   3225             // Ops that don't change the total length matched
   3226         case URX_RESERVED_OP:
   3227         case URX_END:
   3228         case URX_STRING_LEN:
   3229         case URX_NOP:
   3230         case URX_START_CAPTURE:
   3231         case URX_END_CAPTURE:
   3232         case URX_BACKSLASH_B:
   3233         case URX_BACKSLASH_BU:
   3234         case URX_BACKSLASH_G:
   3235         case URX_BACKSLASH_Z:
   3236         case URX_CARET:
   3237         case URX_DOLLAR:
   3238         case URX_DOLLAR_M:
   3239         case URX_DOLLAR_D:
   3240         case URX_DOLLAR_MD:
   3241         case URX_RELOC_OPRND:
   3242         case URX_STO_INP_LOC:
   3243         case URX_CARET_M:
   3244         case URX_CARET_M_UNIX:
   3245         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   3246         case URX_BACKREF_I:
   3247 
   3248         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   3249         case URX_LD_SP:
   3250 
   3251         case URX_JMP_SAV:
   3252         case URX_JMP_SAV_X:
   3253             break;
   3254 
   3255 
   3256             // Ops that match a minimum of one character (one or two 16 bit code units.)
   3257             //
   3258         case URX_ONECHAR:
   3259         case URX_STATIC_SETREF:
   3260         case URX_STAT_SETREF_N:
   3261         case URX_SETREF:
   3262         case URX_BACKSLASH_D:
   3263         case URX_BACKSLASH_H:
   3264         case URX_BACKSLASH_R:
   3265         case URX_BACKSLASH_V:
   3266         case URX_ONECHAR_I:
   3267         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
   3268         case URX_DOTANY_ALL:    // . matches one or two.
   3269         case URX_DOTANY:
   3270         case URX_DOTANY_UNIX:
   3271             currentLen = safeIncrement(currentLen, 1);
   3272             break;
   3273 
   3274 
   3275         case URX_JMPX:
   3276             loc++;              // URX_JMPX has an extra operand, ignored here,
   3277                                 //   otherwise processed identically to URX_JMP.
   3278             U_FALLTHROUGH;
   3279         case URX_JMP:
   3280             {
   3281                 int32_t  jmpDest = URX_VAL(op);
   3282                 if (jmpDest < loc) {
   3283                     // Loop of some kind.  Can safely ignore, the worst that will happen
   3284                     //  is that we understate the true minimum length
   3285                     currentLen = forwardedLength.elementAti(loc+1);
   3286                 } else {
   3287                     // Forward jump.  Propagate the current min length to the target loc of the jump.
   3288                     U_ASSERT(jmpDest <= end+1);
   3289                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
   3290                         forwardedLength.setElementAt(currentLen, jmpDest);
   3291                     }
   3292                 }
   3293             }
   3294             break;
   3295 
   3296         case URX_BACKTRACK:
   3297             {
   3298                 // Back-tracks are kind of like a branch, except that the min length was
   3299                 //   propagated already, by the state save.
   3300                 currentLen = forwardedLength.elementAti(loc+1);
   3301             }
   3302             break;
   3303 
   3304 
   3305         case URX_STATE_SAVE:
   3306             {
   3307                 // State Save, for forward jumps, propagate the current minimum.
   3308                 //             of the state save.
   3309                 int32_t  jmpDest = URX_VAL(op);
   3310                 if (jmpDest > loc) {
   3311                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3312                         forwardedLength.setElementAt(currentLen, jmpDest);
   3313                     }
   3314                 }
   3315             }
   3316             break;
   3317 
   3318 
   3319         case URX_STRING:
   3320             {
   3321                 loc++;
   3322                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3323                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
   3324             }
   3325             break;
   3326 
   3327 
   3328         case URX_STRING_I:
   3329             {
   3330                 loc++;
   3331                 // TODO: with full case folding, matching input text may be shorter than
   3332                 //       the string we have here.  More smarts could put some bounds on it.
   3333                 //       Assume a min length of one for now.  A min length of zero causes
   3334                 //        optimization failures for a pattern like "string"+
   3335                 // currentLen += URX_VAL(stringLenOp);
   3336                 currentLen = safeIncrement(currentLen, 1);
   3337             }
   3338             break;
   3339 
   3340         case URX_CTR_INIT:
   3341         case URX_CTR_INIT_NG:
   3342             {
   3343                 // Loop Init Ops.
   3344                 //   If the min loop count == 0
   3345                 //      move loc forwards to the end of the loop, skipping over the body.
   3346                 //   If the min count is > 0,
   3347                 //      continue normal processing of the body of the loop.
   3348                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
   3349                         loopEndLoc   = URX_VAL(loopEndLoc);
   3350                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
   3351                 if (minLoopCount == 0) {
   3352                     loc = loopEndLoc;
   3353                 } else {
   3354                     loc+=3;  // Skips over operands of CTR_INIT
   3355                 }
   3356             }
   3357             break;
   3358 
   3359 
   3360         case URX_CTR_LOOP:
   3361         case URX_CTR_LOOP_NG:
   3362             // Loop ops.
   3363             //  The jump is conditional, backwards only.
   3364             break;
   3365 
   3366         case URX_LOOP_SR_I:
   3367         case URX_LOOP_DOT_I:
   3368         case URX_LOOP_C:
   3369             // More loop ops.  These state-save to themselves.
   3370             //   don't change the minimum match - could match nothing at all.
   3371             break;
   3372 
   3373 
   3374         case URX_LA_START:
   3375         case URX_LB_START:
   3376             {
   3377                 // Look-around.  Scan forward until the matching look-ahead end,
   3378                 //   without processing the look-around block.  This is overly pessimistic for look-ahead,
   3379                 //   it assumes that the look-ahead match might be zero-length.
   3380                 //   TODO:  Positive lookahead could recursively do the block, then continue
   3381                 //          with the longer of the block or the value coming in.  Ticket 6060
   3382                 int32_t  depth = (opType == URX_LA_START? 2: 1);;
   3383                 for (;;) {
   3384                     loc++;
   3385                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3386                     if (URX_TYPE(op) == URX_LA_START) {
   3387                         // The boilerplate for look-ahead includes two LA_END insturctions,
   3388                         //    Depth will be decremented by each one when it is seen.
   3389                         depth += 2;
   3390                     }
   3391                     if (URX_TYPE(op) == URX_LB_START) {
   3392                         depth++;
   3393                     }
   3394                     if (URX_TYPE(op) == URX_LA_END) {
   3395                         depth--;
   3396                         if (depth == 0) {
   3397                             break;
   3398                         }
   3399                     }
   3400                     if (URX_TYPE(op)==URX_LBN_END) {
   3401                         depth--;
   3402                         if (depth == 0) {
   3403                             break;
   3404                         }
   3405                     }
   3406                     if (URX_TYPE(op) == URX_STATE_SAVE) {
   3407                         // Need this because neg lookahead blocks will FAIL to outside
   3408                         //   of the block.
   3409                         int32_t  jmpDest = URX_VAL(op);
   3410                         if (jmpDest > loc) {
   3411                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3412                                 forwardedLength.setElementAt(currentLen, jmpDest);
   3413                             }
   3414                         }
   3415                     }
   3416                     U_ASSERT(loc <= end);
   3417                 }
   3418             }
   3419             break;
   3420 
   3421         case URX_LA_END:
   3422         case URX_LB_CONT:
   3423         case URX_LB_END:
   3424         case URX_LBN_CONT:
   3425         case URX_LBN_END:
   3426             // Only come here if the matching URX_LA_START or URX_LB_START was not in the
   3427             //   range being sized, which happens when measuring size of look-behind blocks.
   3428             break;
   3429 
   3430         default:
   3431             U_ASSERT(FALSE);
   3432             }
   3433 
   3434         }
   3435 
   3436     // We have finished walking through the ops.  Check whether some forward jump
   3437     //   propagated a shorter length to location end+1.
   3438     if (forwardedLength.elementAti(end+1) < currentLen) {
   3439         currentLen = forwardedLength.elementAti(end+1);
   3440         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   3441     }
   3442 
   3443     return currentLen;
   3444 }
   3445 
   3446 //------------------------------------------------------------------------------
   3447 //
   3448 //   maxMatchLength    Calculate the length of the longest string that could
   3449 //                     match the specified pattern.
   3450 //                     Length is in 16 bit code units, not code points.
   3451 //
   3452 //                     The calculated length may not be exact.  The returned
   3453 //                     value may be longer than the actual maximum; it must
   3454 //                     never be shorter.
   3455 //
   3456 //------------------------------------------------------------------------------
   3457 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
   3458     if (U_FAILURE(*fStatus)) {
   3459         return 0;
   3460     }
   3461     U_ASSERT(start <= end);
   3462     U_ASSERT(end < fRXPat->fCompiledPat->size());
   3463 
   3464 
   3465     int32_t    loc;
   3466     int32_t    op;
   3467     int32_t    opType;
   3468     int32_t    currentLen = 0;
   3469     UVector32  forwardedLength(end+1, *fStatus);
   3470     forwardedLength.setSize(end+1);
   3471 
   3472     for (loc=start; loc<=end; loc++) {
   3473         forwardedLength.setElementAt(0, loc);
   3474     }
   3475 
   3476     for (loc = start; loc<=end; loc++) {
   3477         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3478         opType = URX_TYPE(op);
   3479 
   3480         // The loop is advancing linearly through the pattern.
   3481         // If the op we are now at was the destination of a branch in the pattern,
   3482         // and that path has a longer maximum length than the current accumulated value,
   3483         // replace the current accumulated value.
   3484         if (forwardedLength.elementAti(loc) > currentLen) {
   3485             currentLen = forwardedLength.elementAti(loc);
   3486         }
   3487 
   3488         switch (opType) {
   3489             // Ops that don't change the total length matched
   3490         case URX_RESERVED_OP:
   3491         case URX_END:
   3492         case URX_STRING_LEN:
   3493         case URX_NOP:
   3494         case URX_START_CAPTURE:
   3495         case URX_END_CAPTURE:
   3496         case URX_BACKSLASH_B:
   3497         case URX_BACKSLASH_BU:
   3498         case URX_BACKSLASH_G:
   3499         case URX_BACKSLASH_Z:
   3500         case URX_CARET:
   3501         case URX_DOLLAR:
   3502         case URX_DOLLAR_M:
   3503         case URX_DOLLAR_D:
   3504         case URX_DOLLAR_MD:
   3505         case URX_RELOC_OPRND:
   3506         case URX_STO_INP_LOC:
   3507         case URX_CARET_M:
   3508         case URX_CARET_M_UNIX:
   3509 
   3510         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   3511         case URX_LD_SP:
   3512 
   3513         case URX_LB_END:
   3514         case URX_LB_CONT:
   3515         case URX_LBN_CONT:
   3516         case URX_LBN_END:
   3517             break;
   3518 
   3519 
   3520             // Ops that increase that cause an unbounded increase in the length
   3521             //   of a matched string, or that increase it a hard to characterize way.
   3522             //   Call the max length unbounded, and stop further checking.
   3523         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   3524         case URX_BACKREF_I:
   3525         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
   3526             currentLen = INT32_MAX;
   3527             break;
   3528 
   3529 
   3530             // Ops that match a max of one character (possibly two 16 bit code units.)
   3531             //
   3532         case URX_STATIC_SETREF:
   3533         case URX_STAT_SETREF_N:
   3534         case URX_SETREF:
   3535         case URX_BACKSLASH_D:
   3536         case URX_BACKSLASH_H:
   3537         case URX_BACKSLASH_R:
   3538         case URX_BACKSLASH_V:
   3539         case URX_ONECHAR_I:
   3540         case URX_DOTANY_ALL:
   3541         case URX_DOTANY:
   3542         case URX_DOTANY_UNIX:
   3543             currentLen = safeIncrement(currentLen, 2);
   3544             break;
   3545 
   3546             // Single literal character.  Increase current max length by one or two,
   3547             //       depending on whether the char is in the supplementary range.
   3548         case URX_ONECHAR:
   3549             currentLen = safeIncrement(currentLen, 1);
   3550             if (URX_VAL(op) > 0x10000) {
   3551                 currentLen = safeIncrement(currentLen, 1);
   3552             }
   3553             break;
   3554 
   3555             // Jumps.
   3556             //
   3557         case URX_JMP:
   3558         case URX_JMPX:
   3559         case URX_JMP_SAV:
   3560         case URX_JMP_SAV_X:
   3561             {
   3562                 int32_t  jmpDest = URX_VAL(op);
   3563                 if (jmpDest < loc) {
   3564                     // Loop of some kind.  Max match length is unbounded.
   3565                     currentLen = INT32_MAX;
   3566                 } else {
   3567                     // Forward jump.  Propagate the current min length to the target loc of the jump.
   3568                     if (forwardedLength.elementAti(jmpDest) < currentLen) {
   3569                         forwardedLength.setElementAt(currentLen, jmpDest);
   3570                     }
   3571                     currentLen = 0;
   3572                 }
   3573             }
   3574             break;
   3575 
   3576         case URX_BACKTRACK:
   3577             // back-tracks are kind of like a branch, except that the max length was
   3578             //   propagated already, by the state save.
   3579             currentLen = forwardedLength.elementAti(loc+1);
   3580             break;
   3581 
   3582 
   3583         case URX_STATE_SAVE:
   3584             {
   3585                 // State Save, for forward jumps, propagate the current minimum.
   3586                 //               of the state save.
   3587                 //             For backwards jumps, they create a loop, maximum
   3588                 //               match length is unbounded.
   3589                 int32_t  jmpDest = URX_VAL(op);
   3590                 if (jmpDest > loc) {
   3591                     if (currentLen > forwardedLength.elementAti(jmpDest)) {
   3592                         forwardedLength.setElementAt(currentLen, jmpDest);
   3593                     }
   3594                 } else {
   3595                     currentLen = INT32_MAX;
   3596                 }
   3597             }
   3598             break;
   3599 
   3600 
   3601 
   3602 
   3603         case URX_STRING:
   3604             {
   3605                 loc++;
   3606                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3607                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
   3608                 break;
   3609             }
   3610 
   3611         case URX_STRING_I:
   3612             // TODO:  This code assumes that any user string that matches will be no longer
   3613             //        than our compiled string, with case insensitive matching.
   3614             //        Our compiled string has been case-folded already.
   3615             //
   3616             //        Any matching user string will have no more code points than our
   3617             //        compiled (folded) string.  Folding may add code points, but
   3618             //        not remove them.
   3619             //
   3620             //        There is a potential problem if a supplemental code point
   3621             //        case-folds to a BMP code point.  In this case our compiled string
   3622             //        could be shorter (in code units) than a matching user string.
   3623             //
   3624             //        At this time (Unicode 6.1) there are no such characters, and this case
   3625             //        is not being handled.  A test, intltest regex/Bug9283, will fail if
   3626             //        any problematic characters are added to Unicode.
   3627             //
   3628             //        If this happens, we can make a set of the BMP chars that the
   3629             //        troublesome supplementals fold to, scan our string, and bump the
   3630             //        currentLen one extra for each that is found.
   3631             //
   3632             {
   3633                 loc++;
   3634                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3635                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
   3636             }
   3637             break;
   3638 
   3639         case URX_CTR_INIT:
   3640         case URX_CTR_INIT_NG:
   3641             // For Loops, recursively call this function on the pattern for the loop body,
   3642             //   then multiply the result by the maximum loop count.
   3643             {
   3644                 int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
   3645                 if (loopEndLoc == loc+4) {
   3646                     // Loop has an empty body. No affect on max match length.
   3647                     // Continue processing with code after the loop end.
   3648                     loc = loopEndLoc;
   3649                     break;
   3650                 }
   3651 
   3652                 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
   3653                 if (maxLoopCount == -1) {
   3654                     // Unbounded Loop. No upper bound on match length.
   3655                     currentLen = INT32_MAX;
   3656                     break;
   3657                 }
   3658 
   3659                 U_ASSERT(loopEndLoc >= loc+4);
   3660                 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
   3661                 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
   3662                 if (updatedLen >= INT32_MAX) {
   3663                     currentLen = INT32_MAX;
   3664                     break;
   3665                 }
   3666                 currentLen = (int32_t)updatedLen;
   3667                 loc = loopEndLoc;
   3668                 break;
   3669             }
   3670 
   3671         case URX_CTR_LOOP:
   3672         case URX_CTR_LOOP_NG:
   3673             // These opcodes will be skipped over by code for URX_CRT_INIT.
   3674             // We shouldn't encounter them here.
   3675             U_ASSERT(FALSE);
   3676             break;
   3677 
   3678         case URX_LOOP_SR_I:
   3679         case URX_LOOP_DOT_I:
   3680         case URX_LOOP_C:
   3681             // For anything to do with loops, make the match length unbounded.
   3682             currentLen = INT32_MAX;
   3683             break;
   3684 
   3685 
   3686 
   3687         case URX_LA_START:
   3688         case URX_LA_END:
   3689             // Look-ahead.  Just ignore, treat the look-ahead block as if
   3690             // it were normal pattern.  Gives a too-long match length,
   3691             //  but good enough for now.
   3692             break;
   3693 
   3694             // End of look-ahead ops should always be consumed by the processing at
   3695             //  the URX_LA_START op.
   3696             // U_ASSERT(FALSE);
   3697             // break;
   3698 
   3699         case URX_LB_START:
   3700             {
   3701                 // Look-behind.  Scan forward until the matching look-around end,
   3702                 //   without processing the look-behind block.
   3703                 int32_t  depth = 0;
   3704                 for (;;) {
   3705                     loc++;
   3706                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3707                     if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
   3708                         depth++;
   3709                     }
   3710                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
   3711                         if (depth == 0) {
   3712                             break;
   3713                         }
   3714                         depth--;
   3715                     }
   3716                     U_ASSERT(loc < end);
   3717                 }
   3718             }
   3719             break;
   3720 
   3721         default:
   3722             U_ASSERT(FALSE);
   3723         }
   3724 
   3725 
   3726         if (currentLen == INT32_MAX) {
   3727             //  The maximum length is unbounded.
   3728             //  Stop further processing of the pattern.
   3729             break;
   3730         }
   3731 
   3732     }
   3733     return currentLen;
   3734 
   3735 }
   3736 
   3737 
   3738 //------------------------------------------------------------------------------
   3739 //
   3740 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
   3741 //                Extra NOPs are inserted for some constructs during the initial
   3742 //                code generation to provide locations that may be patched later.
   3743 //                Many end up unneeded, and are removed by this function.
   3744 //
   3745 //                In order to minimize the number of passes through the pattern,
   3746 //                back-reference fixup is also performed here (adjusting
   3747 //                back-reference operands to point to the correct frame offsets).
   3748 //
   3749 //------------------------------------------------------------------------------
   3750 void RegexCompile::stripNOPs() {
   3751 
   3752     if (U_FAILURE(*fStatus)) {
   3753         return;
   3754     }
   3755 
   3756     int32_t    end = fRXPat->fCompiledPat->size();
   3757     UVector32  deltas(end, *fStatus);
   3758 
   3759     // Make a first pass over the code, computing the amount that things
   3760     //   will be offset at each location in the original code.
   3761     int32_t   loc;
   3762     int32_t   d = 0;
   3763     for (loc=0; loc<end; loc++) {
   3764         deltas.addElement(d, *fStatus);
   3765         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3766         if (URX_TYPE(op) == URX_NOP) {
   3767             d++;
   3768         }
   3769     }
   3770 
   3771     UnicodeString caseStringBuffer;
   3772 
   3773     // Make a second pass over the code, removing the NOPs by moving following
   3774     //  code up, and patching operands that refer to code locations that
   3775     //  are being moved.  The array of offsets from the first step is used
   3776     //  to compute the new operand values.
   3777     int32_t src;
   3778     int32_t dst = 0;
   3779     for (src=0; src<end; src++) {
   3780         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
   3781         int32_t opType = URX_TYPE(op);
   3782         switch (opType) {
   3783         case URX_NOP:
   3784             break;
   3785 
   3786         case URX_STATE_SAVE:
   3787         case URX_JMP:
   3788         case URX_CTR_LOOP:
   3789         case URX_CTR_LOOP_NG:
   3790         case URX_RELOC_OPRND:
   3791         case URX_JMPX:
   3792         case URX_JMP_SAV:
   3793         case URX_JMP_SAV_X:
   3794             // These are instructions with operands that refer to code locations.
   3795             {
   3796                 int32_t  operandAddress = URX_VAL(op);
   3797                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
   3798                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
   3799                 op = buildOp(opType, fixedOperandAddress);
   3800                 fRXPat->fCompiledPat->setElementAt(op, dst);
   3801                 dst++;
   3802                 break;
   3803             }
   3804 
   3805         case URX_BACKREF:
   3806         case URX_BACKREF_I:
   3807             {
   3808                 int32_t where = URX_VAL(op);
   3809                 if (where > fRXPat->fGroupMap->size()) {
   3810                     error(U_REGEX_INVALID_BACK_REF);
   3811                     break;
   3812                 }
   3813                 where = fRXPat->fGroupMap->elementAti(where-1);
   3814                 op    = buildOp(opType, where);
   3815                 fRXPat->fCompiledPat->setElementAt(op, dst);
   3816                 dst++;
   3817 
   3818                 fRXPat->fNeedsAltInput = TRUE;
   3819                 break;
   3820             }
   3821         case URX_RESERVED_OP:
   3822         case URX_RESERVED_OP_N:
   3823         case URX_BACKTRACK:
   3824         case URX_END:
   3825         case URX_ONECHAR:
   3826         case URX_STRING:
   3827         case URX_STRING_LEN:
   3828         case URX_START_CAPTURE:
   3829         case URX_END_CAPTURE:
   3830         case URX_STATIC_SETREF:
   3831         case URX_STAT_SETREF_N:
   3832         case URX_SETREF:
   3833         case URX_DOTANY:
   3834         case URX_FAIL:
   3835         case URX_BACKSLASH_B:
   3836         case URX_BACKSLASH_BU:
   3837         case URX_BACKSLASH_G:
   3838         case URX_BACKSLASH_X:
   3839         case URX_BACKSLASH_Z:
   3840         case URX_DOTANY_ALL:
   3841         case URX_BACKSLASH_D:
   3842         case URX_CARET:
   3843         case URX_DOLLAR:
   3844         case URX_CTR_INIT:
   3845         case URX_CTR_INIT_NG:
   3846         case URX_DOTANY_UNIX:
   3847         case URX_STO_SP:
   3848         case URX_LD_SP:
   3849         case URX_STO_INP_LOC:
   3850         case URX_LA_START:
   3851         case URX_LA_END:
   3852         case URX_ONECHAR_I:
   3853         case URX_STRING_I:
   3854         case URX_DOLLAR_M:
   3855         case URX_CARET_M:
   3856         case URX_CARET_M_UNIX:
   3857         case URX_LB_START:
   3858         case URX_LB_CONT:
   3859         case URX_LB_END:
   3860         case URX_LBN_CONT:
   3861         case URX_LBN_END:
   3862         case URX_LOOP_SR_I:
   3863         case URX_LOOP_DOT_I:
   3864         case URX_LOOP_C:
   3865         case URX_DOLLAR_D:
   3866         case URX_DOLLAR_MD:
   3867         case URX_BACKSLASH_H:
   3868         case URX_BACKSLASH_R:
   3869         case URX_BACKSLASH_V:
   3870             // These instructions are unaltered by the relocation.
   3871             fRXPat->fCompiledPat->setElementAt(op, dst);
   3872             dst++;
   3873             break;
   3874 
   3875         default:
   3876             // Some op is unaccounted for.
   3877             U_ASSERT(FALSE);
   3878             error(U_REGEX_INTERNAL_ERROR);
   3879         }
   3880     }
   3881 
   3882     fRXPat->fCompiledPat->setSize(dst);
   3883 }
   3884 
   3885 
   3886 
   3887 
   3888 //------------------------------------------------------------------------------
   3889 //
   3890 //  Error         Report a rule parse error.
   3891 //                Only report it if no previous error has been recorded.
   3892 //
   3893 //------------------------------------------------------------------------------
   3894 void RegexCompile::error(UErrorCode e) {
   3895     if (U_SUCCESS(*fStatus)) {
   3896         *fStatus = e;
   3897         // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
   3898         // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
   3899         // int64_t. If the values of the latter are out of range for the former,
   3900         // set them to the appropriate "field not supported" values.
   3901         if (fLineNum > 0x7FFFFFFF) {
   3902             fParseErr->line   = 0;
   3903             fParseErr->offset = -1;
   3904         } else if (fCharNum > 0x7FFFFFFF) {
   3905             fParseErr->line   = (int32_t)fLineNum;
   3906             fParseErr->offset = -1;
   3907         } else {
   3908             fParseErr->line   = (int32_t)fLineNum;
   3909             fParseErr->offset = (int32_t)fCharNum;
   3910         }
   3911 
   3912         UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
   3913 
   3914         // Fill in the context.
   3915         //   Note: extractBetween() pins supplied indicies to the string bounds.
   3916         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
   3917         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
   3918         utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
   3919         utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
   3920     }
   3921 }
   3922 
   3923 
   3924 //
   3925 //  Assorted Unicode character constants.
   3926 //     Numeric because there is no portable way to enter them as literals.
   3927 //     (Think EBCDIC).
   3928 //
   3929 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
   3930 static const UChar      chLF        = 0x0a;      // Line Feed
   3931 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
   3932 static const UChar      chDigit0    = 0x30;      // '0'
   3933 static const UChar      chDigit7    = 0x37;      // '9'
   3934 static const UChar      chColon     = 0x3A;      // ':'
   3935 static const UChar      chE         = 0x45;      // 'E'
   3936 static const UChar      chQ         = 0x51;      // 'Q'
   3937 //static const UChar      chN         = 0x4E;      // 'N'
   3938 static const UChar      chP         = 0x50;      // 'P'
   3939 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
   3940 //static const UChar      chLBracket  = 0x5b;      // '['
   3941 static const UChar      chRBracket  = 0x5d;      // ']'
   3942 static const UChar      chUp        = 0x5e;      // '^'
   3943 static const UChar      chLowerP    = 0x70;
   3944 static const UChar      chLBrace    = 0x7b;      // '{'
   3945 static const UChar      chRBrace    = 0x7d;      // '}'
   3946 static const UChar      chNEL       = 0x85;      //    NEL newline variant
   3947 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
   3948 
   3949 
   3950 //------------------------------------------------------------------------------
   3951 //
   3952 //  nextCharLL    Low Level Next Char from the regex pattern.
   3953 //                Get a char from the string, keep track of input position
   3954 //                     for error reporting.
   3955 //
   3956 //------------------------------------------------------------------------------
   3957 UChar32  RegexCompile::nextCharLL() {
   3958     UChar32       ch;
   3959 
   3960     if (fPeekChar != -1) {
   3961         ch = fPeekChar;
   3962         fPeekChar = -1;
   3963         return ch;
   3964     }
   3965 
   3966     // assume we're already in the right place
   3967     ch = UTEXT_NEXT32(fRXPat->fPattern);
   3968     if (ch == U_SENTINEL) {
   3969         return ch;
   3970     }
   3971 
   3972     if (ch == chCR ||
   3973         ch == chNEL ||
   3974         ch == chLS   ||
   3975         (ch == chLF && fLastChar != chCR)) {
   3976         // Character is starting a new line.  Bump up the line number, and
   3977         //  reset the column to 0.
   3978         fLineNum++;
   3979         fCharNum=0;
   3980     }
   3981     else {
   3982         // Character is not starting a new line.  Except in the case of a
   3983         //   LF following a CR, increment the column position.
   3984         if (ch != chLF) {
   3985             fCharNum++;
   3986         }
   3987     }
   3988     fLastChar = ch;
   3989     return ch;
   3990 }
   3991 
   3992 //------------------------------------------------------------------------------
   3993 //
   3994 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
   3995 //                 character without actually getting it.
   3996 //
   3997 //------------------------------------------------------------------------------
   3998 UChar32  RegexCompile::peekCharLL() {
   3999     if (fPeekChar == -1) {
   4000         fPeekChar = nextCharLL();
   4001     }
   4002     return fPeekChar;
   4003 }
   4004 
   4005 
   4006 //------------------------------------------------------------------------------
   4007 //
   4008 //   nextChar     for pattern scanning.  At this level, we handle stripping
   4009 //                out comments and processing some backslash character escapes.
   4010 //                The rest of the pattern grammar is handled at the next level up.
   4011 //
   4012 //------------------------------------------------------------------------------
   4013 void RegexCompile::nextChar(RegexPatternChar &c) {
   4014 
   4015     fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
   4016     c.fChar    = nextCharLL();
   4017     c.fQuoted  = FALSE;
   4018 
   4019     if (fQuoteMode) {
   4020         c.fQuoted = TRUE;
   4021         if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
   4022             c.fChar == (UChar32)-1) {
   4023             fQuoteMode = FALSE;  //  Exit quote mode,
   4024             nextCharLL();        // discard the E
   4025             nextChar(c);         // recurse to get the real next char
   4026         }
   4027     }
   4028     else if (fInBackslashQuote) {
   4029         // The current character immediately follows a '\'
   4030         // Don't check for any further escapes, just return it as-is.
   4031         // Don't set c.fQuoted, because that would prevent the state machine from
   4032         //    dispatching on the character.
   4033         fInBackslashQuote = FALSE;
   4034     }
   4035     else
   4036     {
   4037         // We are not in a \Q quoted region \E of the source.
   4038         //
   4039         if (fModeFlags & UREGEX_COMMENTS) {
   4040             //
   4041             // We are in free-spacing and comments mode.
   4042             //  Scan through any white space and comments, until we
   4043             //  reach a significant character or the end of inut.
   4044             for (;;) {
   4045                 if (c.fChar == (UChar32)-1) {
   4046                     break;     // End of Input
   4047                 }
   4048                 if  (c.fChar == chPound && fEOLComments == TRUE) {
   4049                     // Start of a comment.  Consume the rest of it, until EOF or a new line
   4050                     for (;;) {
   4051                         c.fChar = nextCharLL();
   4052                         if (c.fChar == (UChar32)-1 ||  // EOF
   4053                             c.fChar == chCR        ||
   4054                             c.fChar == chLF        ||
   4055                             c.fChar == chNEL       ||
   4056                             c.fChar == chLS)       {
   4057                             break;
   4058                         }
   4059                     }
   4060                 }
   4061                 // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
   4062                 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
   4063                     break;
   4064                 }
   4065                 c.fChar = nextCharLL();
   4066             }
   4067         }
   4068 
   4069         //
   4070         //  check for backslash escaped characters.
   4071         //
   4072         if (c.fChar == chBackSlash) {
   4073             int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
   4074             if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
   4075                 //
   4076                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
   4077                 //   Includes \uxxxx, \n, \r, many others.
   4078                 //   Return the single equivalent character.
   4079                 //
   4080                 nextCharLL();                 // get & discard the peeked char.
   4081                 c.fQuoted = TRUE;
   4082 
   4083                 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
   4084                     int32_t endIndex = (int32_t)pos;
   4085                     c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
   4086 
   4087                     if (endIndex == pos) {
   4088                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   4089                     }
   4090                     fCharNum += endIndex - pos;
   4091                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
   4092                 } else {
   4093                     int32_t offset = 0;
   4094                     struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
   4095 
   4096                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
   4097                     c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
   4098 
   4099                     if (offset == 0) {
   4100                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   4101                     } else if (context.lastOffset == offset) {
   4102                         UTEXT_PREVIOUS32(fRXPat->fPattern);
   4103                     } else if (context.lastOffset != offset-1) {
   4104                         utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
   4105                     }
   4106                     fCharNum += offset;
   4107                 }
   4108             }
   4109             else if (peekCharLL() == chDigit0) {
   4110                 //  Octal Escape, using Java Regexp Conventions
   4111                 //    which are \0 followed by 1-3 octal digits.
   4112                 //    Different from ICU Unescape handling of Octal, which does not
   4113                 //    require the leading 0.
   4114                 //  Java also has the convention of only consuming 2 octal digits if
   4115                 //    the three digit number would be > 0xff
   4116                 //
   4117                 c.fChar = 0;
   4118                 nextCharLL();    // Consume the initial 0.
   4119                 int index;
   4120                 for (index=0; index<3; index++) {
   4121                     int32_t ch = peekCharLL();
   4122                     if (ch<chDigit0 || ch>chDigit7) {
   4123                         if (index==0) {
   4124                            // \0 is not followed by any octal digits.
   4125                            error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   4126                         }
   4127                         break;
   4128                     }
   4129                     c.fChar <<= 3;
   4130                     c.fChar += ch&7;
   4131                     if (c.fChar <= 255) {
   4132                         nextCharLL();
   4133                     } else {
   4134                         // The last digit made the number too big.  Forget we saw it.
   4135                         c.fChar >>= 3;
   4136                     }
   4137                 }
   4138                 c.fQuoted = TRUE;
   4139             }
   4140             else if (peekCharLL() == chQ) {
   4141                 //  "\Q"  enter quote mode, which will continue until "\E"
   4142                 fQuoteMode = TRUE;
   4143                 nextCharLL();       // discard the 'Q'.
   4144                 nextChar(c);        // recurse to get the real next char.
   4145             }
   4146             else
   4147             {
   4148                 // We are in a '\' escape that will be handled by the state table scanner.
   4149                 // Just return the backslash, but remember that the following char is to
   4150                 //  be taken literally.
   4151                 fInBackslashQuote = TRUE;
   4152             }
   4153         }
   4154     }
   4155 
   4156     // re-enable # to end-of-line comments, in case they were disabled.
   4157     // They are disabled by the parser upon seeing '(?', but this lasts for
   4158     //  the fetching of the next character only.
   4159     fEOLComments = TRUE;
   4160 
   4161     // putc(c.fChar, stdout);
   4162 }
   4163 
   4164 
   4165 
   4166 //------------------------------------------------------------------------------
   4167 //
   4168 //  scanNamedChar
   4169 //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
   4170 //
   4171 //             The scan position will be at the 'N'.  On return
   4172 //             the scan position should be just after the '}'
   4173 //
   4174 //             Return the UChar32
   4175 //
   4176 //------------------------------------------------------------------------------
   4177 UChar32  RegexCompile::scanNamedChar() {
   4178     if (U_FAILURE(*fStatus)) {
   4179         return 0;
   4180     }
   4181 
   4182     nextChar(fC);
   4183     if (fC.fChar != chLBrace) {
   4184         error(U_REGEX_PROPERTY_SYNTAX);
   4185         return 0;
   4186     }
   4187 
   4188     UnicodeString  charName;
   4189     for (;;) {
   4190         nextChar(fC);
   4191         if (fC.fChar == chRBrace) {
   4192             break;
   4193         }
   4194         if (fC.fChar == -1) {
   4195             error(U_REGEX_PROPERTY_SYNTAX);
   4196             return 0;
   4197         }
   4198         charName.append(fC.fChar);
   4199     }
   4200 
   4201     char name[100];
   4202     if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
   4203          (uint32_t)charName.length()>=sizeof(name)) {
   4204         // All Unicode character names have only invariant characters.
   4205         // The API to get a character, given a name, accepts only char *, forcing us to convert,
   4206         //   which requires this error check
   4207         error(U_REGEX_PROPERTY_SYNTAX);
   4208         return 0;
   4209     }
   4210     charName.extract(0, charName.length(), name, sizeof(name), US_INV);
   4211 
   4212     UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
   4213     if (U_FAILURE(*fStatus)) {
   4214         error(U_REGEX_PROPERTY_SYNTAX);
   4215     }
   4216 
   4217     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
   4218     return theChar;
   4219 }
   4220 
   4221 //------------------------------------------------------------------------------
   4222 //
   4223 //  scanProp   Construct a UnicodeSet from the text at the current scan
   4224 //             position, which will be of the form \p{whaterver}
   4225 //
   4226 //             The scan position will be at the 'p' or 'P'.  On return
   4227 //             the scan position should be just after the '}'
   4228 //
   4229 //             Return a UnicodeSet, constructed from the \P pattern,
   4230 //             or NULL if the pattern is invalid.
   4231 //
   4232 //------------------------------------------------------------------------------
   4233 UnicodeSet *RegexCompile::scanProp() {
   4234     UnicodeSet    *uset = NULL;
   4235 
   4236     if (U_FAILURE(*fStatus)) {
   4237         return NULL;
   4238     }
   4239     (void)chLowerP;   // Suppress compiler unused variable warning.
   4240     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
   4241     UBool negated = (fC.fChar == chP);
   4242 
   4243     UnicodeString propertyName;
   4244     nextChar(fC);
   4245     if (fC.fChar != chLBrace) {
   4246         error(U_REGEX_PROPERTY_SYNTAX);
   4247         return NULL;
   4248     }
   4249     for (;;) {
   4250         nextChar(fC);
   4251         if (fC.fChar == chRBrace) {
   4252             break;
   4253         }
   4254         if (fC.fChar == -1) {
   4255             // Hit the end of the input string without finding the closing '}'
   4256             error(U_REGEX_PROPERTY_SYNTAX);
   4257             return NULL;
   4258         }
   4259         propertyName.append(fC.fChar);
   4260     }
   4261     uset = createSetForProperty(propertyName, negated);
   4262     nextChar(fC);    // Move input scan to position following the closing '}'
   4263     return uset;
   4264 }
   4265 
   4266 //------------------------------------------------------------------------------
   4267 //
   4268 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
   4269 //             position, which is expected be of the form [:property expression:]
   4270 //
   4271 //             The scan position will be at the opening ':'.  On return
   4272 //             the scan position must be on the closing ']'
   4273 //
   4274 //             Return a UnicodeSet constructed from the pattern,
   4275 //             or NULL if this is not a valid POSIX-style set expression.
   4276 //             If not a property expression, restore the initial scan position
   4277 //                (to the opening ':')
   4278 //
   4279 //               Note:  the opening '[:' is not sufficient to guarantee that
   4280 //                      this is a [:property:] expression.
   4281 //                      [:'+=,] is a perfectly good ordinary set expression that
   4282 //                              happens to include ':' as one of its characters.
   4283 //
   4284 //------------------------------------------------------------------------------
   4285 UnicodeSet *RegexCompile::scanPosixProp() {
   4286     UnicodeSet    *uset = NULL;
   4287 
   4288     if (U_FAILURE(*fStatus)) {
   4289         return NULL;
   4290     }
   4291 
   4292     U_ASSERT(fC.fChar == chColon);
   4293 
   4294     // Save the scanner state.
   4295     // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
   4296     int64_t     savedScanIndex        = fScanIndex;
   4297     int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
   4298     UBool       savedQuoteMode        = fQuoteMode;
   4299     UBool       savedInBackslashQuote = fInBackslashQuote;
   4300     UBool       savedEOLComments      = fEOLComments;
   4301     int64_t     savedLineNum          = fLineNum;
   4302     int64_t     savedCharNum          = fCharNum;
   4303     UChar32     savedLastChar         = fLastChar;
   4304     UChar32     savedPeekChar         = fPeekChar;
   4305     RegexPatternChar savedfC          = fC;
   4306 
   4307     // Scan for a closing ].   A little tricky because there are some perverse
   4308     //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
   4309     //   ending on the second closing ].
   4310 
   4311     UnicodeString propName;
   4312     UBool         negated  = FALSE;
   4313 
   4314     // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
   4315     nextChar(fC);
   4316     if (fC.fChar == chUp) {
   4317        negated = TRUE;
   4318        nextChar(fC);
   4319     }
   4320 
   4321     // Scan for the closing ":]", collecting the property name along the way.
   4322     UBool  sawPropSetTerminator = FALSE;
   4323     for (;;) {
   4324         propName.append(fC.fChar);
   4325         nextChar(fC);
   4326         if (fC.fQuoted || fC.fChar == -1) {
   4327             // Escaped characters or end of input - either says this isn't a [:Property:]
   4328             break;
   4329         }
   4330         if (fC.fChar == chColon) {
   4331             nextChar(fC);
   4332             if (fC.fChar == chRBracket) {
   4333                 sawPropSetTerminator = TRUE;
   4334             }
   4335             break;
   4336         }
   4337     }
   4338 
   4339     if (sawPropSetTerminator) {
   4340         uset = createSetForProperty(propName, negated);
   4341     }
   4342     else
   4343     {
   4344         // No closing ":]".
   4345         //  Restore the original scan position.
   4346         //  The main scanner will retry the input as a normal set expression,
   4347         //    not a [:Property:] expression.
   4348         fScanIndex        = savedScanIndex;
   4349         fQuoteMode        = savedQuoteMode;
   4350         fInBackslashQuote = savedInBackslashQuote;
   4351         fEOLComments      = savedEOLComments;
   4352         fLineNum          = savedLineNum;
   4353         fCharNum          = savedCharNum;
   4354         fLastChar         = savedLastChar;
   4355         fPeekChar         = savedPeekChar;
   4356         fC                = savedfC;
   4357         UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
   4358     }
   4359     return uset;
   4360 }
   4361 
   4362 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
   4363     set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
   4364     addCategory(set, U_GC_CF_MASK, ec);
   4365 }
   4366 
   4367 //
   4368 //  Create a Unicode Set from a Unicode Property expression.
   4369 //     This is common code underlying both \p{...} ane [:...:] expressions.
   4370 //     Includes trying the Java "properties" that aren't supported as
   4371 //     normal ICU UnicodeSet properties
   4372 //
   4373 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
   4374 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
   4375 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
   4376     UnicodeString   setExpr;
   4377     UnicodeSet      *set;
   4378     uint32_t        usetFlags = 0;
   4379 
   4380     if (U_FAILURE(*fStatus)) {
   4381         return NULL;
   4382     }
   4383 
   4384     //
   4385     //  First try the property as we received it
   4386     //
   4387     if (negated) {
   4388         setExpr.append(negSetPrefix, -1);
   4389     } else {
   4390         setExpr.append(posSetPrefix, -1);
   4391     }
   4392     setExpr.append(propName);
   4393     setExpr.append(chRBrace);
   4394     setExpr.append(chRBracket);
   4395     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   4396         usetFlags |= USET_CASE_INSENSITIVE;
   4397     }
   4398     set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
   4399     if (U_SUCCESS(*fStatus)) {
   4400        return set;
   4401     }
   4402     delete set;
   4403     set = NULL;
   4404 
   4405     //
   4406     //  The property as it was didn't work.
   4407 
   4408     //  Do [:word:]. It is not recognized as a property by UnicodeSet.  "word" not standard POSIX
   4409     //     or standard Java, but many other regular expression packages do recognize it.
   4410 
   4411     if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
   4412         *fStatus = U_ZERO_ERROR;
   4413         set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
   4414         if (set == NULL) {
   4415             *fStatus = U_MEMORY_ALLOCATION_ERROR;
   4416             return set;
   4417         }
   4418         if (negated) {
   4419             set->complement();
   4420         }
   4421         return set;
   4422     }
   4423 
   4424 
   4425     //    Do Java fixes -
   4426     //       InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
   4427     //       InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
   4428     //
   4429     //       Note on Spaces:  either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
   4430     //                        is accepted by Java.  The property part of the name is compared
   4431     //                        case-insenstively.  The spaces must be exactly as shown, either
   4432     //                        all there, or all omitted, with exactly one at each position
   4433     //                        if they are present.  From checking against JDK 1.6
   4434     //
   4435     //       This code should be removed when ICU properties support the Java  compatibility names
   4436     //          (ICU 4.0?)
   4437     //
   4438     UnicodeString mPropName = propName;
   4439     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
   4440         mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
   4441     }
   4442     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
   4443         mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
   4444         mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
   4445     }
   4446     else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
   4447         mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
   4448     }
   4449 
   4450     //    See if the property looks like a Java "InBlockName", which
   4451     //    we will recast as "Block=BlockName"
   4452     //
   4453     static const UChar IN[] = {0x49, 0x6E, 0};  // "In"
   4454     static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00};  // "Block="
   4455     if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
   4456         setExpr.truncate(4);   // Leaves "[\p{", or "[\P{"
   4457         setExpr.append(BLOCK, -1);
   4458         setExpr.append(UnicodeString(mPropName, 2));  // Property with the leading "In" removed.
   4459         setExpr.append(chRBrace);
   4460         setExpr.append(chRBracket);
   4461         *fStatus = U_ZERO_ERROR;
   4462         set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
   4463         if (U_SUCCESS(*fStatus)) {
   4464             return set;
   4465         }
   4466         delete set;
   4467         set = NULL;
   4468     }
   4469 
   4470     if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
   4471         propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
   4472     {
   4473         UErrorCode localStatus = U_ZERO_ERROR;
   4474         //setExpr.remove();
   4475         set = new UnicodeSet();
   4476         //
   4477         //  Try the various Java specific properties.
   4478         //   These all begin with "java"
   4479         //
   4480         if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
   4481             addCategory(set, U_GC_CN_MASK, localStatus);
   4482             set->complement();
   4483         }
   4484         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
   4485             addCategory(set, U_GC_ND_MASK, localStatus);
   4486         }
   4487         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
   4488             addIdentifierIgnorable(set, localStatus);
   4489         }
   4490         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
   4491             set->add(0, 0x1F).add(0x7F, 0x9F);
   4492         }
   4493         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
   4494             addCategory(set, U_GC_L_MASK, localStatus);
   4495             addCategory(set, U_GC_SC_MASK, localStatus);
   4496             addCategory(set, U_GC_PC_MASK, localStatus);
   4497             addCategory(set, U_GC_ND_MASK, localStatus);
   4498             addCategory(set, U_GC_NL_MASK, localStatus);
   4499             addCategory(set, U_GC_MC_MASK, localStatus);
   4500             addCategory(set, U_GC_MN_MASK, localStatus);
   4501             addIdentifierIgnorable(set, localStatus);
   4502         }
   4503         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
   4504             addCategory(set, U_GC_L_MASK, localStatus);
   4505             addCategory(set, U_GC_NL_MASK, localStatus);
   4506             addCategory(set, U_GC_SC_MASK, localStatus);
   4507             addCategory(set, U_GC_PC_MASK, localStatus);
   4508         }
   4509         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
   4510             addCategory(set, U_GC_L_MASK, localStatus);
   4511         }
   4512         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
   4513             addCategory(set, U_GC_L_MASK, localStatus);
   4514             addCategory(set, U_GC_ND_MASK, localStatus);
   4515         }
   4516         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
   4517             addCategory(set, U_GC_LL_MASK, localStatus);
   4518         }
   4519         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
   4520             set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
   4521         }
   4522         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
   4523             addCategory(set, U_GC_Z_MASK, localStatus);
   4524         }
   4525         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
   4526             set->add(0x10000, UnicodeSet::MAX_VALUE);
   4527         }
   4528         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
   4529             addCategory(set, U_GC_LT_MASK, localStatus);
   4530         }
   4531         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
   4532             addCategory(set, U_GC_L_MASK, localStatus);
   4533             addCategory(set, U_GC_NL_MASK, localStatus);
   4534         }
   4535         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
   4536             addCategory(set, U_GC_L_MASK, localStatus);
   4537             addCategory(set, U_GC_PC_MASK, localStatus);
   4538             addCategory(set, U_GC_ND_MASK, localStatus);
   4539             addCategory(set, U_GC_NL_MASK, localStatus);
   4540             addCategory(set, U_GC_MC_MASK, localStatus);
   4541             addCategory(set, U_GC_MN_MASK, localStatus);
   4542             addIdentifierIgnorable(set, localStatus);
   4543         }
   4544         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
   4545             addCategory(set, U_GC_LU_MASK, localStatus);
   4546         }
   4547         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
   4548             set->add(0, UnicodeSet::MAX_VALUE);
   4549         }
   4550         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
   4551             addCategory(set, U_GC_Z_MASK, localStatus);
   4552             set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
   4553             set->add(9, 0x0d).add(0x1c, 0x1f);
   4554         }
   4555         else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
   4556             set->add(0, UnicodeSet::MAX_VALUE);
   4557         }
   4558 
   4559         if (U_SUCCESS(localStatus) && !set->isEmpty()) {
   4560             *fStatus = U_ZERO_ERROR;
   4561             if (usetFlags & USET_CASE_INSENSITIVE) {
   4562                 set->closeOver(USET_CASE_INSENSITIVE);
   4563             }
   4564             if (negated) {
   4565                 set->complement();
   4566             }
   4567             return set;
   4568         }
   4569         delete set;
   4570         set = NULL;
   4571     }
   4572     error(*fStatus);
   4573     return NULL;
   4574 }
   4575 
   4576 
   4577 
   4578 //
   4579 //  SetEval   Part of the evaluation of [set expressions].
   4580 //            Perform any pending (stacked) operations with precedence
   4581 //            equal or greater to that of the next operator encountered
   4582 //            in the expression.
   4583 //
   4584 void RegexCompile::setEval(int32_t nextOp) {
   4585     UnicodeSet *rightOperand = NULL;
   4586     UnicodeSet *leftOperand  = NULL;
   4587     for (;;) {
   4588         U_ASSERT(fSetOpStack.empty()==FALSE);
   4589         int32_t pendingSetOperation = fSetOpStack.peeki();
   4590         if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
   4591             break;
   4592         }
   4593         fSetOpStack.popi();
   4594         U_ASSERT(fSetStack.empty() == FALSE);
   4595         rightOperand = (UnicodeSet *)fSetStack.peek();
   4596         switch (pendingSetOperation) {
   4597             case setNegation:
   4598                 rightOperand->complement();
   4599                 break;
   4600             case setCaseClose:
   4601                 // TODO: need a simple close function.  Ticket 6065
   4602                 rightOperand->closeOver(USET_CASE_INSENSITIVE);
   4603                 rightOperand->removeAllStrings();
   4604                 break;
   4605             case setDifference1:
   4606             case setDifference2:
   4607                 fSetStack.pop();
   4608                 leftOperand = (UnicodeSet *)fSetStack.peek();
   4609                 leftOperand->removeAll(*rightOperand);
   4610                 delete rightOperand;
   4611                 break;
   4612             case setIntersection1:
   4613             case setIntersection2:
   4614                 fSetStack.pop();
   4615                 leftOperand = (UnicodeSet *)fSetStack.peek();
   4616                 leftOperand->retainAll(*rightOperand);
   4617                 delete rightOperand;
   4618                 break;
   4619             case setUnion:
   4620                 fSetStack.pop();
   4621                 leftOperand = (UnicodeSet *)fSetStack.peek();
   4622                 leftOperand->addAll(*rightOperand);
   4623                 delete rightOperand;
   4624                 break;
   4625             default:
   4626                 U_ASSERT(FALSE);
   4627                 break;
   4628             }
   4629         }
   4630     }
   4631 
   4632 void RegexCompile::setPushOp(int32_t op) {
   4633     setEval(op);
   4634     fSetOpStack.push(op, *fStatus);
   4635     fSetStack.push(new UnicodeSet(), *fStatus);
   4636 }
   4637 
   4638 U_NAMESPACE_END
   4639 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
   4640 
   4641