Home | History | Annotate | Download | only in i18n
      1 //
      2 //  file:  regexcmp.cpp
      3 //
      4 //  Copyright (C) 2002-2015 International Business Machines Corporation and others.
      5 //  All Rights Reserved.
      6 //
      7 //  This file contains the ICU regular expression compiler, which is responsible
      8 //  for processing a regular expression pattern into the compiled form that
      9 //  is used by the match finding engine.
     10 //
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     15 
     16 #include "unicode/ustring.h"
     17 #include "unicode/unistr.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/uchriter.h"
     21 #include "unicode/parsepos.h"
     22 #include "unicode/parseerr.h"
     23 #include "unicode/regex.h"
     24 #include "unicode/utf.h"
     25 #include "unicode/utf16.h"
     26 #include "patternprops.h"
     27 #include "putilimp.h"
     28 #include "cmemory.h"
     29 #include "cstring.h"
     30 #include "uvectr32.h"
     31 #include "uvectr64.h"
     32 #include "uassert.h"
     33 #include "uinvchar.h"
     34 
     35 #include "regeximp.h"
     36 #include "regexcst.h"   // Contains state table for the regex pattern parser.
     37                         //   generated by a Perl script.
     38 #include "regexcmp.h"
     39 #include "regexst.h"
     40 #include "regextxt.h"
     41 
     42 
     43 
     44 U_NAMESPACE_BEGIN
     45 
     46 
     47 //------------------------------------------------------------------------------
     48 //
     49 //  Constructor.
     50 //
     51 //------------------------------------------------------------------------------
     52 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
     53    fParenStack(status), fSetStack(status), fSetOpStack(status)
     54 {
     55     // Lazy init of all shared global sets (needed for init()'s empty text)
     56     RegexStaticSets::initGlobals(&status);
     57 
     58     fStatus           = &status;
     59 
     60     fRXPat            = rxp;
     61     fScanIndex        = 0;
     62     fLastChar         = -1;
     63     fPeekChar         = -1;
     64     fLineNum          = 1;
     65     fCharNum          = 0;
     66     fQuoteMode        = FALSE;
     67     fInBackslashQuote = FALSE;
     68     fModeFlags        = fRXPat->fFlags | 0x80000000;
     69     fEOLComments      = TRUE;
     70 
     71     fMatchOpenParen   = -1;
     72     fMatchCloseParen  = -1;
     73     fCaptureName      = NULL;
     74 
     75     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
     76         status = rxp->fDeferredStatus;
     77     }
     78 }
     79 
     80 static const UChar      chAmp       = 0x26;      // '&'
     81 static const UChar      chDash      = 0x2d;      // '-'
     82 
     83 
     84 //------------------------------------------------------------------------------
     85 //
     86 //  Destructor
     87 //
     88 //------------------------------------------------------------------------------
     89 RegexCompile::~RegexCompile() {
     90     delete fCaptureName;         // Normally will be NULL, but can exist if pattern
     91                                  //   compilation stops with a syntax error.
     92 }
     93 
     94 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
     95     set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
     96 }
     97 
     98 //------------------------------------------------------------------------------
     99 //
    100 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
    101 //                           The state tables are hand-written in the file regexcst.txt,
    102 //                           and converted to the form used here by a perl
    103 //                           script regexcst.pl
    104 //
    105 //------------------------------------------------------------------------------
    106 void    RegexCompile::compile(
    107                          const UnicodeString &pat,   // Source pat to be compiled.
    108                          UParseError &pp,            // Error position info
    109                          UErrorCode &e)              // Error Code
    110 {
    111     fRXPat->fPatternString = new UnicodeString(pat);
    112     UText patternText = UTEXT_INITIALIZER;
    113     utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
    114 
    115     if (U_SUCCESS(e)) {
    116         compile(&patternText, pp, e);
    117         utext_close(&patternText);
    118     }
    119 }
    120 
    121 //
    122 //   compile, UText mode
    123 //     All the work is actually done here.
    124 //
    125 void    RegexCompile::compile(
    126                          UText *pat,                 // Source pat to be compiled.
    127                          UParseError &pp,            // Error position info
    128                          UErrorCode &e)              // Error Code
    129 {
    130     fStatus             = &e;
    131     fParseErr           = &pp;
    132     fStackPtr           = 0;
    133     fStack[fStackPtr]   = 0;
    134 
    135     if (U_FAILURE(*fStatus)) {
    136         return;
    137     }
    138 
    139     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
    140     U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
    141 
    142     // Prepare the RegexPattern object to receive the compiled pattern.
    143     fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
    144     if (U_FAILURE(*fStatus)) {
    145         return;
    146     }
    147     fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
    148     fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;
    149 
    150 
    151     // Initialize the pattern scanning state machine
    152     fPatternLength = utext_nativeLength(pat);
    153     uint16_t                state = 1;
    154     const RegexTableEl      *tableEl;
    155 
    156     // UREGEX_LITERAL force entire pattern to be treated as a literal string.
    157     if (fModeFlags & UREGEX_LITERAL) {
    158         fQuoteMode = TRUE;
    159     }
    160 
    161     nextChar(fC);                        // Fetch the first char from the pattern string.
    162 
    163     //
    164     // Main loop for the regex pattern parsing state machine.
    165     //   Runs once per state transition.
    166     //   Each time through optionally performs, depending on the state table,
    167     //      - an advance to the the next pattern char
    168     //      - an action to be performed.
    169     //      - pushing or popping a state to/from the local state return stack.
    170     //   file regexcst.txt is the source for the state table.  The logic behind
    171     //     recongizing the pattern syntax is there, not here.
    172     //
    173     for (;;) {
    174         //  Bail out if anything has gone wrong.
    175         //  Regex pattern parsing stops on the first error encountered.
    176         if (U_FAILURE(*fStatus)) {
    177             break;
    178         }
    179 
    180         U_ASSERT(state != 0);
    181 
    182         // Find the state table element that matches the input char from the pattern, or the
    183         //    class of the input character.  Start with the first table row for this
    184         //    state, then linearly scan forward until we find a row that matches the
    185         //    character.  The last row for each state always matches all characters, so
    186         //    the search will stop there, if not before.
    187         //
    188         tableEl = &gRuleParseStateTable[state];
    189         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
    190             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
    191 
    192         for (;;) {    // loop through table rows belonging to this state, looking for one
    193                       //   that matches the current input char.
    194             REGEX_SCAN_DEBUG_PRINTF(("."));
    195             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
    196                 // Table row specified an individual character, not a set, and
    197                 //   the input character is not quoted, and
    198                 //   the input character matched it.
    199                 break;
    200             }
    201             if (tableEl->fCharClass == 255) {
    202                 // Table row specified default, match anything character class.
    203                 break;
    204             }
    205             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
    206                 // Table row specified "quoted" and the char was quoted.
    207                 break;
    208             }
    209             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
    210                 // Table row specified eof and we hit eof on the input.
    211                 break;
    212             }
    213 
    214             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
    215                 fC.fQuoted == FALSE &&                                       //   char is not escaped &&
    216                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
    217                 U_ASSERT(tableEl->fCharClass <= 137);
    218                 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
    219                     // Table row specified a character class, or set of characters,
    220                     //   and the current char matches it.
    221                     break;
    222                 }
    223             }
    224 
    225             // No match on this row, advance to the next  row for this state,
    226             tableEl++;
    227         }
    228         REGEX_SCAN_DEBUG_PRINTF(("\n"));
    229 
    230         //
    231         // We've found the row of the state table that matches the current input
    232         //   character from the rules string.
    233         // Perform any action specified  by this row in the state table.
    234         if (doParseActions(tableEl->fAction) == FALSE) {
    235             // Break out of the state machine loop if the
    236             //   the action signalled some kind of error, or
    237             //   the action was to exit, occurs on normal end-of-rules-input.
    238             break;
    239         }
    240 
    241         if (tableEl->fPushState != 0) {
    242             fStackPtr++;
    243             if (fStackPtr >= kStackSize) {
    244                 error(U_REGEX_INTERNAL_ERROR);
    245                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
    246                 fStackPtr--;
    247             }
    248             fStack[fStackPtr] = tableEl->fPushState;
    249         }
    250 
    251         //
    252         //  NextChar.  This is where characters are actually fetched from the pattern.
    253         //             Happens under control of the 'n' tag in the state table.
    254         //
    255         if (tableEl->fNextChar) {
    256             nextChar(fC);
    257         }
    258 
    259         // Get the next state from the table entry, or from the
    260         //   state stack if the next state was specified as "pop".
    261         if (tableEl->fNextState != 255) {
    262             state = tableEl->fNextState;
    263         } else {
    264             state = fStack[fStackPtr];
    265             fStackPtr--;
    266             if (fStackPtr < 0) {
    267                 // state stack underflow
    268                 // This will occur if the user pattern has mis-matched parentheses,
    269                 //   with extra close parens.
    270                 //
    271                 fStackPtr++;
    272                 error(U_REGEX_MISMATCHED_PAREN);
    273             }
    274         }
    275 
    276     }
    277 
    278     if (U_FAILURE(*fStatus)) {
    279         // Bail out if the pattern had errors.
    280         //   Set stack cleanup:  a successful compile would have left it empty,
    281         //   but errors can leave temporary sets hanging around.
    282         while (!fSetStack.empty()) {
    283             delete (UnicodeSet *)fSetStack.pop();
    284         }
    285         return;
    286     }
    287 
    288     //
    289     // The pattern has now been read and processed, and the compiled code generated.
    290     //
    291 
    292     //
    293     // The pattern's fFrameSize so far has accumulated the requirements for
    294     //   storage for capture parentheses, counters, etc. that are encountered
    295     //   in the pattern.  Add space for the two variables that are always
    296     //   present in the saved state:  the input string position (int64_t) and
    297     //   the position in the compiled pattern.
    298     //
    299     allocateStackData(RESTACKFRAME_HDRCOUNT);
    300 
    301     //
    302     // Optimization pass 1: NOPs, back-references, and case-folding
    303     //
    304     stripNOPs();
    305 
    306     //
    307     // Get bounds for the minimum and maximum length of a string that this
    308     //   pattern can match.  Used to avoid looking for matches in strings that
    309     //   are too short.
    310     //
    311     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
    312 
    313     //
    314     // Optimization pass 2: match start type
    315     //
    316     matchStartType();
    317 
    318     //
    319     // Set up fast latin-1 range sets
    320     //
    321     int32_t numSets = fRXPat->fSets->size();
    322     fRXPat->fSets8 = new Regex8BitSet[numSets];
    323     // Null pointer check.
    324     if (fRXPat->fSets8 == NULL) {
    325         e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
    326         return;
    327     }
    328     int32_t i;
    329     for (i=0; i<numSets; i++) {
    330         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
    331         fRXPat->fSets8[i].init(s);
    332     }
    333 
    334 }
    335 
    336 
    337 
    338 
    339 
    340 //------------------------------------------------------------------------------
    341 //
    342 //  doParseAction        Do some action during regex pattern parsing.
    343 //                       Called by the parse state machine.
    344 //
    345 //                       Generation of the match engine PCode happens here, or
    346 //                       in functions called from the parse actions defined here.
    347 //
    348 //
    349 //------------------------------------------------------------------------------
    350 UBool RegexCompile::doParseActions(int32_t action)
    351 {
    352     UBool   returnVal = TRUE;
    353 
    354     switch ((Regex_PatternParseAction)action) {
    355 
    356     case doPatStart:
    357         // Start of pattern compiles to:
    358         //0   SAVE   2        Fall back to position of FAIL
    359         //1   jmp    3
    360         //2   FAIL            Stop if we ever reach here.
    361         //3   NOP             Dummy, so start of pattern looks the same as
    362         //                    the start of an ( grouping.
    363         //4   NOP             Resreved, will be replaced by a save if there are
    364         //                    OR | operators at the top level
    365         appendOp(URX_STATE_SAVE, 2);
    366         appendOp(URX_JMP,  3);
    367         appendOp(URX_FAIL, 0);
    368 
    369         // Standard open nonCapture paren action emits the two NOPs and
    370         //   sets up the paren stack frame.
    371         doParseActions(doOpenNonCaptureParen);
    372         break;
    373 
    374     case doPatFinish:
    375         // We've scanned to the end of the pattern
    376         //  The end of pattern compiles to:
    377         //        URX_END
    378         //    which will stop the runtime match engine.
    379         //  Encountering end of pattern also behaves like a close paren,
    380         //   and forces fixups of the State Save at the beginning of the compiled pattern
    381         //   and of any OR operations at the top level.
    382         //
    383         handleCloseParen();
    384         if (fParenStack.size() > 0) {
    385             // Missing close paren in pattern.
    386             error(U_REGEX_MISMATCHED_PAREN);
    387         }
    388 
    389         // add the END operation to the compiled pattern.
    390         appendOp(URX_END, 0);
    391 
    392         // Terminate the pattern compilation state machine.
    393         returnVal = FALSE;
    394         break;
    395 
    396 
    397 
    398     case doOrOperator:
    399         // Scanning a '|', as in (A|B)
    400         {
    401             // Generate code for any pending literals preceding the '|'
    402             fixLiterals(FALSE);
    403 
    404             // Insert a SAVE operation at the start of the pattern section preceding
    405             //   this OR at this level.  This SAVE will branch the match forward
    406             //   to the right hand side of the OR in the event that the left hand
    407             //   side fails to match and backtracks.  Locate the position for the
    408             //   save from the location on the top of the parentheses stack.
    409             int32_t savePosition = fParenStack.popi();
    410             int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
    411             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
    412             op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
    413             fRXPat->fCompiledPat->setElementAt(op, savePosition);
    414 
    415             // Append an JMP operation into the compiled pattern.  The operand for
    416             //  the JMP will eventually be the location following the ')' for the
    417             //  group.  This will be patched in later, when the ')' is encountered.
    418             appendOp(URX_JMP, 0);
    419 
    420             // Push the position of the newly added JMP op onto the parentheses stack.
    421             // This registers if for fixup when this block's close paren is encountered.
    422             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
    423 
    424             // Append a NOP to the compiled pattern.  This is the slot reserved
    425             //   for a SAVE in the event that there is yet another '|' following
    426             //   this one.
    427             appendOp(URX_NOP, 0);
    428             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
    429         }
    430         break;
    431 
    432 
    433     case doBeginNamedCapture:
    434         // Scanning (?<letter.
    435         //   The first letter of the name will come through again under doConinueNamedCapture.
    436         fCaptureName = new UnicodeString();
    437         if (fCaptureName == NULL) {
    438             error(U_MEMORY_ALLOCATION_ERROR);
    439         }
    440         break;
    441 
    442     case  doContinueNamedCapture:
    443         fCaptureName->append(fC.fChar);
    444         break;
    445 
    446     case doBadNamedCapture:
    447         error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
    448         break;
    449 
    450     case doOpenCaptureParen:
    451         // Open Capturing Paren, possibly named.
    452         //   Compile to a
    453         //      - NOP, which later may be replaced by a save-state if the
    454         //         parenthesized group gets a * quantifier, followed by
    455         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
    456         //      - NOP, which may later be replaced by a save-state if there
    457         //             is an '|' alternation within the parens.
    458         //
    459         //    Each capture group gets three slots in the save stack frame:
    460         //         0: Capture Group start position (in input string being matched.)
    461         //         1: Capture Group end position.
    462         //         2: Start of Match-in-progress.
    463         //    The first two locations are for a completed capture group, and are
    464         //     referred to by back references and the like.
    465         //    The third location stores the capture start position when an START_CAPTURE is
    466         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
    467         //      END_CAPTURE is encountered.
    468         {
    469             fixLiterals();
    470             appendOp(URX_NOP, 0);
    471             int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
    472             appendOp(URX_START_CAPTURE, varsLoc);
    473             appendOp(URX_NOP, 0);
    474 
    475             // On the Parentheses stack, start a new frame and add the postions
    476             //   of the two NOPs.  Depending on what follows in the pattern, the
    477             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
    478             //   address of the end of the parenthesized group.
    479             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    480             fParenStack.push(capturing, *fStatus);                        // Frame type.
    481             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
    482             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
    483 
    484             // Save the mapping from group number to stack frame variable position.
    485             fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
    486 
    487             // If this is a named capture group, add the name->group number mapping.
    488             if (fCaptureName != NULL) {
    489                 int32_t groupNumber = fRXPat->fGroupMap->size();
    490                 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
    491                 fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
    492                 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
    493                     error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
    494                 }
    495             }
    496         }
    497         break;
    498 
    499     case doOpenNonCaptureParen:
    500         // Open non-caputuring (grouping only) Paren.
    501         //   Compile to a
    502         //      - NOP, which later may be replaced by a save-state if the
    503         //         parenthesized group gets a * quantifier, followed by
    504         //      - NOP, which may later be replaced by a save-state if there
    505         //             is an '|' alternation within the parens.
    506         {
    507             fixLiterals();
    508             appendOp(URX_NOP, 0);
    509             appendOp(URX_NOP, 0);
    510 
    511             // On the Parentheses stack, start a new frame and add the postions
    512             //   of the two NOPs.
    513             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    514             fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
    515             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
    516             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
    517         }
    518          break;
    519 
    520 
    521     case doOpenAtomicParen:
    522         // Open Atomic Paren.  (?>
    523         //   Compile to a
    524         //      - NOP, which later may be replaced if the parenthesized group
    525         //         has a quantifier, followed by
    526         //      - STO_SP  save state stack position, so it can be restored at the ")"
    527         //      - NOP, which may later be replaced by a save-state if there
    528         //             is an '|' alternation within the parens.
    529         {
    530             fixLiterals();
    531             appendOp(URX_NOP, 0);
    532             int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
    533             appendOp(URX_STO_SP, varLoc);
    534             appendOp(URX_NOP, 0);
    535 
    536             // On the Parentheses stack, start a new frame and add the postions
    537             //   of the two NOPs.  Depending on what follows in the pattern, the
    538             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
    539             //   address of the end of the parenthesized group.
    540             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    541             fParenStack.push(atomic, *fStatus);                           // Frame type.
    542             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
    543             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
    544         }
    545         break;
    546 
    547 
    548     case doOpenLookAhead:
    549         // Positive Look-ahead   (?=  stuff  )
    550         //
    551         //   Note:   Addition of transparent input regions, with the need to
    552         //           restore the original regions when failing out of a lookahead
    553         //           block, complicated this sequence.  Some conbined opcodes
    554         //           might make sense - or might not, lookahead aren't that common.
    555         //
    556         //      Caution:  min match length optimization knows about this
    557         //               sequence; don't change without making updates there too.
    558         //
    559         // Compiles to
    560         //    1    START_LA     dataLoc     Saves SP, Input Pos
    561         //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
    562         //    3    JMP          6           continue ...
    563         //
    564         //    4.   LA_END                   Look Ahead failed.  Restore regions.
    565         //    5.   BACKTRACK                and back track again.
    566         //
    567         //    6.   NOP              reserved for use by quantifiers on the block.
    568         //                          Look-ahead can't have quantifiers, but paren stack
    569         //                             compile time conventions require the slot anyhow.
    570         //    7.   NOP              may be replaced if there is are '|' ops in the block.
    571         //    8.     code for parenthesized stuff.
    572         //    9.   LA_END
    573         //
    574         //  Two data slots are reserved, for saving the stack ptr and the input position.
    575         {
    576             fixLiterals();
    577             int32_t dataLoc = allocateData(2);
    578             appendOp(URX_LA_START, dataLoc);
    579             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
    580             appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
    581             appendOp(URX_LA_END, dataLoc);
    582             appendOp(URX_BACKTRACK, 0);
    583             appendOp(URX_NOP, 0);
    584             appendOp(URX_NOP, 0);
    585 
    586             // On the Parentheses stack, start a new frame and add the postions
    587             //   of the NOPs.
    588             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    589             fParenStack.push(lookAhead, *fStatus);                        // Frame type.
    590             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
    591             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
    592         }
    593         break;
    594 
    595     case doOpenLookAheadNeg:
    596         // Negated Lookahead.   (?! stuff )
    597         // Compiles to
    598         //    1.    START_LA    dataloc
    599         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
    600         //                                //   which continues with the match.
    601         //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
    602         //    4.       code for parenthesized stuff.
    603         //    5.    END_LA                // Cut back stack, remove saved state from step 2.
    604         //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
    605         //    7.    END_LA                // Restore match region, in case look-ahead was using
    606         //                                        an alternate (transparent) region.
    607         {
    608             fixLiterals();
    609             int32_t dataLoc = allocateData(2);
    610             appendOp(URX_LA_START, dataLoc);
    611             appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
    612             appendOp(URX_NOP, 0);
    613 
    614             // On the Parentheses stack, start a new frame and add the postions
    615             //   of the StateSave and NOP.
    616             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    617             fParenStack.push(negLookAhead, *fStatus);                    // Frame type
    618             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
    619             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
    620 
    621             // Instructions #5 - #7 will be added when the ')' is encountered.
    622         }
    623         break;
    624 
    625     case doOpenLookBehind:
    626         {
    627             //   Compile a (?<= look-behind open paren.
    628             //
    629             //          Compiles to
    630             //              0       URX_LB_START     dataLoc
    631             //              1       URX_LB_CONT      dataLoc
    632             //              2                        MinMatchLen
    633             //              3                        MaxMatchLen
    634             //              4       URX_NOP          Standard '(' boilerplate.
    635             //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
    636             //              6         <code for LookBehind expression>
    637             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
    638             //              8       URX_LA_END       dataLoc    # Restore stack, input pos
    639             //
    640             //          Allocate a block of matcher data, to contain (when running a match)
    641             //              0:    Stack ptr on entry
    642             //              1:    Input Index on entry
    643             //              2:    Start index of match current match attempt.
    644             //              3:    Original Input String len.
    645 
    646             // Generate match code for any pending literals.
    647             fixLiterals();
    648 
    649             // Allocate data space
    650             int32_t dataLoc = allocateData(4);
    651 
    652             // Emit URX_LB_START
    653             appendOp(URX_LB_START, dataLoc);
    654 
    655             // Emit URX_LB_CONT
    656             appendOp(URX_LB_CONT, dataLoc);
    657             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
    658             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
    659 
    660             // Emit the NOPs
    661             appendOp(URX_NOP, 0);
    662             appendOp(URX_NOP, 0);
    663 
    664             // On the Parentheses stack, start a new frame and add the postions
    665             //   of the URX_LB_CONT and the NOP.
    666             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    667             fParenStack.push(lookBehind, *fStatus);                       // Frame type
    668             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
    669             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
    670 
    671             // The final two instructions will be added when the ')' is encountered.
    672         }
    673 
    674         break;
    675 
    676     case doOpenLookBehindNeg:
    677         {
    678             //   Compile a (?<! negated look-behind open paren.
    679             //
    680             //          Compiles to
    681             //              0       URX_LB_START     dataLoc    # Save entry stack, input len
    682             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
    683             //              2                        MinMatchLen
    684             //              3                        MaxMatchLen
    685             //              4                        continueLoc (9)
    686             //              5       URX_NOP          Standard '(' boilerplate.
    687             //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
    688             //              7         <code for LookBehind expression>
    689             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
    690             //              9       ...
    691             //
    692             //          Allocate a block of matcher data, to contain (when running a match)
    693             //              0:    Stack ptr on entry
    694             //              1:    Input Index on entry
    695             //              2:    Start index of match current match attempt.
    696             //              3:    Original Input String len.
    697 
    698             // Generate match code for any pending literals.
    699             fixLiterals();
    700 
    701             // Allocate data space
    702             int32_t dataLoc = allocateData(4);
    703 
    704             // Emit URX_LB_START
    705             appendOp(URX_LB_START, dataLoc);
    706 
    707             // Emit URX_LBN_CONT
    708             appendOp(URX_LBN_CONT, dataLoc);
    709             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
    710             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
    711             appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
    712 
    713             // Emit the NOPs
    714             appendOp(URX_NOP, 0);
    715             appendOp(URX_NOP, 0);
    716 
    717             // On the Parentheses stack, start a new frame and add the postions
    718             //   of the URX_LB_CONT and the NOP.
    719             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    720             fParenStack.push(lookBehindN, *fStatus);                      // Frame type
    721             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
    722             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
    723 
    724             // The final two instructions will be added when the ')' is encountered.
    725         }
    726         break;
    727 
    728     case doConditionalExpr:
    729         // Conditionals such as (?(1)a:b)
    730     case doPerlInline:
    731         // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
    732         error(U_REGEX_UNIMPLEMENTED);
    733         break;
    734 
    735 
    736     case doCloseParen:
    737         handleCloseParen();
    738         if (fParenStack.size() <= 0) {
    739             //  Extra close paren, or missing open paren.
    740             error(U_REGEX_MISMATCHED_PAREN);
    741         }
    742         break;
    743 
    744     case doNOP:
    745         break;
    746 
    747 
    748     case doBadOpenParenType:
    749     case doRuleError:
    750         error(U_REGEX_RULE_SYNTAX);
    751         break;
    752 
    753 
    754     case doMismatchedParenErr:
    755         error(U_REGEX_MISMATCHED_PAREN);
    756         break;
    757 
    758     case doPlus:
    759         //  Normal '+'  compiles to
    760         //     1.   stuff to be repeated  (already built)
    761         //     2.   jmp-sav 1
    762         //     3.   ...
    763         //
    764         //  Or, if the item to be repeated can match a zero length string,
    765         //     1.   STO_INP_LOC  data-loc
    766         //     2.      body of stuff to be repeated
    767         //     3.   JMP_SAV_X    2
    768         //     4.   ...
    769 
    770         //
    771         //  Or, if the item to be repeated is simple
    772         //     1.   Item to be repeated.
    773         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
    774         //     3.   LOOP_C       stack location
    775         {
    776             int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
    777             int32_t  frameLoc;
    778 
    779             // Check for simple constructs, which may get special optimized code.
    780             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
    781                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
    782 
    783                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
    784                     // Emit optimized code for [char set]+
    785                     appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
    786                     frameLoc = allocateStackData(1);
    787                     appendOp(URX_LOOP_C, frameLoc);
    788                     break;
    789                 }
    790 
    791                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
    792                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
    793                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
    794                     // Emit Optimized code for .+ operations.
    795                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
    796                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
    797                         // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
    798                         loopOpI |= 1;
    799                     }
    800                     if (fModeFlags & UREGEX_UNIX_LINES) {
    801                         loopOpI |= 2;
    802                     }
    803                     appendOp(loopOpI);
    804                     frameLoc = allocateStackData(1);
    805                     appendOp(URX_LOOP_C, frameLoc);
    806                     break;
    807                 }
    808 
    809             }
    810 
    811             // General case.
    812 
    813             // Check for minimum match length of zero, which requires
    814             //    extra loop-breaking code.
    815             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
    816                 // Zero length match is possible.
    817                 // Emit the code sequence that can handle it.
    818                 insertOp(topLoc);
    819                 frameLoc = allocateStackData(1);
    820 
    821                 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
    822                 fRXPat->fCompiledPat->setElementAt(op, topLoc);
    823 
    824                 appendOp(URX_JMP_SAV_X, topLoc+1);
    825             } else {
    826                 // Simpler code when the repeated body must match something non-empty
    827                 appendOp(URX_JMP_SAV, topLoc);
    828             }
    829         }
    830         break;
    831 
    832     case doNGPlus:
    833         //  Non-greedy '+?'  compiles to
    834         //     1.   stuff to be repeated  (already built)
    835         //     2.   state-save  1
    836         //     3.   ...
    837         {
    838             int32_t topLoc      = blockTopLoc(FALSE);
    839             appendOp(URX_STATE_SAVE, topLoc);
    840         }
    841         break;
    842 
    843 
    844     case doOpt:
    845         // Normal (greedy) ? quantifier.
    846         //  Compiles to
    847         //     1. state save 3
    848         //     2.    body of optional block
    849         //     3. ...
    850         // Insert the state save into the compiled pattern, and we're done.
    851         {
    852             int32_t   saveStateLoc = blockTopLoc(TRUE);
    853             int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
    854             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
    855         }
    856         break;
    857 
    858     case doNGOpt:
    859         // Non-greedy ?? quantifier
    860         //   compiles to
    861         //    1.  jmp   4
    862         //    2.     body of optional block
    863         //    3   jmp   5
    864         //    4.  state save 2
    865         //    5    ...
    866         //  This code is less than ideal, with two jmps instead of one, because we can only
    867         //  insert one instruction at the top of the block being iterated.
    868         {
    869             int32_t  jmp1_loc = blockTopLoc(TRUE);
    870             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
    871 
    872             int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
    873             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
    874 
    875             appendOp(URX_JMP, jmp2_loc+2);
    876 
    877             appendOp(URX_STATE_SAVE, jmp1_loc+1);
    878         }
    879         break;
    880 
    881 
    882     case doStar:
    883         // Normal (greedy) * quantifier.
    884         // Compiles to
    885         //       1.   STATE_SAVE   4
    886         //       2.      body of stuff being iterated over
    887         //       3.   JMP_SAV      2
    888         //       4.   ...
    889         //
    890         // Or, if the body is a simple [Set],
    891         //       1.   LOOP_SR_I    set number
    892         //       2.   LOOP_C       stack location
    893         //       ...
    894         //
    895         // Or if this is a .*
    896         //       1.   LOOP_DOT_I    (. matches all mode flag)
    897         //       2.   LOOP_C        stack location
    898         //
    899         // Or, if the body can match a zero-length string, to inhibit infinite loops,
    900         //       1.   STATE_SAVE   5
    901         //       2.   STO_INP_LOC  data-loc
    902         //       3.      body of stuff
    903         //       4.   JMP_SAV_X    2
    904         //       5.   ...
    905         {
    906             // location of item #1, the STATE_SAVE
    907             int32_t   topLoc = blockTopLoc(FALSE);
    908             int32_t   dataLoc = -1;
    909 
    910             // Check for simple *, where the construct being repeated
    911             //   compiled to single opcode, and might be optimizable.
    912             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
    913                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
    914 
    915                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
    916                     // Emit optimized code for a [char set]*
    917                     int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
    918                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
    919                     dataLoc = allocateStackData(1);
    920                     appendOp(URX_LOOP_C, dataLoc);
    921                     break;
    922                 }
    923 
    924                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
    925                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
    926                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
    927                     // Emit Optimized code for .* operations.
    928                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
    929                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
    930                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
    931                         loopOpI |= 1;
    932                     }
    933                     if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
    934                         loopOpI |= 2;
    935                     }
    936                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
    937                     dataLoc = allocateStackData(1);
    938                     appendOp(URX_LOOP_C, dataLoc);
    939                     break;
    940                 }
    941             }
    942 
    943             // Emit general case code for this *
    944             // The optimizations did not apply.
    945 
    946             int32_t   saveStateLoc = blockTopLoc(TRUE);
    947             int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
    948 
    949             // Check for minimum match length of zero, which requires
    950             //    extra loop-breaking code.
    951             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
    952                 insertOp(saveStateLoc);
    953                 dataLoc = allocateStackData(1);
    954 
    955                 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
    956                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
    957                 jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
    958             }
    959 
    960             // Locate the position in the compiled pattern where the match will continue
    961             //   after completing the *.   (4 or 5 in the comment above)
    962             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
    963 
    964             // Put together the save state op and store it into the compiled code.
    965             int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
    966             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
    967 
    968             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
    969             appendOp(jmpOp);
    970         }
    971         break;
    972 
    973     case doNGStar:
    974         // Non-greedy *? quantifier
    975         // compiles to
    976         //     1.   JMP    3
    977         //     2.      body of stuff being iterated over
    978         //     3.   STATE_SAVE  2
    979         //     4    ...
    980         {
    981             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
    982             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
    983             int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
    984             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
    985             appendOp(URX_STATE_SAVE, jmpLoc+1);
    986         }
    987         break;
    988 
    989 
    990     case doIntervalInit:
    991         // The '{' opening an interval quantifier was just scanned.
    992         // Init the counter varaiables that will accumulate the values as the digits
    993         //    are scanned.
    994         fIntervalLow = 0;
    995         fIntervalUpper = -1;
    996         break;
    997 
    998     case doIntevalLowerDigit:
    999         // Scanned a digit from the lower value of an {lower,upper} interval
   1000         {
   1001             int32_t digitValue = u_charDigitValue(fC.fChar);
   1002             U_ASSERT(digitValue >= 0);
   1003             int64_t val = (int64_t)fIntervalLow*10 + digitValue;
   1004             if (val > INT32_MAX) {
   1005                 error(U_REGEX_NUMBER_TOO_BIG);
   1006             } else {
   1007                 fIntervalLow = (int32_t)val;
   1008             }
   1009         }
   1010         break;
   1011 
   1012     case doIntervalUpperDigit:
   1013         // Scanned a digit from the upper value of an {lower,upper} interval
   1014         {
   1015             if (fIntervalUpper < 0) {
   1016                 fIntervalUpper = 0;
   1017             }
   1018             int32_t digitValue = u_charDigitValue(fC.fChar);
   1019             U_ASSERT(digitValue >= 0);
   1020             int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
   1021             if (val > INT32_MAX) {
   1022                 error(U_REGEX_NUMBER_TOO_BIG);
   1023             } else {
   1024                 fIntervalUpper = (int32_t)val;
   1025             }
   1026         }
   1027         break;
   1028 
   1029     case doIntervalSame:
   1030         // Scanned a single value interval like {27}.  Upper = Lower.
   1031         fIntervalUpper = fIntervalLow;
   1032         break;
   1033 
   1034     case doInterval:
   1035         // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
   1036         if (compileInlineInterval() == FALSE) {
   1037             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
   1038         }
   1039         break;
   1040 
   1041     case doPossessiveInterval:
   1042         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
   1043         {
   1044             // Remember the loc for the top of the block being looped over.
   1045             //   (Can not reserve a slot in the compiled pattern at this time, because
   1046             //    compileInterval needs to reserve also, and blockTopLoc can only reserve
   1047             //    once per block.)
   1048             int32_t topLoc = blockTopLoc(FALSE);
   1049 
   1050             // Produce normal looping code.
   1051             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
   1052 
   1053             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
   1054             //  just as if the loop was inclosed in atomic parentheses.
   1055 
   1056             // First the STO_SP before the start of the loop
   1057             insertOp(topLoc);
   1058 
   1059             int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
   1060             int32_t  op     = buildOp(URX_STO_SP, varLoc);
   1061             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1062 
   1063             int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
   1064             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
   1065             loopOp++;     // point LoopOp after the just-inserted STO_SP
   1066             fRXPat->fCompiledPat->push(loopOp, *fStatus);
   1067 
   1068             // Then the LD_SP after the end of the loop
   1069             appendOp(URX_LD_SP, varLoc);
   1070         }
   1071 
   1072         break;
   1073 
   1074     case doNGInterval:
   1075         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
   1076         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
   1077         break;
   1078 
   1079     case doIntervalError:
   1080         error(U_REGEX_BAD_INTERVAL);
   1081         break;
   1082 
   1083     case doLiteralChar:
   1084         // We've just scanned a "normal" character from the pattern,
   1085         literalChar(fC.fChar);
   1086         break;
   1087 
   1088 
   1089     case doEscapedLiteralChar:
   1090         // We've just scanned an backslashed escaped character with  no
   1091         //   special meaning.  It represents itself.
   1092         if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
   1093             ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
   1094             (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
   1095                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1096              }
   1097         literalChar(fC.fChar);
   1098         break;
   1099 
   1100 
   1101     case doDotAny:
   1102         // scanned a ".",  match any single character.
   1103         {
   1104             fixLiterals(FALSE);
   1105             if (fModeFlags & UREGEX_DOTALL) {
   1106                 appendOp(URX_DOTANY_ALL, 0);
   1107             } else if (fModeFlags & UREGEX_UNIX_LINES) {
   1108                 appendOp(URX_DOTANY_UNIX, 0);
   1109             } else {
   1110                 appendOp(URX_DOTANY, 0);
   1111             }
   1112         }
   1113         break;
   1114 
   1115     case doCaret:
   1116         {
   1117             fixLiterals(FALSE);
   1118             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1119                 appendOp(URX_CARET, 0);
   1120             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1121                 appendOp(URX_CARET_M, 0);
   1122             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1123                 appendOp(URX_CARET, 0);   // Only testing true start of input.
   1124             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1125                 appendOp(URX_CARET_M_UNIX, 0);
   1126             }
   1127         }
   1128         break;
   1129 
   1130     case doDollar:
   1131         {
   1132             fixLiterals(FALSE);
   1133             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1134                 appendOp(URX_DOLLAR, 0);
   1135             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1136                 appendOp(URX_DOLLAR_M, 0);
   1137             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1138                 appendOp(URX_DOLLAR_D, 0);
   1139             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1140                 appendOp(URX_DOLLAR_MD, 0);
   1141             }
   1142         }
   1143         break;
   1144 
   1145     case doBackslashA:
   1146         fixLiterals(FALSE);
   1147         appendOp(URX_CARET, 0);
   1148         break;
   1149 
   1150     case doBackslashB:
   1151         {
   1152             #if  UCONFIG_NO_BREAK_ITERATION==1
   1153             if (fModeFlags & UREGEX_UWORD) {
   1154                 error(U_UNSUPPORTED_ERROR);
   1155             }
   1156             #endif
   1157             fixLiterals(FALSE);
   1158             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
   1159             appendOp(op, 1);
   1160         }
   1161         break;
   1162 
   1163     case doBackslashb:
   1164         {
   1165             #if  UCONFIG_NO_BREAK_ITERATION==1
   1166             if (fModeFlags & UREGEX_UWORD) {
   1167                 error(U_UNSUPPORTED_ERROR);
   1168             }
   1169             #endif
   1170             fixLiterals(FALSE);
   1171             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
   1172             appendOp(op, 0);
   1173         }
   1174         break;
   1175 
   1176     case doBackslashD:
   1177         fixLiterals(FALSE);
   1178         appendOp(URX_BACKSLASH_D, 1);
   1179         break;
   1180 
   1181     case doBackslashd:
   1182         fixLiterals(FALSE);
   1183         appendOp(URX_BACKSLASH_D, 0);
   1184         break;
   1185 
   1186     case doBackslashG:
   1187         fixLiterals(FALSE);
   1188         appendOp(URX_BACKSLASH_G, 0);
   1189         break;
   1190 
   1191     case doBackslashH:
   1192         fixLiterals(FALSE);
   1193         appendOp(URX_BACKSLASH_H, 1);
   1194         break;
   1195 
   1196     case doBackslashh:
   1197         fixLiterals(FALSE);
   1198         appendOp(URX_BACKSLASH_H, 0);
   1199         break;
   1200 
   1201     case doBackslashR:
   1202         fixLiterals(FALSE);
   1203         appendOp(URX_BACKSLASH_R, 0);
   1204         break;
   1205 
   1206     case doBackslashS:
   1207         fixLiterals(FALSE);
   1208         appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
   1209         break;
   1210 
   1211     case doBackslashs:
   1212         fixLiterals(FALSE);
   1213         appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
   1214         break;
   1215 
   1216     case doBackslashV:
   1217         fixLiterals(FALSE);
   1218         appendOp(URX_BACKSLASH_V, 1);
   1219         break;
   1220 
   1221     case doBackslashv:
   1222         fixLiterals(FALSE);
   1223         appendOp(URX_BACKSLASH_V, 0);
   1224         break;
   1225 
   1226     case doBackslashW:
   1227         fixLiterals(FALSE);
   1228         appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
   1229         break;
   1230 
   1231     case doBackslashw:
   1232         fixLiterals(FALSE);
   1233         appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
   1234         break;
   1235 
   1236     case doBackslashX:
   1237         fixLiterals(FALSE);
   1238         appendOp(URX_BACKSLASH_X, 0);
   1239         break;
   1240 
   1241 
   1242     case doBackslashZ:
   1243         fixLiterals(FALSE);
   1244         appendOp(URX_DOLLAR, 0);
   1245         break;
   1246 
   1247     case doBackslashz:
   1248         fixLiterals(FALSE);
   1249         appendOp(URX_BACKSLASH_Z, 0);
   1250         break;
   1251 
   1252     case doEscapeError:
   1253         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1254         break;
   1255 
   1256     case doExit:
   1257         fixLiterals(FALSE);
   1258         returnVal = FALSE;
   1259         break;
   1260 
   1261     case doProperty:
   1262         {
   1263             fixLiterals(FALSE);
   1264             UnicodeSet *theSet = scanProp();
   1265             compileSet(theSet);
   1266         }
   1267         break;
   1268 
   1269     case doNamedChar:
   1270         {
   1271             UChar32 c = scanNamedChar();
   1272             literalChar(c);
   1273         }
   1274         break;
   1275 
   1276 
   1277     case doBackRef:
   1278         // BackReference.  Somewhat unusual in that the front-end can not completely parse
   1279         //                 the regular expression, because the number of digits to be consumed
   1280         //                 depends on the number of capture groups that have been defined.  So
   1281         //                 we have to do it here instead.
   1282         {
   1283             int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
   1284             int32_t  groupNum = 0;
   1285             UChar32  c        = fC.fChar;
   1286 
   1287             for (;;) {
   1288                 // Loop once per digit, for max allowed number of digits in a back reference.
   1289                 int32_t digit = u_charDigitValue(c);
   1290                 groupNum = groupNum * 10 + digit;
   1291                 if (groupNum >= numCaptureGroups) {
   1292                     break;
   1293                 }
   1294                 c = peekCharLL();
   1295                 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
   1296                     break;
   1297                 }
   1298                 nextCharLL();
   1299             }
   1300 
   1301             // Scan of the back reference in the source regexp is complete.  Now generate
   1302             //  the compiled code for it.
   1303             // Because capture groups can be forward-referenced by back-references,
   1304             //  we fill the operand with the capture group number.  At the end
   1305             //  of compilation, it will be changed to the variable's location.
   1306             U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
   1307                                      //    and shouldn't enter this code path at all.
   1308             fixLiterals(FALSE);
   1309             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1310                 appendOp(URX_BACKREF_I, groupNum);
   1311             } else {
   1312                 appendOp(URX_BACKREF, groupNum);
   1313             }
   1314         }
   1315         break;
   1316 
   1317     case doBeginNamedBackRef:
   1318         U_ASSERT(fCaptureName == NULL);
   1319         fCaptureName = new UnicodeString;
   1320         if (fCaptureName == NULL) {
   1321             error(U_MEMORY_ALLOCATION_ERROR);
   1322         }
   1323         break;
   1324 
   1325     case doContinueNamedBackRef:
   1326         fCaptureName->append(fC.fChar);
   1327         break;
   1328 
   1329     case doCompleteNamedBackRef:
   1330         {
   1331         int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
   1332         if (groupNumber == 0) {
   1333             // Group name has not been defined.
   1334             //   Could be a forward reference. If we choose to support them at some
   1335             //   future time, extra mechanism will be required at this point.
   1336             error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   1337         } else {
   1338             // Given the number, handle identically to a \n numbered back reference.
   1339             // See comments above, under doBackRef
   1340             fixLiterals(FALSE);
   1341             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1342                 appendOp(URX_BACKREF_I, groupNumber);
   1343             } else {
   1344                 appendOp(URX_BACKREF, groupNumber);
   1345             }
   1346         }
   1347         delete fCaptureName;
   1348         fCaptureName = NULL;
   1349         break;
   1350         }
   1351 
   1352     case doPossessivePlus:
   1353         // Possessive ++ quantifier.
   1354         // Compiles to
   1355         //       1.   STO_SP
   1356         //       2.      body of stuff being iterated over
   1357         //       3.   STATE_SAVE 5
   1358         //       4.   JMP        2
   1359         //       5.   LD_SP
   1360         //       6.   ...
   1361         //
   1362         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
   1363         //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
   1364         //
   1365         {
   1366             // Emit the STO_SP
   1367             int32_t   topLoc = blockTopLoc(TRUE);
   1368             int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
   1369             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
   1370             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1371 
   1372             // Emit the STATE_SAVE
   1373             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
   1374 
   1375             // Emit the JMP
   1376             appendOp(URX_JMP, topLoc+1);
   1377 
   1378             // Emit the LD_SP
   1379             appendOp(URX_LD_SP, stoLoc);
   1380         }
   1381         break;
   1382 
   1383     case doPossessiveStar:
   1384         // Possessive *+ quantifier.
   1385         // Compiles to
   1386         //       1.   STO_SP       loc
   1387         //       2.   STATE_SAVE   5
   1388         //       3.      body of stuff being iterated over
   1389         //       4.   JMP          2
   1390         //       5.   LD_SP        loc
   1391         //       6    ...
   1392         // TODO:  do something to cut back the state stack each time through the loop.
   1393         {
   1394             // Reserve two slots at the top of the block.
   1395             int32_t   topLoc = blockTopLoc(TRUE);
   1396             insertOp(topLoc);
   1397 
   1398             // emit   STO_SP     loc
   1399             int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
   1400             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
   1401             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1402 
   1403             // Emit the SAVE_STATE   5
   1404             int32_t L7 = fRXPat->fCompiledPat->size()+1;
   1405             op = buildOp(URX_STATE_SAVE, L7);
   1406             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
   1407 
   1408             // Append the JMP operation.
   1409             appendOp(URX_JMP, topLoc+1);
   1410 
   1411             // Emit the LD_SP       loc
   1412             appendOp(URX_LD_SP, stoLoc);
   1413         }
   1414         break;
   1415 
   1416     case doPossessiveOpt:
   1417         // Possessive  ?+ quantifier.
   1418         //  Compiles to
   1419         //     1. STO_SP      loc
   1420         //     2. SAVE_STATE  5
   1421         //     3.    body of optional block
   1422         //     4. LD_SP       loc
   1423         //     5. ...
   1424         //
   1425         {
   1426             // Reserve two slots at the top of the block.
   1427             int32_t   topLoc = blockTopLoc(TRUE);
   1428             insertOp(topLoc);
   1429 
   1430             // Emit the STO_SP
   1431             int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
   1432             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
   1433             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1434 
   1435             // Emit the SAVE_STATE
   1436             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
   1437             op = buildOp(URX_STATE_SAVE, continueLoc);
   1438             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
   1439 
   1440             // Emit the LD_SP
   1441             appendOp(URX_LD_SP, stoLoc);
   1442         }
   1443         break;
   1444 
   1445 
   1446     case doBeginMatchMode:
   1447         fNewModeFlags = fModeFlags;
   1448         fSetModeFlag  = TRUE;
   1449         break;
   1450 
   1451     case doMatchMode:   //  (?i)    and similar
   1452         {
   1453             int32_t  bit = 0;
   1454             switch (fC.fChar) {
   1455             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
   1456             case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
   1457             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
   1458             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
   1459             case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
   1460             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
   1461             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
   1462             case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
   1463             default:
   1464                 U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
   1465                                    // by the scanner.
   1466             }
   1467             if (fSetModeFlag) {
   1468                 fNewModeFlags |= bit;
   1469             } else {
   1470                 fNewModeFlags &= ~bit;
   1471             }
   1472         }
   1473         break;
   1474 
   1475     case doSetMatchMode:
   1476         // Emit code to match any pending literals, using the not-yet changed match mode.
   1477         fixLiterals();
   1478 
   1479         // We've got a (?i) or similar.  The match mode is being changed, but
   1480         //   the change is not scoped to a parenthesized block.
   1481         U_ASSERT(fNewModeFlags < 0);
   1482         fModeFlags = fNewModeFlags;
   1483 
   1484         break;
   1485 
   1486 
   1487     case doMatchModeParen:
   1488         // We've got a (?i: or similar.  Begin a parenthesized block, save old
   1489         //   mode flags so they can be restored at the close of the block.
   1490         //
   1491         //   Compile to a
   1492         //      - NOP, which later may be replaced by a save-state if the
   1493         //         parenthesized group gets a * quantifier, followed by
   1494         //      - NOP, which may later be replaced by a save-state if there
   1495         //             is an '|' alternation within the parens.
   1496         {
   1497             fixLiterals(FALSE);
   1498             appendOp(URX_NOP, 0);
   1499             appendOp(URX_NOP, 0);
   1500 
   1501             // On the Parentheses stack, start a new frame and add the postions
   1502             //   of the two NOPs (a normal non-capturing () frame, except for the
   1503             //   saving of the orignal mode flags.)
   1504             fParenStack.push(fModeFlags, *fStatus);
   1505             fParenStack.push(flags, *fStatus);                            // Frame Marker
   1506             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
   1507             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
   1508 
   1509             // Set the current mode flags to the new values.
   1510             U_ASSERT(fNewModeFlags < 0);
   1511             fModeFlags = fNewModeFlags;
   1512         }
   1513         break;
   1514 
   1515     case doBadModeFlag:
   1516         error(U_REGEX_INVALID_FLAG);
   1517         break;
   1518 
   1519     case doSuppressComments:
   1520         // We have just scanned a '(?'.  We now need to prevent the character scanner from
   1521         // treating a '#' as a to-the-end-of-line comment.
   1522         //   (This Perl compatibility just gets uglier and uglier to do...)
   1523         fEOLComments = FALSE;
   1524         break;
   1525 
   1526 
   1527     case doSetAddAmp:
   1528         {
   1529           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1530           set->add(chAmp);
   1531         }
   1532         break;
   1533 
   1534     case doSetAddDash:
   1535         {
   1536           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1537           set->add(chDash);
   1538         }
   1539         break;
   1540 
   1541      case doSetBackslash_s:
   1542         {
   1543          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1544          set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
   1545          break;
   1546         }
   1547 
   1548      case doSetBackslash_S:
   1549         {
   1550             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1551             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
   1552             SSet.complement();
   1553             set->addAll(SSet);
   1554             break;
   1555         }
   1556 
   1557     case doSetBackslash_d:
   1558         {
   1559             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1560             // TODO - make a static set, ticket 6058.
   1561             addCategory(set, U_GC_ND_MASK, *fStatus);
   1562             break;
   1563         }
   1564 
   1565     case doSetBackslash_D:
   1566         {
   1567             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1568             UnicodeSet digits;
   1569             // TODO - make a static set, ticket 6058.
   1570             digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
   1571             digits.complement();
   1572             set->addAll(digits);
   1573             break;
   1574         }
   1575 
   1576     case doSetBackslash_h:
   1577         {
   1578             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1579             UnicodeSet h;
   1580             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
   1581             h.add((UChar32)9);   // Tab
   1582             set->addAll(h);
   1583             break;
   1584         }
   1585 
   1586     case doSetBackslash_H:
   1587         {
   1588             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1589             UnicodeSet h;
   1590             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
   1591             h.add((UChar32)9);   // Tab
   1592             h.complement();
   1593             set->addAll(h);
   1594             break;
   1595         }
   1596 
   1597     case doSetBackslash_v:
   1598         {
   1599             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1600             set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
   1601             set->add((UChar32)0x85);
   1602             set->add((UChar32)0x2028, (UChar32)0x2029);
   1603             break;
   1604         }
   1605 
   1606     case doSetBackslash_V:
   1607         {
   1608             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1609             UnicodeSet v;
   1610             v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
   1611             v.add((UChar32)0x85);
   1612             v.add((UChar32)0x2028, (UChar32)0x2029);
   1613             v.complement();
   1614             set->addAll(v);
   1615             break;
   1616         }
   1617 
   1618     case doSetBackslash_w:
   1619         {
   1620             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1621             set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
   1622             break;
   1623         }
   1624 
   1625     case doSetBackslash_W:
   1626         {
   1627             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1628             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
   1629             SSet.complement();
   1630             set->addAll(SSet);
   1631             break;
   1632         }
   1633 
   1634     case doSetBegin:
   1635         fixLiterals(FALSE);
   1636         fSetStack.push(new UnicodeSet(), *fStatus);
   1637         fSetOpStack.push(setStart, *fStatus);
   1638         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1639             fSetOpStack.push(setCaseClose, *fStatus);
   1640         }
   1641         break;
   1642 
   1643     case doSetBeginDifference1:
   1644         //  We have scanned something like [[abc]-[
   1645         //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
   1646         //  Push a Difference operator, which will cause the new set to be subtracted from what
   1647         //    went before once it is created.
   1648         setPushOp(setDifference1);
   1649         fSetOpStack.push(setStart, *fStatus);
   1650         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1651             fSetOpStack.push(setCaseClose, *fStatus);
   1652         }
   1653         break;
   1654 
   1655     case doSetBeginIntersection1:
   1656         //  We have scanned something like  [[abc]&[
   1657         //   Need both the '&' operator and the open '[' operator.
   1658         setPushOp(setIntersection1);
   1659         fSetOpStack.push(setStart, *fStatus);
   1660         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1661             fSetOpStack.push(setCaseClose, *fStatus);
   1662         }
   1663         break;
   1664 
   1665     case doSetBeginUnion:
   1666         //  We have scanned something like  [[abc][
   1667         //     Need to handle the union operation explicitly [[abc] | [
   1668         setPushOp(setUnion);
   1669         fSetOpStack.push(setStart, *fStatus);
   1670         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1671             fSetOpStack.push(setCaseClose, *fStatus);
   1672         }
   1673         break;
   1674 
   1675     case doSetDifference2:
   1676         // We have scanned something like [abc--
   1677         //   Consider this to unambiguously be a set difference operator.
   1678         setPushOp(setDifference2);
   1679         break;
   1680 
   1681     case doSetEnd:
   1682         // Have encountered the ']' that closes a set.
   1683         //    Force the evaluation of any pending operations within this set,
   1684         //    leave the completed set on the top of the set stack.
   1685         setEval(setEnd);
   1686         U_ASSERT(fSetOpStack.peeki()==setStart);
   1687         fSetOpStack.popi();
   1688         break;
   1689 
   1690     case doSetFinish:
   1691         {
   1692         // Finished a complete set expression, including all nested sets.
   1693         //   The close bracket has already triggered clearing out pending set operators,
   1694         //    the operator stack should be empty and the operand stack should have just
   1695         //    one entry, the result set.
   1696         U_ASSERT(fSetOpStack.empty());
   1697         UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
   1698         U_ASSERT(fSetStack.empty());
   1699         compileSet(theSet);
   1700         break;
   1701         }
   1702 
   1703     case doSetIntersection2:
   1704         // Have scanned something like [abc&&
   1705         setPushOp(setIntersection2);
   1706         break;
   1707 
   1708     case doSetLiteral:
   1709         // Union the just-scanned literal character into the set being built.
   1710         //    This operation is the highest precedence set operation, so we can always do
   1711         //    it immediately, without waiting to see what follows.  It is necessary to perform
   1712         //    any pending '-' or '&' operation first, because these have the same precedence
   1713         //    as union-ing in a literal'
   1714         {
   1715             setEval(setUnion);
   1716             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1717             s->add(fC.fChar);
   1718             fLastSetLiteral = fC.fChar;
   1719             break;
   1720         }
   1721 
   1722     case doSetLiteralEscaped:
   1723         // A back-slash escaped literal character was encountered.
   1724         // Processing is the same as with setLiteral, above, with the addition of
   1725         //  the optional check for errors on escaped ASCII letters.
   1726         {
   1727             if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
   1728                 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
   1729                  (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
   1730                 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1731             }
   1732             setEval(setUnion);
   1733             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1734             s->add(fC.fChar);
   1735             fLastSetLiteral = fC.fChar;
   1736             break;
   1737         }
   1738 
   1739         case doSetNamedChar:
   1740         // Scanning a \N{UNICODE CHARACTER NAME}
   1741         //  Aside from the source of the character, the processing is identical to doSetLiteral,
   1742         //    above.
   1743         {
   1744             UChar32  c = scanNamedChar();
   1745             setEval(setUnion);
   1746             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1747             s->add(c);
   1748             fLastSetLiteral = c;
   1749             break;
   1750         }
   1751 
   1752     case doSetNamedRange:
   1753         // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
   1754         // The left character is already in the set, and is saved in fLastSetLiteral.
   1755         // The right side needs to be picked up, the scan is at the 'N'.
   1756         // Lower Limit > Upper limit being an error matches both Java
   1757         //        and ICU UnicodeSet behavior.
   1758         {
   1759             UChar32  c = scanNamedChar();
   1760             if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) {
   1761                 error(U_REGEX_INVALID_RANGE);
   1762             }
   1763             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1764             s->add(fLastSetLiteral, c);
   1765             fLastSetLiteral = c;
   1766             break;
   1767         }
   1768 
   1769 
   1770     case  doSetNegate:
   1771         // Scanned a '^' at the start of a set.
   1772         // Push the negation operator onto the set op stack.
   1773         // A twist for case-insensitive matching:
   1774         //   the case closure operation must happen _before_ negation.
   1775         //   But the case closure operation will already be on the stack if it's required.
   1776         //   This requires checking for case closure, and swapping the stack order
   1777         //    if it is present.
   1778         {
   1779             int32_t  tosOp = fSetOpStack.peeki();
   1780             if (tosOp == setCaseClose) {
   1781                 fSetOpStack.popi();
   1782                 fSetOpStack.push(setNegation, *fStatus);
   1783                 fSetOpStack.push(setCaseClose, *fStatus);
   1784             } else {
   1785                 fSetOpStack.push(setNegation, *fStatus);
   1786             }
   1787         }
   1788         break;
   1789 
   1790     case doSetNoCloseError:
   1791         error(U_REGEX_MISSING_CLOSE_BRACKET);
   1792         break;
   1793 
   1794     case doSetOpError:
   1795         error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
   1796         break;
   1797 
   1798     case doSetPosixProp:
   1799         {
   1800             UnicodeSet *s = scanPosixProp();
   1801             if (s != NULL) {
   1802                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
   1803                 tos->addAll(*s);
   1804                 delete s;
   1805             }  // else error.  scanProp() reported the error status already.
   1806         }
   1807         break;
   1808 
   1809     case doSetProp:
   1810         //  Scanned a \p \P within [brackets].
   1811         {
   1812             UnicodeSet *s = scanProp();
   1813             if (s != NULL) {
   1814                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
   1815                 tos->addAll(*s);
   1816                 delete s;
   1817             }  // else error.  scanProp() reported the error status already.
   1818         }
   1819         break;
   1820 
   1821 
   1822     case doSetRange:
   1823         // We have scanned literal-literal.  Add the range to the set.
   1824         // The left character is already in the set, and is saved in fLastSetLiteral.
   1825         // The right side is the current character.
   1826         // Lower Limit > Upper limit being an error matches both Java
   1827         //        and ICU UnicodeSet behavior.
   1828         {
   1829         if (fLastSetLiteral > fC.fChar) {
   1830             error(U_REGEX_INVALID_RANGE);
   1831         }
   1832         UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1833         s->add(fLastSetLiteral, fC.fChar);
   1834         break;
   1835         }
   1836 
   1837     default:
   1838         U_ASSERT(FALSE);
   1839         error(U_REGEX_INTERNAL_ERROR);
   1840         break;
   1841     }
   1842 
   1843     if (U_FAILURE(*fStatus)) {
   1844         returnVal = FALSE;
   1845     }
   1846 
   1847     return returnVal;
   1848 }
   1849 
   1850 
   1851 
   1852 //------------------------------------------------------------------------------
   1853 //
   1854 //   literalChar           We've encountered a literal character from the pattern,
   1855 //                             or an escape sequence that reduces to a character.
   1856 //                         Add it to the string containing all literal chars/strings from
   1857 //                             the pattern.
   1858 //
   1859 //------------------------------------------------------------------------------
   1860 void RegexCompile::literalChar(UChar32 c)  {
   1861     fLiteralChars.append(c);
   1862 }
   1863 
   1864 
   1865 //------------------------------------------------------------------------------
   1866 //
   1867 //    fixLiterals           When compiling something that can follow a literal
   1868 //                          string in a pattern, emit the code to match the
   1869 //                          accumulated literal string.
   1870 //
   1871 //                          Optionally, split the last char of the string off into
   1872 //                          a single "ONE_CHAR" operation, so that quantifiers can
   1873 //                          apply to that char alone.  Example:   abc*
   1874 //                          The * must apply to the 'c' only.
   1875 //
   1876 //------------------------------------------------------------------------------
   1877 void    RegexCompile::fixLiterals(UBool split) {
   1878 
   1879     // If no literal characters have been scanned but not yet had code generated
   1880     //   for them, nothing needs to be done.
   1881     if (fLiteralChars.length() == 0) {
   1882         return;
   1883     }
   1884 
   1885     int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
   1886     UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
   1887 
   1888     // Split:  We need to  ensure that the last item in the compiled pattern
   1889     //     refers only to the last literal scanned in the pattern, so that
   1890     //     quantifiers (*, +, etc.) affect only it, and not a longer string.
   1891     //     Split before case folding for case insensitive matches.
   1892 
   1893     if (split) {
   1894         fLiteralChars.truncate(indexOfLastCodePoint);
   1895         fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
   1896                               //  Note that the truncated literal string may be empty, in which case
   1897                               //  nothing will be emitted.
   1898 
   1899         literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
   1900         fixLiterals(FALSE);          // Second recursive call, code for the final code point.
   1901         return;
   1902     }
   1903 
   1904     // If we are doing case-insensitive matching, case fold the string.  This may expand
   1905     //   the string, e.g. the German sharp-s turns into "ss"
   1906     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1907         fLiteralChars.foldCase();
   1908         indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
   1909         lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
   1910     }
   1911 
   1912     if (indexOfLastCodePoint == 0) {
   1913         // Single character, emit a URX_ONECHAR op to match it.
   1914         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
   1915                  u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
   1916             appendOp(URX_ONECHAR_I, lastCodePoint);
   1917         } else {
   1918             appendOp(URX_ONECHAR, lastCodePoint);
   1919         }
   1920     } else {
   1921         // Two or more chars, emit a URX_STRING to match them.
   1922         if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
   1923             error(U_REGEX_PATTERN_TOO_BIG);
   1924         }
   1925         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1926             appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
   1927         } else {
   1928             // TODO here:  add optimization to split case sensitive strings of length two
   1929             //             into two single char ops, for efficiency.
   1930             appendOp(URX_STRING, fRXPat->fLiteralText.length());
   1931         }
   1932         appendOp(URX_STRING_LEN, fLiteralChars.length());
   1933 
   1934         // Add this string into the accumulated strings of the compiled pattern.
   1935         fRXPat->fLiteralText.append(fLiteralChars);
   1936     }
   1937 
   1938     fLiteralChars.remove();
   1939 }
   1940 
   1941 
   1942 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
   1943     if (U_FAILURE(*fStatus)) {
   1944         return 0;
   1945     }
   1946     if (type < 0 || type > 255) {
   1947         U_ASSERT(FALSE);
   1948         error(U_REGEX_INTERNAL_ERROR);
   1949         type = URX_RESERVED_OP;
   1950     }
   1951     if (val > 0x00ffffff) {
   1952         U_ASSERT(FALSE);
   1953         error(U_REGEX_INTERNAL_ERROR);
   1954         val = 0;
   1955     }
   1956     if (val < 0) {
   1957         if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
   1958             U_ASSERT(FALSE);
   1959             error(U_REGEX_INTERNAL_ERROR);
   1960             return -1;
   1961         }
   1962         if (URX_TYPE(val) != 0xff) {
   1963             U_ASSERT(FALSE);
   1964             error(U_REGEX_INTERNAL_ERROR);
   1965             return -1;
   1966         }
   1967         type = URX_RESERVED_OP_N;
   1968     }
   1969     return (type << 24) | val;
   1970 }
   1971 
   1972 
   1973 //------------------------------------------------------------------------------
   1974 //
   1975 //   appendOp()             Append a new instruction onto the compiled pattern
   1976 //                          Includes error checking, limiting the size of the
   1977 //                          pattern to lengths that can be represented in the
   1978 //                          24 bit operand field of an instruction.
   1979 //
   1980 //------------------------------------------------------------------------------
   1981 void RegexCompile::appendOp(int32_t op) {
   1982     if (U_FAILURE(*fStatus)) {
   1983         return;
   1984     }
   1985     fRXPat->fCompiledPat->addElement(op, *fStatus);
   1986     if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
   1987         error(U_REGEX_PATTERN_TOO_BIG);
   1988     }
   1989 }
   1990 
   1991 void RegexCompile::appendOp(int32_t type, int32_t val) {
   1992     appendOp(buildOp(type, val));
   1993 }
   1994 
   1995 
   1996 //------------------------------------------------------------------------------
   1997 //
   1998 //   insertOp()             Insert a slot for a new opcode into the already
   1999 //                          compiled pattern code.
   2000 //
   2001 //                          Fill the slot with a NOP.  Our caller will replace it
   2002 //                          with what they really wanted.
   2003 //
   2004 //------------------------------------------------------------------------------
   2005 void   RegexCompile::insertOp(int32_t where) {
   2006     UVector64 *code = fRXPat->fCompiledPat;
   2007     U_ASSERT(where>0 && where < code->size());
   2008 
   2009     int32_t  nop = buildOp(URX_NOP, 0);
   2010     code->insertElementAt(nop, where, *fStatus);
   2011 
   2012     // Walk through the pattern, looking for any ops with targets that
   2013     //  were moved down by the insert.  Fix them.
   2014     int32_t loc;
   2015     for (loc=0; loc<code->size(); loc++) {
   2016         int32_t op = (int32_t)code->elementAti(loc);
   2017         int32_t opType = URX_TYPE(op);
   2018         int32_t opValue = URX_VAL(op);
   2019         if ((opType == URX_JMP         ||
   2020             opType == URX_JMPX         ||
   2021             opType == URX_STATE_SAVE   ||
   2022             opType == URX_CTR_LOOP     ||
   2023             opType == URX_CTR_LOOP_NG  ||
   2024             opType == URX_JMP_SAV      ||
   2025             opType == URX_JMP_SAV_X    ||
   2026             opType == URX_RELOC_OPRND)    && opValue > where) {
   2027             // Target location for this opcode is after the insertion point and
   2028             //   needs to be incremented to adjust for the insertion.
   2029             opValue++;
   2030             op = buildOp(opType, opValue);
   2031             code->setElementAt(op, loc);
   2032         }
   2033     }
   2034 
   2035     // Now fix up the parentheses stack.  All positive values in it are locations in
   2036     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
   2037     for (loc=0; loc<fParenStack.size(); loc++) {
   2038         int32_t x = fParenStack.elementAti(loc);
   2039         U_ASSERT(x < code->size());
   2040         if (x>where) {
   2041             x++;
   2042             fParenStack.setElementAt(x, loc);
   2043         }
   2044     }
   2045 
   2046     if (fMatchCloseParen > where) {
   2047         fMatchCloseParen++;
   2048     }
   2049     if (fMatchOpenParen > where) {
   2050         fMatchOpenParen++;
   2051     }
   2052 }
   2053 
   2054 
   2055 //------------------------------------------------------------------------------
   2056 //
   2057 //   allocateData()        Allocate storage in the matcher's static data area.
   2058 //                         Return the index for the newly allocated data.
   2059 //                         The storage won't actually exist until we are running a match
   2060 //                         operation, but the storage indexes are inserted into various
   2061 //                         opcodes while compiling the pattern.
   2062 //
   2063 //------------------------------------------------------------------------------
   2064 int32_t RegexCompile::allocateData(int32_t size) {
   2065     if (U_FAILURE(*fStatus)) {
   2066         return 0;
   2067     }
   2068     if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
   2069         error(U_REGEX_INTERNAL_ERROR);
   2070         return 0;
   2071     }
   2072     int32_t dataIndex = fRXPat->fDataSize;
   2073     fRXPat->fDataSize += size;
   2074     if (fRXPat->fDataSize >= 0x00fffff0) {
   2075         error(U_REGEX_INTERNAL_ERROR);
   2076     }
   2077     return dataIndex;
   2078 }
   2079 
   2080 
   2081 //------------------------------------------------------------------------------
   2082 //
   2083 //   allocateStackData()   Allocate space in the back-tracking stack frame.
   2084 //                         Return the index for the newly allocated data.
   2085 //                         The frame indexes are inserted into various
   2086 //                         opcodes while compiling the pattern, meaning that frame
   2087 //                         size must be restricted to the size that will fit
   2088 //                         as an operand (24 bits).
   2089 //
   2090 //------------------------------------------------------------------------------
   2091 int32_t RegexCompile::allocateStackData(int32_t size) {
   2092     if (U_FAILURE(*fStatus)) {
   2093         return 0;
   2094     }
   2095     if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
   2096         error(U_REGEX_INTERNAL_ERROR);
   2097         return 0;
   2098     }
   2099     int32_t dataIndex = fRXPat->fFrameSize;
   2100     fRXPat->fFrameSize += size;
   2101     if (fRXPat->fFrameSize >= 0x00fffff0) {
   2102         error(U_REGEX_PATTERN_TOO_BIG);
   2103     }
   2104     return dataIndex;
   2105 }
   2106 
   2107 
   2108 //------------------------------------------------------------------------------
   2109 //
   2110 //   blockTopLoc()          Find or create a location in the compiled pattern
   2111 //                          at the start of the operation or block that has
   2112 //                          just been compiled.  Needed when a quantifier (* or
   2113 //                          whatever) appears, and we need to add an operation
   2114 //                          at the start of the thing being quantified.
   2115 //
   2116 //                          (Parenthesized Blocks) have a slot with a NOP that
   2117 //                          is reserved for this purpose.  .* or similar don't
   2118 //                          and a slot needs to be added.
   2119 //
   2120 //       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
   2121 //                                         at the returned location.
   2122 //                                 FALSE - just return the address,
   2123 //                                         do not reserve a location there.
   2124 //
   2125 //------------------------------------------------------------------------------
   2126 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
   2127     int32_t   theLoc;
   2128     fixLiterals(TRUE);  // Emit code for any pending literals.
   2129                         //   If last item was a string, emit separate op for the its last char.
   2130     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
   2131     {
   2132         // The item just processed is a parenthesized block.
   2133         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
   2134         U_ASSERT(theLoc > 0);
   2135         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
   2136     }
   2137     else {
   2138         // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
   2139         // No slot for STATE_SAVE was pre-reserved in the compiled code.
   2140         // We need to make space now.
   2141         theLoc = fRXPat->fCompiledPat->size()-1;
   2142         int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
   2143         if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
   2144             // Strings take two opcode, we want the position of the first one.
   2145             // We can have a string at this point if a single character case-folded to two.
   2146             theLoc--;
   2147         }
   2148         if (reserveLoc) {
   2149             int32_t  nop = buildOp(URX_NOP, 0);
   2150             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
   2151         }
   2152     }
   2153     return theLoc;
   2154 }
   2155 
   2156 
   2157 
   2158 //------------------------------------------------------------------------------
   2159 //
   2160 //    handleCloseParen      When compiling a close paren, we need to go back
   2161 //                          and fix up any JMP or SAVE operations within the
   2162 //                          parenthesized block that need to target the end
   2163 //                          of the block.  The locations of these are kept on
   2164 //                          the paretheses stack.
   2165 //
   2166 //                          This function is called both when encountering a
   2167 //                          real ) and at the end of the pattern.
   2168 //
   2169 //------------------------------------------------------------------------------
   2170 void  RegexCompile::handleCloseParen() {
   2171     int32_t   patIdx;
   2172     int32_t   patOp;
   2173     if (fParenStack.size() <= 0) {
   2174         error(U_REGEX_MISMATCHED_PAREN);
   2175         return;
   2176     }
   2177 
   2178     // Emit code for any pending literals.
   2179     fixLiterals(FALSE);
   2180 
   2181     // Fixup any operations within the just-closed parenthesized group
   2182     //    that need to reference the end of the (block).
   2183     //    (The first one popped from the stack is an unused slot for
   2184     //     alternation (OR) state save, but applying the fixup to it does no harm.)
   2185     for (;;) {
   2186         patIdx = fParenStack.popi();
   2187         if (patIdx < 0) {
   2188             // value < 0 flags the start of the frame on the paren stack.
   2189             break;
   2190         }
   2191         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
   2192         patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
   2193         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
   2194         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
   2195         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
   2196         fMatchOpenParen     = patIdx;
   2197     }
   2198 
   2199     //  At the close of any parenthesized block, restore the match mode flags  to
   2200     //  the value they had at the open paren.  Saved value is
   2201     //  at the top of the paren stack.
   2202     fModeFlags = fParenStack.popi();
   2203     U_ASSERT(fModeFlags < 0);
   2204 
   2205     // DO any additional fixups, depending on the specific kind of
   2206     // parentesized grouping this is
   2207 
   2208     switch (patIdx) {
   2209     case plain:
   2210     case flags:
   2211         // No additional fixups required.
   2212         //   (Grouping-only parentheses)
   2213         break;
   2214     case capturing:
   2215         // Capturing Parentheses.
   2216         //   Insert a End Capture op into the pattern.
   2217         //   The frame offset of the variables for this cg is obtained from the
   2218         //       start capture op and put it into the end-capture op.
   2219         {
   2220             int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
   2221             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
   2222 
   2223             int32_t   frameVarLocation = URX_VAL(captureOp);
   2224             appendOp(URX_END_CAPTURE, frameVarLocation);
   2225         }
   2226         break;
   2227     case atomic:
   2228         // Atomic Parenthesis.
   2229         //   Insert a LD_SP operation to restore the state stack to the position
   2230         //   it was when the atomic parens were entered.
   2231         {
   2232             int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
   2233             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
   2234             int32_t   stoLoc = URX_VAL(stoOp);
   2235             appendOp(URX_LD_SP, stoLoc);
   2236         }
   2237         break;
   2238 
   2239     case lookAhead:
   2240         {
   2241             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
   2242             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
   2243             int32_t dataLoc  = URX_VAL(startOp);
   2244             appendOp(URX_LA_END, dataLoc);
   2245         }
   2246         break;
   2247 
   2248     case negLookAhead:
   2249         {
   2250             // See comment at doOpenLookAheadNeg
   2251             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
   2252             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
   2253             int32_t dataLoc  = URX_VAL(startOp);
   2254             appendOp(URX_LA_END, dataLoc);
   2255             appendOp(URX_BACKTRACK, 0);
   2256             appendOp(URX_LA_END, dataLoc);
   2257 
   2258             // Patch the URX_SAVE near the top of the block.
   2259             // The destination of the SAVE is the final LA_END that was just added.
   2260             int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
   2261             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
   2262             int32_t dest     = fRXPat->fCompiledPat->size()-1;
   2263             saveOp           = buildOp(URX_STATE_SAVE, dest);
   2264             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
   2265         }
   2266         break;
   2267 
   2268     case lookBehind:
   2269         {
   2270             // See comment at doOpenLookBehind.
   2271 
   2272             // Append the URX_LB_END and URX_LA_END to the compiled pattern.
   2273             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
   2274             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
   2275             int32_t dataLoc  = URX_VAL(startOp);
   2276             appendOp(URX_LB_END, dataLoc);
   2277             appendOp(URX_LA_END, dataLoc);
   2278 
   2279             // Determine the min and max bounds for the length of the
   2280             //  string that the pattern can match.
   2281             //  An unbounded upper limit is an error.
   2282             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
   2283             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
   2284             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
   2285             if (URX_TYPE(maxML) != 0) {
   2286                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2287                 break;
   2288             }
   2289             if (maxML == INT32_MAX) {
   2290                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2291                 break;
   2292             }
   2293             U_ASSERT(minML <= maxML);
   2294 
   2295             // Insert the min and max match len bounds into the URX_LB_CONT op that
   2296             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
   2297             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
   2298             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
   2299 
   2300         }
   2301         break;
   2302 
   2303 
   2304 
   2305     case lookBehindN:
   2306         {
   2307             // See comment at doOpenLookBehindNeg.
   2308 
   2309             // Append the URX_LBN_END to the compiled pattern.
   2310             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
   2311             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
   2312             int32_t dataLoc  = URX_VAL(startOp);
   2313             appendOp(URX_LBN_END, dataLoc);
   2314 
   2315             // Determine the min and max bounds for the length of the
   2316             //  string that the pattern can match.
   2317             //  An unbounded upper limit is an error.
   2318             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
   2319             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
   2320             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
   2321             if (URX_TYPE(maxML) != 0) {
   2322                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2323                 break;
   2324             }
   2325             if (maxML == INT32_MAX) {
   2326                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2327                 break;
   2328             }
   2329             U_ASSERT(minML <= maxML);
   2330 
   2331             // Insert the min and max match len bounds into the URX_LB_CONT op that
   2332             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
   2333             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
   2334             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
   2335 
   2336             // Insert the pattern location to continue at after a successful match
   2337             //  as the last operand of the URX_LBN_CONT
   2338             int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
   2339             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
   2340         }
   2341         break;
   2342 
   2343 
   2344 
   2345     default:
   2346         U_ASSERT(FALSE);
   2347     }
   2348 
   2349     // remember the next location in the compiled pattern.
   2350     // The compilation of Quantifiers will look at this to see whether its looping
   2351     //   over a parenthesized block or a single item
   2352     fMatchCloseParen = fRXPat->fCompiledPat->size();
   2353 }
   2354 
   2355 
   2356 
   2357 //------------------------------------------------------------------------------
   2358 //
   2359 //   compileSet       Compile the pattern operations for a reference to a
   2360 //                    UnicodeSet.
   2361 //
   2362 //------------------------------------------------------------------------------
   2363 void        RegexCompile::compileSet(UnicodeSet *theSet)
   2364 {
   2365     if (theSet == NULL) {
   2366         return;
   2367     }
   2368     //  Remove any strings from the set.
   2369     //  There shoudn't be any, but just in case.
   2370     //     (Case Closure can add them; if we had a simple case closure avaialble that
   2371     //      ignored strings, that would be better.)
   2372     theSet->removeAllStrings();
   2373     int32_t  setSize = theSet->size();
   2374 
   2375     switch (setSize) {
   2376     case 0:
   2377         {
   2378             // Set of no elements.   Always fails to match.
   2379             appendOp(URX_BACKTRACK, 0);
   2380             delete theSet;
   2381         }
   2382         break;
   2383 
   2384     case 1:
   2385         {
   2386             // The set contains only a single code point.  Put it into
   2387             //   the compiled pattern as a single char operation rather
   2388             //   than a set, and discard the set itself.
   2389             literalChar(theSet->charAt(0));
   2390             delete theSet;
   2391         }
   2392         break;
   2393 
   2394     default:
   2395         {
   2396             //  The set contains two or more chars.  (the normal case)
   2397             //  Put it into the compiled pattern as a set.
   2398             int32_t setNumber = fRXPat->fSets->size();
   2399             fRXPat->fSets->addElement(theSet, *fStatus);
   2400             appendOp(URX_SETREF, setNumber);
   2401         }
   2402     }
   2403 }
   2404 
   2405 
   2406 //------------------------------------------------------------------------------
   2407 //
   2408 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
   2409 //                      Except for the specific opcodes used, the code is the same
   2410 //                      for all three types (greedy, non-greedy, possessive) of
   2411 //                      intervals.  The opcodes are supplied as parameters.
   2412 //                      (There are two sets of opcodes - greedy & possessive use the
   2413 //                      same ones, while non-greedy has it's own.)
   2414 //
   2415 //                      The code for interval loops has this form:
   2416 //                         0  CTR_INIT   counter loc (in stack frame)
   2417 //                         1             5  patt address of CTR_LOOP at bottom of block
   2418 //                         2             min count
   2419 //                         3             max count   (-1 for unbounded)
   2420 //                         4  ...        block to be iterated over
   2421 //                         5  CTR_LOOP
   2422 //
   2423 //                       In
   2424 //------------------------------------------------------------------------------
   2425 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
   2426 {
   2427     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
   2428     //   four slots in the compiled code.  Reserve them.
   2429     int32_t   topOfBlock = blockTopLoc(TRUE);
   2430     insertOp(topOfBlock);
   2431     insertOp(topOfBlock);
   2432     insertOp(topOfBlock);
   2433 
   2434     // The operands for the CTR_INIT opcode include the index in the matcher data
   2435     //   of the counter.  Allocate it now. There are two data items
   2436     //        counterLoc   -->  Loop counter
   2437     //               +1    -->  Input index (for breaking non-progressing loops)
   2438     //                          (Only present if unbounded upper limit on loop)
   2439     int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
   2440     int32_t   counterLoc = allocateStackData(dataSize);
   2441 
   2442     int32_t   op = buildOp(InitOp, counterLoc);
   2443     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
   2444 
   2445     // The second operand of CTR_INIT is the location following the end of the loop.
   2446     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
   2447     //   compilation of something later on causes the code to grow and the target
   2448     //   position to move.
   2449     int32_t loopEnd = fRXPat->fCompiledPat->size();
   2450     op = buildOp(URX_RELOC_OPRND, loopEnd);
   2451     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
   2452 
   2453     // Followed by the min and max counts.
   2454     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
   2455     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
   2456 
   2457     // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
   2458     //   Goes at end of the block being looped over, so just append to the code so far.
   2459     appendOp(LoopOp, topOfBlock);
   2460 
   2461     if ((fIntervalLow & 0xff000000) != 0 ||
   2462         (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
   2463             error(U_REGEX_NUMBER_TOO_BIG);
   2464         }
   2465 
   2466     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
   2467         error(U_REGEX_MAX_LT_MIN);
   2468     }
   2469 }
   2470 
   2471 
   2472 
   2473 UBool RegexCompile::compileInlineInterval() {
   2474     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
   2475         // Too big to inline.  Fail, which will cause looping code to be generated.
   2476         //   (Upper < Lower picks up unbounded upper and errors, both.)
   2477         return FALSE;
   2478     }
   2479 
   2480     int32_t   topOfBlock = blockTopLoc(FALSE);
   2481     if (fIntervalUpper == 0) {
   2482         // Pathological case.  Attempt no matches, as if the block doesn't exist.
   2483         // Discard the generated code for the block.
   2484         // If the block included parens, discard the info pertaining to them as well.
   2485         fRXPat->fCompiledPat->setSize(topOfBlock);
   2486         if (fMatchOpenParen >= topOfBlock) {
   2487             fMatchOpenParen = -1;
   2488         }
   2489         if (fMatchCloseParen >= topOfBlock) {
   2490             fMatchCloseParen = -1;
   2491         }
   2492         return TRUE;
   2493     }
   2494 
   2495     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
   2496         // The thing being repeated is not a single op, but some
   2497         //   more complex block.  Do it as a loop, not inlines.
   2498         //   Note that things "repeated" a max of once are handled as inline, because
   2499         //     the one copy of the code already generated is just fine.
   2500         return FALSE;
   2501     }
   2502 
   2503     // Pick up the opcode that is to be repeated
   2504     //
   2505     int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
   2506 
   2507     // Compute the pattern location where the inline sequence
   2508     //   will end, and set up the state save op that will be needed.
   2509     //
   2510     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
   2511                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
   2512     int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
   2513     if (fIntervalLow == 0) {
   2514         insertOp(topOfBlock);
   2515         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
   2516     }
   2517 
   2518 
   2519 
   2520     //  Loop, emitting the op for the thing being repeated each time.
   2521     //    Loop starts at 1 because one instance of the op already exists in the pattern,
   2522     //    it was put there when it was originally encountered.
   2523     int32_t i;
   2524     for (i=1; i<fIntervalUpper; i++ ) {
   2525         if (i >= fIntervalLow) {
   2526             appendOp(saveOp);
   2527         }
   2528         appendOp(op);
   2529     }
   2530     return TRUE;
   2531 }
   2532 
   2533 
   2534 
   2535 //------------------------------------------------------------------------------
   2536 //
   2537 //   caseInsensitiveStart  given a single code point from a pattern string, determine the
   2538 //                         set of characters that could potentially begin a case-insensitive
   2539 //                         match of a string beginning with that character, using full Unicode
   2540 //                         case insensitive matching.
   2541 //
   2542 //          This is used in optimizing find().
   2543 //
   2544 //          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
   2545 //          misses cases like this:
   2546 //             A string from the pattern begins with 'ss' (although all we know
   2547 //                 in this context is that it begins with 's')
   2548 //             The pattern could match a string beginning with a German sharp-s
   2549 //
   2550 //           To the ordinary case closure for a character c, we add all other
   2551 //           characters cx where the case closure of cx incudes a string form that begins
   2552 //           with the original character c.
   2553 //
   2554 //           This function could be made smarter. The full pattern string is available
   2555 //           and it would be possible to verify that the extra characters being added
   2556 //           to the starting set fully match, rather than having just a first-char of the
   2557 //           folded form match.
   2558 //
   2559 //------------------------------------------------------------------------------
   2560 void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
   2561 
   2562 // Machine Generated below.
   2563 // It may need updating with new versions of Unicode.
   2564 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
   2565 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
   2566 
   2567 // Machine Generated Data. Do not hand edit.
   2568     static const UChar32 RECaseFixCodePoints[] = {
   2569         0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
   2570         0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
   2571         0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
   2572         0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
   2573         0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
   2574 
   2575     static const int16_t RECaseFixStringOffsets[] = {
   2576         0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
   2577         0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
   2578         0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
   2579         0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
   2580         0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
   2581 
   2582     static const int16_t RECaseFixCounts[] = {
   2583         0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
   2584         0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
   2585         0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
   2586         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
   2587         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
   2588 
   2589     static const UChar RECaseFixData[] = {
   2590         0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
   2591         0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
   2592         0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
   2593         0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
   2594         0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
   2595         0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
   2596         0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
   2597         0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
   2598         0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
   2599         0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
   2600         0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
   2601 
   2602 // End of machine generated data.
   2603 
   2604     if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
   2605         UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
   2606         starterChars->set(caseFoldedC, caseFoldedC);
   2607 
   2608         int32_t i;
   2609         for (i=0; RECaseFixCodePoints[i]<c ; i++) {
   2610             // Simple linear search through the sorted list of interesting code points.
   2611         }
   2612 
   2613         if (RECaseFixCodePoints[i] == c) {
   2614             int32_t dataIndex = RECaseFixStringOffsets[i];
   2615             int32_t numCharsToAdd = RECaseFixCounts[i];
   2616             UChar32 cpToAdd = 0;
   2617             for (int32_t j=0; j<numCharsToAdd; j++) {
   2618                 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
   2619                 starterChars->add(cpToAdd);
   2620             }
   2621         }
   2622 
   2623         starterChars->closeOver(USET_CASE_INSENSITIVE);
   2624         starterChars->removeAllStrings();
   2625     } else {
   2626         // Not a cased character. Just return it alone.
   2627         starterChars->set(c, c);
   2628     }
   2629 }
   2630 
   2631 
   2632 
   2633 
   2634 //------------------------------------------------------------------------------
   2635 //
   2636 //   matchStartType    Determine how a match can start.
   2637 //                     Used to optimize find() operations.
   2638 //
   2639 //                     Operation is very similar to minMatchLength().  Walk the compiled
   2640 //                     pattern, keeping an on-going minimum-match-length.  For any
   2641 //                     op where the min match coming in is zero, add that ops possible
   2642 //                     starting matches to the possible starts for the overall pattern.
   2643 //
   2644 //------------------------------------------------------------------------------
   2645 void   RegexCompile::matchStartType() {
   2646     if (U_FAILURE(*fStatus)) {
   2647         return;
   2648     }
   2649 
   2650 
   2651     int32_t    loc;                    // Location in the pattern of the current op being processed.
   2652     int32_t    op;                     // The op being processed
   2653     int32_t    opType;                 // The opcode type of the op
   2654     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
   2655     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
   2656 
   2657     UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
   2658                                        //   could have advanced the position in a match.
   2659                                        //   (Maximum match length so far == 0)
   2660 
   2661     // forwardedLength is a vector holding minimum-match-length values that
   2662     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
   2663     //   It must be one longer than the pattern being checked because some  ops
   2664     //   will jmp to a end-of-block+1 location from within a block, and we must
   2665     //   count those when checking the block.
   2666     int32_t end = fRXPat->fCompiledPat->size();
   2667     UVector32  forwardedLength(end+1, *fStatus);
   2668     forwardedLength.setSize(end+1);
   2669     for (loc=3; loc<end; loc++) {
   2670         forwardedLength.setElementAt(INT32_MAX, loc);
   2671     }
   2672 
   2673     for (loc = 3; loc<end; loc++) {
   2674         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   2675         opType = URX_TYPE(op);
   2676 
   2677         // The loop is advancing linearly through the pattern.
   2678         // If the op we are now at was the destination of a branch in the pattern,
   2679         // and that path has a shorter minimum length than the current accumulated value,
   2680         // replace the current accumulated value.
   2681         if (forwardedLength.elementAti(loc) < currentLen) {
   2682             currentLen = forwardedLength.elementAti(loc);
   2683             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   2684         }
   2685 
   2686         switch (opType) {
   2687             // Ops that don't change the total length matched
   2688         case URX_RESERVED_OP:
   2689         case URX_END:
   2690         case URX_FAIL:
   2691         case URX_STRING_LEN:
   2692         case URX_NOP:
   2693         case URX_START_CAPTURE:
   2694         case URX_END_CAPTURE:
   2695         case URX_BACKSLASH_B:
   2696         case URX_BACKSLASH_BU:
   2697         case URX_BACKSLASH_G:
   2698         case URX_BACKSLASH_Z:
   2699         case URX_DOLLAR:
   2700         case URX_DOLLAR_M:
   2701         case URX_DOLLAR_D:
   2702         case URX_DOLLAR_MD:
   2703         case URX_RELOC_OPRND:
   2704         case URX_STO_INP_LOC:
   2705         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   2706         case URX_BACKREF_I:
   2707 
   2708         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   2709         case URX_LD_SP:
   2710             break;
   2711 
   2712         case URX_CARET:
   2713             if (atStart) {
   2714                 fRXPat->fStartType = START_START;
   2715             }
   2716             break;
   2717 
   2718         case URX_CARET_M:
   2719         case URX_CARET_M_UNIX:
   2720             if (atStart) {
   2721                 fRXPat->fStartType = START_LINE;
   2722             }
   2723             break;
   2724 
   2725         case URX_ONECHAR:
   2726             if (currentLen == 0) {
   2727                 // This character could appear at the start of a match.
   2728                 //   Add it to the set of possible starting characters.
   2729                 fRXPat->fInitialChars->add(URX_VAL(op));
   2730                 numInitialStrings += 2;
   2731             }
   2732             currentLen++;
   2733             atStart = FALSE;
   2734             break;
   2735 
   2736 
   2737         case URX_SETREF:
   2738             if (currentLen == 0) {
   2739                 int32_t  sn = URX_VAL(op);
   2740                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
   2741                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
   2742                 fRXPat->fInitialChars->addAll(*s);
   2743                 numInitialStrings += 2;
   2744             }
   2745             currentLen++;
   2746             atStart = FALSE;
   2747             break;
   2748 
   2749         case URX_LOOP_SR_I:
   2750             // [Set]*, like a SETREF, above, in what it can match,
   2751             //  but may not match at all, so currentLen is not incremented.
   2752             if (currentLen == 0) {
   2753                 int32_t  sn = URX_VAL(op);
   2754                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
   2755                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
   2756                 fRXPat->fInitialChars->addAll(*s);
   2757                 numInitialStrings += 2;
   2758             }
   2759             atStart = FALSE;
   2760             break;
   2761 
   2762         case URX_LOOP_DOT_I:
   2763             if (currentLen == 0) {
   2764                 // .* at the start of a pattern.
   2765                 //    Any character can begin the match.
   2766                 fRXPat->fInitialChars->clear();
   2767                 fRXPat->fInitialChars->complement();
   2768                 numInitialStrings += 2;
   2769             }
   2770             atStart = FALSE;
   2771             break;
   2772 
   2773 
   2774         case URX_STATIC_SETREF:
   2775             if (currentLen == 0) {
   2776                 int32_t  sn = URX_VAL(op);
   2777                 U_ASSERT(sn>0 && sn<URX_LAST_SET);
   2778                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
   2779                 fRXPat->fInitialChars->addAll(*s);
   2780                 numInitialStrings += 2;
   2781             }
   2782             currentLen++;
   2783             atStart = FALSE;
   2784             break;
   2785 
   2786 
   2787 
   2788         case URX_STAT_SETREF_N:
   2789             if (currentLen == 0) {
   2790                 int32_t  sn = URX_VAL(op);
   2791                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
   2792                 UnicodeSet sc(*s);
   2793                 sc.complement();
   2794                 fRXPat->fInitialChars->addAll(sc);
   2795                 numInitialStrings += 2;
   2796             }
   2797             currentLen++;
   2798             atStart = FALSE;
   2799             break;
   2800 
   2801 
   2802 
   2803         case URX_BACKSLASH_D:
   2804             // Digit Char
   2805              if (currentLen == 0) {
   2806                  UnicodeSet s;
   2807                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
   2808                  if (URX_VAL(op) != 0) {
   2809                      s.complement();
   2810                  }
   2811                  fRXPat->fInitialChars->addAll(s);
   2812                  numInitialStrings += 2;
   2813             }
   2814             currentLen++;
   2815             atStart = FALSE;
   2816             break;
   2817 
   2818 
   2819         case URX_BACKSLASH_H:
   2820             // Horiz white space
   2821             if (currentLen == 0) {
   2822                 UnicodeSet s;
   2823                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
   2824                 s.add((UChar32)9);   // Tab
   2825                 if (URX_VAL(op) != 0) {
   2826                     s.complement();
   2827                 }
   2828                 fRXPat->fInitialChars->addAll(s);
   2829                 numInitialStrings += 2;
   2830             }
   2831             currentLen++;
   2832             atStart = FALSE;
   2833             break;
   2834 
   2835 
   2836         case URX_BACKSLASH_R:       // Any line ending sequence
   2837         case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
   2838             if (currentLen == 0) {
   2839                 UnicodeSet s;
   2840                 s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
   2841                 s.add((UChar32)0x85);
   2842                 s.add((UChar32)0x2028, (UChar32)0x2029);
   2843                 if (URX_VAL(op) != 0) {
   2844                      // Complement option applies to URX_BACKSLASH_V only.
   2845                      s.complement();
   2846                 }
   2847                 fRXPat->fInitialChars->addAll(s);
   2848                 numInitialStrings += 2;
   2849             }
   2850             currentLen++;
   2851             atStart = FALSE;
   2852             break;
   2853 
   2854 
   2855 
   2856         case URX_ONECHAR_I:
   2857             // Case Insensitive Single Character.
   2858             if (currentLen == 0) {
   2859                 UChar32  c = URX_VAL(op);
   2860                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
   2861                     UnicodeSet starters(c, c);
   2862                     starters.closeOver(USET_CASE_INSENSITIVE);
   2863                     // findCaseInsensitiveStarters(c, &starters);
   2864                     //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
   2865                     //   The expanded folding can't match the pattern.
   2866                     fRXPat->fInitialChars->addAll(starters);
   2867                 } else {
   2868                     // Char has no case variants.  Just add it as-is to the
   2869                     //   set of possible starting chars.
   2870                     fRXPat->fInitialChars->add(c);
   2871                 }
   2872                 numInitialStrings += 2;
   2873             }
   2874             currentLen++;
   2875             atStart = FALSE;
   2876             break;
   2877 
   2878 
   2879         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
   2880         case URX_DOTANY_ALL:    // . matches one or two.
   2881         case URX_DOTANY:
   2882         case URX_DOTANY_UNIX:
   2883             if (currentLen == 0) {
   2884                 // These constructs are all bad news when they appear at the start
   2885                 //   of a match.  Any character can begin the match.
   2886                 fRXPat->fInitialChars->clear();
   2887                 fRXPat->fInitialChars->complement();
   2888                 numInitialStrings += 2;
   2889             }
   2890             currentLen++;
   2891             atStart = FALSE;
   2892             break;
   2893 
   2894 
   2895         case URX_JMPX:
   2896             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
   2897         case URX_JMP:
   2898             {
   2899                 int32_t  jmpDest = URX_VAL(op);
   2900                 if (jmpDest < loc) {
   2901                     // Loop of some kind.  Can safely ignore, the worst that will happen
   2902                     //  is that we understate the true minimum length
   2903                     currentLen = forwardedLength.elementAti(loc+1);
   2904 
   2905                 } else {
   2906                     // Forward jump.  Propagate the current min length to the target loc of the jump.
   2907                     U_ASSERT(jmpDest <= end+1);
   2908                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
   2909                         forwardedLength.setElementAt(currentLen, jmpDest);
   2910                     }
   2911                 }
   2912             }
   2913             atStart = FALSE;
   2914             break;
   2915 
   2916         case URX_JMP_SAV:
   2917         case URX_JMP_SAV_X:
   2918             // Combo of state save to the next loc, + jmp backwards.
   2919             //   Net effect on min. length computation is nothing.
   2920             atStart = FALSE;
   2921             break;
   2922 
   2923         case URX_BACKTRACK:
   2924             // Fails are kind of like a branch, except that the min length was
   2925             //   propagated already, by the state save.
   2926             currentLen = forwardedLength.elementAti(loc+1);
   2927             atStart = FALSE;
   2928             break;
   2929 
   2930 
   2931         case URX_STATE_SAVE:
   2932             {
   2933                 // State Save, for forward jumps, propagate the current minimum.
   2934                 //             of the state save.
   2935                 int32_t  jmpDest = URX_VAL(op);
   2936                 if (jmpDest > loc) {
   2937                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
   2938                         forwardedLength.setElementAt(currentLen, jmpDest);
   2939                     }
   2940                 }
   2941             }
   2942             atStart = FALSE;
   2943             break;
   2944 
   2945 
   2946 
   2947 
   2948         case URX_STRING:
   2949             {
   2950                 loc++;
   2951                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   2952                 int32_t stringLen   = URX_VAL(stringLenOp);
   2953                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
   2954                 U_ASSERT(stringLenOp >= 2);
   2955                 if (currentLen == 0) {
   2956                     // Add the starting character of this string to the set of possible starting
   2957                     //   characters for this pattern.
   2958                     int32_t stringStartIdx = URX_VAL(op);
   2959                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
   2960                     fRXPat->fInitialChars->add(c);
   2961 
   2962                     // Remember this string.  After the entire pattern has been checked,
   2963                     //  if nothing else is identified that can start a match, we'll use it.
   2964                     numInitialStrings++;
   2965                     fRXPat->fInitialStringIdx = stringStartIdx;
   2966                     fRXPat->fInitialStringLen = stringLen;
   2967                 }
   2968 
   2969                 currentLen += stringLen;
   2970                 atStart = FALSE;
   2971             }
   2972             break;
   2973 
   2974         case URX_STRING_I:
   2975             {
   2976                 // Case-insensitive string.  Unlike exact-match strings, we won't
   2977                 //   attempt a string search for possible match positions.  But we
   2978                 //   do update the set of possible starting characters.
   2979                 loc++;
   2980                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   2981                 int32_t stringLen   = URX_VAL(stringLenOp);
   2982                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
   2983                 U_ASSERT(stringLenOp >= 2);
   2984                 if (currentLen == 0) {
   2985                     // Add the starting character of this string to the set of possible starting
   2986                     //   characters for this pattern.
   2987                     int32_t stringStartIdx = URX_VAL(op);
   2988                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
   2989                     UnicodeSet s;
   2990                     findCaseInsensitiveStarters(c, &s);
   2991                     fRXPat->fInitialChars->addAll(s);
   2992                     numInitialStrings += 2;  // Matching on an initial string not possible.
   2993                 }
   2994                 currentLen += stringLen;
   2995                 atStart = FALSE;
   2996             }
   2997             break;
   2998 
   2999         case URX_CTR_INIT:
   3000         case URX_CTR_INIT_NG:
   3001             {
   3002                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops
   3003                 //   so location must be updated accordingly.
   3004                 // Loop Init Ops.
   3005                 //   If the min loop count == 0
   3006                 //      move loc forwards to the end of the loop, skipping over the body.
   3007                 //   If the min count is > 0,
   3008                 //      continue normal processing of the body of the loop.
   3009                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
   3010                         loopEndLoc   = URX_VAL(loopEndLoc);
   3011                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
   3012                 if (minLoopCount == 0) {
   3013                     // Min Loop Count of 0, treat like a forward branch and
   3014                     //   move the current minimum length up to the target
   3015                     //   (end of loop) location.
   3016                     U_ASSERT(loopEndLoc <= end+1);
   3017                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
   3018                         forwardedLength.setElementAt(currentLen, loopEndLoc);
   3019                     }
   3020                 }
   3021                 loc+=3;  // Skips over operands of CTR_INIT
   3022             }
   3023             atStart = FALSE;
   3024             break;
   3025 
   3026 
   3027         case URX_CTR_LOOP:
   3028         case URX_CTR_LOOP_NG:
   3029             // Loop ops.
   3030             //  The jump is conditional, backwards only.
   3031             atStart = FALSE;
   3032             break;
   3033 
   3034         case URX_LOOP_C:
   3035             // More loop ops.  These state-save to themselves.
   3036             //   don't change the minimum match
   3037             atStart = FALSE;
   3038             break;
   3039 
   3040 
   3041         case URX_LA_START:
   3042         case URX_LB_START:
   3043             {
   3044                 // Look-around.  Scan forward until the matching look-ahead end,
   3045                 //   without processing the look-around block.  This is overly pessimistic.
   3046 
   3047                 // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
   3048                 //   lookahead contains two LA_END instructions, so count goes up by two
   3049                 //   for each LA_START.
   3050                 int32_t  depth = (opType == URX_LA_START? 2: 1);
   3051                 for (;;) {
   3052                     loc++;
   3053                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3054                     if (URX_TYPE(op) == URX_LA_START) {
   3055                         depth+=2;
   3056                     }
   3057                     if (URX_TYPE(op) == URX_LB_START) {
   3058                         depth++;
   3059                     }
   3060                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
   3061                         depth--;
   3062                         if (depth == 0) {
   3063                             break;
   3064                         }
   3065                     }
   3066                     if (URX_TYPE(op) == URX_STATE_SAVE) {
   3067                         // Need this because neg lookahead blocks will FAIL to outside
   3068                         //   of the block.
   3069                         int32_t  jmpDest = URX_VAL(op);
   3070                         if (jmpDest > loc) {
   3071                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3072                                 forwardedLength.setElementAt(currentLen, jmpDest);
   3073                             }
   3074                         }
   3075                     }
   3076                     U_ASSERT(loc <= end);
   3077                 }
   3078             }
   3079             break;
   3080 
   3081         case URX_LA_END:
   3082         case URX_LB_CONT:
   3083         case URX_LB_END:
   3084         case URX_LBN_CONT:
   3085         case URX_LBN_END:
   3086             U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be
   3087                                  //  consumed by the scan in URX_LA_START and LB_START
   3088 
   3089             break;
   3090 
   3091         default:
   3092             U_ASSERT(FALSE);
   3093             }
   3094 
   3095         }
   3096 
   3097 
   3098     // We have finished walking through the ops.  Check whether some forward jump
   3099     //   propagated a shorter length to location end+1.
   3100     if (forwardedLength.elementAti(end+1) < currentLen) {
   3101         currentLen = forwardedLength.elementAti(end+1);
   3102     }
   3103 
   3104 
   3105     fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
   3106 
   3107 
   3108     // Sort out what we should check for when looking for candidate match start positions.
   3109     // In order of preference,
   3110     //     1.   Start of input text buffer.
   3111     //     2.   A literal string.
   3112     //     3.   Start of line in multi-line mode.
   3113     //     4.   A single literal character.
   3114     //     5.   A character from a set of characters.
   3115     //
   3116     if (fRXPat->fStartType == START_START) {
   3117         // Match only at the start of an input text string.
   3118         //    start type is already set.  We're done.
   3119     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
   3120         // Match beginning only with a literal string.
   3121         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
   3122         U_ASSERT(fRXPat->fInitialChars->contains(c));
   3123         fRXPat->fStartType   = START_STRING;
   3124         fRXPat->fInitialChar = c;
   3125     } else if (fRXPat->fStartType == START_LINE) {
   3126         // Match at start of line in Multi-Line mode.
   3127         // Nothing to do here; everything is already set.
   3128     } else if (fRXPat->fMinMatchLen == 0) {
   3129         // Zero length match possible.  We could start anywhere.
   3130         fRXPat->fStartType = START_NO_INFO;
   3131     } else if (fRXPat->fInitialChars->size() == 1) {
   3132         // All matches begin with the same char.
   3133         fRXPat->fStartType   = START_CHAR;
   3134         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
   3135         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
   3136     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
   3137         fRXPat->fMinMatchLen > 0) {
   3138         // Matches start with a set of character smaller than the set of all chars.
   3139         fRXPat->fStartType = START_SET;
   3140     } else {
   3141         // Matches can start with anything
   3142         fRXPat->fStartType = START_NO_INFO;
   3143     }
   3144 
   3145     return;
   3146 }
   3147 
   3148 
   3149 
   3150 //------------------------------------------------------------------------------
   3151 //
   3152 //   minMatchLength    Calculate the length of the shortest string that could
   3153 //                     match the specified pattern.
   3154 //                     Length is in 16 bit code units, not code points.
   3155 //
   3156 //                     The calculated length may not be exact.  The returned
   3157 //                     value may be shorter than the actual minimum; it must
   3158 //                     never be longer.
   3159 //
   3160 //                     start and end are the range of p-code operations to be
   3161 //                     examined.  The endpoints are included in the range.
   3162 //
   3163 //------------------------------------------------------------------------------
   3164 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
   3165     if (U_FAILURE(*fStatus)) {
   3166         return 0;
   3167     }
   3168 
   3169     U_ASSERT(start <= end);
   3170     U_ASSERT(end < fRXPat->fCompiledPat->size());
   3171 
   3172 
   3173     int32_t    loc;
   3174     int32_t    op;
   3175     int32_t    opType;
   3176     int32_t    currentLen = 0;
   3177 
   3178 
   3179     // forwardedLength is a vector holding minimum-match-length values that
   3180     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
   3181     //   It must be one longer than the pattern being checked because some  ops
   3182     //   will jmp to a end-of-block+1 location from within a block, and we must
   3183     //   count those when checking the block.
   3184     UVector32  forwardedLength(end+2, *fStatus);
   3185     forwardedLength.setSize(end+2);
   3186     for (loc=start; loc<=end+1; loc++) {
   3187         forwardedLength.setElementAt(INT32_MAX, loc);
   3188     }
   3189 
   3190     for (loc = start; loc<=end; loc++) {
   3191         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3192         opType = URX_TYPE(op);
   3193 
   3194         // The loop is advancing linearly through the pattern.
   3195         // If the op we are now at was the destination of a branch in the pattern,
   3196         // and that path has a shorter minimum length than the current accumulated value,
   3197         // replace the current accumulated value.
   3198         // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
   3199                                                                //   no-match-possible cases.
   3200         if (forwardedLength.elementAti(loc) < currentLen) {
   3201             currentLen = forwardedLength.elementAti(loc);
   3202             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   3203         }
   3204 
   3205         switch (opType) {
   3206             // Ops that don't change the total length matched
   3207         case URX_RESERVED_OP:
   3208         case URX_END:
   3209         case URX_STRING_LEN:
   3210         case URX_NOP:
   3211         case URX_START_CAPTURE:
   3212         case URX_END_CAPTURE:
   3213         case URX_BACKSLASH_B:
   3214         case URX_BACKSLASH_BU:
   3215         case URX_BACKSLASH_G:
   3216         case URX_BACKSLASH_Z:
   3217         case URX_CARET:
   3218         case URX_DOLLAR:
   3219         case URX_DOLLAR_M:
   3220         case URX_DOLLAR_D:
   3221         case URX_DOLLAR_MD:
   3222         case URX_RELOC_OPRND:
   3223         case URX_STO_INP_LOC:
   3224         case URX_CARET_M:
   3225         case URX_CARET_M_UNIX:
   3226         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   3227         case URX_BACKREF_I:
   3228 
   3229         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   3230         case URX_LD_SP:
   3231 
   3232         case URX_JMP_SAV:
   3233         case URX_JMP_SAV_X:
   3234             break;
   3235 
   3236 
   3237             // Ops that match a minimum of one character (one or two 16 bit code units.)
   3238             //
   3239         case URX_ONECHAR:
   3240         case URX_STATIC_SETREF:
   3241         case URX_STAT_SETREF_N:
   3242         case URX_SETREF:
   3243         case URX_BACKSLASH_D:
   3244         case URX_BACKSLASH_H:
   3245         case URX_BACKSLASH_R:
   3246         case URX_BACKSLASH_V:
   3247         case URX_ONECHAR_I:
   3248         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
   3249         case URX_DOTANY_ALL:    // . matches one or two.
   3250         case URX_DOTANY:
   3251         case URX_DOTANY_UNIX:
   3252             currentLen++;
   3253             break;
   3254 
   3255 
   3256         case URX_JMPX:
   3257             loc++;              // URX_JMPX has an extra operand, ignored here,
   3258                                 //   otherwise processed identically to URX_JMP.
   3259         case URX_JMP:
   3260             {
   3261                 int32_t  jmpDest = URX_VAL(op);
   3262                 if (jmpDest < loc) {
   3263                     // Loop of some kind.  Can safely ignore, the worst that will happen
   3264                     //  is that we understate the true minimum length
   3265                     currentLen = forwardedLength.elementAti(loc+1);
   3266                 } else {
   3267                     // Forward jump.  Propagate the current min length to the target loc of the jump.
   3268                     U_ASSERT(jmpDest <= end+1);
   3269                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
   3270                         forwardedLength.setElementAt(currentLen, jmpDest);
   3271                     }
   3272                 }
   3273             }
   3274             break;
   3275 
   3276         case URX_BACKTRACK:
   3277             {
   3278                 // Back-tracks are kind of like a branch, except that the min length was
   3279                 //   propagated already, by the state save.
   3280                 currentLen = forwardedLength.elementAti(loc+1);
   3281             }
   3282             break;
   3283 
   3284 
   3285         case URX_STATE_SAVE:
   3286             {
   3287                 // State Save, for forward jumps, propagate the current minimum.
   3288                 //             of the state save.
   3289                 int32_t  jmpDest = URX_VAL(op);
   3290                 if (jmpDest > loc) {
   3291                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3292                         forwardedLength.setElementAt(currentLen, jmpDest);
   3293                     }
   3294                 }
   3295             }
   3296             break;
   3297 
   3298 
   3299         case URX_STRING:
   3300             {
   3301                 loc++;
   3302                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3303                 currentLen += URX_VAL(stringLenOp);
   3304             }
   3305             break;
   3306 
   3307 
   3308         case URX_STRING_I:
   3309             {
   3310                 loc++;
   3311                 // TODO: with full case folding, matching input text may be shorter than
   3312                 //       the string we have here.  More smarts could put some bounds on it.
   3313                 //       Assume a min length of one for now.  A min length of zero causes
   3314                 //        optimization failures for a pattern like "string"+
   3315                 // currentLen += URX_VAL(stringLenOp);
   3316                 currentLen += 1;
   3317             }
   3318             break;
   3319 
   3320         case URX_CTR_INIT:
   3321         case URX_CTR_INIT_NG:
   3322             {
   3323                 // Loop Init Ops.
   3324                 //   If the min loop count == 0
   3325                 //      move loc forwards to the end of the loop, skipping over the body.
   3326                 //   If the min count is > 0,
   3327                 //      continue normal processing of the body of the loop.
   3328                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
   3329                         loopEndLoc   = URX_VAL(loopEndLoc);
   3330                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
   3331                 if (minLoopCount == 0) {
   3332                     loc = loopEndLoc;
   3333                 } else {
   3334                     loc+=3;  // Skips over operands of CTR_INIT
   3335                 }
   3336             }
   3337             break;
   3338 
   3339 
   3340         case URX_CTR_LOOP:
   3341         case URX_CTR_LOOP_NG:
   3342             // Loop ops.
   3343             //  The jump is conditional, backwards only.
   3344             break;
   3345 
   3346         case URX_LOOP_SR_I:
   3347         case URX_LOOP_DOT_I:
   3348         case URX_LOOP_C:
   3349             // More loop ops.  These state-save to themselves.
   3350             //   don't change the minimum match - could match nothing at all.
   3351             break;
   3352 
   3353 
   3354         case URX_LA_START:
   3355         case URX_LB_START:
   3356             {
   3357                 // Look-around.  Scan forward until the matching look-ahead end,
   3358                 //   without processing the look-around block.  This is overly pessimistic for look-ahead,
   3359                 //   it assumes that the look-ahead match might be zero-length.
   3360                 //   TODO:  Positive lookahead could recursively do the block, then continue
   3361                 //          with the longer of the block or the value coming in.  Ticket 6060
   3362                 int32_t  depth = (opType == URX_LA_START? 2: 1);;
   3363                 for (;;) {
   3364                     loc++;
   3365                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3366                     if (URX_TYPE(op) == URX_LA_START) {
   3367                         // The boilerplate for look-ahead includes two LA_END insturctions,
   3368                         //    Depth will be decremented by each one when it is seen.
   3369                         depth += 2;
   3370                     }
   3371                     if (URX_TYPE(op) == URX_LB_START) {
   3372                         depth++;
   3373                     }
   3374                     if (URX_TYPE(op) == URX_LA_END) {
   3375                         depth--;
   3376                         if (depth == 0) {
   3377                             break;
   3378                         }
   3379                     }
   3380                     if (URX_TYPE(op)==URX_LBN_END) {
   3381                         depth--;
   3382                         if (depth == 0) {
   3383                             break;
   3384                         }
   3385                     }
   3386                     if (URX_TYPE(op) == URX_STATE_SAVE) {
   3387                         // Need this because neg lookahead blocks will FAIL to outside
   3388                         //   of the block.
   3389                         int32_t  jmpDest = URX_VAL(op);
   3390                         if (jmpDest > loc) {
   3391                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3392                                 forwardedLength.setElementAt(currentLen, jmpDest);
   3393                             }
   3394                         }
   3395                     }
   3396                     U_ASSERT(loc <= end);
   3397                 }
   3398             }
   3399             break;
   3400 
   3401         case URX_LA_END:
   3402         case URX_LB_CONT:
   3403         case URX_LB_END:
   3404         case URX_LBN_CONT:
   3405         case URX_LBN_END:
   3406             // Only come here if the matching URX_LA_START or URX_LB_START was not in the
   3407             //   range being sized, which happens when measuring size of look-behind blocks.
   3408             break;
   3409 
   3410         default:
   3411             U_ASSERT(FALSE);
   3412             }
   3413 
   3414         }
   3415 
   3416     // We have finished walking through the ops.  Check whether some forward jump
   3417     //   propagated a shorter length to location end+1.
   3418     if (forwardedLength.elementAti(end+1) < currentLen) {
   3419         currentLen = forwardedLength.elementAti(end+1);
   3420         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   3421     }
   3422 
   3423     return currentLen;
   3424 }
   3425 
   3426 // Increment with overflow check.
   3427 // val and delta will both be positive.
   3428 
   3429 static int32_t safeIncrement(int32_t val, int32_t delta) {
   3430     if (INT32_MAX - val > delta) {
   3431         return val + delta;
   3432     } else {
   3433         return INT32_MAX;
   3434     }
   3435 }
   3436 
   3437 
   3438 //------------------------------------------------------------------------------
   3439 //
   3440 //   maxMatchLength    Calculate the length of the longest string that could
   3441 //                     match the specified pattern.
   3442 //                     Length is in 16 bit code units, not code points.
   3443 //
   3444 //                     The calculated length may not be exact.  The returned
   3445 //                     value may be longer than the actual maximum; it must
   3446 //                     never be shorter.
   3447 //
   3448 //------------------------------------------------------------------------------
   3449 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
   3450     if (U_FAILURE(*fStatus)) {
   3451         return 0;
   3452     }
   3453     U_ASSERT(start <= end);
   3454     U_ASSERT(end < fRXPat->fCompiledPat->size());
   3455 
   3456 
   3457     int32_t    loc;
   3458     int32_t    op;
   3459     int32_t    opType;
   3460     int32_t    currentLen = 0;
   3461     UVector32  forwardedLength(end+1, *fStatus);
   3462     forwardedLength.setSize(end+1);
   3463 
   3464     for (loc=start; loc<=end; loc++) {
   3465         forwardedLength.setElementAt(0, loc);
   3466     }
   3467 
   3468     for (loc = start; loc<=end; loc++) {
   3469         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3470         opType = URX_TYPE(op);
   3471 
   3472         // The loop is advancing linearly through the pattern.
   3473         // If the op we are now at was the destination of a branch in the pattern,
   3474         // and that path has a longer maximum length than the current accumulated value,
   3475         // replace the current accumulated value.
   3476         if (forwardedLength.elementAti(loc) > currentLen) {
   3477             currentLen = forwardedLength.elementAti(loc);
   3478         }
   3479 
   3480         switch (opType) {
   3481             // Ops that don't change the total length matched
   3482         case URX_RESERVED_OP:
   3483         case URX_END:
   3484         case URX_STRING_LEN:
   3485         case URX_NOP:
   3486         case URX_START_CAPTURE:
   3487         case URX_END_CAPTURE:
   3488         case URX_BACKSLASH_B:
   3489         case URX_BACKSLASH_BU:
   3490         case URX_BACKSLASH_G:
   3491         case URX_BACKSLASH_Z:
   3492         case URX_CARET:
   3493         case URX_DOLLAR:
   3494         case URX_DOLLAR_M:
   3495         case URX_DOLLAR_D:
   3496         case URX_DOLLAR_MD:
   3497         case URX_RELOC_OPRND:
   3498         case URX_STO_INP_LOC:
   3499         case URX_CARET_M:
   3500         case URX_CARET_M_UNIX:
   3501 
   3502         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   3503         case URX_LD_SP:
   3504 
   3505         case URX_LB_END:
   3506         case URX_LB_CONT:
   3507         case URX_LBN_CONT:
   3508         case URX_LBN_END:
   3509             break;
   3510 
   3511 
   3512             // Ops that increase that cause an unbounded increase in the length
   3513             //   of a matched string, or that increase it a hard to characterize way.
   3514             //   Call the max length unbounded, and stop further checking.
   3515         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   3516         case URX_BACKREF_I:
   3517         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
   3518             currentLen = INT32_MAX;
   3519             break;
   3520 
   3521 
   3522             // Ops that match a max of one character (possibly two 16 bit code units.)
   3523             //
   3524         case URX_STATIC_SETREF:
   3525         case URX_STAT_SETREF_N:
   3526         case URX_SETREF:
   3527         case URX_BACKSLASH_D:
   3528         case URX_BACKSLASH_H:
   3529         case URX_BACKSLASH_R:
   3530         case URX_BACKSLASH_V:
   3531         case URX_ONECHAR_I:
   3532         case URX_DOTANY_ALL:
   3533         case URX_DOTANY:
   3534         case URX_DOTANY_UNIX:
   3535             currentLen = safeIncrement(currentLen, 2);
   3536             break;
   3537 
   3538             // Single literal character.  Increase current max length by one or two,
   3539             //       depending on whether the char is in the supplementary range.
   3540         case URX_ONECHAR:
   3541             currentLen = safeIncrement(currentLen, 1);
   3542             if (URX_VAL(op) > 0x10000) {
   3543                 currentLen = safeIncrement(currentLen, 1);
   3544             }
   3545             break;
   3546 
   3547             // Jumps.
   3548             //
   3549         case URX_JMP:
   3550         case URX_JMPX:
   3551         case URX_JMP_SAV:
   3552         case URX_JMP_SAV_X:
   3553             {
   3554                 int32_t  jmpDest = URX_VAL(op);
   3555                 if (jmpDest < loc) {
   3556                     // Loop of some kind.  Max match length is unbounded.
   3557                     currentLen = INT32_MAX;
   3558                 } else {
   3559                     // Forward jump.  Propagate the current min length to the target loc of the jump.
   3560                     if (forwardedLength.elementAti(jmpDest) < currentLen) {
   3561                         forwardedLength.setElementAt(currentLen, jmpDest);
   3562                     }
   3563                     currentLen = 0;
   3564                 }
   3565             }
   3566             break;
   3567 
   3568         case URX_BACKTRACK:
   3569             // back-tracks are kind of like a branch, except that the max length was
   3570             //   propagated already, by the state save.
   3571             currentLen = forwardedLength.elementAti(loc+1);
   3572             break;
   3573 
   3574 
   3575         case URX_STATE_SAVE:
   3576             {
   3577                 // State Save, for forward jumps, propagate the current minimum.
   3578                 //               of the state save.
   3579                 //             For backwards jumps, they create a loop, maximum
   3580                 //               match length is unbounded.
   3581                 int32_t  jmpDest = URX_VAL(op);
   3582                 if (jmpDest > loc) {
   3583                     if (currentLen > forwardedLength.elementAti(jmpDest)) {
   3584                         forwardedLength.setElementAt(currentLen, jmpDest);
   3585                     }
   3586                 } else {
   3587                     currentLen = INT32_MAX;
   3588                 }
   3589             }
   3590             break;
   3591 
   3592 
   3593 
   3594 
   3595         case URX_STRING:
   3596             {
   3597                 loc++;
   3598                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3599                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
   3600                 break;
   3601             }
   3602 
   3603         case URX_STRING_I:
   3604             // TODO:  This code assumes that any user string that matches will be no longer
   3605             //        than our compiled string, with case insensitive matching.
   3606             //        Our compiled string has been case-folded already.
   3607             //
   3608             //        Any matching user string will have no more code points than our
   3609             //        compiled (folded) string.  Folding may add code points, but
   3610             //        not remove them.
   3611             //
   3612             //        There is a potential problem if a supplemental code point
   3613             //        case-folds to a BMP code point.  In this case our compiled string
   3614             //        could be shorter (in code units) than a matching user string.
   3615             //
   3616             //        At this time (Unicode 6.1) there are no such characters, and this case
   3617             //        is not being handled.  A test, intltest regex/Bug9283, will fail if
   3618             //        any problematic characters are added to Unicode.
   3619             //
   3620             //        If this happens, we can make a set of the BMP chars that the
   3621             //        troublesome supplementals fold to, scan our string, and bump the
   3622             //        currentLen one extra for each that is found.
   3623             //
   3624             {
   3625                 loc++;
   3626                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3627                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
   3628             }
   3629             break;
   3630 
   3631         case URX_CTR_INIT:
   3632         case URX_CTR_INIT_NG:
   3633             // For Loops, recursively call this function on the pattern for the loop body,
   3634             //   then multiply the result by the maximum loop count.
   3635             {
   3636                 int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
   3637                 if (loopEndLoc == loc+4) {
   3638                     // Loop has an empty body. No affect on max match length.
   3639                     // Continue processing with code after the loop end.
   3640                     loc = loopEndLoc;
   3641                     break;
   3642                 }
   3643 
   3644                 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
   3645                 if (maxLoopCount == -1) {
   3646                     // Unbounded Loop. No upper bound on match length.
   3647                     currentLen = INT32_MAX;
   3648                     break;
   3649                 }
   3650 
   3651                 U_ASSERT(loopEndLoc >= loc+4);
   3652                 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
   3653                 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
   3654                 if (updatedLen >= INT32_MAX) {
   3655                     currentLen = INT32_MAX;
   3656                     break;
   3657                 }
   3658                 currentLen = (int32_t)updatedLen;
   3659                 loc = loopEndLoc;
   3660                 break;
   3661             }
   3662 
   3663         case URX_CTR_LOOP:
   3664         case URX_CTR_LOOP_NG:
   3665             // These opcodes will be skipped over by code for URX_CRT_INIT.
   3666             // We shouldn't encounter them here.
   3667             U_ASSERT(FALSE);
   3668             break;
   3669 
   3670         case URX_LOOP_SR_I:
   3671         case URX_LOOP_DOT_I:
   3672         case URX_LOOP_C:
   3673             // For anything to do with loops, make the match length unbounded.
   3674             currentLen = INT32_MAX;
   3675             break;
   3676 
   3677 
   3678 
   3679         case URX_LA_START:
   3680         case URX_LA_END:
   3681             // Look-ahead.  Just ignore, treat the look-ahead block as if
   3682             // it were normal pattern.  Gives a too-long match length,
   3683             //  but good enough for now.
   3684             break;
   3685 
   3686             // End of look-ahead ops should always be consumed by the processing at
   3687             //  the URX_LA_START op.
   3688             // U_ASSERT(FALSE);
   3689             // break;
   3690 
   3691         case URX_LB_START:
   3692             {
   3693                 // Look-behind.  Scan forward until the matching look-around end,
   3694                 //   without processing the look-behind block.
   3695                 int32_t  depth = 0;
   3696                 for (;;) {
   3697                     loc++;
   3698                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3699                     if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
   3700                         depth++;
   3701                     }
   3702                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
   3703                         if (depth == 0) {
   3704                             break;
   3705                         }
   3706                         depth--;
   3707                     }
   3708                     U_ASSERT(loc < end);
   3709                 }
   3710             }
   3711             break;
   3712 
   3713         default:
   3714             U_ASSERT(FALSE);
   3715         }
   3716 
   3717 
   3718         if (currentLen == INT32_MAX) {
   3719             //  The maximum length is unbounded.
   3720             //  Stop further processing of the pattern.
   3721             break;
   3722         }
   3723 
   3724     }
   3725     return currentLen;
   3726 
   3727 }
   3728 
   3729 
   3730 //------------------------------------------------------------------------------
   3731 //
   3732 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
   3733 //                Extra NOPs are inserted for some constructs during the initial
   3734 //                code generation to provide locations that may be patched later.
   3735 //                Many end up unneeded, and are removed by this function.
   3736 //
   3737 //                In order to minimize the number of passes through the pattern,
   3738 //                back-reference fixup is also performed here (adjusting
   3739 //                back-reference operands to point to the correct frame offsets).
   3740 //
   3741 //------------------------------------------------------------------------------
   3742 void RegexCompile::stripNOPs() {
   3743 
   3744     if (U_FAILURE(*fStatus)) {
   3745         return;
   3746     }
   3747 
   3748     int32_t    end = fRXPat->fCompiledPat->size();
   3749     UVector32  deltas(end, *fStatus);
   3750 
   3751     // Make a first pass over the code, computing the amount that things
   3752     //   will be offset at each location in the original code.
   3753     int32_t   loc;
   3754     int32_t   d = 0;
   3755     for (loc=0; loc<end; loc++) {
   3756         deltas.addElement(d, *fStatus);
   3757         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
   3758         if (URX_TYPE(op) == URX_NOP) {
   3759             d++;
   3760         }
   3761     }
   3762 
   3763     UnicodeString caseStringBuffer;
   3764 
   3765     // Make a second pass over the code, removing the NOPs by moving following
   3766     //  code up, and patching operands that refer to code locations that
   3767     //  are being moved.  The array of offsets from the first step is used
   3768     //  to compute the new operand values.
   3769     int32_t src;
   3770     int32_t dst = 0;
   3771     for (src=0; src<end; src++) {
   3772         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
   3773         int32_t opType = URX_TYPE(op);
   3774         switch (opType) {
   3775         case URX_NOP:
   3776             break;
   3777 
   3778         case URX_STATE_SAVE:
   3779         case URX_JMP:
   3780         case URX_CTR_LOOP:
   3781         case URX_CTR_LOOP_NG:
   3782         case URX_RELOC_OPRND:
   3783         case URX_JMPX:
   3784         case URX_JMP_SAV:
   3785         case URX_JMP_SAV_X:
   3786             // These are instructions with operands that refer to code locations.
   3787             {
   3788                 int32_t  operandAddress = URX_VAL(op);
   3789                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
   3790                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
   3791                 op = buildOp(opType, fixedOperandAddress);
   3792                 fRXPat->fCompiledPat->setElementAt(op, dst);
   3793                 dst++;
   3794                 break;
   3795             }
   3796 
   3797         case URX_BACKREF:
   3798         case URX_BACKREF_I:
   3799             {
   3800                 int32_t where = URX_VAL(op);
   3801                 if (where > fRXPat->fGroupMap->size()) {
   3802                     error(U_REGEX_INVALID_BACK_REF);
   3803                     break;
   3804                 }
   3805                 where = fRXPat->fGroupMap->elementAti(where-1);
   3806                 op    = buildOp(opType, where);
   3807                 fRXPat->fCompiledPat->setElementAt(op, dst);
   3808                 dst++;
   3809 
   3810                 fRXPat->fNeedsAltInput = TRUE;
   3811                 break;
   3812             }
   3813         case URX_RESERVED_OP:
   3814         case URX_RESERVED_OP_N:
   3815         case URX_BACKTRACK:
   3816         case URX_END:
   3817         case URX_ONECHAR:
   3818         case URX_STRING:
   3819         case URX_STRING_LEN:
   3820         case URX_START_CAPTURE:
   3821         case URX_END_CAPTURE:
   3822         case URX_STATIC_SETREF:
   3823         case URX_STAT_SETREF_N:
   3824         case URX_SETREF:
   3825         case URX_DOTANY:
   3826         case URX_FAIL:
   3827         case URX_BACKSLASH_B:
   3828         case URX_BACKSLASH_BU:
   3829         case URX_BACKSLASH_G:
   3830         case URX_BACKSLASH_X:
   3831         case URX_BACKSLASH_Z:
   3832         case URX_DOTANY_ALL:
   3833         case URX_BACKSLASH_D:
   3834         case URX_CARET:
   3835         case URX_DOLLAR:
   3836         case URX_CTR_INIT:
   3837         case URX_CTR_INIT_NG:
   3838         case URX_DOTANY_UNIX:
   3839         case URX_STO_SP:
   3840         case URX_LD_SP:
   3841         case URX_STO_INP_LOC:
   3842         case URX_LA_START:
   3843         case URX_LA_END:
   3844         case URX_ONECHAR_I:
   3845         case URX_STRING_I:
   3846         case URX_DOLLAR_M:
   3847         case URX_CARET_M:
   3848         case URX_CARET_M_UNIX:
   3849         case URX_LB_START:
   3850         case URX_LB_CONT:
   3851         case URX_LB_END:
   3852         case URX_LBN_CONT:
   3853         case URX_LBN_END:
   3854         case URX_LOOP_SR_I:
   3855         case URX_LOOP_DOT_I:
   3856         case URX_LOOP_C:
   3857         case URX_DOLLAR_D:
   3858         case URX_DOLLAR_MD:
   3859         case URX_BACKSLASH_H:
   3860         case URX_BACKSLASH_R:
   3861         case URX_BACKSLASH_V:
   3862             // These instructions are unaltered by the relocation.
   3863             fRXPat->fCompiledPat->setElementAt(op, dst);
   3864             dst++;
   3865             break;
   3866 
   3867         default:
   3868             // Some op is unaccounted for.
   3869             U_ASSERT(FALSE);
   3870             error(U_REGEX_INTERNAL_ERROR);
   3871         }
   3872     }
   3873 
   3874     fRXPat->fCompiledPat->setSize(dst);
   3875 }
   3876 
   3877 
   3878 
   3879 
   3880 //------------------------------------------------------------------------------
   3881 //
   3882 //  Error         Report a rule parse error.
   3883 //                Only report it if no previous error has been recorded.
   3884 //
   3885 //------------------------------------------------------------------------------
   3886 void RegexCompile::error(UErrorCode e) {
   3887     if (U_SUCCESS(*fStatus)) {
   3888         *fStatus = e;
   3889         // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
   3890         // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
   3891         // int64_t. If the values of the latter are out of range for the former,
   3892         // set them to the appropriate "field not supported" values.
   3893         if (fLineNum > 0x7FFFFFFF) {
   3894             fParseErr->line   = 0;
   3895             fParseErr->offset = -1;
   3896         } else if (fCharNum > 0x7FFFFFFF) {
   3897             fParseErr->line   = (int32_t)fLineNum;
   3898             fParseErr->offset = -1;
   3899         } else {
   3900             fParseErr->line   = (int32_t)fLineNum;
   3901             fParseErr->offset = (int32_t)fCharNum;
   3902         }
   3903 
   3904         UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
   3905 
   3906         // Fill in the context.
   3907         //   Note: extractBetween() pins supplied indicies to the string bounds.
   3908         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
   3909         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
   3910         utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
   3911         utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
   3912     }
   3913 }
   3914 
   3915 
   3916 //
   3917 //  Assorted Unicode character constants.
   3918 //     Numeric because there is no portable way to enter them as literals.
   3919 //     (Think EBCDIC).
   3920 //
   3921 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
   3922 static const UChar      chLF        = 0x0a;      // Line Feed
   3923 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
   3924 static const UChar      chDigit0    = 0x30;      // '0'
   3925 static const UChar      chDigit7    = 0x37;      // '9'
   3926 static const UChar      chColon     = 0x3A;      // ':'
   3927 static const UChar      chE         = 0x45;      // 'E'
   3928 static const UChar      chQ         = 0x51;      // 'Q'
   3929 //static const UChar      chN         = 0x4E;      // 'N'
   3930 static const UChar      chP         = 0x50;      // 'P'
   3931 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
   3932 //static const UChar      chLBracket  = 0x5b;      // '['
   3933 static const UChar      chRBracket  = 0x5d;      // ']'
   3934 static const UChar      chUp        = 0x5e;      // '^'
   3935 static const UChar      chLowerP    = 0x70;
   3936 static const UChar      chLBrace    = 0x7b;      // '{'
   3937 static const UChar      chRBrace    = 0x7d;      // '}'
   3938 static const UChar      chNEL       = 0x85;      //    NEL newline variant
   3939 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
   3940 
   3941 
   3942 //------------------------------------------------------------------------------
   3943 //
   3944 //  nextCharLL    Low Level Next Char from the regex pattern.
   3945 //                Get a char from the string, keep track of input position
   3946 //                     for error reporting.
   3947 //
   3948 //------------------------------------------------------------------------------
   3949 UChar32  RegexCompile::nextCharLL() {
   3950     UChar32       ch;
   3951 
   3952     if (fPeekChar != -1) {
   3953         ch = fPeekChar;
   3954         fPeekChar = -1;
   3955         return ch;
   3956     }
   3957 
   3958     // assume we're already in the right place
   3959     ch = UTEXT_NEXT32(fRXPat->fPattern);
   3960     if (ch == U_SENTINEL) {
   3961         return ch;
   3962     }
   3963 
   3964     if (ch == chCR ||
   3965         ch == chNEL ||
   3966         ch == chLS   ||
   3967         (ch == chLF && fLastChar != chCR)) {
   3968         // Character is starting a new line.  Bump up the line number, and
   3969         //  reset the column to 0.
   3970         fLineNum++;
   3971         fCharNum=0;
   3972     }
   3973     else {
   3974         // Character is not starting a new line.  Except in the case of a
   3975         //   LF following a CR, increment the column position.
   3976         if (ch != chLF) {
   3977             fCharNum++;
   3978         }
   3979     }
   3980     fLastChar = ch;
   3981     return ch;
   3982 }
   3983 
   3984 //------------------------------------------------------------------------------
   3985 //
   3986 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
   3987 //                 character without actually getting it.
   3988 //
   3989 //------------------------------------------------------------------------------
   3990 UChar32  RegexCompile::peekCharLL() {
   3991     if (fPeekChar == -1) {
   3992         fPeekChar = nextCharLL();
   3993     }
   3994     return fPeekChar;
   3995 }
   3996 
   3997 
   3998 //------------------------------------------------------------------------------
   3999 //
   4000 //   nextChar     for pattern scanning.  At this level, we handle stripping
   4001 //                out comments and processing some backslash character escapes.
   4002 //                The rest of the pattern grammar is handled at the next level up.
   4003 //
   4004 //------------------------------------------------------------------------------
   4005 void RegexCompile::nextChar(RegexPatternChar &c) {
   4006 
   4007     fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
   4008     c.fChar    = nextCharLL();
   4009     c.fQuoted  = FALSE;
   4010 
   4011     if (fQuoteMode) {
   4012         c.fQuoted = TRUE;
   4013         if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
   4014             c.fChar == (UChar32)-1) {
   4015             fQuoteMode = FALSE;  //  Exit quote mode,
   4016             nextCharLL();        // discard the E
   4017             nextChar(c);         // recurse to get the real next char
   4018         }
   4019     }
   4020     else if (fInBackslashQuote) {
   4021         // The current character immediately follows a '\'
   4022         // Don't check for any further escapes, just return it as-is.
   4023         // Don't set c.fQuoted, because that would prevent the state machine from
   4024         //    dispatching on the character.
   4025         fInBackslashQuote = FALSE;
   4026     }
   4027     else
   4028     {
   4029         // We are not in a \Q quoted region \E of the source.
   4030         //
   4031         if (fModeFlags & UREGEX_COMMENTS) {
   4032             //
   4033             // We are in free-spacing and comments mode.
   4034             //  Scan through any white space and comments, until we
   4035             //  reach a significant character or the end of inut.
   4036             for (;;) {
   4037                 if (c.fChar == (UChar32)-1) {
   4038                     break;     // End of Input
   4039                 }
   4040                 if  (c.fChar == chPound && fEOLComments == TRUE) {
   4041                     // Start of a comment.  Consume the rest of it, until EOF or a new line
   4042                     for (;;) {
   4043                         c.fChar = nextCharLL();
   4044                         if (c.fChar == (UChar32)-1 ||  // EOF
   4045                             c.fChar == chCR        ||
   4046                             c.fChar == chLF        ||
   4047                             c.fChar == chNEL       ||
   4048                             c.fChar == chLS)       {
   4049                             break;
   4050                         }
   4051                     }
   4052                 }
   4053                 // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
   4054                 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
   4055                     break;
   4056                 }
   4057                 c.fChar = nextCharLL();
   4058             }
   4059         }
   4060 
   4061         //
   4062         //  check for backslash escaped characters.
   4063         //
   4064         if (c.fChar == chBackSlash) {
   4065             int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
   4066             if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
   4067                 //
   4068                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
   4069                 //   Includes \uxxxx, \n, \r, many others.
   4070                 //   Return the single equivalent character.
   4071                 //
   4072                 nextCharLL();                 // get & discard the peeked char.
   4073                 c.fQuoted = TRUE;
   4074 
   4075                 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
   4076                     int32_t endIndex = (int32_t)pos;
   4077                     c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
   4078 
   4079                     if (endIndex == pos) {
   4080                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   4081                     }
   4082                     fCharNum += endIndex - pos;
   4083                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
   4084                 } else {
   4085                     int32_t offset = 0;
   4086                     struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
   4087 
   4088                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
   4089                     c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
   4090 
   4091                     if (offset == 0) {
   4092                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   4093                     } else if (context.lastOffset == offset) {
   4094                         UTEXT_PREVIOUS32(fRXPat->fPattern);
   4095                     } else if (context.lastOffset != offset-1) {
   4096                         utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
   4097                     }
   4098                     fCharNum += offset;
   4099                 }
   4100             }
   4101             else if (peekCharLL() == chDigit0) {
   4102                 //  Octal Escape, using Java Regexp Conventions
   4103                 //    which are \0 followed by 1-3 octal digits.
   4104                 //    Different from ICU Unescape handling of Octal, which does not
   4105                 //    require the leading 0.
   4106                 //  Java also has the convention of only consuming 2 octal digits if
   4107                 //    the three digit number would be > 0xff
   4108                 //
   4109                 c.fChar = 0;
   4110                 nextCharLL();    // Consume the initial 0.
   4111                 int index;
   4112                 for (index=0; index<3; index++) {
   4113                     int32_t ch = peekCharLL();
   4114                     if (ch<chDigit0 || ch>chDigit7) {
   4115                         if (index==0) {
   4116                            // \0 is not followed by any octal digits.
   4117                            error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   4118                         }
   4119                         break;
   4120                     }
   4121                     c.fChar <<= 3;
   4122                     c.fChar += ch&7;
   4123                     if (c.fChar <= 255) {
   4124                         nextCharLL();
   4125                     } else {
   4126                         // The last digit made the number too big.  Forget we saw it.
   4127                         c.fChar >>= 3;
   4128                     }
   4129                 }
   4130                 c.fQuoted = TRUE;
   4131             }
   4132             else if (peekCharLL() == chQ) {
   4133                 //  "\Q"  enter quote mode, which will continue until "\E"
   4134                 fQuoteMode = TRUE;
   4135                 nextCharLL();       // discard the 'Q'.
   4136                 nextChar(c);        // recurse to get the real next char.
   4137             }
   4138             else
   4139             {
   4140                 // We are in a '\' escape that will be handled by the state table scanner.
   4141                 // Just return the backslash, but remember that the following char is to
   4142                 //  be taken literally.
   4143                 fInBackslashQuote = TRUE;
   4144             }
   4145         }
   4146     }
   4147 
   4148     // re-enable # to end-of-line comments, in case they were disabled.
   4149     // They are disabled by the parser upon seeing '(?', but this lasts for
   4150     //  the fetching of the next character only.
   4151     fEOLComments = TRUE;
   4152 
   4153     // putc(c.fChar, stdout);
   4154 }
   4155 
   4156 
   4157 
   4158 //------------------------------------------------------------------------------
   4159 //
   4160 //  scanNamedChar
   4161 //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
   4162 //
   4163 //             The scan position will be at the 'N'.  On return
   4164 //             the scan position should be just after the '}'
   4165 //
   4166 //             Return the UChar32
   4167 //
   4168 //------------------------------------------------------------------------------
   4169 UChar32  RegexCompile::scanNamedChar() {
   4170     if (U_FAILURE(*fStatus)) {
   4171         return 0;
   4172     }
   4173 
   4174     nextChar(fC);
   4175     if (fC.fChar != chLBrace) {
   4176         error(U_REGEX_PROPERTY_SYNTAX);
   4177         return 0;
   4178     }
   4179 
   4180     UnicodeString  charName;
   4181     for (;;) {
   4182         nextChar(fC);
   4183         if (fC.fChar == chRBrace) {
   4184             break;
   4185         }
   4186         if (fC.fChar == -1) {
   4187             error(U_REGEX_PROPERTY_SYNTAX);
   4188             return 0;
   4189         }
   4190         charName.append(fC.fChar);
   4191     }
   4192 
   4193     char name[100];
   4194     if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
   4195          (uint32_t)charName.length()>=sizeof(name)) {
   4196         // All Unicode character names have only invariant characters.
   4197         // The API to get a character, given a name, accepts only char *, forcing us to convert,
   4198         //   which requires this error check
   4199         error(U_REGEX_PROPERTY_SYNTAX);
   4200         return 0;
   4201     }
   4202     charName.extract(0, charName.length(), name, sizeof(name), US_INV);
   4203 
   4204     UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
   4205     if (U_FAILURE(*fStatus)) {
   4206         error(U_REGEX_PROPERTY_SYNTAX);
   4207     }
   4208 
   4209     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
   4210     return theChar;
   4211 }
   4212 
   4213 //------------------------------------------------------------------------------
   4214 //
   4215 //  scanProp   Construct a UnicodeSet from the text at the current scan
   4216 //             position, which will be of the form \p{whaterver}
   4217 //
   4218 //             The scan position will be at the 'p' or 'P'.  On return
   4219 //             the scan position should be just after the '}'
   4220 //
   4221 //             Return a UnicodeSet, constructed from the \P pattern,
   4222 //             or NULL if the pattern is invalid.
   4223 //
   4224 //------------------------------------------------------------------------------
   4225 UnicodeSet *RegexCompile::scanProp() {
   4226     UnicodeSet    *uset = NULL;
   4227 
   4228     if (U_FAILURE(*fStatus)) {
   4229         return NULL;
   4230     }
   4231     (void)chLowerP;   // Suppress compiler unused variable warning.
   4232     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
   4233     UBool negated = (fC.fChar == chP);
   4234 
   4235     UnicodeString propertyName;
   4236     nextChar(fC);
   4237     if (fC.fChar != chLBrace) {
   4238         error(U_REGEX_PROPERTY_SYNTAX);
   4239         return NULL;
   4240     }
   4241     for (;;) {
   4242         nextChar(fC);
   4243         if (fC.fChar == chRBrace) {
   4244             break;
   4245         }
   4246         if (fC.fChar == -1) {
   4247             // Hit the end of the input string without finding the closing '}'
   4248             error(U_REGEX_PROPERTY_SYNTAX);
   4249             return NULL;
   4250         }
   4251         propertyName.append(fC.fChar);
   4252     }
   4253     uset = createSetForProperty(propertyName, negated);
   4254     nextChar(fC);    // Move input scan to position following the closing '}'
   4255     return uset;
   4256 }
   4257 
   4258 //------------------------------------------------------------------------------
   4259 //
   4260 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
   4261 //             position, which is expected be of the form [:property expression:]
   4262 //
   4263 //             The scan position will be at the opening ':'.  On return
   4264 //             the scan position must be on the closing ']'
   4265 //
   4266 //             Return a UnicodeSet constructed from the pattern,
   4267 //             or NULL if this is not a valid POSIX-style set expression.
   4268 //             If not a property expression, restore the initial scan position
   4269 //                (to the opening ':')
   4270 //
   4271 //               Note:  the opening '[:' is not sufficient to guarantee that
   4272 //                      this is a [:property:] expression.
   4273 //                      [:'+=,] is a perfectly good ordinary set expression that
   4274 //                              happens to include ':' as one of its characters.
   4275 //
   4276 //------------------------------------------------------------------------------
   4277 UnicodeSet *RegexCompile::scanPosixProp() {
   4278     UnicodeSet    *uset = NULL;
   4279 
   4280     if (U_FAILURE(*fStatus)) {
   4281         return NULL;
   4282     }
   4283 
   4284     U_ASSERT(fC.fChar == chColon);
   4285 
   4286     // Save the scanner state.
   4287     // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
   4288     int64_t     savedScanIndex        = fScanIndex;
   4289     int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
   4290     UBool       savedQuoteMode        = fQuoteMode;
   4291     UBool       savedInBackslashQuote = fInBackslashQuote;
   4292     UBool       savedEOLComments      = fEOLComments;
   4293     int64_t     savedLineNum          = fLineNum;
   4294     int64_t     savedCharNum          = fCharNum;
   4295     UChar32     savedLastChar         = fLastChar;
   4296     UChar32     savedPeekChar         = fPeekChar;
   4297     RegexPatternChar savedfC          = fC;
   4298 
   4299     // Scan for a closing ].   A little tricky because there are some perverse
   4300     //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
   4301     //   ending on the second closing ].
   4302 
   4303     UnicodeString propName;
   4304     UBool         negated  = FALSE;
   4305 
   4306     // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
   4307     nextChar(fC);
   4308     if (fC.fChar == chUp) {
   4309        negated = TRUE;
   4310        nextChar(fC);
   4311     }
   4312 
   4313     // Scan for the closing ":]", collecting the property name along the way.
   4314     UBool  sawPropSetTerminator = FALSE;
   4315     for (;;) {
   4316         propName.append(fC.fChar);
   4317         nextChar(fC);
   4318         if (fC.fQuoted || fC.fChar == -1) {
   4319             // Escaped characters or end of input - either says this isn't a [:Property:]
   4320             break;
   4321         }
   4322         if (fC.fChar == chColon) {
   4323             nextChar(fC);
   4324             if (fC.fChar == chRBracket) {
   4325                 sawPropSetTerminator = TRUE;
   4326             }
   4327             break;
   4328         }
   4329     }
   4330 
   4331     if (sawPropSetTerminator) {
   4332         uset = createSetForProperty(propName, negated);
   4333     }
   4334     else
   4335     {
   4336         // No closing ":]".
   4337         //  Restore the original scan position.
   4338         //  The main scanner will retry the input as a normal set expression,
   4339         //    not a [:Property:] expression.
   4340         fScanIndex        = savedScanIndex;
   4341         fQuoteMode        = savedQuoteMode;
   4342         fInBackslashQuote = savedInBackslashQuote;
   4343         fEOLComments      = savedEOLComments;
   4344         fLineNum          = savedLineNum;
   4345         fCharNum          = savedCharNum;
   4346         fLastChar         = savedLastChar;
   4347         fPeekChar         = savedPeekChar;
   4348         fC                = savedfC;
   4349         UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
   4350     }
   4351     return uset;
   4352 }
   4353 
   4354 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
   4355     set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
   4356     addCategory(set, U_GC_CF_MASK, ec);
   4357 }
   4358 
   4359 //
   4360 //  Create a Unicode Set from a Unicode Property expression.
   4361 //     This is common code underlying both \p{...} ane [:...:] expressions.
   4362 //     Includes trying the Java "properties" that aren't supported as
   4363 //     normal ICU UnicodeSet properties
   4364 //
   4365 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
   4366 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
   4367 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
   4368     UnicodeString   setExpr;
   4369     UnicodeSet      *set;
   4370     uint32_t        usetFlags = 0;
   4371 
   4372     if (U_FAILURE(*fStatus)) {
   4373         return NULL;
   4374     }
   4375 
   4376     //
   4377     //  First try the property as we received it
   4378     //
   4379     if (negated) {
   4380         setExpr.append(negSetPrefix, -1);
   4381     } else {
   4382         setExpr.append(posSetPrefix, -1);
   4383     }
   4384     setExpr.append(propName);
   4385     setExpr.append(chRBrace);
   4386     setExpr.append(chRBracket);
   4387     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   4388         usetFlags |= USET_CASE_INSENSITIVE;
   4389     }
   4390     set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
   4391     if (U_SUCCESS(*fStatus)) {
   4392        return set;
   4393     }
   4394     delete set;
   4395     set = NULL;
   4396 
   4397     //
   4398     //  The property as it was didn't work.
   4399 
   4400     //  Do [:word:]. It is not recognized as a property by UnicodeSet.  "word" not standard POSIX
   4401     //     or standard Java, but many other regular expression packages do recognize it.
   4402 
   4403     if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
   4404         *fStatus = U_ZERO_ERROR;
   4405         set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
   4406         if (set == NULL) {
   4407             *fStatus = U_MEMORY_ALLOCATION_ERROR;
   4408             return set;
   4409         }
   4410         if (negated) {
   4411             set->complement();
   4412         }
   4413         return set;
   4414     }
   4415 
   4416 
   4417     //    Do Java fixes -
   4418     //       InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
   4419     //       InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
   4420     //
   4421     //       Note on Spaces:  either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
   4422     //                        is accepted by Java.  The property part of the name is compared
   4423     //                        case-insenstively.  The spaces must be exactly as shown, either
   4424     //                        all there, or all omitted, with exactly one at each position
   4425     //                        if they are present.  From checking against JDK 1.6
   4426     //
   4427     //       This code should be removed when ICU properties support the Java  compatibility names
   4428     //          (ICU 4.0?)
   4429     //
   4430     UnicodeString mPropName = propName;
   4431     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
   4432         mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
   4433     }
   4434     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
   4435         mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
   4436         mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
   4437     }
   4438     else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
   4439         mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
   4440     }
   4441 
   4442     //    See if the property looks like a Java "InBlockName", which
   4443     //    we will recast as "Block=BlockName"
   4444     //
   4445     static const UChar IN[] = {0x49, 0x6E, 0};  // "In"
   4446     static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00};  // "Block="
   4447     if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
   4448         setExpr.truncate(4);   // Leaves "[\p{", or "[\P{"
   4449         setExpr.append(BLOCK, -1);
   4450         setExpr.append(UnicodeString(mPropName, 2));  // Property with the leading "In" removed.
   4451         setExpr.append(chRBrace);
   4452         setExpr.append(chRBracket);
   4453         *fStatus = U_ZERO_ERROR;
   4454         set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
   4455         if (U_SUCCESS(*fStatus)) {
   4456             return set;
   4457         }
   4458         delete set;
   4459         set = NULL;
   4460     }
   4461 
   4462     if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
   4463         propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
   4464     {
   4465         UErrorCode localStatus = U_ZERO_ERROR;
   4466         //setExpr.remove();
   4467         set = new UnicodeSet();
   4468         //
   4469         //  Try the various Java specific properties.
   4470         //   These all begin with "java"
   4471         //
   4472         if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
   4473             addCategory(set, U_GC_CN_MASK, localStatus);
   4474             set->complement();
   4475         }
   4476         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
   4477             addCategory(set, U_GC_ND_MASK, localStatus);
   4478         }
   4479         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
   4480             addIdentifierIgnorable(set, localStatus);
   4481         }
   4482         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
   4483             set->add(0, 0x1F).add(0x7F, 0x9F);
   4484         }
   4485         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
   4486             addCategory(set, U_GC_L_MASK, localStatus);
   4487             addCategory(set, U_GC_SC_MASK, localStatus);
   4488             addCategory(set, U_GC_PC_MASK, localStatus);
   4489             addCategory(set, U_GC_ND_MASK, localStatus);
   4490             addCategory(set, U_GC_NL_MASK, localStatus);
   4491             addCategory(set, U_GC_MC_MASK, localStatus);
   4492             addCategory(set, U_GC_MN_MASK, localStatus);
   4493             addIdentifierIgnorable(set, localStatus);
   4494         }
   4495         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
   4496             addCategory(set, U_GC_L_MASK, localStatus);
   4497             addCategory(set, U_GC_NL_MASK, localStatus);
   4498             addCategory(set, U_GC_SC_MASK, localStatus);
   4499             addCategory(set, U_GC_PC_MASK, localStatus);
   4500         }
   4501         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
   4502             addCategory(set, U_GC_L_MASK, localStatus);
   4503         }
   4504         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
   4505             addCategory(set, U_GC_L_MASK, localStatus);
   4506             addCategory(set, U_GC_ND_MASK, localStatus);
   4507         }
   4508         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
   4509             addCategory(set, U_GC_LL_MASK, localStatus);
   4510         }
   4511         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
   4512             set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
   4513         }
   4514         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
   4515             addCategory(set, U_GC_Z_MASK, localStatus);
   4516         }
   4517         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
   4518             set->add(0x10000, UnicodeSet::MAX_VALUE);
   4519         }
   4520         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
   4521             addCategory(set, U_GC_LT_MASK, localStatus);
   4522         }
   4523         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
   4524             addCategory(set, U_GC_L_MASK, localStatus);
   4525             addCategory(set, U_GC_NL_MASK, localStatus);
   4526         }
   4527         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
   4528             addCategory(set, U_GC_L_MASK, localStatus);
   4529             addCategory(set, U_GC_PC_MASK, localStatus);
   4530             addCategory(set, U_GC_ND_MASK, localStatus);
   4531             addCategory(set, U_GC_NL_MASK, localStatus);
   4532             addCategory(set, U_GC_MC_MASK, localStatus);
   4533             addCategory(set, U_GC_MN_MASK, localStatus);
   4534             addIdentifierIgnorable(set, localStatus);
   4535         }
   4536         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
   4537             addCategory(set, U_GC_LU_MASK, localStatus);
   4538         }
   4539         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
   4540             set->add(0, UnicodeSet::MAX_VALUE);
   4541         }
   4542         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
   4543             addCategory(set, U_GC_Z_MASK, localStatus);
   4544             set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
   4545             set->add(9, 0x0d).add(0x1c, 0x1f);
   4546         }
   4547         else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
   4548             set->add(0, UnicodeSet::MAX_VALUE);
   4549         }
   4550 
   4551         if (U_SUCCESS(localStatus) && !set->isEmpty()) {
   4552             *fStatus = U_ZERO_ERROR;
   4553             if (usetFlags & USET_CASE_INSENSITIVE) {
   4554                 set->closeOver(USET_CASE_INSENSITIVE);
   4555             }
   4556             if (negated) {
   4557                 set->complement();
   4558             }
   4559             return set;
   4560         }
   4561         delete set;
   4562         set = NULL;
   4563     }
   4564     error(*fStatus);
   4565     return NULL;
   4566 }
   4567 
   4568 
   4569 
   4570 //
   4571 //  SetEval   Part of the evaluation of [set expressions].
   4572 //            Perform any pending (stacked) operations with precedence
   4573 //            equal or greater to that of the next operator encountered
   4574 //            in the expression.
   4575 //
   4576 void RegexCompile::setEval(int32_t nextOp) {
   4577     UnicodeSet *rightOperand = NULL;
   4578     UnicodeSet *leftOperand  = NULL;
   4579     for (;;) {
   4580         U_ASSERT(fSetOpStack.empty()==FALSE);
   4581         int32_t pendingSetOperation = fSetOpStack.peeki();
   4582         if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
   4583             break;
   4584         }
   4585         fSetOpStack.popi();
   4586         U_ASSERT(fSetStack.empty() == FALSE);
   4587         rightOperand = (UnicodeSet *)fSetStack.peek();
   4588         switch (pendingSetOperation) {
   4589             case setNegation:
   4590                 rightOperand->complement();
   4591                 break;
   4592             case setCaseClose:
   4593                 // TODO: need a simple close function.  Ticket 6065
   4594                 rightOperand->closeOver(USET_CASE_INSENSITIVE);
   4595                 rightOperand->removeAllStrings();
   4596                 break;
   4597             case setDifference1:
   4598             case setDifference2:
   4599                 fSetStack.pop();
   4600                 leftOperand = (UnicodeSet *)fSetStack.peek();
   4601                 leftOperand->removeAll(*rightOperand);
   4602                 delete rightOperand;
   4603                 break;
   4604             case setIntersection1:
   4605             case setIntersection2:
   4606                 fSetStack.pop();
   4607                 leftOperand = (UnicodeSet *)fSetStack.peek();
   4608                 leftOperand->retainAll(*rightOperand);
   4609                 delete rightOperand;
   4610                 break;
   4611             case setUnion:
   4612                 fSetStack.pop();
   4613                 leftOperand = (UnicodeSet *)fSetStack.peek();
   4614                 leftOperand->addAll(*rightOperand);
   4615                 delete rightOperand;
   4616                 break;
   4617             default:
   4618                 U_ASSERT(FALSE);
   4619                 break;
   4620             }
   4621         }
   4622     }
   4623 
   4624 void RegexCompile::setPushOp(int32_t op) {
   4625     setEval(op);
   4626     fSetOpStack.push(op, *fStatus);
   4627     fSetStack.push(new UnicodeSet(), *fStatus);
   4628 }
   4629 
   4630 U_NAMESPACE_END
   4631 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
   4632 
   4633