Home | History | Annotate | Download | only in i18n
      1 
      2 //
      3 //  file:  regexcmp.cpp
      4 //
      5 //  Copyright (C) 2002-2008 International Business Machines Corporation and others.
      6 //  All Rights Reserved.
      7 //
      8 //  This file contains the ICU regular expression compiler, which is responsible
      9 //  for processing a regular expression pattern into the compiled form that
     10 //  is used by the match finding engine.
     11 //
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     16 
     17 #include "unicode/unistr.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/uchriter.h"
     21 #include "unicode/parsepos.h"
     22 #include "unicode/parseerr.h"
     23 #include "unicode/regex.h"
     24 #include "../common/util.h"
     25 #include "cmemory.h"
     26 #include "cstring.h"
     27 #include "uvectr32.h"
     28 #include "uassert.h"
     29 #include "ucln_in.h"
     30 #include "uinvchar.h"
     31 
     32 #include "regeximp.h"
     33 #include "regexcst.h"   // Contains state table for the regex pattern parser.
     34                         //   generated by a Perl script.
     35 #include "regexcmp.h"
     36 #include "regexst.h"
     37 
     38 
     39 
     40 U_NAMESPACE_BEGIN
     41 
     42 
     43 //------------------------------------------------------------------------------
     44 //
     45 //  Constructor.
     46 //
     47 //------------------------------------------------------------------------------
     48 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
     49    fParenStack(status), fSetStack(status), fSetOpStack(status)
     50 {
     51     fStatus           = &status;
     52 
     53     fRXPat            = rxp;
     54     fScanIndex        = 0;
     55     fNextIndex        = 0;
     56     fPeekChar         = -1;
     57     fLineNum          = 1;
     58     fCharNum          = 0;
     59     fQuoteMode        = FALSE;
     60     fInBackslashQuote = FALSE;
     61     fModeFlags        = fRXPat->fFlags | 0x80000000;
     62     fEOLComments      = TRUE;
     63 
     64     fMatchOpenParen   = -1;
     65     fMatchCloseParen  = -1;
     66     fStringOpStart    = -1;
     67 
     68     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
     69         status = rxp->fDeferredStatus;
     70     }
     71 }
     72 
     73 static const UChar      chAmp       = 0x26;      // '&'
     74 static const UChar      chDash      = 0x2d;      // '-'
     75 
     76 
     77 //------------------------------------------------------------------------------
     78 //
     79 //  Destructor
     80 //
     81 //------------------------------------------------------------------------------
     82 RegexCompile::~RegexCompile() {
     83 }
     84 
     85 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
     86     set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
     87 }
     88 
     89 //------------------------------------------------------------------------------
     90 //
     91 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
     92 //                           The state tables are hand-written in the file regexcst.txt,
     93 //                           and converted to the form used here by a perl
     94 //                           script regexcst.pl
     95 //
     96 //------------------------------------------------------------------------------
     97 void    RegexCompile::compile(
     98                          const UnicodeString &pat,   // Source pat to be compiled.
     99                          UParseError &pp,            // Error position info
    100                          UErrorCode &e)              // Error Code
    101 {
    102     fStatus             = &e;
    103     fParseErr           = &pp;
    104     fStackPtr           = 0;
    105     fStack[fStackPtr]   = 0;
    106 
    107     if (U_FAILURE(*fStatus)) {
    108         return;
    109     }
    110 
    111     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
    112     U_ASSERT(fRXPat->fPattern.length() == 0);
    113 
    114     // Prepare the RegexPattern object to receive the compiled pattern.
    115     fRXPat->fPattern        = pat;
    116     fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
    117     fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;
    118 
    119 
    120     // Initialize the pattern scanning state machine
    121     fPatternLength = pat.length();
    122     uint16_t                state = 1;
    123     const RegexTableEl      *tableEl;
    124     nextChar(fC);                        // Fetch the first char from the pattern string.
    125 
    126     //
    127     // Main loop for the regex pattern parsing state machine.
    128     //   Runs once per state transition.
    129     //   Each time through optionally performs, depending on the state table,
    130     //      - an advance to the the next pattern char
    131     //      - an action to be performed.
    132     //      - pushing or popping a state to/from the local state return stack.
    133     //   file regexcst.txt is the source for the state table.  The logic behind
    134     //     recongizing the pattern syntax is there, not here.
    135     //
    136     for (;;) {
    137         //  Bail out if anything has gone wrong.
    138         //  Regex pattern parsing stops on the first error encountered.
    139         if (U_FAILURE(*fStatus)) {
    140             break;
    141         }
    142 
    143         U_ASSERT(state != 0);
    144 
    145         // Find the state table element that matches the input char from the pattern, or the
    146         //    class of the input character.  Start with the first table row for this
    147         //    state, then linearly scan forward until we find a row that matches the
    148         //    character.  The last row for each state always matches all characters, so
    149         //    the search will stop there, if not before.
    150         //
    151         tableEl = &gRuleParseStateTable[state];
    152         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
    153             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
    154 
    155         for (;;) {    // loop through table rows belonging to this state, looking for one
    156                       //   that matches the current input char.
    157             REGEX_SCAN_DEBUG_PRINTF(("."));
    158             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
    159                 // Table row specified an individual character, not a set, and
    160                 //   the input character is not quoted, and
    161                 //   the input character matched it.
    162                 break;
    163             }
    164             if (tableEl->fCharClass == 255) {
    165                 // Table row specified default, match anything character class.
    166                 break;
    167             }
    168             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
    169                 // Table row specified "quoted" and the char was quoted.
    170                 break;
    171             }
    172             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
    173                 // Table row specified eof and we hit eof on the input.
    174                 break;
    175             }
    176 
    177             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
    178                 fC.fQuoted == FALSE &&                                       //   char is not escaped &&
    179                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
    180                 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
    181                     // Table row specified a character class, or set of characters,
    182                     //   and the current char matches it.
    183                     break;
    184                 }
    185             }
    186 
    187             // No match on this row, advance to the next  row for this state,
    188             tableEl++;
    189         }
    190         REGEX_SCAN_DEBUG_PRINTF(("\n"));
    191 
    192         //
    193         // We've found the row of the state table that matches the current input
    194         //   character from the rules string.
    195         // Perform any action specified  by this row in the state table.
    196         if (doParseActions(tableEl->fAction) == FALSE) {
    197             // Break out of the state machine loop if the
    198             //   the action signalled some kind of error, or
    199             //   the action was to exit, occurs on normal end-of-rules-input.
    200             break;
    201         }
    202 
    203         if (tableEl->fPushState != 0) {
    204             fStackPtr++;
    205             if (fStackPtr >= kStackSize) {
    206                 error(U_REGEX_INTERNAL_ERROR);
    207                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
    208                 fStackPtr--;
    209             }
    210             fStack[fStackPtr] = tableEl->fPushState;
    211         }
    212 
    213         //
    214         //  NextChar.  This is where characters are actually fetched from the pattern.
    215         //             Happens under control of the 'n' tag in the state table.
    216         //
    217         if (tableEl->fNextChar) {
    218             nextChar(fC);
    219         }
    220 
    221         // Get the next state from the table entry, or from the
    222         //   state stack if the next state was specified as "pop".
    223         if (tableEl->fNextState != 255) {
    224             state = tableEl->fNextState;
    225         } else {
    226             state = fStack[fStackPtr];
    227             fStackPtr--;
    228             if (fStackPtr < 0) {
    229                 // state stack underflow
    230                 // This will occur if the user pattern has mis-matched parentheses,
    231                 //   with extra close parens.
    232                 //
    233                 fStackPtr++;
    234                 error(U_REGEX_MISMATCHED_PAREN);
    235             }
    236         }
    237 
    238     }
    239 
    240     if (U_FAILURE(*fStatus)) {
    241         // Bail out if the pattern had errors.
    242         //   Set stack cleanup:  a successful compile would have left it empty,
    243         //   but errors can leave temporary sets hanging around.
    244         while (!fSetStack.empty()) {
    245             delete (UnicodeSet *)fSetStack.pop();
    246         }
    247         return;
    248     }
    249 
    250     //
    251     // The pattern has now been read and processed, and the compiled code generated.
    252     //
    253 
    254     // Back-reference fixup
    255     //
    256     int32_t loc;
    257     for (loc=0; loc<fRXPat->fCompiledPat->size(); loc++) {
    258         int32_t op = fRXPat->fCompiledPat->elementAti(loc);
    259         int32_t opType = URX_TYPE(op);
    260         if (opType == URX_BACKREF || opType == URX_BACKREF_I) {
    261             int32_t where = URX_VAL(op);
    262             if (where > fRXPat->fGroupMap->size()) {
    263                 error(U_REGEX_INVALID_BACK_REF);
    264                 break;
    265             }
    266             where = fRXPat->fGroupMap->elementAti(where-1);
    267             op    = URX_BUILD(opType, where);
    268             fRXPat->fCompiledPat->setElementAt(op, loc);
    269         }
    270     }
    271 
    272 
    273     //
    274     // Compute the number of digits requried for the largest capture group number.
    275     //
    276     fRXPat->fMaxCaptureDigits = 1;
    277     int32_t  n = 10;
    278     for (;;) {
    279         if (n > fRXPat->fGroupMap->size()) {
    280             break;
    281         }
    282         fRXPat->fMaxCaptureDigits++;
    283         n *= 10;
    284     }
    285 
    286     //
    287     // The pattern's fFrameSize so far has accumulated the requirements for
    288     //   storage for capture parentheses, counters, etc. that are encountered
    289     //   in the pattern.  Add space for the two variables that are always
    290     //   present in the saved state:  the input string position and the
    291     //   position in the compiled pattern.
    292     //
    293     fRXPat->fFrameSize+=2;
    294 
    295     //
    296     // Get bounds for the minimum and maximum length of a string that this
    297     //   pattern can match.  Used to avoid looking for matches in strings that
    298     //   are too short.
    299     //
    300     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
    301 
    302     //
    303     // Optimization passes
    304     //
    305     matchStartType();
    306     stripNOPs();
    307 
    308     //
    309     // Set up fast latin-1 range sets
    310     //
    311     int32_t numSets = fRXPat->fSets->size();
    312     fRXPat->fSets8 = new Regex8BitSet[numSets];
    313     // Null pointer check.
    314     if (fRXPat->fSets8 == NULL) {
    315         e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
    316         return;
    317     }
    318     int32_t i;
    319     for (i=0; i<numSets; i++) {
    320         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
    321         fRXPat->fSets8[i].init(s);
    322     }
    323 
    324 }
    325 
    326 
    327 
    328 
    329 
    330 //------------------------------------------------------------------------------
    331 //
    332 //  doParseAction        Do some action during regex pattern parsing.
    333 //                       Called by the parse state machine.
    334 //
    335 //                       Generation of the match engine PCode happens here, or
    336 //                       in functions called from the parse actions defined here.
    337 //
    338 //
    339 //------------------------------------------------------------------------------
    340 UBool RegexCompile::doParseActions(int32_t action)
    341 {
    342     UBool   returnVal = TRUE;
    343 
    344     switch ((Regex_PatternParseAction)action) {
    345 
    346     case doPatStart:
    347         // Start of pattern compiles to:
    348         //0   SAVE   2        Fall back to position of FAIL
    349         //1   jmp    3
    350         //2   FAIL            Stop if we ever reach here.
    351         //3   NOP             Dummy, so start of pattern looks the same as
    352         //                    the start of an ( grouping.
    353         //4   NOP             Resreved, will be replaced by a save if there are
    354         //                    OR | operators at the top level
    355         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus);
    356         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP,  3), *fStatus);
    357         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus);
    358 
    359         // Standard open nonCapture paren action emits the two NOPs and
    360         //   sets up the paren stack frame.
    361         doParseActions(doOpenNonCaptureParen);
    362         break;
    363 
    364     case doPatFinish:
    365         // We've scanned to the end of the pattern
    366         //  The end of pattern compiles to:
    367         //        URX_END
    368         //    which will stop the runtime match engine.
    369         //  Encountering end of pattern also behaves like a close paren,
    370         //   and forces fixups of the State Save at the beginning of the compiled pattern
    371         //   and of any OR operations at the top level.
    372         //
    373         handleCloseParen();
    374         if (fParenStack.size() > 0) {
    375             // Missing close paren in pattern.
    376             error(U_REGEX_MISMATCHED_PAREN);
    377         }
    378 
    379         // add the END operation to the compiled pattern.
    380         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus);
    381 
    382         // Terminate the pattern compilation state machine.
    383         returnVal = FALSE;
    384         break;
    385 
    386 
    387 
    388     case doOrOperator:
    389         // Scanning a '|', as in (A|B)
    390         {
    391             // Insert a SAVE operation at the start of the pattern section preceding
    392             //   this OR at this level.  This SAVE will branch the match forward
    393             //   to the right hand side of the OR in the event that the left hand
    394             //   side fails to match and backtracks.  Locate the position for the
    395             //   save from the location on the top of the parentheses stack.
    396             int32_t savePosition = fParenStack.popi();
    397             int32_t op = fRXPat->fCompiledPat->elementAti(savePosition);
    398             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
    399             op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
    400             fRXPat->fCompiledPat->setElementAt(op, savePosition);
    401 
    402             // Append an JMP operation into the compiled pattern.  The operand for
    403             //  the JMP will eventually be the location following the ')' for the
    404             //  group.  This will be patched in later, when the ')' is encountered.
    405             op = URX_BUILD(URX_JMP, 0);
    406             fRXPat->fCompiledPat->addElement(op, *fStatus);
    407 
    408             // Push the position of the newly added JMP op onto the parentheses stack.
    409             // This registers if for fixup when this block's close paren is encountered.
    410             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
    411 
    412             // Append a NOP to the compiled pattern.  This is the slot reserved
    413             //   for a SAVE in the event that there is yet another '|' following
    414             //   this one.
    415             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
    416             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
    417         }
    418         break;
    419 
    420 
    421     case doOpenCaptureParen:
    422         // Open Paren.
    423         //   Compile to a
    424         //      - NOP, which later may be replaced by a save-state if the
    425         //         parenthesized group gets a * quantifier, followed by
    426         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
    427         //      - NOP, which may later be replaced by a save-state if there
    428         //             is an '|' alternation within the parens.
    429         //
    430         //    Each capture group gets three slots in the save stack frame:
    431         //         0:   Capture Group start position (in input string being matched.)
    432         //         1:   Capture Group end   positino.
    433         //         2:   Start of Match-in-progress.
    434         //    The first two locations are for a completed capture group, and are
    435         //     referred to by back references and the like.
    436         //    The third location stores the capture start position when an START_CAPTURE is
    437         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
    438         //      END_CAPure is encountered.
    439         {
    440             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
    441             int32_t  varsLoc    = fRXPat->fFrameSize;    // Reserve three slots in match stack frame.
    442             fRXPat->fFrameSize += 3;
    443             int32_t  cop        = URX_BUILD(URX_START_CAPTURE, varsLoc);
    444             fRXPat->fCompiledPat->addElement(cop, *fStatus);
    445             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
    446 
    447             // On the Parentheses stack, start a new frame and add the postions
    448             //   of the two NOPs.  Depending on what follows in the pattern, the
    449             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
    450             //   address of the end of the parenthesized group.
    451             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    452             fParenStack.push(capturing, *fStatus);                        // Frame type.
    453             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
    454             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
    455 
    456             // Save the mapping from group number to stack frame variable position.
    457             fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
    458         }
    459          break;
    460 
    461     case doOpenNonCaptureParen:
    462         // Open non-caputuring (grouping only) Paren.
    463         //   Compile to a
    464         //      - NOP, which later may be replaced by a save-state if the
    465         //         parenthesized group gets a * quantifier, followed by
    466         //      - NOP, which may later be replaced by a save-state if there
    467         //             is an '|' alternation within the parens.
    468         {
    469             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
    470             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
    471 
    472             // On the Parentheses stack, start a new frame and add the postions
    473             //   of the two NOPs.
    474             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    475             fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
    476             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
    477             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
    478         }
    479          break;
    480 
    481 
    482     case doOpenAtomicParen:
    483         // Open Atomic Paren.  (?>
    484         //   Compile to a
    485         //      - NOP, which later may be replaced if the parenthesized group
    486         //         has a quantifier, followed by
    487         //      - STO_SP  save state stack position, so it can be restored at the ")"
    488         //      - NOP, which may later be replaced by a save-state if there
    489         //             is an '|' alternation within the parens.
    490         {
    491             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
    492             int32_t  varLoc    = fRXPat->fDataSize;    // Reserve a data location for saving the
    493             fRXPat->fDataSize += 1;                    //  state stack ptr.
    494             int32_t  stoOp     = URX_BUILD(URX_STO_SP, varLoc);
    495             fRXPat->fCompiledPat->addElement(stoOp, *fStatus);
    496             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
    497 
    498             // On the Parentheses stack, start a new frame and add the postions
    499             //   of the two NOPs.  Depending on what follows in the pattern, the
    500             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
    501             //   address of the end of the parenthesized group.
    502             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    503             fParenStack.push(atomic, *fStatus);                           // Frame type.
    504             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
    505             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
    506         }
    507         break;
    508 
    509 
    510     case doOpenLookAhead:
    511         // Positive Look-ahead   (?=  stuff  )
    512         //
    513         //   Note:   Addition of transparent input regions, with the need to
    514         //           restore the original regions when failing out of a lookahead
    515         //           block, complicated this sequence.  Some conbined opcodes
    516         //           might make sense - or might not, lookahead aren't that common.
    517         //
    518         //      Caution:  min match length optimization knows about this
    519         //               sequence; don't change without making updates there too.
    520         //
    521         // Compiles to
    522         //    1    START_LA     dataLoc     Saves SP, Input Pos
    523         //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
    524         //    3    JMP          6           continue ...
    525         //
    526         //    4.   LA_END                   Look Ahead failed.  Restore regions.
    527         //    5.   BACKTRACK                and back track again.
    528         //
    529         //    6.   NOP              reserved for use by quantifiers on the block.
    530         //                          Look-ahead can't have quantifiers, but paren stack
    531         //                             compile time conventions require the slot anyhow.
    532         //    7.   NOP              may be replaced if there is are '|' ops in the block.
    533         //    8.     code for parenthesized stuff.
    534         //    9.   LA_END
    535         //
    536         //  Two data slots are reserved, for saving the stack ptr and the input position.
    537         {
    538             int32_t dataLoc = fRXPat->fDataSize;
    539             fRXPat->fDataSize += 2;
    540             int32_t op = URX_BUILD(URX_LA_START, dataLoc);
    541             fRXPat->fCompiledPat->addElement(op, *fStatus);
    542 
    543             op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
    544             fRXPat->fCompiledPat->addElement(op, *fStatus);
    545 
    546             op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
    547             fRXPat->fCompiledPat->addElement(op, *fStatus);
    548 
    549             op = URX_BUILD(URX_LA_END, dataLoc);
    550             fRXPat->fCompiledPat->addElement(op, *fStatus);
    551 
    552             op = URX_BUILD(URX_BACKTRACK, 0);
    553             fRXPat->fCompiledPat->addElement(op, *fStatus);
    554 
    555             op = URX_BUILD(URX_NOP, 0);
    556             fRXPat->fCompiledPat->addElement(op, *fStatus);
    557             fRXPat->fCompiledPat->addElement(op, *fStatus);
    558 
    559             // On the Parentheses stack, start a new frame and add the postions
    560             //   of the NOPs.
    561             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    562             fParenStack.push(lookAhead, *fStatus);                        // Frame type.
    563             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
    564             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
    565         }
    566         break;
    567 
    568     case doOpenLookAheadNeg:
    569         // Negated Lookahead.   (?! stuff )
    570         // Compiles to
    571         //    1.    START_LA    dataloc
    572         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
    573         //                                //   which continues with the match.
    574         //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
    575         //    4.       code for parenthesized stuff.
    576         //    5.    END_LA                // Cut back stack, remove saved state from step 2.
    577         //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
    578         //    7.    END_LA                // Restore match region, in case look-ahead was using
    579         //                                        an alternate (transparent) region.
    580         {
    581             int32_t dataLoc = fRXPat->fDataSize;
    582             fRXPat->fDataSize += 2;
    583             int32_t op = URX_BUILD(URX_LA_START, dataLoc);
    584             fRXPat->fCompiledPat->addElement(op, *fStatus);
    585 
    586             op = URX_BUILD(URX_STATE_SAVE, 0);    // dest address will be patched later.
    587             fRXPat->fCompiledPat->addElement(op, *fStatus);
    588 
    589             op = URX_BUILD(URX_NOP, 0);
    590             fRXPat->fCompiledPat->addElement(op, *fStatus);
    591 
    592             // On the Parentheses stack, start a new frame and add the postions
    593             //   of the StateSave and NOP.
    594             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    595             fParenStack.push(negLookAhead, *fStatus);                    // Frame type
    596             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
    597             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
    598 
    599             // BEGIN android-changed
    600             // Apply important regex bug fixing for ticket#7224
    601             // Instructions #5 - #7 will be added when the ')' is encountered.
    602             // END android-changed
    603         }
    604         break;
    605 
    606     case doOpenLookBehind:
    607         {
    608             //   Compile a (?<= look-behind open paren.
    609             //
    610             //          Compiles to
    611             //              0       URX_LB_START     dataLoc
    612             //              1       URX_LB_CONT      dataLoc
    613             //              2                        MinMatchLen
    614             //              3                        MaxMatchLen
    615             //              4       URX_NOP          Standard '(' boilerplate.
    616             //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
    617             //              6         <code for LookBehind expression>
    618             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
    619             //              8       URX_LA_END       dataLoc    # Restore stack, input pos
    620             //
    621             //          Allocate a block of matcher data, to contain (when running a match)
    622             //              0:    Stack ptr on entry
    623             //              1:    Input Index on entry
    624             //              2:    Start index of match current match attempt.
    625             //              3:    Original Input String len.
    626 
    627             // Allocate data space
    628             int32_t dataLoc = fRXPat->fDataSize;
    629             fRXPat->fDataSize += 4;
    630 
    631             // Emit URX_LB_START
    632             int32_t op = URX_BUILD(URX_LB_START, dataLoc);
    633             fRXPat->fCompiledPat->addElement(op, *fStatus);
    634 
    635             // Emit URX_LB_CONT
    636             op = URX_BUILD(URX_LB_CONT, dataLoc);
    637             fRXPat->fCompiledPat->addElement(op, *fStatus);
    638             fRXPat->fCompiledPat->addElement(0,  *fStatus);    // MinMatchLength.  To be filled later.
    639             fRXPat->fCompiledPat->addElement(0,  *fStatus);    // MaxMatchLength.  To be filled later.
    640 
    641             // Emit the NOP
    642             op = URX_BUILD(URX_NOP, 0);
    643             fRXPat->fCompiledPat->addElement(op, *fStatus);
    644             fRXPat->fCompiledPat->addElement(op, *fStatus);
    645 
    646             // On the Parentheses stack, start a new frame and add the postions
    647             //   of the URX_LB_CONT and the NOP.
    648             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    649             fParenStack.push(lookBehind, *fStatus);                       // Frame type
    650             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
    651             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
    652 
    653             // The final two instructions will be added when the ')' is encountered.
    654         }
    655 
    656         break;
    657 
    658     case doOpenLookBehindNeg:
    659         {
    660             //   Compile a (?<! negated look-behind open paren.
    661             //
    662             //          Compiles to
    663             //              0       URX_LB_START     dataLoc    # Save entry stack, input len
    664             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
    665             //              2                        MinMatchLen
    666             //              3                        MaxMatchLen
    667             //              4                        continueLoc (9)
    668             //              5       URX_NOP          Standard '(' boilerplate.
    669             //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
    670             //              7         <code for LookBehind expression>
    671             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
    672             //              9       ...
    673             //
    674             //          Allocate a block of matcher data, to contain (when running a match)
    675             //              0:    Stack ptr on entry
    676             //              1:    Input Index on entry
    677             //              2:    Start index of match current match attempt.
    678             //              3:    Original Input String len.
    679 
    680             // Allocate data space
    681             int32_t dataLoc = fRXPat->fDataSize;
    682             fRXPat->fDataSize += 4;
    683 
    684             // Emit URX_LB_START
    685             int32_t op = URX_BUILD(URX_LB_START, dataLoc);
    686             fRXPat->fCompiledPat->addElement(op, *fStatus);
    687 
    688             // Emit URX_LBN_CONT
    689             op = URX_BUILD(URX_LBN_CONT, dataLoc);
    690             fRXPat->fCompiledPat->addElement(op, *fStatus);
    691             fRXPat->fCompiledPat->addElement(0,  *fStatus);    // MinMatchLength.  To be filled later.
    692             fRXPat->fCompiledPat->addElement(0,  *fStatus);    // MaxMatchLength.  To be filled later.
    693             fRXPat->fCompiledPat->addElement(0,  *fStatus);    // Continue Loc.    To be filled later.
    694 
    695             // Emit the NOP
    696             op = URX_BUILD(URX_NOP, 0);
    697             fRXPat->fCompiledPat->addElement(op, *fStatus);
    698             fRXPat->fCompiledPat->addElement(op, *fStatus);
    699 
    700             // On the Parentheses stack, start a new frame and add the postions
    701             //   of the URX_LB_CONT and the NOP.
    702             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    703             fParenStack.push(lookBehindN, *fStatus);                      // Frame type
    704             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
    705             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
    706 
    707             // The final two instructions will be added when the ')' is encountered.
    708         }
    709         break;
    710 
    711     case doConditionalExpr:
    712         // Conditionals such as (?(1)a:b)
    713     case doPerlInline:
    714         // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
    715         error(U_REGEX_UNIMPLEMENTED);
    716         break;
    717 
    718 
    719     case doCloseParen:
    720         handleCloseParen();
    721         if (fParenStack.size() <= 0) {
    722             //  Extra close paren, or missing open paren.
    723             error(U_REGEX_MISMATCHED_PAREN);
    724         }
    725         break;
    726 
    727     case doNOP:
    728         break;
    729 
    730 
    731     case doBadOpenParenType:
    732     case doRuleError:
    733         error(U_REGEX_RULE_SYNTAX);
    734         break;
    735 
    736 
    737     case doMismatchedParenErr:
    738         error(U_REGEX_MISMATCHED_PAREN);
    739         break;
    740 
    741     case doPlus:
    742         //  Normal '+'  compiles to
    743         //     1.   stuff to be repeated  (already built)
    744         //     2.   jmp-sav 1
    745         //     3.   ...
    746         //
    747         //  Or, if the item to be repeated can match a zero length string,
    748         //     1.   STO_INP_LOC  data-loc
    749         //     2.      body of stuff to be repeated
    750         //     3.   JMP_SAV_X    2
    751         //     4.   ...
    752 
    753         //
    754         //  Or, if the item to be repeated is simple
    755         //     1.   Item to be repeated.
    756         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
    757         //     3.   LOOP_C       stack location
    758         {
    759             int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
    760             int32_t  frameLoc;
    761 
    762             // Check for simple constructs, which may get special optimized code.
    763             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
    764                 int32_t repeatedOp = fRXPat->fCompiledPat->elementAti(topLoc);
    765 
    766                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
    767                     // Emit optimized code for [char set]+
    768                     int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
    769                     fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
    770                     frameLoc = fRXPat->fFrameSize;
    771                     fRXPat->fFrameSize++;
    772                     int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
    773                     fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
    774                     break;
    775                 }
    776 
    777                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
    778                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
    779                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
    780                     // Emit Optimized code for .+ operations.
    781                     int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
    782                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
    783                         // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
    784                         loopOpI |= 1;
    785                     }
    786                     if (fModeFlags & UREGEX_UNIX_LINES) {
    787                         loopOpI |= 2;
    788                     }
    789                     fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
    790                     frameLoc = fRXPat->fFrameSize;
    791                     fRXPat->fFrameSize++;
    792                     int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
    793                     fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
    794                     break;
    795                 }
    796 
    797             }
    798 
    799             // General case.
    800 
    801             // Check for minimum match length of zero, which requires
    802             //    extra loop-breaking code.
    803             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
    804                 // Zero length match is possible.
    805                 // Emit the code sequence that can handle it.
    806                 insertOp(topLoc);
    807                 frameLoc =  fRXPat->fFrameSize;
    808                 fRXPat->fFrameSize++;
    809 
    810                 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc);
    811                 fRXPat->fCompiledPat->setElementAt(op, topLoc);
    812 
    813                 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1);
    814                 fRXPat->fCompiledPat->addElement(op, *fStatus);
    815             } else {
    816                 // Simpler code when the repeated body must match something non-empty
    817                 int32_t  jmpOp  = URX_BUILD(URX_JMP_SAV, topLoc);
    818                 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
    819             }
    820         }
    821         break;
    822 
    823     case doNGPlus:
    824         //  Non-greedy '+?'  compiles to
    825         //     1.   stuff to be repeated  (already built)
    826         //     2.   state-save  1
    827         //     3.   ...
    828         {
    829             int32_t topLoc      = blockTopLoc(FALSE);
    830             int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc);
    831             fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
    832         }
    833         break;
    834 
    835 
    836     case doOpt:
    837         // Normal (greedy) ? quantifier.
    838         //  Compiles to
    839         //     1. state save 3
    840         //     2.    body of optional block
    841         //     3. ...
    842         // Insert the state save into the compiled pattern, and we're done.
    843         {
    844             int32_t   saveStateLoc = blockTopLoc(TRUE);
    845             int32_t   saveStateOp  = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
    846             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
    847         }
    848         break;
    849 
    850     case doNGOpt:
    851         // Non-greedy ?? quantifier
    852         //   compiles to
    853         //    1.  jmp   4
    854         //    2.     body of optional block
    855         //    3   jmp   5
    856         //    4.  state save 2
    857         //    5    ...
    858         //  This code is less than ideal, with two jmps instead of one, because we can only
    859         //  insert one instruction at the top of the block being iterated.
    860         {
    861             int32_t  jmp1_loc = blockTopLoc(TRUE);
    862             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
    863 
    864             int32_t  jmp1_op  = URX_BUILD(URX_JMP, jmp2_loc+1);
    865             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
    866 
    867             int32_t  jmp2_op  = URX_BUILD(URX_JMP, jmp2_loc+2);
    868             fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus);
    869 
    870             int32_t  save_op  = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1);
    871             fRXPat->fCompiledPat->addElement(save_op, *fStatus);
    872         }
    873         break;
    874 
    875 
    876     case doStar:
    877         // Normal (greedy) * quantifier.
    878         // Compiles to
    879         //       1.   STATE_SAVE   4
    880         //       2.      body of stuff being iterated over
    881         //       3.   JMP_SAV      2
    882         //       4.   ...
    883         //
    884         // Or, if the body is a simple [Set],
    885         //       1.   LOOP_SR_I    set number
    886         //       2.   LOOP_C       stack location
    887         //       ...
    888         //
    889         // Or if this is a .*
    890         //       1.   LOOP_DOT_I    (. matches all mode flag)
    891         //       2.   LOOP_C        stack location
    892         //
    893         // Or, if the body can match a zero-length string, to inhibit infinite loops,
    894         //       1.   STATE_SAVE   5
    895         //       2.   STO_INP_LOC  data-loc
    896         //       3.      body of stuff
    897         //       4.   JMP_SAV_X    2
    898         //       5.   ...
    899         {
    900             // location of item #1, the STATE_SAVE
    901             int32_t   topLoc = blockTopLoc(FALSE);
    902             int32_t   dataLoc = -1;
    903 
    904             // Check for simple *, where the construct being repeated
    905             //   compiled to single opcode, and might be optimizable.
    906             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
    907                 int32_t repeatedOp = fRXPat->fCompiledPat->elementAti(topLoc);
    908 
    909                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
    910                     // Emit optimized code for a [char set]*
    911                     int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
    912                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
    913                     dataLoc = fRXPat->fFrameSize;
    914                     fRXPat->fFrameSize++;
    915                     int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
    916                     fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
    917                     break;
    918                 }
    919 
    920                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
    921                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
    922                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
    923                     // Emit Optimized code for .* operations.
    924                     int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
    925                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
    926                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
    927                         loopOpI |= 1;
    928                     }
    929                     if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
    930                         loopOpI |= 2;
    931                     }
    932                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
    933                     dataLoc = fRXPat->fFrameSize;
    934                     fRXPat->fFrameSize++;
    935                     int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
    936                     fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
    937                     break;
    938                 }
    939             }
    940 
    941             // Emit general case code for this *
    942             // The optimizations did not apply.
    943 
    944             int32_t   saveStateLoc = blockTopLoc(TRUE);
    945             int32_t   jmpOp        = URX_BUILD(URX_JMP_SAV, saveStateLoc+1);
    946 
    947             // Check for minimum match length of zero, which requires
    948             //    extra loop-breaking code.
    949             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
    950                 insertOp(saveStateLoc);
    951                 dataLoc =  fRXPat->fFrameSize;
    952                 fRXPat->fFrameSize++;
    953 
    954                 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
    955                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
    956                 jmpOp      = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2);
    957             }
    958 
    959             // Locate the position in the compiled pattern where the match will continue
    960             //   after completing the *.   (4 or 5 in the comment above)
    961             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
    962 
    963             // Put together the save state op store it into the compiled code.
    964             int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
    965             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
    966 
    967             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
    968             fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
    969         }
    970         break;
    971 
    972     case doNGStar:
    973         // Non-greedy *? quantifier
    974         // compiles to
    975         //     1.   JMP    3
    976         //     2.      body of stuff being iterated over
    977         //     3.   STATE_SAVE  2
    978         //     4    ...
    979         {
    980             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
    981             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
    982             int32_t     jmpOp   = URX_BUILD(URX_JMP, saveLoc);
    983             int32_t     stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
    984             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
    985             fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
    986         }
    987         break;
    988 
    989 
    990     case doIntervalInit:
    991         // The '{' opening an interval quantifier was just scanned.
    992         // Init the counter varaiables that will accumulate the values as the digits
    993         //    are scanned.
    994         fIntervalLow = 0;
    995         fIntervalUpper = -1;
    996         break;
    997 
    998     case doIntevalLowerDigit:
    999         // Scanned a digit from the lower value of an {lower,upper} interval
   1000         {
   1001             int32_t digitValue = u_charDigitValue(fC.fChar);
   1002             U_ASSERT(digitValue >= 0);
   1003             fIntervalLow = fIntervalLow*10 + digitValue;
   1004             if (fIntervalLow < 0) {
   1005                 error(U_REGEX_NUMBER_TOO_BIG);
   1006             }
   1007         }
   1008         break;
   1009 
   1010     case doIntervalUpperDigit:
   1011         // Scanned a digit from the upper value of an {lower,upper} interval
   1012         {
   1013             if (fIntervalUpper < 0) {
   1014                 fIntervalUpper = 0;
   1015             }
   1016             int32_t digitValue = u_charDigitValue(fC.fChar);
   1017             U_ASSERT(digitValue >= 0);
   1018             fIntervalUpper = fIntervalUpper*10 + digitValue;
   1019             if (fIntervalUpper < 0) {
   1020                 error(U_REGEX_NUMBER_TOO_BIG);
   1021             }
   1022         }
   1023         break;
   1024 
   1025     case doIntervalSame:
   1026         // Scanned a single value interval like {27}.  Upper = Lower.
   1027         fIntervalUpper = fIntervalLow;
   1028         break;
   1029 
   1030     case doInterval:
   1031         // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
   1032         if (compileInlineInterval() == FALSE) {
   1033             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
   1034         }
   1035         break;
   1036 
   1037     case doPossessiveInterval:
   1038         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
   1039         {
   1040             // Remember the loc for the top of the block being looped over.
   1041             //   (Can not reserve a slot in the compiled pattern at this time, because
   1042             //    compileInterval needs to reserve also, and blockTopLoc can only reserve
   1043             //    once per block.)
   1044             int32_t topLoc = blockTopLoc(FALSE);
   1045 
   1046             // Produce normal looping code.
   1047             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
   1048 
   1049             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
   1050             //  just as if the loop was inclosed in atomic parentheses.
   1051 
   1052             // First the STO_SP before the start of the loop
   1053             insertOp(topLoc);
   1054             int32_t  varLoc    = fRXPat->fDataSize;    // Reserve a data location for saving the
   1055             fRXPat->fDataSize += 1;                    //  state stack ptr.
   1056             int32_t  op        = URX_BUILD(URX_STO_SP, varLoc);
   1057             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1058 
   1059             int32_t loopOp = fRXPat->fCompiledPat->popi();
   1060             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
   1061             loopOp++;     // point LoopOp after the just-inserted STO_SP
   1062             fRXPat->fCompiledPat->push(loopOp, *fStatus);
   1063 
   1064             // Then the LD_SP after the end of the loop
   1065             op = URX_BUILD(URX_LD_SP, varLoc);
   1066             fRXPat->fCompiledPat->addElement(op, *fStatus);
   1067         }
   1068 
   1069         break;
   1070 
   1071     case doNGInterval:
   1072         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
   1073         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
   1074         break;
   1075 
   1076     case doIntervalError:
   1077         error(U_REGEX_BAD_INTERVAL);
   1078         break;
   1079 
   1080     case doLiteralChar:
   1081         // We've just scanned a "normal" character from the pattern,
   1082         literalChar(fC.fChar);
   1083         break;
   1084 
   1085 
   1086     case doEscapedLiteralChar:
   1087         // We've just scanned an backslashed escaped character with  no
   1088         //   special meaning.  It represents itself.
   1089         if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
   1090             ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
   1091             (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
   1092                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1093              }
   1094         literalChar(fC.fChar);
   1095         break;
   1096 
   1097 
   1098     case doDotAny:
   1099         // scanned a ".",  match any single character.
   1100         {
   1101             int32_t   op;
   1102             if (fModeFlags & UREGEX_DOTALL) {
   1103                 op = URX_BUILD(URX_DOTANY_ALL, 0);
   1104             } else if (fModeFlags & UREGEX_UNIX_LINES) {
   1105                 op = URX_BUILD(URX_DOTANY_UNIX, 0);
   1106             } else {
   1107                 op = URX_BUILD(URX_DOTANY, 0);
   1108             }
   1109             fRXPat->fCompiledPat->addElement(op, *fStatus);
   1110         }
   1111         break;
   1112 
   1113     case doCaret:
   1114         {
   1115             int32_t op = 0;
   1116             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1117                 op = URX_CARET;
   1118             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1119                 op = URX_CARET_M;
   1120             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1121                 op = URX_CARET;   // Only testing true start of input.
   1122             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1123                 op = URX_CARET_M_UNIX;
   1124             }
   1125             fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
   1126         }
   1127         break;
   1128 
   1129     case doDollar:
   1130         {
   1131             int32_t op = 0;
   1132             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1133                 op = URX_DOLLAR;
   1134             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1135                 op = URX_DOLLAR_M;
   1136             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1137                 op = URX_DOLLAR_D;
   1138             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1139                 op = URX_DOLLAR_MD;
   1140             }
   1141             fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
   1142         }
   1143         break;
   1144 
   1145     case doBackslashA:
   1146         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
   1147         break;
   1148 
   1149     case doBackslashB:
   1150         {
   1151             #if  UCONFIG_NO_BREAK_ITERATION==1
   1152             if (fModeFlags & UREGEX_UWORD) {
   1153                 error(U_UNSUPPORTED_ERROR);
   1154             }
   1155             #endif
   1156             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
   1157             fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus);
   1158         }
   1159         break;
   1160 
   1161     case doBackslashb:
   1162         {
   1163             #if  UCONFIG_NO_BREAK_ITERATION==1
   1164             if (fModeFlags & UREGEX_UWORD) {
   1165                 error(U_UNSUPPORTED_ERROR);
   1166             }
   1167             #endif
   1168             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
   1169             fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
   1170         }
   1171         break;
   1172 
   1173     case doBackslashD:
   1174         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus);
   1175         break;
   1176 
   1177     case doBackslashd:
   1178         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus);
   1179         break;
   1180 
   1181     case doBackslashG:
   1182         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
   1183         break;
   1184 
   1185     case doBackslashS:
   1186         fRXPat->fCompiledPat->addElement(
   1187             URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus);
   1188         break;
   1189 
   1190     case doBackslashs:
   1191         fRXPat->fCompiledPat->addElement(
   1192             URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus);
   1193         break;
   1194 
   1195     case doBackslashW:
   1196         fRXPat->fCompiledPat->addElement(
   1197             URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus);
   1198         break;
   1199 
   1200     case doBackslashw:
   1201         fRXPat->fCompiledPat->addElement(
   1202             URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
   1203         break;
   1204 
   1205     case doBackslashX:
   1206         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
   1207         break;
   1208 
   1209 
   1210     case doBackslashZ:
   1211         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
   1212         break;
   1213 
   1214     case doBackslashz:
   1215         fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
   1216         break;
   1217 
   1218     case doEscapeError:
   1219         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1220         break;
   1221 
   1222     case doExit:
   1223         returnVal = FALSE;
   1224         break;
   1225 
   1226     case doProperty:
   1227         {
   1228             UnicodeSet *theSet = scanProp();
   1229             compileSet(theSet);
   1230         }
   1231         break;
   1232 
   1233     case doNamedChar:
   1234         {
   1235             UChar32 c = scanNamedChar();
   1236             literalChar(c);
   1237         }
   1238         break;
   1239 
   1240 
   1241     case doBackRef:
   1242         // BackReference.  Somewhat unusual in that the front-end can not completely parse
   1243         //                 the regular expression, because the number of digits to be consumed
   1244         //                 depends on the number of capture groups that have been defined.  So
   1245         //                 we have to do it here instead.
   1246         {
   1247             int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
   1248             int32_t  groupNum = 0;
   1249             UChar32  c        = fC.fChar;
   1250 
   1251             for (;;) {
   1252                 // Loop once per digit, for max allowed number of digits in a back reference.
   1253                 int32_t digit = u_charDigitValue(c);
   1254                 groupNum = groupNum * 10 + digit;
   1255                 if (groupNum >= numCaptureGroups) {
   1256                     break;
   1257                 }
   1258                 c = peekCharLL();
   1259                 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
   1260                     break;
   1261                 }
   1262                 nextCharLL();
   1263             }
   1264 
   1265             // Scan of the back reference in the source regexp is complete.  Now generate
   1266             //  the compiled code for it.
   1267             // Because capture groups can be forward-referenced by back-references,
   1268             //  we fill the operand with the capture group number.  At the end
   1269             //  of compilation, it will be changed to the variable's location.
   1270             U_ASSERT(groupNum > 0);
   1271             int32_t  op;
   1272             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1273                 op = URX_BUILD(URX_BACKREF_I, groupNum);
   1274             } else {
   1275                 op = URX_BUILD(URX_BACKREF, groupNum);
   1276             }
   1277             fRXPat->fCompiledPat->addElement(op, *fStatus);
   1278         }
   1279         break;
   1280 
   1281 
   1282     case doPossessivePlus:
   1283         // Possessive ++ quantifier.
   1284         // Compiles to
   1285         //       1.   STO_SP
   1286         //       2.      body of stuff being iterated over
   1287         //       3.   STATE_SAVE 5
   1288         //       4.   JMP        2
   1289         //       5.   LD_SP
   1290         //       6.   ...
   1291         //
   1292         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
   1293         //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
   1294         //
   1295         {
   1296             // Emit the STO_SP
   1297             int32_t   topLoc = blockTopLoc(TRUE);
   1298             int32_t   stoLoc = fRXPat->fDataSize;
   1299             fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr.
   1300             int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
   1301             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1302 
   1303             // Emit the STATE_SAVE
   1304             op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
   1305             fRXPat->fCompiledPat->addElement(op, *fStatus);
   1306 
   1307             // Emit the JMP
   1308             op = URX_BUILD(URX_JMP, topLoc+1);
   1309             fRXPat->fCompiledPat->addElement(op, *fStatus);
   1310 
   1311             // Emit the LD_SP
   1312             op = URX_BUILD(URX_LD_SP, stoLoc);
   1313             fRXPat->fCompiledPat->addElement(op, *fStatus);
   1314         }
   1315         break;
   1316 
   1317     case doPossessiveStar:
   1318         // Possessive *+ quantifier.
   1319         // Compiles to
   1320         //       1.   STO_SP       loc
   1321         //       2.   STATE_SAVE   5
   1322         //       3.      body of stuff being iterated over
   1323         //       4.   JMP          2
   1324         //       5.   LD_SP        loc
   1325         //       6    ...
   1326         // TODO:  do something to cut back the state stack each time through the loop.
   1327         {
   1328             // Reserve two slots at the top of the block.
   1329             int32_t   topLoc = blockTopLoc(TRUE);
   1330             insertOp(topLoc);
   1331 
   1332             // emit   STO_SP     loc
   1333             int32_t   stoLoc = fRXPat->fDataSize;
   1334             fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr.
   1335             int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
   1336             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1337 
   1338             // Emit the SAVE_STATE   5
   1339             int32_t L7 = fRXPat->fCompiledPat->size()+1;
   1340             op = URX_BUILD(URX_STATE_SAVE, L7);
   1341             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
   1342 
   1343             // Append the JMP operation.
   1344             op = URX_BUILD(URX_JMP, topLoc+1);
   1345             fRXPat->fCompiledPat->addElement(op, *fStatus);
   1346 
   1347             // Emit the LD_SP       loc
   1348             op = URX_BUILD(URX_LD_SP, stoLoc);
   1349             fRXPat->fCompiledPat->addElement(op, *fStatus);
   1350         }
   1351         break;
   1352 
   1353     case doPossessiveOpt:
   1354         // Possessive  ?+ quantifier.
   1355         //  Compiles to
   1356         //     1. STO_SP      loc
   1357         //     2. SAVE_STATE  5
   1358         //     3.    body of optional block
   1359         //     4. LD_SP       loc
   1360         //     5. ...
   1361         //
   1362         {
   1363             // Reserve two slots at the top of the block.
   1364             int32_t   topLoc = blockTopLoc(TRUE);
   1365             insertOp(topLoc);
   1366 
   1367             // Emit the STO_SP
   1368             int32_t   stoLoc = fRXPat->fDataSize;
   1369             fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr.
   1370             int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
   1371             fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1372 
   1373             // Emit the SAVE_STATE
   1374             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
   1375             op = URX_BUILD(URX_STATE_SAVE, continueLoc);
   1376             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
   1377 
   1378             // Emit the LD_SP
   1379             op = URX_BUILD(URX_LD_SP, stoLoc);
   1380             fRXPat->fCompiledPat->addElement(op, *fStatus);
   1381         }
   1382         break;
   1383 
   1384 
   1385     case doBeginMatchMode:
   1386         fNewModeFlags = fModeFlags;
   1387         fSetModeFlag  = TRUE;
   1388         break;
   1389 
   1390     case doMatchMode:   //  (?i)    and similar
   1391         {
   1392             int32_t  bit = 0;
   1393             switch (fC.fChar) {
   1394             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
   1395             case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
   1396             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
   1397             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
   1398             case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
   1399             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
   1400             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
   1401             case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
   1402             default:
   1403                 U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
   1404                                    // by the scanner.
   1405             }
   1406             if (fSetModeFlag) {
   1407                 fNewModeFlags |= bit;
   1408             } else {
   1409                 fNewModeFlags &= ~bit;
   1410             }
   1411         }
   1412         break;
   1413 
   1414     case doSetMatchMode:
   1415         // We've got a (?i) or similar.  The match mode is being changed, but
   1416         //   the change is not scoped to a parenthesized block.
   1417         U_ASSERT(fNewModeFlags < 0);
   1418         fModeFlags = fNewModeFlags;
   1419 
   1420         // Prevent any string from spanning across the change of match mode.
   1421         //   Otherwise the pattern "abc(?i)def" would make a single string of "abcdef"
   1422         fixLiterals();
   1423         break;
   1424 
   1425 
   1426     case doMatchModeParen:
   1427         // We've got a (?i: or similar.  Begin a parenthesized block, save old
   1428         //   mode flags so they can be restored at the close of the block.
   1429         //
   1430         //   Compile to a
   1431         //      - NOP, which later may be replaced by a save-state if the
   1432         //         parenthesized group gets a * quantifier, followed by
   1433         //      - NOP, which may later be replaced by a save-state if there
   1434         //             is an '|' alternation within the parens.
   1435         {
   1436             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
   1437             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
   1438 
   1439             // On the Parentheses stack, start a new frame and add the postions
   1440             //   of the two NOPs (a normal non-capturing () frame, except for the
   1441             //   saving of the orignal mode flags.)
   1442             fParenStack.push(fModeFlags, *fStatus);
   1443             fParenStack.push(flags, *fStatus);                            // Frame Marker
   1444             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
   1445             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
   1446 
   1447             // Set the current mode flags to the new values.
   1448             U_ASSERT(fNewModeFlags < 0);
   1449             fModeFlags = fNewModeFlags;
   1450         }
   1451         break;
   1452 
   1453     case doBadModeFlag:
   1454         error(U_REGEX_INVALID_FLAG);
   1455         break;
   1456 
   1457     case doSuppressComments:
   1458         // We have just scanned a '(?'.  We now need to prevent the character scanner from
   1459         // treating a '#' as a to-the-end-of-line comment.
   1460         //   (This Perl compatibility just gets uglier and uglier to do...)
   1461         fEOLComments = FALSE;
   1462         break;
   1463 
   1464 
   1465     case doSetAddAmp:
   1466         {
   1467           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1468           set->add(chAmp);
   1469         }
   1470         break;
   1471 
   1472     case doSetAddDash:
   1473         {
   1474           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1475           set->add(chDash);
   1476         }
   1477         break;
   1478 
   1479      case doSetBackslash_s:
   1480         {
   1481          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1482          set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
   1483          break;
   1484         }
   1485 
   1486      case doSetBackslash_S:
   1487         {
   1488             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1489             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
   1490             SSet.complement();
   1491             set->addAll(SSet);
   1492             break;
   1493         }
   1494 
   1495     case doSetBackslash_d:
   1496         {
   1497             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1498             // TODO - make a static set, ticket 6058.
   1499             addCategory(set, U_GC_ND_MASK, *fStatus);
   1500             break;
   1501         }
   1502 
   1503     case doSetBackslash_D:
   1504         {
   1505             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1506             UnicodeSet digits;
   1507             // TODO - make a static set, ticket 6058.
   1508             digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
   1509             digits.complement();
   1510             set->addAll(digits);
   1511             break;
   1512         }
   1513 
   1514     case doSetBackslash_w:
   1515         {
   1516             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1517             set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
   1518             break;
   1519         }
   1520 
   1521     case doSetBackslash_W:
   1522         {
   1523             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
   1524             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
   1525             SSet.complement();
   1526             set->addAll(SSet);
   1527             break;
   1528         }
   1529 
   1530     case doSetBegin:
   1531         fSetStack.push(new UnicodeSet(), *fStatus);
   1532         fSetOpStack.push(setStart, *fStatus);
   1533         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1534             fSetOpStack.push(setCaseClose, *fStatus);
   1535         }
   1536         break;
   1537 
   1538     case doSetBeginDifference1:
   1539         //  We have scanned something like [[abc]-[
   1540         //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
   1541         //  Push a Difference operator, which will cause the new set to be subtracted from what
   1542         //    went before once it is created.
   1543         setPushOp(setDifference1);
   1544         fSetOpStack.push(setStart, *fStatus);
   1545         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1546             fSetOpStack.push(setCaseClose, *fStatus);
   1547         }
   1548         break;
   1549 
   1550     case doSetBeginIntersection1:
   1551         //  We have scanned something like  [[abc]&[
   1552         //   Need both the '&' operator and the open '[' operator.
   1553         setPushOp(setIntersection1);
   1554         fSetOpStack.push(setStart, *fStatus);
   1555         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1556             fSetOpStack.push(setCaseClose, *fStatus);
   1557         }
   1558         break;
   1559 
   1560     case doSetBeginUnion:
   1561         //  We have scanned something like  [[abc][
   1562         //     Need to handle the union operation explicitly [[abc] | [
   1563         setPushOp(setUnion);
   1564         fSetOpStack.push(setStart, *fStatus);
   1565         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1566             fSetOpStack.push(setCaseClose, *fStatus);
   1567         }
   1568         break;
   1569 
   1570     case doSetDifference2:
   1571         // We have scanned something like [abc--
   1572         //   Consider this to unambiguously be a set difference operator.
   1573         setPushOp(setDifference2);
   1574         break;
   1575 
   1576     case doSetEnd:
   1577         // Have encountered the ']' that closes a set.
   1578         //    Force the evaluation of any pending operations within this set,
   1579         //    leave the completed set on the top of the set stack.
   1580         setEval(setEnd);
   1581         U_ASSERT(fSetOpStack.peeki()==setStart);
   1582         fSetOpStack.popi();
   1583         break;
   1584 
   1585     case doSetFinish:
   1586         {
   1587         // Finished a complete set expression, including all nested sets.
   1588         //   The close bracket has already triggered clearing out pending set operators,
   1589         //    the operator stack should be empty and the operand stack should have just
   1590         //    one entry, the result set.
   1591         U_ASSERT(fSetOpStack.empty());
   1592         UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
   1593         U_ASSERT(fSetStack.empty());
   1594         compileSet(theSet);
   1595         break;
   1596         }
   1597 
   1598     case doSetIntersection2:
   1599         // Have scanned something like [abc&&
   1600         setPushOp(setIntersection2);
   1601         break;
   1602 
   1603     case doSetLiteral:
   1604         // Union the just-scanned literal character into the set being built.
   1605         //    This operation is the highest precedence set operation, so we can always do
   1606         //    it immediately, without waiting to see what follows.  It is necessary to perform
   1607         //    any pending '-' or '&' operation first, because these have the same precedence
   1608         //    as union-ing in a literal'
   1609         {
   1610             setEval(setUnion);
   1611             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1612             s->add(fC.fChar);
   1613             fLastSetLiteral = fC.fChar;
   1614             break;
   1615         }
   1616 
   1617     case doSetLiteralEscaped:
   1618         // A back-slash escaped literal character was encountered.
   1619         // Processing is the same as with setLiteral, above, with the addition of
   1620         //  the optional check for errors on escaped ASCII letters.
   1621         {
   1622             if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
   1623                 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
   1624                  (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
   1625                 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1626             }
   1627             setEval(setUnion);
   1628             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1629             s->add(fC.fChar);
   1630             fLastSetLiteral = fC.fChar;
   1631             break;
   1632         }
   1633 
   1634         case doSetNamedChar:
   1635         // Scanning a \N{UNICODE CHARACTER NAME}
   1636         //  Aside from the source of the character, the processing is identical to doSetLiteral,
   1637         //    above.
   1638         {
   1639             UChar32  c = scanNamedChar();
   1640             setEval(setUnion);
   1641             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1642             s->add(c);
   1643             fLastSetLiteral = c;
   1644             break;
   1645         }
   1646 
   1647     case doSetNamedRange:
   1648         // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
   1649         // The left character is already in the set, and is saved in fLastSetLiteral.
   1650         // The right side needs to be picked up, the scan is at the 'N'.
   1651         // Lower Limit > Upper limit being an error matches both Java
   1652         //        and ICU UnicodeSet behavior.
   1653         {
   1654             UChar32  c = scanNamedChar();
   1655             if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) {
   1656                 error(U_REGEX_INVALID_RANGE);
   1657             }
   1658             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1659             s->add(fLastSetLiteral, c);
   1660             fLastSetLiteral = c;
   1661             break;
   1662         }
   1663 
   1664 
   1665         case  doSetNegate:
   1666         // Scanned a '^' at the start of a set.
   1667         // Push the negation operator onto the set op stack.
   1668         // A twist for case-insensitive matching:
   1669         //   the case closure operation must happen _before_ negation.
   1670         //   But the case closure operation will already be on the stack if it's required.
   1671         //   This requires checking for case closure, and swapping the stack order
   1672         //    if it is present.
   1673         {
   1674             int32_t  tosOp = fSetOpStack.peeki();
   1675             if (tosOp == setCaseClose) {
   1676                 fSetOpStack.popi();
   1677                 fSetOpStack.push(setNegation, *fStatus);
   1678                 fSetOpStack.push(setCaseClose, *fStatus);
   1679             } else {
   1680                 fSetOpStack.push(setNegation, *fStatus);
   1681             }
   1682         }
   1683         break;
   1684 
   1685     case doSetNoCloseError:
   1686         error(U_REGEX_MISSING_CLOSE_BRACKET);
   1687         break;
   1688 
   1689     case doSetOpError:
   1690         error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
   1691         break;
   1692 
   1693     case doSetPosixProp:
   1694         {
   1695             UnicodeSet *s = scanPosixProp();
   1696             if (s != NULL) {
   1697                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
   1698                 tos->addAll(*s);
   1699                 delete s;
   1700             }  // else error.  scanProp() reported the error status already.
   1701         }
   1702         break;
   1703 
   1704     case doSetProp:
   1705         //  Scanned a \p \P within [brackets].
   1706         {
   1707             UnicodeSet *s = scanProp();
   1708             if (s != NULL) {
   1709                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
   1710                 tos->addAll(*s);
   1711                 delete s;
   1712             }  // else error.  scanProp() reported the error status already.
   1713         }
   1714         break;
   1715 
   1716 
   1717     case doSetRange:
   1718         // We have scanned literal-literal.  Add the range to the set.
   1719         // The left character is already in the set, and is saved in fLastSetLiteral.
   1720         // The right side is the current character.
   1721         // Lower Limit > Upper limit being an error matches both Java
   1722         //        and ICU UnicodeSet behavior.
   1723         {
   1724         if (fLastSetLiteral > fC.fChar) {
   1725             error(U_REGEX_INVALID_RANGE);
   1726         }
   1727         UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
   1728         s->add(fLastSetLiteral, fC.fChar);
   1729         break;
   1730         }
   1731 
   1732 
   1733     default:
   1734         U_ASSERT(FALSE);
   1735         error(U_REGEX_INTERNAL_ERROR);
   1736         break;
   1737     }
   1738 
   1739     if (U_FAILURE(*fStatus)) {
   1740         returnVal = FALSE;
   1741     }
   1742 
   1743     return returnVal;
   1744 }
   1745 
   1746 
   1747 
   1748 //------------------------------------------------------------------------------
   1749 //
   1750 //   literalChar           We've encountered a literal character from the pattern,
   1751 //                             or an escape sequence that reduces to a character.
   1752 //                         Add it to the string containing all literal chars/strings from
   1753 //                             the pattern.
   1754 //                         If we are in a pattern string already, add the new char to it.
   1755 //                         If we aren't in a pattern string, begin one now.
   1756 //
   1757 //------------------------------------------------------------------------------
   1758 void RegexCompile::literalChar(UChar32 c)  {
   1759     int32_t           op;            // An operation in the compiled pattern.
   1760     int32_t           opType;
   1761     int32_t           patternLoc;   // A position in the compiled pattern.
   1762     int32_t           stringLen;
   1763 
   1764 
   1765     // If the last thing compiled into the pattern was not a literal char,
   1766     //   force this new literal char to begin a new string, and not append to the previous.
   1767     op     = fRXPat->fCompiledPat->lastElementi();
   1768     opType = URX_TYPE(op);
   1769     if (!(opType == URX_STRING_LEN || opType == URX_ONECHAR || opType == URX_ONECHAR_I)) {
   1770         fixLiterals();
   1771     }
   1772 
   1773     if (fStringOpStart == -1) {
   1774         // First char of a string in the pattern.
   1775         // Emit a OneChar op into the compiled pattern.
   1776         emitONE_CHAR(c);
   1777 
   1778         // Also add it to the string pool, in case we get a second adjacent literal
   1779         //   and want to change form ONE_CHAR to STRING
   1780         fStringOpStart = fRXPat->fLiteralText.length();
   1781         fRXPat->fLiteralText.append(c);
   1782         return;
   1783     }
   1784 
   1785     // We are adding onto an existing string
   1786     fRXPat->fLiteralText.append(c);
   1787 
   1788     op     = fRXPat->fCompiledPat->lastElementi();
   1789     opType = URX_TYPE(op);
   1790     U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN);
   1791 
   1792     // If the most recently emitted op is a URX_ONECHAR,
   1793     if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) {
   1794         if (U16_IS_TRAIL(c) && U16_IS_LEAD(URX_VAL(op))) {
   1795             // The most recently emitted op is a ONECHAR that was the first half
   1796             //   of a surrogate pair.  Update the ONECHAR's operand to be the
   1797             //   supplementary code point resulting from both halves of the pair.
   1798             c = U16_GET_SUPPLEMENTARY(URX_VAL(op), c);
   1799             op = URX_BUILD(opType, c);
   1800             patternLoc = fRXPat->fCompiledPat->size() - 1;
   1801             fRXPat->fCompiledPat->setElementAt(op, patternLoc);
   1802             return;
   1803         }
   1804 
   1805         // The most recently emitted op is a ONECHAR.
   1806         //  We've now received another adjacent char.  Change the ONECHAR op
   1807         //   to a string op.
   1808         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1809             op     = URX_BUILD(URX_STRING_I, fStringOpStart);
   1810         } else {
   1811             op     = URX_BUILD(URX_STRING, fStringOpStart);
   1812         }
   1813         patternLoc = fRXPat->fCompiledPat->size() - 1;
   1814         fRXPat->fCompiledPat->setElementAt(op, patternLoc);
   1815         op         = URX_BUILD(URX_STRING_LEN, 0);
   1816         fRXPat->fCompiledPat->addElement(op, *fStatus);
   1817     }
   1818 
   1819     // The pattern contains a URX_SRING / URX_STRING_LEN.  Update the
   1820     //  string length to reflect the new char we just added to the string.
   1821     stringLen  = fRXPat->fLiteralText.length() - fStringOpStart;
   1822     op         = URX_BUILD(URX_STRING_LEN, stringLen);
   1823     patternLoc = fRXPat->fCompiledPat->size() - 1;
   1824     fRXPat->fCompiledPat->setElementAt(op, patternLoc);
   1825 }
   1826 
   1827 
   1828 
   1829 //------------------------------------------------------------------------------
   1830 //
   1831 //    emitONE_CHAR         emit a ONE_CHAR op into the generated code.
   1832 //                         Choose cased or uncased version, depending on the
   1833 //                         match mode and whether the character itself is cased.
   1834 //
   1835 //------------------------------------------------------------------------------
   1836 void RegexCompile::emitONE_CHAR(UChar32  c) {
   1837     int32_t op;
   1838     if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
   1839         u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
   1840         // We have a cased character, and are in case insensitive matching mode.
   1841         c  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
   1842         op = URX_BUILD(URX_ONECHAR_I, c);
   1843     } else {
   1844         // Uncased char, or case sensitive match mode.
   1845         //  Either way, just generate a literal compare of the char.
   1846         op = URX_BUILD(URX_ONECHAR, c);
   1847     }
   1848     fRXPat->fCompiledPat->addElement(op, *fStatus);
   1849 }
   1850 
   1851 
   1852 //------------------------------------------------------------------------------
   1853 //
   1854 //    fixLiterals           When compiling something that can follow a literal
   1855 //                          string in a pattern, we need to "fix" any preceding
   1856 //                          string, which will cause any subsequent literals to
   1857 //                          begin a new string, rather than appending to the
   1858 //                          old one.
   1859 //
   1860 //                          Optionally, split the last char of the string off into
   1861 //                          a single "ONE_CHAR" operation, so that quantifiers can
   1862 //                          apply to that char alone.  Example:   abc*
   1863 //                          The * must apply to the 'c' only.
   1864 //
   1865 //------------------------------------------------------------------------------
   1866 void    RegexCompile::fixLiterals(UBool split) {
   1867     int32_t  stringStart = fStringOpStart;    // start index of the current literal string
   1868     int32_t  op;                              // An op from/for the compiled pattern.
   1869     int32_t  opType;                          // An opcode type from the compiled pattern.
   1870     int32_t  stringLastCharIdx;
   1871     UChar32  lastChar;
   1872     int32_t  stringNextToLastCharIdx;
   1873     UChar32  nextToLastChar;
   1874     int32_t  stringLen;
   1875 
   1876     fStringOpStart = -1;
   1877     if (!split) {
   1878         return;
   1879     }
   1880 
   1881     // Split:  We need to  ensure that the last item in the compiled pattern does
   1882     //   not refer to a literal string of more than one char.  If it does,
   1883     //   separate the last char from the rest of the string.
   1884 
   1885     // If the last operation from the compiled pattern is not a string,
   1886     //   nothing needs to be done
   1887     op     = fRXPat->fCompiledPat->lastElementi();
   1888     opType = URX_TYPE(op);
   1889     if (opType != URX_STRING_LEN) {
   1890         return;
   1891     }
   1892     stringLen = URX_VAL(op);
   1893 
   1894     //
   1895     // Find the position of the last code point in the string  (might be a surrogate pair)
   1896     //
   1897     stringLastCharIdx = fRXPat->fLiteralText.length();
   1898     stringLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
   1899     lastChar          = fRXPat->fLiteralText.char32At(stringLastCharIdx);
   1900 
   1901     // The string should always be at least two code points long, meaning that there
   1902     //   should be something before the last char position that we just found.
   1903     U_ASSERT(stringLastCharIdx > stringStart);
   1904     stringNextToLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
   1905     U_ASSERT(stringNextToLastCharIdx >= stringStart);
   1906     nextToLastChar          = fRXPat->fLiteralText.char32At(stringNextToLastCharIdx);
   1907 
   1908     if (stringNextToLastCharIdx > stringStart) {
   1909         // The length of string remaining after removing one char is two or more.
   1910         // Leave the string in the compiled pattern, shorten it by one char,
   1911         //   and append a URX_ONECHAR op for the last char.
   1912         stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx);
   1913         op = URX_BUILD(URX_STRING_LEN, stringLen);
   1914         fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
   1915         emitONE_CHAR(lastChar);
   1916     } else {
   1917         // The original string consisted of exactly two characters.  Replace
   1918         // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair
   1919         // of URX_ONECHARs.
   1920         fRXPat->fCompiledPat->setSize(fRXPat->fCompiledPat->size() -2);
   1921         emitONE_CHAR(nextToLastChar);
   1922         emitONE_CHAR(lastChar);
   1923     }
   1924 }
   1925 
   1926 
   1927 
   1928 
   1929 
   1930 
   1931 //------------------------------------------------------------------------------
   1932 //
   1933 //   insertOp()             Insert a slot for a new opcode into the already
   1934 //                          compiled pattern code.
   1935 //
   1936 //                          Fill the slot with a NOP.  Our caller will replace it
   1937 //                          with what they really wanted.
   1938 //
   1939 //------------------------------------------------------------------------------
   1940 void   RegexCompile::insertOp(int32_t where) {
   1941     UVector32 *code = fRXPat->fCompiledPat;
   1942     U_ASSERT(where>0 && where < code->size());
   1943 
   1944     int32_t  nop = URX_BUILD(URX_NOP, 0);
   1945     code->insertElementAt(nop, where, *fStatus);
   1946 
   1947     // Walk through the pattern, looking for any ops with targets that
   1948     //  were moved down by the insert.  Fix them.
   1949     int32_t loc;
   1950     for (loc=0; loc<code->size(); loc++) {
   1951         int32_t op = code->elementAti(loc);
   1952         int32_t opType = URX_TYPE(op);
   1953         int32_t opValue = URX_VAL(op);
   1954         if ((opType == URX_JMP         ||
   1955             opType == URX_JMPX         ||
   1956             opType == URX_STATE_SAVE   ||
   1957             opType == URX_CTR_LOOP     ||
   1958             opType == URX_CTR_LOOP_NG  ||
   1959             opType == URX_JMP_SAV      ||
   1960             opType == URX_RELOC_OPRND)    && opValue > where) {
   1961             // Target location for this opcode is after the insertion point and
   1962             //   needs to be incremented to adjust for the insertion.
   1963             opValue++;
   1964             op = URX_BUILD(opType, opValue);
   1965             code->setElementAt(op, loc);
   1966         }
   1967     }
   1968 
   1969     // Now fix up the parentheses stack.  All positive values in it are locations in
   1970     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
   1971     for (loc=0; loc<fParenStack.size(); loc++) {
   1972         int32_t x = fParenStack.elementAti(loc);
   1973         U_ASSERT(x < code->size());
   1974         if (x>where) {
   1975             x++;
   1976             fParenStack.setElementAt(x, loc);
   1977         }
   1978     }
   1979 
   1980     if (fMatchCloseParen > where) {
   1981         fMatchCloseParen++;
   1982     }
   1983     if (fMatchOpenParen > where) {
   1984         fMatchOpenParen++;
   1985     }
   1986 }
   1987 
   1988 
   1989 
   1990 //------------------------------------------------------------------------------
   1991 //
   1992 //   blockTopLoc()          Find or create a location in the compiled pattern
   1993 //                          at the start of the operation or block that has
   1994 //                          just been compiled.  Needed when a quantifier (* or
   1995 //                          whatever) appears, and we need to add an operation
   1996 //                          at the start of the thing being quantified.
   1997 //
   1998 //                          (Parenthesized Blocks) have a slot with a NOP that
   1999 //                          is reserved for this purpose.  .* or similar don't
   2000 //                          and a slot needs to be added.
   2001 //
   2002 //       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
   2003 //                                         at the returned location.
   2004 //                                 FALSE - just return the address,
   2005 //                                         do not reserve a location there.
   2006 //
   2007 //------------------------------------------------------------------------------
   2008 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
   2009     int32_t   theLoc;
   2010     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
   2011     {
   2012         // The item just processed is a parenthesized block.
   2013         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
   2014         U_ASSERT(theLoc > 0);
   2015         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
   2016     }
   2017     else {
   2018         // Item just compiled is a single thing, a ".", or a single char, or a set reference.
   2019         // No slot for STATE_SAVE was pre-reserved in the compiled code.
   2020         // We need to make space now.
   2021         fixLiterals(TRUE);  // If last item was a string, separate the last char.
   2022         theLoc = fRXPat->fCompiledPat->size()-1;
   2023         if (reserveLoc) {
   2024             /*int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);*/
   2025             int32_t  nop = URX_BUILD(URX_NOP, 0);
   2026             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
   2027         }
   2028     }
   2029     return theLoc;
   2030 }
   2031 
   2032 
   2033 
   2034 //------------------------------------------------------------------------------
   2035 //
   2036 //    handleCloseParen      When compiling a close paren, we need to go back
   2037 //                          and fix up any JMP or SAVE operations within the
   2038 //                          parenthesized block that need to target the end
   2039 //                          of the block.  The locations of these are kept on
   2040 //                          the paretheses stack.
   2041 //
   2042 //                          This function is called both when encountering a
   2043 //                          real ) and at the end of the pattern.
   2044 //
   2045 //------------------------------------------------------------------------------
   2046 void  RegexCompile::handleCloseParen() {
   2047     int32_t   patIdx;
   2048     int32_t   patOp;
   2049     if (fParenStack.size() <= 0) {
   2050         error(U_REGEX_MISMATCHED_PAREN);
   2051         return;
   2052     }
   2053 
   2054     // Force any literal chars that may follow the close paren to start a new string,
   2055     //   and not attach to any preceding it.
   2056     fixLiterals(FALSE);
   2057 
   2058     // Fixup any operations within the just-closed parenthesized group
   2059     //    that need to reference the end of the (block).
   2060     //    (The first one popped from the stack is an unused slot for
   2061     //     alternation (OR) state save, but applying the fixup to it does no harm.)
   2062     for (;;) {
   2063         patIdx = fParenStack.popi();
   2064         if (patIdx < 0) {
   2065             // value < 0 flags the start of the frame on the paren stack.
   2066             break;
   2067         }
   2068         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
   2069         patOp = fRXPat->fCompiledPat->elementAti(patIdx);
   2070         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
   2071         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
   2072         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
   2073         fMatchOpenParen     = patIdx;
   2074     }
   2075 
   2076     //  At the close of any parenthesized block, restore the match mode flags  to
   2077     //  the value they had at the open paren.  Saved value is
   2078     //  at the top of the paren stack.
   2079     fModeFlags = fParenStack.popi();
   2080     U_ASSERT(fModeFlags < 0);
   2081 
   2082     // DO any additional fixups, depending on the specific kind of
   2083     // parentesized grouping this is
   2084 
   2085     switch (patIdx) {
   2086     case plain:
   2087     case flags:
   2088         // No additional fixups required.
   2089         //   (Grouping-only parentheses)
   2090         break;
   2091     case capturing:
   2092         // Capturing Parentheses.
   2093         //   Insert a End Capture op into the pattern.
   2094         //   The frame offset of the variables for this cg is obtained from the
   2095         //       start capture op and put it into the end-capture op.
   2096         {
   2097             int32_t   captureOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
   2098             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
   2099 
   2100             int32_t   frameVarLocation = URX_VAL(captureOp);
   2101             int32_t   endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation);
   2102             fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
   2103         }
   2104         break;
   2105     case atomic:
   2106         // Atomic Parenthesis.
   2107         //   Insert a LD_SP operation to restore the state stack to the position
   2108         //   it was when the atomic parens were entered.
   2109         {
   2110             int32_t   stoOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
   2111             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
   2112             int32_t   stoLoc = URX_VAL(stoOp);
   2113             int32_t   ldOp   = URX_BUILD(URX_LD_SP, stoLoc);
   2114             fRXPat->fCompiledPat->addElement(ldOp, *fStatus);
   2115         }
   2116         break;
   2117 
   2118     case lookAhead:
   2119         {
   2120             int32_t  startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
   2121             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
   2122             int32_t dataLoc  = URX_VAL(startOp);
   2123             int32_t op       = URX_BUILD(URX_LA_END, dataLoc);
   2124             fRXPat->fCompiledPat->addElement(op, *fStatus);
   2125         }
   2126         break;
   2127 
   2128     case negLookAhead:
   2129         {
   2130             // See comment at doOpenLookAheadNeg
   2131             int32_t  startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
   2132             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
   2133             int32_t dataLoc  = URX_VAL(startOp);
   2134             int32_t op       = URX_BUILD(URX_LA_END, dataLoc);
   2135             fRXPat->fCompiledPat->addElement(op, *fStatus);
   2136             op               = URX_BUILD(URX_BACKTRACK, 0);
   2137             fRXPat->fCompiledPat->addElement(op, *fStatus);
   2138             // BEGIN android-changed
   2139             // Apply important regex bug fixing for ticket#7224
   2140             op               = URX_BUILD(URX_LA_END, dataLoc);
   2141             // END android-changed
   2142             fRXPat->fCompiledPat->addElement(op, *fStatus);
   2143 
   2144             // Patch the URX_SAVE near the top of the block.
   2145             // The destination of the SAVE is the final LA_END that was just added.
   2146             int32_t saveOp   = fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
   2147             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
   2148             int32_t dest     = fRXPat->fCompiledPat->size()-1;
   2149             saveOp           = URX_BUILD(URX_STATE_SAVE, dest);
   2150             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
   2151         }
   2152         break;
   2153 
   2154     case lookBehind:
   2155         {
   2156             // See comment at doOpenLookBehind.
   2157 
   2158             // Append the URX_LB_END and URX_LA_END to the compiled pattern.
   2159             int32_t  startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
   2160             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
   2161             int32_t dataLoc  = URX_VAL(startOp);
   2162             int32_t op       = URX_BUILD(URX_LB_END, dataLoc);
   2163             fRXPat->fCompiledPat->addElement(op, *fStatus);
   2164                     op       = URX_BUILD(URX_LA_END, dataLoc);
   2165             fRXPat->fCompiledPat->addElement(op, *fStatus);
   2166 
   2167             // Determine the min and max bounds for the length of the
   2168             //  string that the pattern can match.
   2169             //  An unbounded upper limit is an error.
   2170             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
   2171             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
   2172             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
   2173             if (maxML == INT32_MAX) {
   2174                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2175                 break;
   2176             }
   2177             U_ASSERT(minML <= maxML);
   2178 
   2179             // Insert the min and max match len bounds into the URX_LB_CONT op that
   2180             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
   2181             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
   2182             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
   2183 
   2184         }
   2185         break;
   2186 
   2187 
   2188 
   2189     case lookBehindN:
   2190         {
   2191             // See comment at doOpenLookBehindNeg.
   2192 
   2193             // Append the URX_LBN_END to the compiled pattern.
   2194             int32_t  startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
   2195             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
   2196             int32_t dataLoc  = URX_VAL(startOp);
   2197             int32_t op       = URX_BUILD(URX_LBN_END, dataLoc);
   2198             fRXPat->fCompiledPat->addElement(op, *fStatus);
   2199 
   2200             // Determine the min and max bounds for the length of the
   2201             //  string that the pattern can match.
   2202             //  An unbounded upper limit is an error.
   2203             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
   2204             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
   2205             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
   2206             if (maxML == INT32_MAX) {
   2207                 error(U_REGEX_LOOK_BEHIND_LIMIT);
   2208                 break;
   2209             }
   2210             U_ASSERT(minML <= maxML);
   2211 
   2212             // Insert the min and max match len bounds into the URX_LB_CONT op that
   2213             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
   2214             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
   2215             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
   2216 
   2217             // Insert the pattern location to continue at after a successful match
   2218             //  as the last operand of the URX_LBN_CONT
   2219             op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
   2220             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
   2221         }
   2222         break;
   2223 
   2224 
   2225 
   2226     default:
   2227         U_ASSERT(FALSE);
   2228     }
   2229 
   2230     // remember the next location in the compiled pattern.
   2231     // The compilation of Quantifiers will look at this to see whether its looping
   2232     //   over a parenthesized block or a single item
   2233     fMatchCloseParen = fRXPat->fCompiledPat->size();
   2234 }
   2235 
   2236 
   2237 
   2238 //------------------------------------------------------------------------------
   2239 //
   2240 //   compileSet       Compile the pattern operations for a reference to a
   2241 //                    UnicodeSet.
   2242 //
   2243 //------------------------------------------------------------------------------
   2244 void        RegexCompile::compileSet(UnicodeSet *theSet)
   2245 {
   2246     if (theSet == NULL) {
   2247         return;
   2248     }
   2249     //  Remove any strings from the set.
   2250     //  There shoudn't be any, but just in case.
   2251     //     (Case Closure can add them; if we had a simple case closure avaialble that
   2252     //      ignored strings, that would be better.)
   2253     theSet->removeAllStrings();
   2254     int32_t  setSize = theSet->size();
   2255     UChar32  firstSetChar = theSet->charAt(0);
   2256 
   2257     switch (setSize) {
   2258     case 0:
   2259         {
   2260             // Set of no elements.   Always fails to match.
   2261             fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
   2262             delete theSet;
   2263         }
   2264         break;
   2265 
   2266     case 1:
   2267         {
   2268             // The set contains only a single code point.  Put it into
   2269             //   the compiled pattern as a single char operation rather
   2270             //   than a set, and discard the set itself.
   2271             literalChar(firstSetChar);
   2272             delete theSet;
   2273         }
   2274         break;
   2275 
   2276     default:
   2277         {
   2278             //  The set contains two or more chars.  (the normal case)
   2279             //  Put it into the compiled pattern as a set.
   2280             int32_t setNumber = fRXPat->fSets->size();
   2281             fRXPat->fSets->addElement(theSet, *fStatus);
   2282             int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
   2283             fRXPat->fCompiledPat->addElement(setOp, *fStatus);
   2284         }
   2285     }
   2286 }
   2287 
   2288 
   2289 //------------------------------------------------------------------------------
   2290 //
   2291 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
   2292 //                      Except for the specific opcodes used, the code is the same
   2293 //                      for all three types (greedy, non-greedy, possessive) of
   2294 //                      intervals.  The opcodes are supplied as parameters.
   2295 //
   2296 //                      The code for interval loops has this form:
   2297 //                         0  CTR_INIT   counter loc (in stack frame)
   2298 //                         1             5  patt address of CTR_LOOP at bottom of block
   2299 //                         2             min count
   2300 //                         3             max count   (-1 for unbounded)
   2301 //                         4  ...        block to be iterated over
   2302 //                         5  CTR_LOOP
   2303 //
   2304 //                       In
   2305 //------------------------------------------------------------------------------
   2306 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
   2307 {
   2308     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
   2309     //   four slots in the compiled code.  Reserve them.
   2310     int32_t   topOfBlock = blockTopLoc(TRUE);
   2311     insertOp(topOfBlock);
   2312     insertOp(topOfBlock);
   2313     insertOp(topOfBlock);
   2314 
   2315     // The operands for the CTR_INIT opcode include the index in the matcher data
   2316     //   of the counter.  Allocate it now.
   2317     int32_t   counterLoc = fRXPat->fFrameSize;
   2318     fRXPat->fFrameSize++;
   2319 
   2320     int32_t   op = URX_BUILD(InitOp, counterLoc);
   2321     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
   2322 
   2323     // The second operand of CTR_INIT is the location following the end of the loop.
   2324     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
   2325     //   compilation of something later on causes the code to grow and the target
   2326     //   position to move.
   2327     int32_t loopEnd = fRXPat->fCompiledPat->size();
   2328     op = URX_BUILD(URX_RELOC_OPRND, loopEnd);
   2329     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
   2330 
   2331     // Followed by the min and max counts.
   2332     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
   2333     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
   2334 
   2335     // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
   2336     //   Goes at end of the block being looped over, so just append to the code so far.
   2337     op = URX_BUILD(LoopOp, topOfBlock);
   2338     fRXPat->fCompiledPat->addElement(op, *fStatus);
   2339 
   2340     if ((fIntervalLow & 0xff000000) != 0 ||
   2341         fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0) {
   2342             error(U_REGEX_NUMBER_TOO_BIG);
   2343         }
   2344 
   2345     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
   2346         error(U_REGEX_MAX_LT_MIN);
   2347     }
   2348 }
   2349 
   2350 
   2351 
   2352 UBool RegexCompile::compileInlineInterval() {
   2353     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
   2354         // Too big to inline.  Fail, which will cause looping code to be generated.
   2355         //   (Upper < Lower picks up unbounded upper and errors, both.)
   2356         return FALSE;
   2357     }
   2358 
   2359     int32_t   topOfBlock = blockTopLoc(FALSE);
   2360     if (fIntervalUpper == 0) {
   2361         // Pathological case.  Attempt no matches, as if the block doesn't exist.
   2362         fRXPat->fCompiledPat->setSize(topOfBlock);
   2363         return TRUE;
   2364     }
   2365 
   2366     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
   2367         // The thing being repeated is not a single op, but some
   2368         //   more complex block.  Do it as a loop, not inlines.
   2369         //   Note that things "repeated" a max of once are handled as inline, because
   2370         //     the one copy of the code already generated is just fine.
   2371         return FALSE;
   2372     }
   2373 
   2374     // Pick up the opcode that is to be repeated
   2375     //
   2376     int32_t op = fRXPat->fCompiledPat->elementAti(topOfBlock);
   2377 
   2378     // Compute the pattern location where the inline sequence
   2379     //   will end, and set up the state save op that will be needed.
   2380     //
   2381     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
   2382                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
   2383     int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc);
   2384     if (fIntervalLow == 0) {
   2385         insertOp(topOfBlock);
   2386         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
   2387     }
   2388 
   2389 
   2390 
   2391     //  Loop, emitting the op for the thing being repeated each time.
   2392     //    Loop starts at 1 because one instance of the op already exists in the pattern,
   2393     //    it was put there when it was originally encountered.
   2394     int32_t i;
   2395     for (i=1; i<fIntervalUpper; i++ ) {
   2396         if (i == fIntervalLow) {
   2397             fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
   2398         }
   2399         if (i > fIntervalLow) {
   2400             fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
   2401         }
   2402         fRXPat->fCompiledPat->addElement(op, *fStatus);
   2403     }
   2404     return TRUE;
   2405 }
   2406 
   2407 
   2408 
   2409 //------------------------------------------------------------------------------
   2410 //
   2411 //   matchStartType    Determine how a match can start.
   2412 //                     Used to optimize find() operations.
   2413 //
   2414 //                     Operation is very similar to minMatchLength().  Walk the compiled
   2415 //                     pattern, keeping an on-going minimum-match-length.  For any
   2416 //                     op where the min match coming in is zero, add that ops possible
   2417 //                     starting matches to the possible starts for the overall pattern.
   2418 //
   2419 //------------------------------------------------------------------------------
   2420 void   RegexCompile::matchStartType() {
   2421     if (U_FAILURE(*fStatus)) {
   2422         return;
   2423     }
   2424 
   2425 
   2426     int32_t    loc;                    // Location in the pattern of the current op being processed.
   2427     int32_t    op;                     // The op being processed
   2428     int32_t    opType;                 // The opcode type of the op
   2429     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
   2430     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
   2431 
   2432     UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
   2433                                        //   could have advanced the position in a match.
   2434                                        //   (Maximum match length so far == 0)
   2435 
   2436     // forwardedLength is a vector holding minimum-match-length values that
   2437     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
   2438     //   It must be one longer than the pattern being checked because some  ops
   2439     //   will jmp to a end-of-block+1 location from within a block, and we must
   2440     //   count those when checking the block.
   2441     int32_t end = fRXPat->fCompiledPat->size();
   2442     UVector32  forwardedLength(end+1, *fStatus);
   2443     forwardedLength.setSize(end+1);
   2444     for (loc=3; loc<end; loc++) {
   2445         forwardedLength.setElementAt(INT32_MAX, loc);
   2446     }
   2447 
   2448     for (loc = 3; loc<end; loc++) {
   2449         op = fRXPat->fCompiledPat->elementAti(loc);
   2450         opType = URX_TYPE(op);
   2451 
   2452         // The loop is advancing linearly through the pattern.
   2453         // If the op we are now at was the destination of a branch in the pattern,
   2454         // and that path has a shorter minimum length than the current accumulated value,
   2455         // replace the current accumulated value.
   2456         if (forwardedLength.elementAti(loc) < currentLen) {
   2457             currentLen = forwardedLength.elementAti(loc);
   2458             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   2459         }
   2460 
   2461         switch (opType) {
   2462             // Ops that don't change the total length matched
   2463         case URX_RESERVED_OP:
   2464         case URX_END:
   2465         case URX_FAIL:
   2466         case URX_STRING_LEN:
   2467         case URX_NOP:
   2468         case URX_START_CAPTURE:
   2469         case URX_END_CAPTURE:
   2470         case URX_BACKSLASH_B:
   2471         case URX_BACKSLASH_BU:
   2472         case URX_BACKSLASH_G:
   2473         case URX_BACKSLASH_Z:
   2474         case URX_DOLLAR:
   2475         case URX_DOLLAR_M:
   2476         case URX_DOLLAR_D:
   2477         case URX_DOLLAR_MD:
   2478         case URX_RELOC_OPRND:
   2479         case URX_STO_INP_LOC:
   2480         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   2481         case URX_BACKREF_I:
   2482 
   2483         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   2484         case URX_LD_SP:
   2485             break;
   2486 
   2487         case URX_CARET:
   2488             if (atStart) {
   2489                 fRXPat->fStartType = START_START;
   2490             }
   2491             break;
   2492 
   2493         case URX_CARET_M:
   2494         case URX_CARET_M_UNIX:
   2495             if (atStart) {
   2496                 fRXPat->fStartType = START_LINE;
   2497             }
   2498             break;
   2499 
   2500         case URX_ONECHAR:
   2501             if (currentLen == 0) {
   2502                 // This character could appear at the start of a match.
   2503                 //   Add it to the set of possible starting characters.
   2504                 fRXPat->fInitialChars->add(URX_VAL(op));
   2505                 numInitialStrings += 2;
   2506             }
   2507             currentLen++;
   2508             atStart = FALSE;
   2509             break;
   2510 
   2511 
   2512         case URX_SETREF:
   2513             if (currentLen == 0) {
   2514                 int32_t  sn = URX_VAL(op);
   2515                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
   2516                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
   2517                 fRXPat->fInitialChars->addAll(*s);
   2518                 numInitialStrings += 2;
   2519             }
   2520             currentLen++;
   2521             atStart = FALSE;
   2522             break;
   2523 
   2524         case URX_LOOP_SR_I:
   2525             // [Set]*, like a SETREF, above, in what it can match,
   2526             //  but may not match at all, so currentLen is not incremented.
   2527             if (currentLen == 0) {
   2528                 int32_t  sn = URX_VAL(op);
   2529                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
   2530                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
   2531                 fRXPat->fInitialChars->addAll(*s);
   2532                 numInitialStrings += 2;
   2533             }
   2534             atStart = FALSE;
   2535             break;
   2536 
   2537         case URX_LOOP_DOT_I:
   2538             if (currentLen == 0) {
   2539                 // .* at the start of a pattern.
   2540                 //    Any character can begin the match.
   2541                 fRXPat->fInitialChars->clear();
   2542                 fRXPat->fInitialChars->complement();
   2543                 numInitialStrings += 2;
   2544             }
   2545             atStart = FALSE;
   2546             break;
   2547 
   2548 
   2549         case URX_STATIC_SETREF:
   2550             if (currentLen == 0) {
   2551                 int32_t  sn = URX_VAL(op);
   2552                 U_ASSERT(sn>0 && sn<URX_LAST_SET);
   2553                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
   2554                 fRXPat->fInitialChars->addAll(*s);
   2555                 numInitialStrings += 2;
   2556             }
   2557             currentLen++;
   2558             atStart = FALSE;
   2559             break;
   2560 
   2561 
   2562 
   2563         case URX_STAT_SETREF_N:
   2564             if (currentLen == 0) {
   2565                 int32_t  sn = URX_VAL(op);
   2566                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
   2567                 UnicodeSet sc(*s);
   2568                 sc.complement();
   2569                 fRXPat->fInitialChars->addAll(sc);
   2570                 numInitialStrings += 2;
   2571             }
   2572             currentLen++;
   2573             atStart = FALSE;
   2574             break;
   2575 
   2576 
   2577 
   2578         case URX_BACKSLASH_D:
   2579             // Digit Char
   2580              if (currentLen == 0) {
   2581                  UnicodeSet s;
   2582                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
   2583                  if (URX_VAL(op) != 0) {
   2584                      s.complement();
   2585                  }
   2586                  fRXPat->fInitialChars->addAll(s);
   2587                  numInitialStrings += 2;
   2588             }
   2589             currentLen++;
   2590             atStart = FALSE;
   2591             break;
   2592 
   2593 
   2594         case URX_ONECHAR_I:
   2595             // Case Insensitive Single Character.
   2596             if (currentLen == 0) {
   2597                 UChar32  c = URX_VAL(op);
   2598                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
   2599                     // character may have distinct cased forms.  Add all of them
   2600                     //   to the set of possible starting match chars.
   2601                     UnicodeSet s(c, c);
   2602                     s.closeOver(USET_CASE_INSENSITIVE);
   2603                     fRXPat->fInitialChars->addAll(s);
   2604                 } else {
   2605                     // Char has no case variants.  Just add it as-is to the
   2606                     //   set of possible starting chars.
   2607                     fRXPat->fInitialChars->add(c);
   2608                 }
   2609                 numInitialStrings += 2;
   2610             }
   2611             currentLen++;
   2612             atStart = FALSE;
   2613             break;
   2614 
   2615 
   2616         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
   2617         case URX_DOTANY_ALL:    // . matches one or two.
   2618         case URX_DOTANY:
   2619         case URX_DOTANY_UNIX:
   2620             if (currentLen == 0) {
   2621                 // These constructs are all bad news when they appear at the start
   2622                 //   of a match.  Any character can begin the match.
   2623                 fRXPat->fInitialChars->clear();
   2624                 fRXPat->fInitialChars->complement();
   2625                 numInitialStrings += 2;
   2626             }
   2627             currentLen++;
   2628             atStart = FALSE;
   2629             break;
   2630 
   2631 
   2632         case URX_JMPX:
   2633             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
   2634         case URX_JMP:
   2635             {
   2636                 int32_t  jmpDest = URX_VAL(op);
   2637                 if (jmpDest < loc) {
   2638                     // Loop of some kind.  Can safely ignore, the worst that will happen
   2639                     //  is that we understate the true minimum length
   2640                     currentLen = forwardedLength.elementAti(loc+1);
   2641 
   2642                 } else {
   2643                     // Forward jump.  Propagate the current min length to the target loc of the jump.
   2644                     U_ASSERT(jmpDest <= end+1);
   2645                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
   2646                         forwardedLength.setElementAt(currentLen, jmpDest);
   2647                     }
   2648                 }
   2649             }
   2650             atStart = FALSE;
   2651             break;
   2652 
   2653         case URX_JMP_SAV:
   2654         case URX_JMP_SAV_X:
   2655             // Combo of state save to the next loc, + jmp backwards.
   2656             //   Net effect on min. length computation is nothing.
   2657             atStart = FALSE;
   2658             break;
   2659 
   2660         case URX_BACKTRACK:
   2661             // Fails are kind of like a branch, except that the min length was
   2662             //   propagated already, by the state save.
   2663             currentLen = forwardedLength.elementAti(loc+1);
   2664             atStart = FALSE;
   2665             break;
   2666 
   2667 
   2668         case URX_STATE_SAVE:
   2669             {
   2670                 // State Save, for forward jumps, propagate the current minimum.
   2671                 //             of the state save.
   2672                 int32_t  jmpDest = URX_VAL(op);
   2673                 if (jmpDest > loc) {
   2674                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
   2675                         forwardedLength.setElementAt(currentLen, jmpDest);
   2676                     }
   2677                 }
   2678             }
   2679             atStart = FALSE;
   2680             break;
   2681 
   2682 
   2683 
   2684 
   2685         case URX_STRING:
   2686             {
   2687                 loc++;
   2688                 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
   2689                 int32_t stringLen   = URX_VAL(stringLenOp);
   2690                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
   2691                 U_ASSERT(stringLenOp >= 2);
   2692                 if (currentLen == 0) {
   2693                     // Add the starting character of this string to the set of possible starting
   2694                     //   characters for this pattern.
   2695                     int32_t stringStartIdx = URX_VAL(op);
   2696                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
   2697                     fRXPat->fInitialChars->add(c);
   2698 
   2699                     // Remember this string.  After the entire pattern has been checked,
   2700                     //  if nothing else is identified that can start a match, we'll use it.
   2701                     numInitialStrings++;
   2702                     fRXPat->fInitialStringIdx = stringStartIdx;
   2703                     fRXPat->fInitialStringLen = stringLen;
   2704                 }
   2705 
   2706                 currentLen += stringLen;
   2707                 atStart = FALSE;
   2708             }
   2709             break;
   2710 
   2711         case URX_STRING_I:
   2712             {
   2713                 // Case-insensitive string.  Unlike exact-match strings, we won't
   2714                 //   attempt a string search for possible match positions.  But we
   2715                 //   do update the set of possible starting characters.
   2716                 loc++;
   2717                 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
   2718                 int32_t stringLen   = URX_VAL(stringLenOp);
   2719                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
   2720                 U_ASSERT(stringLenOp >= 2);
   2721                 if (currentLen == 0) {
   2722                     // Add the starting character of this string to the set of possible starting
   2723                     //   characters for this pattern.
   2724                     int32_t stringStartIdx = URX_VAL(op);
   2725                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
   2726                     UnicodeSet s(c, c);
   2727                     s.closeOver(USET_CASE_INSENSITIVE);
   2728                     fRXPat->fInitialChars->addAll(s);
   2729                     numInitialStrings += 2;  // Matching on an initial string not possible.
   2730                 }
   2731                 currentLen += stringLen;
   2732                 atStart = FALSE;
   2733             }
   2734             break;
   2735 
   2736         case URX_CTR_INIT:
   2737         case URX_CTR_INIT_NG:
   2738             {
   2739                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops
   2740                 //   so location must be updated accordingly.
   2741                 // Loop Init Ops.
   2742                 //   If the min loop count == 0
   2743                 //      move loc forwards to the end of the loop, skipping over the body.
   2744                 //   If the min count is > 0,
   2745                 //      continue normal processing of the body of the loop.
   2746                 int32_t loopEndLoc   = fRXPat->fCompiledPat->elementAti(loc+1);
   2747                         loopEndLoc   = URX_VAL(loopEndLoc);
   2748                 int32_t minLoopCount = fRXPat->fCompiledPat->elementAti(loc+2);
   2749                 if (minLoopCount == 0) {
   2750                     // Min Loop Count of 0, treat like a forward branch and
   2751                     //   move the current minimum length up to the target
   2752                     //   (end of loop) location.
   2753                     U_ASSERT(loopEndLoc <= end+1);
   2754                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
   2755                         forwardedLength.setElementAt(currentLen, loopEndLoc);
   2756                     }
   2757                 }
   2758                 loc+=3;  // Skips over operands of CTR_INIT
   2759             }
   2760             atStart = FALSE;
   2761             break;
   2762 
   2763 
   2764         case URX_CTR_LOOP:
   2765         case URX_CTR_LOOP_NG:
   2766             // Loop ops.
   2767             //  The jump is conditional, backwards only.
   2768             atStart = FALSE;
   2769             break;
   2770 
   2771         case URX_LOOP_C:
   2772             // More loop ops.  These state-save to themselves.
   2773             //   don't change the minimum match
   2774             atStart = FALSE;
   2775             break;
   2776 
   2777 
   2778         case URX_LA_START:
   2779         case URX_LB_START:
   2780             {
   2781                 // Look-around.  Scan forward until the matching look-ahead end,
   2782                 //   without processing the look-around block.  This is overly pessimistic.
   2783 
   2784                 // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
   2785                 //   lookahead contains two LA_END instructions, so count goes up by two
   2786                 //   for each LA_START.
   2787                 int32_t  depth = (opType == URX_LA_START? 2: 1);
   2788                 for (;;) {
   2789                     loc++;
   2790                     op = fRXPat->fCompiledPat->elementAti(loc);
   2791                     if (URX_TYPE(op) == URX_LA_START) {
   2792                         depth+=2;
   2793                     }
   2794                     if (URX_TYPE(op) == URX_LB_START) {
   2795                         depth++;
   2796                     }
   2797                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
   2798                         depth--;
   2799                         if (depth == 0) {
   2800                             break;
   2801                         }
   2802                     }
   2803                     if (URX_TYPE(op) == URX_STATE_SAVE) {
   2804                         // Need this because neg lookahead blocks will FAIL to outside
   2805                         //   of the block.
   2806                         int32_t  jmpDest = URX_VAL(op);
   2807                         if (jmpDest > loc) {
   2808                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
   2809                                 forwardedLength.setElementAt(currentLen, jmpDest);
   2810                             }
   2811                         }
   2812                     }
   2813                     U_ASSERT(loc <= end);
   2814                 }
   2815             }
   2816             break;
   2817 
   2818         case URX_LA_END:
   2819         case URX_LB_CONT:
   2820         case URX_LB_END:
   2821         case URX_LBN_CONT:
   2822         case URX_LBN_END:
   2823             U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be
   2824                                  //  consumed by the scan in URX_LA_START and LB_START
   2825 
   2826             break;
   2827 
   2828         default:
   2829             U_ASSERT(FALSE);
   2830             }
   2831 
   2832         }
   2833 
   2834 
   2835     // We have finished walking through the ops.  Check whether some forward jump
   2836     //   propagated a shorter length to location end+1.
   2837     if (forwardedLength.elementAti(end+1) < currentLen) {
   2838         currentLen = forwardedLength.elementAti(end+1);
   2839     }
   2840 
   2841 
   2842     fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
   2843 
   2844 
   2845     // Sort out what we should check for when looking for candidate match start positions.
   2846     // In order of preference,
   2847     //     1.   Start of input text buffer.
   2848     //     2.   A literal string.
   2849     //     3.   Start of line in multi-line mode.
   2850     //     4.   A single literal character.
   2851     //     5.   A character from a set of characters.
   2852     //
   2853     if (fRXPat->fStartType == START_START) {
   2854         // Match only at the start of an input text string.
   2855         //    start type is already set.  We're done.
   2856     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
   2857         // Match beginning only with a literal string.
   2858         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
   2859         U_ASSERT(fRXPat->fInitialChars->contains(c));
   2860         fRXPat->fStartType   = START_STRING;
   2861         fRXPat->fInitialChar = c;
   2862     } else if (fRXPat->fStartType == START_LINE) {
   2863         // Match at start of line in Multi-Line mode.
   2864         // Nothing to do here; everything is already set.
   2865     } else if (fRXPat->fMinMatchLen == 0) {
   2866         // Zero length match possible.  We could start anywhere.
   2867         fRXPat->fStartType = START_NO_INFO;
   2868     } else if (fRXPat->fInitialChars->size() == 1) {
   2869         // All matches begin with the same char.
   2870         fRXPat->fStartType   = START_CHAR;
   2871         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
   2872         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
   2873     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
   2874         fRXPat->fMinMatchLen > 0) {
   2875         // Matches start with a set of character smaller than the set of all chars.
   2876         fRXPat->fStartType = START_SET;
   2877     } else {
   2878         // Matches can start with anything
   2879         fRXPat->fStartType = START_NO_INFO;
   2880     }
   2881 
   2882     return;
   2883 }
   2884 
   2885 
   2886 
   2887 //------------------------------------------------------------------------------
   2888 //
   2889 //   minMatchLength    Calculate the length of the shortest string that could
   2890 //                     match the specified pattern.
   2891 //                     Length is in 16 bit code units, not code points.
   2892 //
   2893 //                     The calculated length may not be exact.  The returned
   2894 //                     value may be shorter than the actual minimum; it must
   2895 //                     never be longer.
   2896 //
   2897 //                     start and end are the range of p-code operations to be
   2898 //                     examined.  The endpoints are included in the range.
   2899 //
   2900 //------------------------------------------------------------------------------
   2901 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
   2902     if (U_FAILURE(*fStatus)) {
   2903         return 0;
   2904     }
   2905 
   2906     U_ASSERT(start <= end);
   2907     U_ASSERT(end < fRXPat->fCompiledPat->size());
   2908 
   2909 
   2910     int32_t    loc;
   2911     int32_t    op;
   2912     int32_t    opType;
   2913     int32_t    currentLen = 0;
   2914 
   2915 
   2916     // forwardedLength is a vector holding minimum-match-length values that
   2917     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
   2918     //   It must be one longer than the pattern being checked because some  ops
   2919     //   will jmp to a end-of-block+1 location from within a block, and we must
   2920     //   count those when checking the block.
   2921     UVector32  forwardedLength(end+2, *fStatus);
   2922     forwardedLength.setSize(end+2);
   2923     for (loc=start; loc<=end+1; loc++) {
   2924         forwardedLength.setElementAt(INT32_MAX, loc);
   2925     }
   2926 
   2927     for (loc = start; loc<=end; loc++) {
   2928         op = fRXPat->fCompiledPat->elementAti(loc);
   2929         opType = URX_TYPE(op);
   2930 
   2931         // The loop is advancing linearly through the pattern.
   2932         // If the op we are now at was the destination of a branch in the pattern,
   2933         // and that path has a shorter minimum length than the current accumulated value,
   2934         // replace the current accumulated value.
   2935         // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
   2936                                                                //   no-match-possible cases.
   2937         if (forwardedLength.elementAti(loc) < currentLen) {
   2938             currentLen = forwardedLength.elementAti(loc);
   2939             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   2940         }
   2941 
   2942         switch (opType) {
   2943             // Ops that don't change the total length matched
   2944         case URX_RESERVED_OP:
   2945         case URX_END:
   2946         case URX_STRING_LEN:
   2947         case URX_NOP:
   2948         case URX_START_CAPTURE:
   2949         case URX_END_CAPTURE:
   2950         case URX_BACKSLASH_B:
   2951         case URX_BACKSLASH_BU:
   2952         case URX_BACKSLASH_G:
   2953         case URX_BACKSLASH_Z:
   2954         case URX_CARET:
   2955         case URX_DOLLAR:
   2956         case URX_DOLLAR_M:
   2957         case URX_DOLLAR_D:
   2958         case URX_DOLLAR_MD:
   2959         case URX_RELOC_OPRND:
   2960         case URX_STO_INP_LOC:
   2961         case URX_CARET_M:
   2962         case URX_CARET_M_UNIX:
   2963         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   2964         case URX_BACKREF_I:
   2965 
   2966         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   2967         case URX_LD_SP:
   2968 
   2969         case URX_JMP_SAV:
   2970         case URX_JMP_SAV_X:
   2971             break;
   2972 
   2973 
   2974             // Ops that match a minimum of one character (one or two 16 bit code units.)
   2975             //
   2976         case URX_ONECHAR:
   2977         case URX_STATIC_SETREF:
   2978         case URX_STAT_SETREF_N:
   2979         case URX_SETREF:
   2980         case URX_BACKSLASH_D:
   2981         case URX_ONECHAR_I:
   2982         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
   2983         case URX_DOTANY_ALL:    // . matches one or two.
   2984         case URX_DOTANY:
   2985         case URX_DOTANY_UNIX:
   2986             currentLen++;
   2987             break;
   2988 
   2989 
   2990         case URX_JMPX:
   2991             loc++;              // URX_JMPX has an extra operand, ignored here,
   2992                                 //   otherwise processed identically to URX_JMP.
   2993         case URX_JMP:
   2994             {
   2995                 int32_t  jmpDest = URX_VAL(op);
   2996                 if (jmpDest < loc) {
   2997                     // Loop of some kind.  Can safely ignore, the worst that will happen
   2998                     //  is that we understate the true minimum length
   2999                     currentLen = forwardedLength.elementAti(loc+1);
   3000                 } else {
   3001                     // Forward jump.  Propagate the current min length to the target loc of the jump.
   3002                     U_ASSERT(jmpDest <= end+1);
   3003                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
   3004                         forwardedLength.setElementAt(currentLen, jmpDest);
   3005                     }
   3006                 }
   3007             }
   3008             break;
   3009 
   3010         case URX_BACKTRACK:
   3011             {
   3012                 // Back-tracks are kind of like a branch, except that the min length was
   3013                 //   propagated already, by the state save.
   3014                 currentLen = forwardedLength.elementAti(loc+1);
   3015             }
   3016             break;
   3017 
   3018 
   3019         case URX_STATE_SAVE:
   3020             {
   3021                 // State Save, for forward jumps, propagate the current minimum.
   3022                 //             of the state save.
   3023                 int32_t  jmpDest = URX_VAL(op);
   3024                 if (jmpDest > loc) {
   3025                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3026                         forwardedLength.setElementAt(currentLen, jmpDest);
   3027                     }
   3028                 }
   3029             }
   3030             break;
   3031 
   3032 
   3033         case URX_STRING:
   3034         case URX_STRING_I:
   3035             {
   3036                 loc++;
   3037                 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
   3038                 currentLen += URX_VAL(stringLenOp);
   3039             }
   3040             break;
   3041 
   3042 
   3043         case URX_CTR_INIT:
   3044         case URX_CTR_INIT_NG:
   3045             {
   3046                 // Loop Init Ops.
   3047                 //   If the min loop count == 0
   3048                 //      move loc forwards to the end of the loop, skipping over the body.
   3049                 //   If the min count is > 0,
   3050                 //      continue normal processing of the body of the loop.
   3051                 int32_t loopEndLoc   = fRXPat->fCompiledPat->elementAti(loc+1);
   3052                         loopEndLoc   = URX_VAL(loopEndLoc);
   3053                 int32_t minLoopCount = fRXPat->fCompiledPat->elementAti(loc+2);
   3054                 if (minLoopCount == 0) {
   3055                     loc = loopEndLoc;
   3056                 } else {
   3057                     loc+=3;  // Skips over operands of CTR_INIT
   3058                 }
   3059             }
   3060             break;
   3061 
   3062 
   3063         case URX_CTR_LOOP:
   3064         case URX_CTR_LOOP_NG:
   3065             // Loop ops.
   3066             //  The jump is conditional, backwards only.
   3067             break;
   3068 
   3069         case URX_LOOP_SR_I:
   3070         case URX_LOOP_DOT_I:
   3071         case URX_LOOP_C:
   3072             // More loop ops.  These state-save to themselves.
   3073             //   don't change the minimum match - could match nothing at all.
   3074             break;
   3075 
   3076 
   3077         case URX_LA_START:
   3078         case URX_LB_START:
   3079             {
   3080                 // Look-around.  Scan forward until the matching look-ahead end,
   3081                 //   without processing the look-around block.  This is overly pessimistic for look-ahead,
   3082                 //   it assumes that the look-ahead match might be zero-length.
   3083                 //   TODO:  Positive lookahead could recursively do the block, then continue
   3084                 //          with the longer of the block or the value coming in.  Ticket 6060
   3085                 int32_t  depth = (opType == URX_LA_START? 2: 1);;
   3086                 for (;;) {
   3087                     loc++;
   3088                     op = fRXPat->fCompiledPat->elementAti(loc);
   3089                     if (URX_TYPE(op) == URX_LA_START) {
   3090                         // The boilerplate for look-ahead includes two LA_END insturctions,
   3091                         //    Depth will be decremented by each one when it is seen.
   3092                         depth += 2;
   3093                     }
   3094                     if (URX_TYPE(op) == URX_LB_START) {
   3095                         depth++;
   3096                     }
   3097                     if (URX_TYPE(op) == URX_LA_END) {
   3098                         depth--;
   3099                         if (depth == 0) {
   3100                             break;
   3101                         }
   3102                     }
   3103                     if (URX_TYPE(op)==URX_LBN_END) {
   3104                         depth--;
   3105                         if (depth == 0) {
   3106                             break;
   3107                         }
   3108                     }
   3109                     if (URX_TYPE(op) == URX_STATE_SAVE) {
   3110                         // Need this because neg lookahead blocks will FAIL to outside
   3111                         //   of the block.
   3112                         int32_t  jmpDest = URX_VAL(op);
   3113                         if (jmpDest > loc) {
   3114                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3115                                 forwardedLength.setElementAt(currentLen, jmpDest);
   3116                             }
   3117                         }
   3118                     }
   3119                     U_ASSERT(loc <= end);
   3120                 }
   3121             }
   3122             break;
   3123 
   3124         case URX_LA_END:
   3125         case URX_LB_CONT:
   3126         case URX_LB_END:
   3127         case URX_LBN_CONT:
   3128         case URX_LBN_END:
   3129             // Only come here if the matching URX_LA_START or URX_LB_START was not in the
   3130             //   range being sized, which happens when measuring size of look-behind blocks.
   3131             break;
   3132 
   3133         default:
   3134             U_ASSERT(FALSE);
   3135             }
   3136 
   3137         }
   3138 
   3139     // We have finished walking through the ops.  Check whether some forward jump
   3140     //   propagated a shorter length to location end+1.
   3141     if (forwardedLength.elementAti(end+1) < currentLen) {
   3142         currentLen = forwardedLength.elementAti(end+1);
   3143         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   3144     }
   3145 
   3146     return currentLen;
   3147 }
   3148 
   3149 
   3150 
   3151 //------------------------------------------------------------------------------
   3152 //
   3153 //   maxMatchLength    Calculate the length of the longest string that could
   3154 //                     match the specified pattern.
   3155 //                     Length is in 16 bit code units, not code points.
   3156 //
   3157 //                     The calculated length may not be exact.  The returned
   3158 //                     value may be longer than the actual maximum; it must
   3159 //                     never be shorter.
   3160 //
   3161 //------------------------------------------------------------------------------
   3162 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
   3163     if (U_FAILURE(*fStatus)) {
   3164         return 0;
   3165     }
   3166     U_ASSERT(start <= end);
   3167     U_ASSERT(end < fRXPat->fCompiledPat->size());
   3168 
   3169 
   3170     int32_t    loc;
   3171     int32_t    op;
   3172     int32_t    opType;
   3173     int32_t    currentLen = 0;
   3174     UVector32  forwardedLength(end+1, *fStatus);
   3175     forwardedLength.setSize(end+1);
   3176 
   3177     for (loc=start; loc<=end; loc++) {
   3178         forwardedLength.setElementAt(0, loc);
   3179     }
   3180 
   3181     for (loc = start; loc<=end; loc++) {
   3182         op = fRXPat->fCompiledPat->elementAti(loc);
   3183         opType = URX_TYPE(op);
   3184 
   3185         // The loop is advancing linearly through the pattern.
   3186         // If the op we are now at was the destination of a branch in the pattern,
   3187         // and that path has a longer maximum length than the current accumulated value,
   3188         // replace the current accumulated value.
   3189         if (forwardedLength.elementAti(loc) > currentLen) {
   3190             currentLen = forwardedLength.elementAti(loc);
   3191         }
   3192 
   3193         switch (opType) {
   3194             // Ops that don't change the total length matched
   3195         case URX_RESERVED_OP:
   3196         case URX_END:
   3197         case URX_STRING_LEN:
   3198         case URX_NOP:
   3199         case URX_START_CAPTURE:
   3200         case URX_END_CAPTURE:
   3201         case URX_BACKSLASH_B:
   3202         case URX_BACKSLASH_BU:
   3203         case URX_BACKSLASH_G:
   3204         case URX_BACKSLASH_Z:
   3205         case URX_CARET:
   3206         case URX_DOLLAR:
   3207         case URX_DOLLAR_M:
   3208         case URX_DOLLAR_D:
   3209         case URX_DOLLAR_MD:
   3210         case URX_RELOC_OPRND:
   3211         case URX_STO_INP_LOC:
   3212         case URX_CARET_M:
   3213         case URX_CARET_M_UNIX:
   3214 
   3215         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   3216         case URX_LD_SP:
   3217 
   3218         case URX_LB_END:
   3219         case URX_LB_CONT:
   3220         case URX_LBN_CONT:
   3221         case URX_LBN_END:
   3222             break;
   3223 
   3224 
   3225             // Ops that increase that cause an unbounded increase in the length
   3226             //   of a matched string, or that increase it a hard to characterize way.
   3227             //   Call the max length unbounded, and stop further checking.
   3228         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   3229         case URX_BACKREF_I:
   3230         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
   3231             currentLen = INT32_MAX;
   3232             break;
   3233 
   3234 
   3235             // Ops that match a max of one character (possibly two 16 bit code units.)
   3236             //
   3237         case URX_STATIC_SETREF:
   3238         case URX_STAT_SETREF_N:
   3239         case URX_SETREF:
   3240         case URX_BACKSLASH_D:
   3241         case URX_ONECHAR_I:
   3242         case URX_DOTANY_ALL:
   3243         case URX_DOTANY:
   3244         case URX_DOTANY_UNIX:
   3245             currentLen+=2;
   3246             break;
   3247 
   3248             // Single literal character.  Increase current max length by one or two,
   3249             //       depending on whether the char is in the supplementary range.
   3250         case URX_ONECHAR:
   3251             currentLen++;
   3252             if (URX_VAL(op) > 0x10000) {
   3253                 currentLen++;
   3254             }
   3255             break;
   3256 
   3257             // Jumps.
   3258             //
   3259         case URX_JMP:
   3260         case URX_JMPX:
   3261         case URX_JMP_SAV:
   3262         case URX_JMP_SAV_X:
   3263             {
   3264                 int32_t  jmpDest = URX_VAL(op);
   3265                 if (jmpDest < loc) {
   3266                     // Loop of some kind.  Max match length is unbounded.
   3267                     currentLen = INT32_MAX;
   3268                 } else {
   3269                     // Forward jump.  Propagate the current min length to the target loc of the jump.
   3270                     if (forwardedLength.elementAti(jmpDest) < currentLen) {
   3271                         forwardedLength.setElementAt(currentLen, jmpDest);
   3272                     }
   3273                     currentLen = 0;
   3274                 }
   3275             }
   3276             break;
   3277 
   3278         case URX_BACKTRACK:
   3279             // back-tracks are kind of like a branch, except that the max length was
   3280             //   propagated already, by the state save.
   3281             currentLen = forwardedLength.elementAti(loc+1);
   3282             break;
   3283 
   3284 
   3285         case URX_STATE_SAVE:
   3286             {
   3287                 // State Save, for forward jumps, propagate the current minimum.
   3288                 //               of the state save.
   3289                 //             For backwards jumps, they create a loop, maximum
   3290                 //               match length is unbounded.
   3291                 int32_t  jmpDest = URX_VAL(op);
   3292                 if (jmpDest > loc) {
   3293                     if (currentLen > forwardedLength.elementAti(jmpDest)) {
   3294                         forwardedLength.setElementAt(currentLen, jmpDest);
   3295                     }
   3296                 } else {
   3297                     currentLen = INT32_MAX;
   3298                 }
   3299             }
   3300             break;
   3301 
   3302 
   3303 
   3304 
   3305         case URX_STRING:
   3306         case URX_STRING_I:
   3307             {
   3308                 loc++;
   3309                 int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
   3310                 currentLen += URX_VAL(stringLenOp);
   3311             }
   3312             break;
   3313 
   3314 
   3315         case URX_CTR_INIT:
   3316         case URX_CTR_INIT_NG:
   3317         case URX_CTR_LOOP:
   3318         case URX_CTR_LOOP_NG:
   3319         case URX_LOOP_SR_I:
   3320         case URX_LOOP_DOT_I:
   3321         case URX_LOOP_C:
   3322             // For anything to do with loops, make the match length unbounded.
   3323             //   Note:  INIT instructions are multi-word.  Can ignore because
   3324             //          INT32_MAX length will stop the per-instruction loop.
   3325             currentLen = INT32_MAX;
   3326             break;
   3327 
   3328 
   3329 
   3330         case URX_LA_START:
   3331         case URX_LA_END:
   3332             // Look-ahead.  Just ignore, treat the look-ahead block as if
   3333             // it were normal pattern.  Gives a too-long match length,
   3334             //  but good enough for now.
   3335             break;
   3336 
   3337             // End of look-ahead ops should always be consumed by the processing at
   3338             //  the URX_LA_START op.
   3339             // U_ASSERT(FALSE);
   3340             // break;
   3341 
   3342         case URX_LB_START:
   3343             {
   3344                 // Look-behind.  Scan forward until the matching look-around end,
   3345                 //   without processing the look-behind block.
   3346                 int32_t  depth = 0;
   3347                 for (;;) {
   3348                     loc++;
   3349                     op = fRXPat->fCompiledPat->elementAti(loc);
   3350                     if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
   3351                         depth++;
   3352                     }
   3353                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
   3354                         if (depth == 0) {
   3355                             break;
   3356                         }
   3357                         depth--;
   3358                     }
   3359                     U_ASSERT(loc < end);
   3360                 }
   3361             }
   3362             break;
   3363 
   3364         default:
   3365             U_ASSERT(FALSE);
   3366         }
   3367 
   3368 
   3369         if (currentLen == INT32_MAX) {
   3370             //  The maximum length is unbounded.
   3371             //  Stop further processing of the pattern.
   3372             break;
   3373         }
   3374 
   3375     }
   3376     return currentLen;
   3377 
   3378 }
   3379 
   3380 
   3381 //------------------------------------------------------------------------------
   3382 //
   3383 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
   3384 //                Extra NOPs are inserted for some constructs during the initial
   3385 //                code generation to provide locations that may be patched later.
   3386 //                Many end up unneeded, and are removed by this function.
   3387 //
   3388 //------------------------------------------------------------------------------
   3389 void RegexCompile::stripNOPs() {
   3390 
   3391     if (U_FAILURE(*fStatus)) {
   3392         return;
   3393     }
   3394 
   3395     int32_t    end = fRXPat->fCompiledPat->size();
   3396     UVector32  deltas(end, *fStatus);
   3397 
   3398     // Make a first pass over the code, computing the amount that things
   3399     //   will be offset at each location in the original code.
   3400     int32_t   loc;
   3401     int32_t   d = 0;
   3402     for (loc=0; loc<end; loc++) {
   3403         deltas.addElement(d, *fStatus);
   3404         int32_t op = fRXPat->fCompiledPat->elementAti(loc);
   3405         if (URX_TYPE(op) == URX_NOP) {
   3406             d++;
   3407         }
   3408     }
   3409 
   3410     // Make a second pass over the code, removing the NOPs by moving following
   3411     //  code up, and patching operands that refer to code locations that
   3412     //  are being moved.  The array of offsets from the first step is used
   3413     //  to compute the new operand values.
   3414     int32_t src;
   3415     int32_t dst = 0;
   3416     for (src=0; src<end; src++) {
   3417         int32_t op = fRXPat->fCompiledPat->elementAti(src);
   3418         int32_t opType = URX_TYPE(op);
   3419         switch (opType) {
   3420         case URX_NOP:
   3421             break;
   3422 
   3423         case URX_STATE_SAVE:
   3424         case URX_JMP:
   3425         case URX_CTR_LOOP:
   3426         case URX_CTR_LOOP_NG:
   3427         case URX_RELOC_OPRND:
   3428         case URX_JMPX:
   3429         case URX_JMP_SAV:
   3430         case URX_JMP_SAV_X:
   3431             // These are instructions with operands that refer to code locations.
   3432             {
   3433                 int32_t  operandAddress = URX_VAL(op);
   3434                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
   3435                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
   3436                 op = URX_BUILD(opType, fixedOperandAddress);
   3437                 fRXPat->fCompiledPat->setElementAt(op, dst);
   3438                 dst++;
   3439                 break;
   3440             }
   3441 
   3442         case URX_RESERVED_OP:
   3443         case URX_RESERVED_OP_N:
   3444         case URX_BACKTRACK:
   3445         case URX_END:
   3446         case URX_ONECHAR:
   3447         case URX_STRING:
   3448         case URX_STRING_LEN:
   3449         case URX_START_CAPTURE:
   3450         case URX_END_CAPTURE:
   3451         case URX_STATIC_SETREF:
   3452         case URX_STAT_SETREF_N:
   3453         case URX_SETREF:
   3454         case URX_DOTANY:
   3455         case URX_FAIL:
   3456         case URX_BACKSLASH_B:
   3457         case URX_BACKSLASH_BU:
   3458         case URX_BACKSLASH_G:
   3459         case URX_BACKSLASH_X:
   3460         case URX_BACKSLASH_Z:
   3461         case URX_DOTANY_ALL:
   3462         case URX_BACKSLASH_D:
   3463         case URX_CARET:
   3464         case URX_DOLLAR:
   3465         case URX_CTR_INIT:
   3466         case URX_CTR_INIT_NG:
   3467         case URX_DOTANY_UNIX:
   3468         case URX_STO_SP:
   3469         case URX_LD_SP:
   3470         case URX_BACKREF:
   3471         case URX_STO_INP_LOC:
   3472         case URX_LA_START:
   3473         case URX_LA_END:
   3474         case URX_ONECHAR_I:
   3475         case URX_STRING_I:
   3476         case URX_BACKREF_I:
   3477         case URX_DOLLAR_M:
   3478         case URX_CARET_M:
   3479         case URX_CARET_M_UNIX:
   3480         case URX_LB_START:
   3481         case URX_LB_CONT:
   3482         case URX_LB_END:
   3483         case URX_LBN_CONT:
   3484         case URX_LBN_END:
   3485         case URX_LOOP_SR_I:
   3486         case URX_LOOP_DOT_I:
   3487         case URX_LOOP_C:
   3488         case URX_DOLLAR_D:
   3489         case URX_DOLLAR_MD:
   3490             // These instructions are unaltered by the relocation.
   3491             fRXPat->fCompiledPat->setElementAt(op, dst);
   3492             dst++;
   3493             break;
   3494 
   3495         default:
   3496             // Some op is unaccounted for.
   3497             U_ASSERT(FALSE);
   3498             error(U_REGEX_INTERNAL_ERROR);
   3499         }
   3500     }
   3501 
   3502     fRXPat->fCompiledPat->setSize(dst);
   3503 }
   3504 
   3505 
   3506 
   3507 
   3508 //------------------------------------------------------------------------------
   3509 //
   3510 //  Error         Report a rule parse error.
   3511 //                Only report it if no previous error has been recorded.
   3512 //
   3513 //------------------------------------------------------------------------------
   3514 void RegexCompile::error(UErrorCode e) {
   3515     if (U_SUCCESS(*fStatus)) {
   3516         *fStatus = e;
   3517         fParseErr->line   = fLineNum;
   3518         fParseErr->offset = fCharNum;
   3519 
   3520         // Fill in the context.
   3521         //   Note: extractBetween() pins supplied indicies to the string bounds.
   3522         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
   3523         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
   3524         fRXPat->fPattern.extractBetween(fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex,
   3525             fParseErr->preContext,  0);
   3526         fRXPat->fPattern.extractBetween(fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1,
   3527             fParseErr->postContext, 0);
   3528     }
   3529 }
   3530 
   3531 
   3532 //
   3533 //  Assorted Unicode character constants.
   3534 //     Numeric because there is no portable way to enter them as literals.
   3535 //     (Think EBCDIC).
   3536 //
   3537 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
   3538 static const UChar      chLF        = 0x0a;      // Line Feed
   3539 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
   3540 static const UChar      chDigit0    = 0x30;      // '0'
   3541 static const UChar      chDigit7    = 0x37;      // '9'
   3542 static const UChar      chColon     = 0x3A;      // ':'
   3543 static const UChar      chE         = 0x45;      // 'E'
   3544 static const UChar      chQ         = 0x51;      // 'Q'
   3545 static const UChar      chN         = 0x4E;      // 'N'
   3546 static const UChar      chP         = 0x50;      // 'P'
   3547 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
   3548 static const UChar      chLBracket  = 0x5b;      // '['
   3549 static const UChar      chRBracket  = 0x5d;      // ']'
   3550 static const UChar      chUp        = 0x5e;      // '^'
   3551 static const UChar      chLowerP    = 0x70;
   3552 static const UChar      chLBrace    = 0x7b;      // '{'
   3553 static const UChar      chRBrace    = 0x7d;      // '}'
   3554 static const UChar      chNEL       = 0x85;      //    NEL newline variant
   3555 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
   3556 
   3557 
   3558 //------------------------------------------------------------------------------
   3559 //
   3560 //  nextCharLL    Low Level Next Char from the regex pattern.
   3561 //                Get a char from the string, keep track of input position
   3562 //                     for error reporting.
   3563 //
   3564 //------------------------------------------------------------------------------
   3565 UChar32  RegexCompile::nextCharLL() {
   3566     UChar32       ch;
   3567     UnicodeString &pattern = fRXPat->fPattern;
   3568 
   3569     if (fPeekChar != -1) {
   3570         ch = fPeekChar;
   3571         fPeekChar = -1;
   3572         return ch;
   3573     }
   3574     if (fPatternLength==0 || fNextIndex >= fPatternLength) {
   3575         return (UChar32)-1;
   3576     }
   3577     ch         = pattern.char32At(fNextIndex);
   3578     fNextIndex = pattern.moveIndex32(fNextIndex, 1);
   3579 
   3580     if (ch == chCR ||
   3581         ch == chNEL ||
   3582         ch == chLS   ||
   3583         ch == chLF && fLastChar != chCR) {
   3584         // Character is starting a new line.  Bump up the line number, and
   3585         //  reset the column to 0.
   3586         fLineNum++;
   3587         fCharNum=0;
   3588     }
   3589     else {
   3590         // Character is not starting a new line.  Except in the case of a
   3591         //   LF following a CR, increment the column position.
   3592         if (ch != chLF) {
   3593             fCharNum++;
   3594         }
   3595     }
   3596     fLastChar = ch;
   3597     return ch;
   3598 }
   3599 
   3600 //------------------------------------------------------------------------------
   3601 //
   3602 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
   3603 //                 character without actually getting it.
   3604 //
   3605 //------------------------------------------------------------------------------
   3606 UChar32  RegexCompile::peekCharLL() {
   3607     if (fPeekChar == -1) {
   3608         fPeekChar = nextCharLL();
   3609     }
   3610     return fPeekChar;
   3611 }
   3612 
   3613 
   3614 //------------------------------------------------------------------------------
   3615 //
   3616 //   nextChar     for pattern scanning.  At this level, we handle stripping
   3617 //                out comments and processing some backslash character escapes.
   3618 //                The rest of the pattern grammar is handled at the next level up.
   3619 //
   3620 //------------------------------------------------------------------------------
   3621 void RegexCompile::nextChar(RegexPatternChar &c) {
   3622 
   3623     fScanIndex = fNextIndex;
   3624     c.fChar    = nextCharLL();
   3625     c.fQuoted  = FALSE;
   3626 
   3627     if (fQuoteMode) {
   3628         c.fQuoted = TRUE;
   3629         if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-1) {
   3630             fQuoteMode = FALSE;  //  Exit quote mode,
   3631             nextCharLL();       // discard the E
   3632             nextChar(c);        // recurse to get the real next char
   3633         }
   3634     }
   3635     else if (fInBackslashQuote) {
   3636         // The current character immediately follows a '\'
   3637         // Don't check for any further escapes, just return it as-is.
   3638         // Don't set c.fQuoted, because that would prevent the state machine from
   3639         //    dispatching on the character.
   3640         fInBackslashQuote = FALSE;
   3641     }
   3642     else
   3643     {
   3644         // We are not in a \Q quoted region \E of the source.
   3645         //
   3646         if (fModeFlags & UREGEX_COMMENTS) {
   3647             //
   3648             // We are in free-spacing and comments mode.
   3649             //  Scan through any white space and comments, until we
   3650             //  reach a significant character or the end of inut.
   3651             for (;;) {
   3652                 if (c.fChar == (UChar32)-1) {
   3653                     break;     // End of Input
   3654                 }
   3655                 if  (c.fChar == chPound && fEOLComments == TRUE) {
   3656                     // Start of a comment.  Consume the rest of it, until EOF or a new line
   3657                     for (;;) {
   3658                         c.fChar = nextCharLL();
   3659                         if (c.fChar == (UChar32)-1 ||  // EOF
   3660                             c.fChar == chCR        ||
   3661                             c.fChar == chLF        ||
   3662                             c.fChar == chNEL       ||
   3663                             c.fChar == chLS)       {
   3664                             break;
   3665                         }
   3666                     }
   3667                 }
   3668                 // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
   3669                 if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) {
   3670                     break;
   3671                 }
   3672                 c.fChar = nextCharLL();
   3673             }
   3674         }
   3675 
   3676         //
   3677         //  check for backslash escaped characters.
   3678         //
   3679         if (c.fChar == chBackSlash) {
   3680             int32_t startX = fNextIndex;  // start and end positions of the
   3681             int32_t endX   = fNextIndex;  //   sequence following the '\'
   3682             if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
   3683                 //
   3684                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
   3685                 //   Includes \uxxxx, \n, \r, many others.
   3686                 //   Return the single equivalent character.
   3687                 //
   3688                 nextCharLL();                 // get & discard the peeked char.
   3689                 c.fQuoted = TRUE;
   3690                 c.fChar = fRXPat->fPattern.unescapeAt(endX);
   3691                 if (startX == endX) {
   3692                     error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   3693                 }
   3694                 fCharNum += endX - startX;
   3695                 fNextIndex = endX;
   3696             }
   3697             else if (peekCharLL() == chDigit0) {
   3698                 //  Octal Escape, using Java Regexp Conventions
   3699                 //    which are \0 followed by 1-3 octal digits.
   3700                 //    Different from ICU Unescape handling of Octal, which does not
   3701                 //    require the leading 0.
   3702                 //  Java also has the convention of only consuning 2 octal digits if
   3703                 //    the three digit number would be > 0xff
   3704                 //
   3705                 c.fChar = 0;
   3706                 nextCharLL();    // Consume the initial 0.
   3707                 int index;
   3708                 for (index=0; index<3; index++) {
   3709                     int32_t ch = peekCharLL();
   3710                     if (ch<chDigit0 || ch>chDigit7) {
   3711                         if (index==0) {
   3712                            // \0 is not followed by any octal digits.
   3713                            error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   3714                         }
   3715                         break;
   3716                     }
   3717                     c.fChar <<= 3;
   3718                     c.fChar += ch&7;
   3719                     if (c.fChar <= 255) {
   3720                         nextCharLL();
   3721                     } else {
   3722                         // The last digit made the number too big.  Forget we saw it.
   3723                         c.fChar >>= 3;
   3724                     }
   3725                 }
   3726                 c.fQuoted = TRUE;
   3727             }
   3728             else if (peekCharLL() == chQ) {
   3729                 //  "\Q"  enter quote mode, which will continue until "\E"
   3730                 fQuoteMode = TRUE;
   3731                 nextCharLL();       // discard the 'Q'.
   3732                 nextChar(c);        // recurse to get the real next char.
   3733             }
   3734             else
   3735             {
   3736                 // We are in a '\' escape that will be handled by the state table scanner.
   3737                 // Just return the backslash, but remember that the following char is to
   3738                 //  be taken literally.
   3739                 fInBackslashQuote = TRUE;
   3740             }
   3741         }
   3742     }
   3743 
   3744     // re-enable # to end-of-line comments, in case they were disabled.
   3745     // They are disabled by the parser upon seeing '(?', but this lasts for
   3746     //  the fetching of the next character only.
   3747     fEOLComments = TRUE;
   3748 
   3749     // putc(c.fChar, stdout);
   3750 }
   3751 
   3752 
   3753 
   3754 //------------------------------------------------------------------------------
   3755 //
   3756 //  scanNamedChar
   3757  //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
   3758 //
   3759 //             The scan position will be at the 'N'.  On return
   3760 //             the scan position should be just after the '}'
   3761 //
   3762 //             Return the UChar32
   3763 //
   3764 //------------------------------------------------------------------------------
   3765 UChar32  RegexCompile::scanNamedChar() {
   3766     if (U_FAILURE(*fStatus)) {
   3767         return 0;
   3768     }
   3769 
   3770     nextChar(fC);
   3771     if (fC.fChar != chLBrace) {
   3772         error(U_REGEX_PROPERTY_SYNTAX);
   3773         return 0;
   3774     }
   3775 
   3776     UnicodeString  charName;
   3777     for (;;) {
   3778         nextChar(fC);
   3779         if (fC.fChar == chRBrace) {
   3780             break;
   3781         }
   3782         if (fC.fChar == -1) {
   3783             error(U_REGEX_PROPERTY_SYNTAX);
   3784             return 0;
   3785         }
   3786         charName.append(fC.fChar);
   3787     }
   3788 
   3789     char name[100];
   3790     if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
   3791          (uint32_t)charName.length()>=sizeof(name)) {
   3792         // All Unicode character names have only invariant characters.
   3793         // The API to get a character, given a name, accepts only char *, forcing us to convert,
   3794         //   which requires this error check
   3795         error(U_REGEX_PROPERTY_SYNTAX);
   3796         return 0;
   3797     }
   3798     charName.extract(0, charName.length(), name, sizeof(name), US_INV);
   3799 
   3800     UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
   3801     if (U_FAILURE(*fStatus)) {
   3802         error(U_REGEX_PROPERTY_SYNTAX);
   3803     }
   3804 
   3805     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
   3806     return theChar;
   3807 }
   3808 
   3809 //------------------------------------------------------------------------------
   3810 //
   3811 //  scanProp   Construct a UnicodeSet from the text at the current scan
   3812 //             position, which will be of the form \p{whaterver}
   3813 //
   3814 //             The scan position will be at the 'p' or 'P'.  On return
   3815 //             the scan position should be just after the '}'
   3816 //
   3817 //             Return a UnicodeSet, constructed from the \P pattern,
   3818 //             or NULL if the pattern is invalid.
   3819 //
   3820 //------------------------------------------------------------------------------
   3821 UnicodeSet *RegexCompile::scanProp() {
   3822     UnicodeSet    *uset = NULL;
   3823 
   3824     if (U_FAILURE(*fStatus)) {
   3825         return NULL;
   3826     }
   3827     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
   3828     UBool negated = (fC.fChar == chP);
   3829 
   3830     UnicodeString propertyName;
   3831     nextChar(fC);
   3832     if (fC.fChar != chLBrace) {
   3833         error(U_REGEX_PROPERTY_SYNTAX);
   3834         return NULL;
   3835     }
   3836     for (;;) {
   3837         nextChar(fC);
   3838         if (fC.fChar == chRBrace) {
   3839             break;
   3840         }
   3841         if (fC.fChar == -1) {
   3842             // Hit the end of the input string without finding the closing '}'
   3843             error(U_REGEX_PROPERTY_SYNTAX);
   3844             return NULL;
   3845         }
   3846         propertyName.append(fC.fChar);
   3847     }
   3848     uset = createSetForProperty(propertyName, negated);
   3849     nextChar(fC);    // Move input scan to position following the closing '}'
   3850     return uset;
   3851 }
   3852 
   3853 //------------------------------------------------------------------------------
   3854 //
   3855 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
   3856 //             position, which is expected be of the form [:property expression:]
   3857 //
   3858 //             The scan position will be at the opening ':'.  On return
   3859 //             the scan position must be on the closing ']'
   3860 //
   3861 //             Return a UnicodeSet constructed from the pattern,
   3862 //             or NULL if this is not a valid POSIX-style set expression.
   3863 //             If not a property expression, restore the initial scan position
   3864 //                (to the opening ':')
   3865 //
   3866 //               Note:  the opening '[:' is not sufficient to guarantee that
   3867 //                      this is a [:property:] expression.
   3868 //                      [:'+=,] is a perfectly good ordinary set expression that
   3869 //                              happens to include ':' as one of its characters.
   3870 //
   3871 //------------------------------------------------------------------------------
   3872 UnicodeSet *RegexCompile::scanPosixProp() {
   3873     UnicodeSet    *uset = NULL;
   3874 
   3875     if (U_FAILURE(*fStatus)) {
   3876         return NULL;
   3877     }
   3878 
   3879     U_ASSERT(fC.fChar == chColon);
   3880 
   3881     // Save the scanner state.
   3882     // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
   3883     int32_t     savedScanIndex        = fScanIndex;
   3884     int32_t     savedNextIndex        = fNextIndex;
   3885     UBool       savedQuoteMode        = fQuoteMode;
   3886     UBool       savedInBackslashQuote = fInBackslashQuote;
   3887     UBool       savedEOLComments      = fEOLComments;
   3888     int32_t     savedLineNum          = fLineNum;
   3889     int32_t     savedCharNum          = fCharNum;
   3890     UChar32     savedLastChar         = fLastChar;
   3891     UChar32     savedPeekChar         = fPeekChar;
   3892     RegexPatternChar savedfC          = fC;
   3893 
   3894     // Scan for a closing ].   A little tricky because there are some perverse
   3895     //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
   3896     //   ending on the second closing ].
   3897 
   3898     UnicodeString propName;
   3899     UBool         negated  = FALSE;
   3900 
   3901     // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
   3902     nextChar(fC);
   3903     if (fC.fChar == chUp) {
   3904        negated = TRUE;
   3905        nextChar(fC);
   3906     }
   3907 
   3908     // Scan for the closing ":]", collecting the property name along the way.
   3909     UBool  sawPropSetTerminator = FALSE;
   3910     for (;;) {
   3911         propName.append(fC.fChar);
   3912         nextChar(fC);
   3913         if (fC.fQuoted || fC.fChar == -1) {
   3914             // Escaped characters or end of input - either says this isn't a [:Property:]
   3915             break;
   3916         }
   3917         if (fC.fChar == chColon) {
   3918             nextChar(fC);
   3919             if (fC.fChar == chRBracket) {
   3920                 sawPropSetTerminator = TRUE;
   3921             }
   3922             break;
   3923         }
   3924     }
   3925 
   3926     if (sawPropSetTerminator) {
   3927         uset = createSetForProperty(propName, negated);
   3928     }
   3929     else
   3930     {
   3931         // No closing ":]".
   3932         //  Restore the original scan position.
   3933         //  The main scanner will retry the input as a normal set expression,
   3934         //    not a [:Property:] expression.
   3935         fScanIndex        = savedScanIndex;
   3936         fNextIndex        = savedNextIndex;
   3937         fQuoteMode        = savedQuoteMode;
   3938         fInBackslashQuote = savedInBackslashQuote;
   3939         fEOLComments      = savedEOLComments;
   3940         fLineNum          = savedLineNum;
   3941         fCharNum          = savedCharNum;
   3942         fLastChar         = savedLastChar;
   3943         fPeekChar         = savedPeekChar;
   3944         fC                = savedfC;
   3945     }
   3946     return uset;
   3947 }
   3948 
   3949 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
   3950     set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
   3951     addCategory(set, U_GC_CF_MASK, ec);
   3952 }
   3953 
   3954 //
   3955 //  Create a Unicode Set from a Unicode Property expression.
   3956 //     This is common code underlying both \p{...} ane [:...:] expressions.
   3957 //     Includes trying the Java "properties" that aren't supported as
   3958 //     normal ICU UnicodeSet properties
   3959 //
   3960 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
   3961 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
   3962 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
   3963     UnicodeString   setExpr;
   3964     UnicodeSet      *set;
   3965     uint32_t        usetFlags = 0;
   3966 
   3967     if (U_FAILURE(*fStatus)) {
   3968         return NULL;
   3969     }
   3970 
   3971     //
   3972     //  First try the property as we received it
   3973     //
   3974     if (negated) {
   3975         setExpr.append(negSetPrefix, -1);
   3976     } else {
   3977         setExpr.append(posSetPrefix, -1);
   3978     }
   3979     setExpr.append(propName);
   3980     setExpr.append(chRBrace);
   3981     setExpr.append(chRBracket);
   3982     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   3983         usetFlags |= USET_CASE_INSENSITIVE;
   3984     }
   3985     set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
   3986     if (U_SUCCESS(*fStatus)) {
   3987        return set;
   3988     }
   3989     delete set;
   3990     set = NULL;
   3991 
   3992     //
   3993     //  The property as it was didn't work.
   3994     //    Do emergency fixes -
   3995     //       InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
   3996     //       InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
   3997     //
   3998     //       Note on Spaces:  either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
   3999     //                        is accepted by Java.  The property part of the name is compared
   4000     //                        case-insenstively.  The spaces must be exactly as shown, either
   4001     //                        all there, or all omitted, with exactly one at each position
   4002     //                        if they are present.  From checking against JDK 1.6
   4003     //
   4004     //       This code should be removed when ICU properties support the Java  compatibility names
   4005     //          (ICU 4.0?)
   4006     //
   4007     UnicodeString mPropName = propName;
   4008     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
   4009         mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
   4010     }
   4011     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
   4012         mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
   4013         mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
   4014     }
   4015     else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
   4016         mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
   4017     }
   4018 
   4019     //    See if the property looks like a Java "InBlockName", which
   4020     //    we will recast as "Block=BlockName"
   4021     //
   4022     static const UChar IN[] = {0x49, 0x6E, 0};  // "In"
   4023     static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00};  // "Block="
   4024     if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
   4025         setExpr.truncate(4);   // Leaves "[\p{", or "[\P{"
   4026         setExpr.append(BLOCK, -1);
   4027         setExpr.append(UnicodeString(mPropName, 2));  // Property with the leading "In" removed.
   4028         setExpr.append(chRBrace);
   4029         setExpr.append(chRBracket);
   4030         *fStatus = U_ZERO_ERROR;
   4031         set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
   4032         if (U_SUCCESS(*fStatus)) {
   4033             return set;
   4034         }
   4035         delete set;
   4036         set = NULL;
   4037     }
   4038 
   4039     if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
   4040         propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
   4041     {
   4042         UErrorCode localStatus = U_ZERO_ERROR;
   4043         //setExpr.remove();
   4044         set = new UnicodeSet();
   4045         //
   4046         //  Try the various Java specific properties.
   4047         //   These all begin with "java"
   4048         //
   4049         if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
   4050             addCategory(set, U_GC_CN_MASK, localStatus);
   4051             set->complement();
   4052         }
   4053         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
   4054             addCategory(set, U_GC_ND_MASK, localStatus);
   4055         }
   4056         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
   4057             addIdentifierIgnorable(set, localStatus);
   4058         }
   4059         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
   4060             set->add(0, 0x1F).add(0x7F, 0x9F);
   4061         }
   4062         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
   4063             addCategory(set, U_GC_L_MASK, localStatus);
   4064             addCategory(set, U_GC_SC_MASK, localStatus);
   4065             addCategory(set, U_GC_PC_MASK, localStatus);
   4066             addCategory(set, U_GC_ND_MASK, localStatus);
   4067             addCategory(set, U_GC_NL_MASK, localStatus);
   4068             addCategory(set, U_GC_MC_MASK, localStatus);
   4069             addCategory(set, U_GC_MN_MASK, localStatus);
   4070             addIdentifierIgnorable(set, localStatus);
   4071         }
   4072         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
   4073             addCategory(set, U_GC_L_MASK, localStatus);
   4074             addCategory(set, U_GC_NL_MASK, localStatus);
   4075             addCategory(set, U_GC_SC_MASK, localStatus);
   4076             addCategory(set, U_GC_PC_MASK, localStatus);
   4077         }
   4078         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
   4079             addCategory(set, U_GC_L_MASK, localStatus);
   4080         }
   4081         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
   4082             addCategory(set, U_GC_L_MASK, localStatus);
   4083             addCategory(set, U_GC_ND_MASK, localStatus);
   4084         }
   4085         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
   4086             addCategory(set, U_GC_LL_MASK, localStatus);
   4087         }
   4088         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
   4089             set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
   4090         }
   4091         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
   4092             addCategory(set, U_GC_Z_MASK, localStatus);
   4093         }
   4094         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
   4095             set->add(0x10000, UnicodeSet::MAX_VALUE);
   4096         }
   4097         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
   4098             addCategory(set, U_GC_LT_MASK, localStatus);
   4099         }
   4100         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
   4101             addCategory(set, U_GC_L_MASK, localStatus);
   4102             addCategory(set, U_GC_NL_MASK, localStatus);
   4103         }
   4104         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
   4105             addCategory(set, U_GC_L_MASK, localStatus);
   4106             addCategory(set, U_GC_PC_MASK, localStatus);
   4107             addCategory(set, U_GC_ND_MASK, localStatus);
   4108             addCategory(set, U_GC_NL_MASK, localStatus);
   4109             addCategory(set, U_GC_MC_MASK, localStatus);
   4110             addCategory(set, U_GC_MN_MASK, localStatus);
   4111             addIdentifierIgnorable(set, localStatus);
   4112         }
   4113         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
   4114             addCategory(set, U_GC_LU_MASK, localStatus);
   4115         }
   4116         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
   4117             set->add(0, UnicodeSet::MAX_VALUE);
   4118         }
   4119         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
   4120             addCategory(set, U_GC_Z_MASK, localStatus);
   4121             set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
   4122             set->add(9, 0x0d).add(0x1c, 0x1f);
   4123         }
   4124         else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
   4125             set->add(0, UnicodeSet::MAX_VALUE);
   4126         }
   4127 
   4128         if (U_SUCCESS(localStatus) && !set->isEmpty()) {
   4129             *fStatus = U_ZERO_ERROR;
   4130             if (usetFlags & USET_CASE_INSENSITIVE) {
   4131                 set->closeOver(USET_CASE_INSENSITIVE);
   4132             }
   4133             if (negated) {
   4134                 set->complement();
   4135             }
   4136             return set;
   4137         }
   4138         delete set;
   4139         set = NULL;
   4140     }
   4141     error(*fStatus);
   4142     return NULL;
   4143 }
   4144 
   4145 
   4146 
   4147 //
   4148 //  SetEval   Part of the evaluation of [set expressions].
   4149 //            Perform any pending (stacked) operations with precedence
   4150 //            equal or greater to that of the next operator encountered
   4151 //            in the expression.
   4152 //
   4153 void RegexCompile::setEval(int32_t nextOp) {
   4154     UnicodeSet *rightOperand = NULL;
   4155     UnicodeSet *leftOperand  = NULL;
   4156     for (;;) {
   4157         U_ASSERT(fSetOpStack.empty()==FALSE);
   4158         int32_t pendingSetOperation = fSetOpStack.peeki();
   4159         if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
   4160             break;
   4161         }
   4162         fSetOpStack.popi();
   4163         U_ASSERT(fSetStack.empty() == FALSE);
   4164         rightOperand = (UnicodeSet *)fSetStack.peek();
   4165         switch (pendingSetOperation) {
   4166             case setNegation:
   4167                 rightOperand->complement();
   4168                 break;
   4169             case setCaseClose:
   4170                 // TODO: need a simple close function.  Ticket 6065
   4171                 rightOperand->closeOver(USET_CASE_INSENSITIVE);
   4172                 rightOperand->removeAllStrings();
   4173                 break;
   4174             case setDifference1:
   4175             case setDifference2:
   4176                 fSetStack.pop();
   4177                 leftOperand = (UnicodeSet *)fSetStack.peek();
   4178                 leftOperand->removeAll(*rightOperand);
   4179                 delete rightOperand;
   4180                 break;
   4181             case setIntersection1:
   4182             case setIntersection2:
   4183                 fSetStack.pop();
   4184                 leftOperand = (UnicodeSet *)fSetStack.peek();
   4185                 leftOperand->retainAll(*rightOperand);
   4186                 delete rightOperand;
   4187                 break;
   4188             case setUnion:
   4189                 fSetStack.pop();
   4190                 leftOperand = (UnicodeSet *)fSetStack.peek();
   4191                 leftOperand->addAll(*rightOperand);
   4192                 delete rightOperand;
   4193                 break;
   4194             default:
   4195                 U_ASSERT(FALSE);
   4196                 break;
   4197             }
   4198         }
   4199     }
   4200 
   4201 void RegexCompile::setPushOp(int32_t op) {
   4202     setEval(op);
   4203     fSetOpStack.push(op, *fStatus);
   4204     fSetStack.push(new UnicodeSet(), *fStatus);
   4205 }
   4206 
   4207 U_NAMESPACE_END
   4208 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
   4209 
   4210