Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 //
      4 //  regexcmp.h
      5 //
      6 //  Copyright (C) 2002-2016, International Business Machines Corporation and others.
      7 //  All Rights Reserved.
      8 //
      9 //  This file contains declarations for the class RegexCompile
     10 //
     11 //  This class is internal to the regular expression implementation.
     12 //  For the public Regular Expression API, see the file "unicode/regex.h"
     13 //
     14 
     15 
     16 #ifndef RBBISCAN_H
     17 #define RBBISCAN_H
     18 
     19 #include "unicode/utypes.h"
     20 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     21 
     22 #include "unicode/parseerr.h"
     23 #include "unicode/uniset.h"
     24 #include "unicode/uobject.h"
     25 #include "unicode/utext.h"
     26 #include "uhash.h"
     27 #include "uvector.h"
     28 #include "uvectr32.h"
     29 
     30 
     31 
     32 U_NAMESPACE_BEGIN
     33 
     34 
     35 //--------------------------------------------------------------------------------
     36 //
     37 //  class RegexCompile    Contains the regular expression compiler.
     38 //
     39 //--------------------------------------------------------------------------------
     40 struct  RegexTableEl;
     41 class   RegexPattern;
     42 
     43 
     44 class U_I18N_API RegexCompile : public UMemory {
     45 public:
     46 
     47     enum {
     48         kStackSize = 100            // The size of the state stack for
     49     };                              //   pattern parsing.  Corresponds roughly
     50                                     //   to the depth of parentheses nesting
     51                                     //   that is allowed in the rules.
     52 
     53     struct RegexPatternChar {
     54         UChar32             fChar;
     55         UBool               fQuoted;
     56     };
     57 
     58     RegexCompile(RegexPattern *rp, UErrorCode &e);
     59 
     60     void       compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
     61     void       compile(UText *pat, UParseError &pp, UErrorCode &e);
     62 
     63 
     64     virtual    ~RegexCompile();
     65 
     66     void        nextChar(RegexPatternChar &c);      // Get the next char from the input stream.
     67 
     68     static void cleanup();                       // Memory cleanup
     69 
     70 
     71 
     72     // Categories of parentheses in pattern.
     73     //   The category is saved in the compile-time parentheses stack frame, and
     74     //   determines the code to be generated when the matching close ) is encountered.
     75     enum EParenClass {
     76         plain        = -1,               // No special handling
     77         capturing    = -2,
     78         atomic       = -3,
     79         lookAhead    = -4,
     80         negLookAhead = -5,
     81         flags        = -6,
     82         lookBehind   = -7,
     83         lookBehindN  = -8
     84     };
     85 
     86 private:
     87 
     88 
     89     UBool       doParseActions(int32_t a);
     90     void        error(UErrorCode e);                   // error reporting convenience function.
     91 
     92     UChar32     nextCharLL();
     93     UChar32     peekCharLL();
     94     UnicodeSet  *scanProp();
     95     UnicodeSet  *scanPosixProp();
     96     void        handleCloseParen();
     97     int32_t     blockTopLoc(UBool reserve);          // Locate a position in the compiled pattern
     98                                                      //  at the top of the just completed block
     99                                                      //  or operation, and optionally ensure that
    100                                                      //  there is space to add an opcode there.
    101     void        compileSet(UnicodeSet *theSet);      // Generate the compiled pattern for
    102                                                      //   a reference to a UnicodeSet.
    103     void        compileInterval(int32_t InitOp,      // Generate the code for a {min,max} quantifier.
    104                                int32_t LoopOp);
    105     UBool       compileInlineInterval();             // Generate inline code for a {min,max} quantifier
    106     void        literalChar(UChar32 c);              // Compile a literal char
    107     void        fixLiterals(UBool split=FALSE);      // Generate code for pending literal characters.
    108     void        insertOp(int32_t where);             // Open up a slot for a new op in the
    109                                                      //   generated code at the specified location.
    110     void        appendOp(int32_t op);                // Append a new op to the compiled pattern.
    111     void        appendOp(int32_t type, int32_t val); // Build & append a new op to the compiled pattern.
    112     int32_t     buildOp(int32_t type, int32_t val);  // Construct a new pcode instruction.
    113     int32_t     allocateData(int32_t size);          // Allocate space in the matcher data area.
    114                                                      //   Return index of the newly allocated data.
    115     int32_t     allocateStackData(int32_t size);     // Allocate space in the match back-track stack frame.
    116                                                      //   Return offset index in the frame.
    117     int32_t     minMatchLength(int32_t start,
    118                                int32_t end);
    119     int32_t     maxMatchLength(int32_t start,
    120                                int32_t end);
    121     void        matchStartType();
    122     void        stripNOPs();
    123 
    124     void        setEval(int32_t op);
    125     void        setPushOp(int32_t op);
    126     UChar32     scanNamedChar();
    127     UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);
    128 
    129 public:   // Public for testing only.
    130     static void U_EXPORT2 findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars);
    131 private:
    132 
    133 
    134     UErrorCode                    *fStatus;
    135     RegexPattern                  *fRXPat;
    136     UParseError                   *fParseErr;
    137 
    138     //
    139     //  Data associated with low level character scanning
    140     //
    141     int64_t                       fScanIndex;        // Index of current character being processed
    142                                                      //   in the rule input string.
    143     UBool                         fQuoteMode;        // Scan is in a \Q...\E quoted region
    144     UBool                         fInBackslashQuote; // Scan is between a '\' and the following char.
    145     UBool                         fEOLComments;      // When scan is just after '(?',  inhibit #... to
    146                                                      //   end of line comments, in favor of (?#...) comments.
    147     int64_t                       fLineNum;          // Line number in input file.
    148     int64_t                       fCharNum;          // Char position within the line.
    149     UChar32                       fLastChar;         // Previous char, needed to count CR-LF
    150                                                      //   as a single line, not two.
    151     UChar32                       fPeekChar;         // Saved char, if we've scanned ahead.
    152 
    153 
    154     RegexPatternChar              fC;                // Current char for parse state machine
    155                                                      //   processing.
    156 
    157     //
    158     //   Data for the state machine that parses the regular expression.
    159     //
    160     RegexTableEl                  **fStateTable;     // State Transition Table for regex Rule
    161                                                      //   parsing.  index by p[state][char-class]
    162 
    163     uint16_t                      fStack[kStackSize];  // State stack, holds state pushes
    164     int32_t                       fStackPtr;           //  and pops as specified in the state
    165                                                        //  transition rules.
    166 
    167     //
    168     //  Data associated with the generation of the pcode for the match engine
    169     //
    170     int32_t                       fModeFlags;        // Match Flags.  (Case Insensitive, etc.)
    171                                                      //   Always has high bit (31) set so that flag values
    172                                                      //   on the paren stack are distinguished from relocatable
    173                                                      //   pcode addresses.
    174     int32_t                       fNewModeFlags;     // New flags, while compiling (?i, holds state
    175                                                      //   until last flag is scanned.
    176     UBool                         fSetModeFlag;      // true for (?ismx, false for (?-ismx
    177 
    178     UnicodeString                 fLiteralChars;     // Literal chars or strings from the pattern are accumulated here.
    179                                                      //   Once completed, meaning that some non-literal pattern
    180                                                      //   construct is encountered, the appropriate opcodes
    181                                                      //   to match the literal will be generated, and this
    182                                                      //   string will be cleared.
    183 
    184     int64_t                       fPatternLength;    // Length of the input pattern string.
    185 
    186     UVector32                     fParenStack;       // parentheses stack.  Each frame consists of
    187                                                      //   the positions of compiled pattern operations
    188                                                      //   needing fixup, followed by negative value.  The
    189                                                      //   first entry in each frame is the position of the
    190                                                      //   spot reserved for use when a quantifier
    191                                                      //   needs to add a SAVE at the start of a (block)
    192                                                      //   The negative value (-1, -2,...) indicates
    193                                                      //   the kind of paren that opened the frame.  Some
    194                                                      //   need special handling on close.
    195 
    196 
    197     int32_t                       fMatchOpenParen;   // The position in the compiled pattern
    198                                                      //   of the slot reserved for a state save
    199                                                      //   at the start of the most recently processed
    200                                                      //   parenthesized block. Updated when processing
    201                                                      //   a close to the location for the corresponding open.
    202 
    203     int32_t                       fMatchCloseParen;  // The position in the pattern of the first
    204                                                      //   location after the most recently processed
    205                                                      //   parenthesized block.
    206 
    207     int32_t                       fIntervalLow;      // {lower, upper} interval quantifier values.
    208     int32_t                       fIntervalUpper;    // Placed here temporarily, when pattern is
    209                                                      //   initially scanned.  Each new interval
    210                                                      //   encountered overwrites these values.
    211                                                      //   -1 for the upper interval value means none
    212                                                      //   was specified (unlimited occurences.)
    213 
    214     int64_t                       fNameStartPos;     // Starting position of a \N{NAME} name in a
    215                                                      //   pattern, valid while remainder of name is
    216                                                      //   scanned.
    217 
    218     UStack                        fSetStack;         // Stack of UnicodeSets, used while evaluating
    219                                                      //   (at compile time) set expressions within
    220                                                      //   the pattern.
    221     UStack                        fSetOpStack;       // Stack of pending set operators (&&, --, union)
    222 
    223     UChar32                       fLastSetLiteral;   // The last single code point added to a set.
    224                                                      //   needed when "-y" is scanned, and we need
    225                                                      //   to turn "x-y" into a range.
    226 
    227     UnicodeString                *fCaptureName;      // Named Capture, the group name is built up
    228                                                      //   in this string while being scanned.
    229 };
    230 
    231 // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]
    232 //   The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself.
    233 
    234 enum SetOperations {
    235     setStart         = 0 << 16 | 1,
    236     setEnd           = 1 << 16 | 2,
    237     setNegation      = 2 << 16 | 3,
    238     setCaseClose     = 2 << 16 | 9,
    239     setDifference2   = 3 << 16 | 4,    // '--' set difference operator
    240     setIntersection2 = 3 << 16 | 5,    // '&&' set intersection operator
    241     setUnion         = 4 << 16 | 6,    // implicit union of adjacent items
    242     setDifference1   = 4 << 16 | 7,    // '-', single dash difference op, for compatibility with old UnicodeSet.
    243     setIntersection1 = 4 << 16 | 8     // '&', single amp intersection op, for compatibility with old UnicodeSet.
    244     };
    245 
    246 U_NAMESPACE_END
    247 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
    248 #endif   // RBBISCAN_H
    249