1 // 2 // regexcmp.h 3 // 4 // Copyright (C) 2002-2015, International Business Machines Corporation and others. 5 // All Rights Reserved. 6 // 7 // This file contains declarations for the class RegexCompile 8 // 9 // This class is internal to the regular expression implementation. 10 // For the public Regular Expression API, see the file "unicode/regex.h" 11 // 12 13 14 #ifndef RBBISCAN_H 15 #define RBBISCAN_H 16 17 #include "unicode/utypes.h" 18 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 19 20 #include "unicode/uobject.h" 21 #include "unicode/uniset.h" 22 #include "unicode/parseerr.h" 23 #include "uhash.h" 24 #include "uvector.h" 25 #include "uvectr32.h" 26 27 28 29 U_NAMESPACE_BEGIN 30 31 32 //-------------------------------------------------------------------------------- 33 // 34 // class RegexCompile Contains the regular expression compiler. 35 // 36 //-------------------------------------------------------------------------------- 37 struct RegexTableEl; 38 class RegexPattern; 39 40 41 class U_I18N_API RegexCompile : public UMemory { 42 public: 43 44 enum { 45 kStackSize = 100 // The size of the state stack for 46 }; // pattern parsing. Corresponds roughly 47 // to the depth of parentheses nesting 48 // that is allowed in the rules. 49 50 struct RegexPatternChar { 51 UChar32 fChar; 52 UBool fQuoted; 53 }; 54 55 RegexCompile(RegexPattern *rp, UErrorCode &e); 56 57 void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); 58 void compile(UText *pat, UParseError &pp, UErrorCode &e); 59 60 61 virtual ~RegexCompile(); 62 63 void nextChar(RegexPatternChar &c); // Get the next char from the input stream. 64 65 static void cleanup(); // Memory cleanup 66 67 68 69 // Categories of parentheses in pattern. 70 // The category is saved in the compile-time parentheses stack frame, and 71 // determines the code to be generated when the matching close ) is encountered. 72 enum EParenClass { 73 plain = -1, // No special handling 74 capturing = -2, 75 atomic = -3, 76 lookAhead = -4, 77 negLookAhead = -5, 78 flags = -6, 79 lookBehind = -7, 80 lookBehindN = -8 81 }; 82 83 private: 84 85 86 UBool doParseActions(int32_t a); 87 void error(UErrorCode e); // error reporting convenience function. 88 89 UChar32 nextCharLL(); 90 UChar32 peekCharLL(); 91 UnicodeSet *scanProp(); 92 UnicodeSet *scanPosixProp(); 93 void handleCloseParen(); 94 int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern 95 // at the top of the just completed block 96 // or operation, and optionally ensure that 97 // there is space to add an opcode there. 98 void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for 99 // a reference to a UnicodeSet. 100 void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. 101 int32_t LoopOp); 102 UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier 103 void literalChar(UChar32 c); // Compile a literal char 104 void fixLiterals(UBool split=FALSE); // Generate code for pending literal characters. 105 void insertOp(int32_t where); // Open up a slot for a new op in the 106 // generated code at the specified location. 107 void appendOp(int32_t op); // Append a new op to the compiled pattern. 108 void appendOp(int32_t type, int32_t val); // Build & append a new op to the compiled pattern. 109 int32_t buildOp(int32_t type, int32_t val); // Construct a new pcode instruction. 110 int32_t allocateData(int32_t size); // Allocate space in the matcher data area. 111 // Return index of the newly allocated data. 112 int32_t allocateStackData(int32_t size); // Allocate space in the match back-track stack frame. 113 // Return offset index in the frame. 114 int32_t minMatchLength(int32_t start, 115 int32_t end); 116 int32_t maxMatchLength(int32_t start, 117 int32_t end); 118 void matchStartType(); 119 void stripNOPs(); 120 121 void setEval(int32_t op); 122 void setPushOp(int32_t op); 123 UChar32 scanNamedChar(); 124 UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated); 125 126 public: // Public for testing only. 127 static void U_EXPORT2 findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars); 128 private: 129 130 131 UErrorCode *fStatus; 132 RegexPattern *fRXPat; 133 UParseError *fParseErr; 134 135 // 136 // Data associated with low level character scanning 137 // 138 int64_t fScanIndex; // Index of current character being processed 139 // in the rule input string. 140 UBool fQuoteMode; // Scan is in a \Q...\E quoted region 141 UBool fInBackslashQuote; // Scan is between a '\' and the following char. 142 UBool fEOLComments; // When scan is just after '(?', inhibit #... to 143 // end of line comments, in favor of (?#...) comments. 144 int64_t fLineNum; // Line number in input file. 145 int64_t fCharNum; // Char position within the line. 146 UChar32 fLastChar; // Previous char, needed to count CR-LF 147 // as a single line, not two. 148 UChar32 fPeekChar; // Saved char, if we've scanned ahead. 149 150 151 RegexPatternChar fC; // Current char for parse state machine 152 // processing. 153 154 // 155 // Data for the state machine that parses the regular expression. 156 // 157 RegexTableEl **fStateTable; // State Transition Table for regex Rule 158 // parsing. index by p[state][char-class] 159 160 uint16_t fStack[kStackSize]; // State stack, holds state pushes 161 int32_t fStackPtr; // and pops as specified in the state 162 // transition rules. 163 164 // 165 // Data associated with the generation of the pcode for the match engine 166 // 167 int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) 168 // Always has high bit (31) set so that flag values 169 // on the paren stack are distinguished from relocatable 170 // pcode addresses. 171 int32_t fNewModeFlags; // New flags, while compiling (?i, holds state 172 // until last flag is scanned. 173 UBool fSetModeFlag; // true for (?ismx, false for (?-ismx 174 175 UnicodeString fLiteralChars; // Literal chars or strings from the pattern are accumulated here. 176 // Once completed, meaning that some non-literal pattern 177 // construct is encountered, the appropriate opcodes 178 // to match the literal will be generated, and this 179 // string will be cleared. 180 181 int64_t fPatternLength; // Length of the input pattern string. 182 183 UVector32 fParenStack; // parentheses stack. Each frame consists of 184 // the positions of compiled pattern operations 185 // needing fixup, followed by negative value. The 186 // first entry in each frame is the position of the 187 // spot reserved for use when a quantifier 188 // needs to add a SAVE at the start of a (block) 189 // The negative value (-1, -2,...) indicates 190 // the kind of paren that opened the frame. Some 191 // need special handling on close. 192 193 194 int32_t fMatchOpenParen; // The position in the compiled pattern 195 // of the slot reserved for a state save 196 // at the start of the most recently processed 197 // parenthesized block. Updated when processing 198 // a close to the location for the corresponding open. 199 200 int32_t fMatchCloseParen; // The position in the pattern of the first 201 // location after the most recently processed 202 // parenthesized block. 203 204 int32_t fIntervalLow; // {lower, upper} interval quantifier values. 205 int32_t fIntervalUpper; // Placed here temporarily, when pattern is 206 // initially scanned. Each new interval 207 // encountered overwrites these values. 208 // -1 for the upper interval value means none 209 // was specified (unlimited occurences.) 210 211 int64_t fNameStartPos; // Starting position of a \N{NAME} name in a 212 // pattern, valid while remainder of name is 213 // scanned. 214 215 UStack fSetStack; // Stack of UnicodeSets, used while evaluating 216 // (at compile time) set expressions within 217 // the pattern. 218 UStack fSetOpStack; // Stack of pending set operators (&&, --, union) 219 220 UChar32 fLastSetLiteral; // The last single code point added to a set. 221 // needed when "-y" is scanned, and we need 222 // to turn "x-y" into a range. 223 224 UnicodeString *fCaptureName; // Named Capture, the group name is built up 225 // in this string while being scanned. 226 }; 227 228 // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions] 229 // The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself. 230 231 enum SetOperations { 232 setStart = 0 << 16 | 1, 233 setEnd = 1 << 16 | 2, 234 setNegation = 2 << 16 | 3, 235 setCaseClose = 2 << 16 | 9, 236 setDifference2 = 3 << 16 | 4, // '--' set difference operator 237 setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator 238 setUnion = 4 << 16 | 6, // implicit union of adjacent items 239 setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet. 240 setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. 241 }; 242 243 U_NAMESPACE_END 244 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 245 #endif // RBBISCAN_H 246