1 // 2 // regexcmp.h 3 // 4 // Copyright (C) 2002-2012, International Business Machines Corporation and others. 5 // All Rights Reserved. 6 // 7 // This file contains declarations for the class RegexCompile 8 // 9 // This class is internal to the regular expression implementation. 10 // For the public Regular Expression API, see the file "unicode/regex.h" 11 // 12 13 14 #ifndef RBBISCAN_H 15 #define RBBISCAN_H 16 17 #include "unicode/utypes.h" 18 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 19 20 #include "unicode/uobject.h" 21 #include "unicode/uniset.h" 22 #include "unicode/parseerr.h" 23 #include "uhash.h" 24 #include "uvector.h" 25 26 27 28 U_NAMESPACE_BEGIN 29 30 31 //-------------------------------------------------------------------------------- 32 // 33 // class RegexCompile Contains the regular expression compiler. 34 // 35 //-------------------------------------------------------------------------------- 36 struct RegexTableEl; 37 class RegexPattern; 38 39 40 class RegexCompile : public UMemory { 41 public: 42 43 enum { 44 kStackSize = 100 // The size of the state stack for 45 }; // pattern parsing. Corresponds roughly 46 // to the depth of parentheses nesting 47 // that is allowed in the rules. 48 49 struct RegexPatternChar { 50 UChar32 fChar; 51 UBool fQuoted; 52 }; 53 54 RegexCompile(RegexPattern *rp, UErrorCode &e); 55 56 void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); 57 void compile(UText *pat, UParseError &pp, UErrorCode &e); 58 59 60 virtual ~RegexCompile(); 61 62 void nextChar(RegexPatternChar &c); // Get the next char from the input stream. 63 64 static void cleanup(); // Memory cleanup 65 66 67 68 // Categories of parentheses in pattern. 69 // The category is saved in the compile-time parentheses stack frame, and 70 // determines the code to be generated when the matching close ) is encountered. 71 enum EParenClass { 72 plain = -1, // No special handling 73 capturing = -2, 74 atomic = -3, 75 lookAhead = -4, 76 negLookAhead = -5, 77 flags = -6, 78 lookBehind = -7, 79 lookBehindN = -8 80 }; 81 82 private: 83 84 85 UBool doParseActions(int32_t a); 86 void error(UErrorCode e); // error reporting convenience function. 87 88 UChar32 nextCharLL(); 89 UChar32 peekCharLL(); 90 UnicodeSet *scanProp(); 91 UnicodeSet *scanPosixProp(); 92 void handleCloseParen(); 93 int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern 94 // at the top of the just completed block 95 // or operation, and optionally ensure that 96 // there is space to add an opcode there. 97 void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for 98 // a reference to a UnicodeSet. 99 void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. 100 int32_t LoopOp); 101 UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier 102 void literalChar(UChar32 c); // Compile a literal char 103 void fixLiterals(UBool split=FALSE); // Generate code for pending literal characters. 104 void insertOp(int32_t where); // Open up a slot for a new op in the 105 // generated code at the specified location. 106 int32_t minMatchLength(int32_t start, 107 int32_t end); 108 int32_t maxMatchLength(int32_t start, 109 int32_t end); 110 void matchStartType(); 111 void stripNOPs(); 112 113 void setEval(int32_t op); 114 void setPushOp(int32_t op); 115 UChar32 scanNamedChar(); 116 UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated); 117 118 119 UErrorCode *fStatus; 120 RegexPattern *fRXPat; 121 UParseError *fParseErr; 122 123 // 124 // Data associated with low level character scanning 125 // 126 int64_t fScanIndex; // Index of current character being processed 127 // in the rule input string. 128 UBool fQuoteMode; // Scan is in a \Q...\E quoted region 129 UBool fInBackslashQuote; // Scan is between a '\' and the following char. 130 UBool fEOLComments; // When scan is just after '(?', inhibit #... to 131 // end of line comments, in favor of (?#...) comments. 132 int64_t fLineNum; // Line number in input file. 133 int64_t fCharNum; // Char position within the line. 134 UChar32 fLastChar; // Previous char, needed to count CR-LF 135 // as a single line, not two. 136 UChar32 fPeekChar; // Saved char, if we've scanned ahead. 137 138 139 RegexPatternChar fC; // Current char for parse state machine 140 // processing. 141 142 // 143 // Data for the state machine that parses the regular expression. 144 // 145 RegexTableEl **fStateTable; // State Transition Table for regex Rule 146 // parsing. index by p[state][char-class] 147 148 uint16_t fStack[kStackSize]; // State stack, holds state pushes 149 int32_t fStackPtr; // and pops as specified in the state 150 // transition rules. 151 152 // 153 // Data associated with the generation of the pcode for the match engine 154 // 155 int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) 156 // Always has high bit (31) set so that flag values 157 // on the paren stack are distinguished from relocatable 158 // pcode addresses. 159 int32_t fNewModeFlags; // New flags, while compiling (?i, holds state 160 // until last flag is scanned. 161 UBool fSetModeFlag; // true for (?ismx, false for (?-ismx 162 163 UnicodeString fLiteralChars; // Literal chars or strings from the pattern are accumulated here. 164 // Once completed, meaning that some non-literal pattern 165 // construct is encountered, the appropriate opcodes 166 // to match the literal will be generated, and this 167 // string will be cleared. 168 169 int64_t fPatternLength; // Length of the input pattern string. 170 171 UVector32 fParenStack; // parentheses stack. Each frame consists of 172 // the positions of compiled pattern operations 173 // needing fixup, followed by negative value. The 174 // first entry in each frame is the position of the 175 // spot reserved for use when a quantifier 176 // needs to add a SAVE at the start of a (block) 177 // The negative value (-1, -2,...) indicates 178 // the kind of paren that opened the frame. Some 179 // need special handling on close. 180 181 182 int32_t fMatchOpenParen; // The position in the compiled pattern 183 // of the slot reserved for a state save 184 // at the start of the most recently processed 185 // parenthesized block. 186 int32_t fMatchCloseParen; // The position in the pattern of the first 187 // location after the most recently processed 188 // parenthesized block. 189 190 int32_t fIntervalLow; // {lower, upper} interval quantifier values. 191 int32_t fIntervalUpper; // Placed here temporarily, when pattern is 192 // initially scanned. Each new interval 193 // encountered overwrites these values. 194 // -1 for the upper interval value means none 195 // was specified (unlimited occurences.) 196 197 int64_t fNameStartPos; // Starting position of a \N{NAME} name in a 198 // pattern, valid while remainder of name is 199 // scanned. 200 201 UStack fSetStack; // Stack of UnicodeSets, used while evaluating 202 // (at compile time) set expressions within 203 // the pattern. 204 UStack fSetOpStack; // Stack of pending set operators (&&, --, union) 205 206 UChar32 fLastSetLiteral; // The last single code point added to a set. 207 // needed when "-y" is scanned, and we need 208 // to turn "x-y" into a range. 209 }; 210 211 // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions] 212 // The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself. 213 214 enum SetOperations { 215 setStart = 0 << 16 | 1, 216 setEnd = 1 << 16 | 2, 217 setNegation = 2 << 16 | 3, 218 setCaseClose = 2 << 16 | 9, 219 setDifference2 = 3 << 16 | 4, // '--' set difference operator 220 setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator 221 setUnion = 4 << 16 | 6, // implicit union of adjacent items 222 setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet. 223 setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. 224 }; 225 226 U_NAMESPACE_END 227 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 228 #endif // RBBISCAN_H 229