1 // 2 // regexcmp.h 3 // 4 // Copyright (C) 2002-2010, International Business Machines Corporation and others. 5 // All Rights Reserved. 6 // 7 // This file contains declarations for the class RegexCompile 8 // 9 // This class is internal to the regular expression implementation. 10 // For the public Regular Expression API, see the file "unicode/regex.h" 11 // 12 13 14 #ifndef RBBISCAN_H 15 #define RBBISCAN_H 16 17 #include "unicode/utypes.h" 18 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 19 20 #include "unicode/uobject.h" 21 #include "unicode/uniset.h" 22 #include "unicode/parseerr.h" 23 #include "uhash.h" 24 #include "uvector.h" 25 26 27 28 U_NAMESPACE_BEGIN 29 30 31 //-------------------------------------------------------------------------------- 32 // 33 // class RegexCompile Contains the regular expression compiler. 34 // 35 //-------------------------------------------------------------------------------- 36 struct RegexTableEl; 37 class RegexPattern; 38 39 40 class RegexCompile : public UMemory { 41 public: 42 43 enum { 44 kStackSize = 100 // The size of the state stack for 45 }; // pattern parsing. Corresponds roughly 46 // to the depth of parentheses nesting 47 // that is allowed in the rules. 48 49 struct RegexPatternChar { 50 UChar32 fChar; 51 UBool fQuoted; 52 }; 53 54 RegexCompile(RegexPattern *rp, UErrorCode &e); 55 56 void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); 57 void compile(UText *pat, UParseError &pp, UErrorCode &e); 58 59 60 virtual ~RegexCompile(); 61 62 void nextChar(RegexPatternChar &c); // Get the next char from the input stream. 63 64 static void cleanup(); // Memory cleanup 65 66 67 68 // Categories of parentheses in pattern. 69 // The category is saved in the compile-time parentheses stack frame, and 70 // determines the code to be generated when the matching close ) is encountered. 71 enum EParenClass { 72 plain = -1, // No special handling 73 capturing = -2, 74 atomic = -3, 75 lookAhead = -4, 76 negLookAhead = -5, 77 flags = -6, 78 lookBehind = -7, 79 lookBehindN = -8 80 }; 81 82 private: 83 84 85 UBool doParseActions(int32_t a); 86 void error(UErrorCode e); // error reporting convenience function. 87 88 UChar32 nextCharLL(); 89 UChar32 peekCharLL(); 90 UnicodeSet *scanProp(); 91 UnicodeSet *scanPosixProp(); 92 void handleCloseParen(); 93 int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern 94 // at the top of the just completed block 95 // or operation, and optionally ensure that 96 // there is space to add an opcode there. 97 void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for 98 // a reference to a UnicodeSet. 99 void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. 100 int32_t LoopOp); 101 UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier 102 void literalChar(UChar32 c); // Compile a literal char 103 void fixLiterals(UBool split=FALSE); // Fix literal strings. 104 void insertOp(int32_t where); // Open up a slot for a new op in the 105 // generated code at the specified location. 106 void emitONE_CHAR(UChar32 c); // Emit a ONE_CHAR op into the compiled code, 107 // taking case mode into account. 108 int32_t minMatchLength(int32_t start, 109 int32_t end); 110 int32_t maxMatchLength(int32_t start, 111 int32_t end); 112 void matchStartType(); 113 void stripNOPs(); 114 115 void setEval(int32_t op); 116 void setPushOp(int32_t op); 117 UChar32 scanNamedChar(); 118 UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated); 119 120 121 UErrorCode *fStatus; 122 RegexPattern *fRXPat; 123 UParseError *fParseErr; 124 125 // 126 // Data associated with low level character scanning 127 // 128 int64_t fScanIndex; // Index of current character being processed 129 // in the rule input string. 130 UBool fQuoteMode; // Scan is in a \Q...\E quoted region 131 UBool fInBackslashQuote; // Scan is between a '\' and the following char. 132 UBool fEOLComments; // When scan is just after '(?', inhibit #... to 133 // end of line comments, in favor of (?#...) comments. 134 int64_t fLineNum; // Line number in input file. 135 int64_t fCharNum; // Char position within the line. 136 UChar32 fLastChar; // Previous char, needed to count CR-LF 137 // as a single line, not two. 138 UChar32 fPeekChar; // Saved char, if we've scanned ahead. 139 140 141 RegexPatternChar fC; // Current char for parse state machine 142 // processing. 143 144 // 145 // Data for the state machine that parses the regular expression. 146 // 147 RegexTableEl **fStateTable; // State Transition Table for regex Rule 148 // parsing. index by p[state][char-class] 149 150 uint16_t fStack[kStackSize]; // State stack, holds state pushes 151 int32_t fStackPtr; // and pops as specified in the state 152 // transition rules. 153 154 // 155 // Data associated with the generation of the pcode for the match engine 156 // 157 int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) 158 // Always has high bit (31) set so that flag values 159 // on the paren stack are distinguished from relocatable 160 // pcode addresses. 161 int32_t fNewModeFlags; // New flags, while compiling (?i, holds state 162 // until last flag is scanned. 163 UBool fSetModeFlag; // true for (?ismx, false for (?-ismx 164 165 166 int32_t fStringOpStart; // While a literal string is being scanned 167 // holds the start index within RegexPattern. 168 // fLiteralText where the string is being stored. 169 170 int64_t fPatternLength; // Length of the input pattern string. 171 172 UVector32 fParenStack; // parentheses stack. Each frame consists of 173 // the positions of compiled pattern operations 174 // needing fixup, followed by negative value. The 175 // first entry in each frame is the position of the 176 // spot reserved for use when a quantifier 177 // needs to add a SAVE at the start of a (block) 178 // The negative value (-1, -2,...) indicates 179 // the kind of paren that opened the frame. Some 180 // need special handling on close. 181 182 183 int32_t fMatchOpenParen; // The position in the compiled pattern 184 // of the slot reserved for a state save 185 // at the start of the most recently processed 186 // parenthesized block. 187 int32_t fMatchCloseParen; // The position in the pattern of the first 188 // location after the most recently processed 189 // parenthesized block. 190 191 int32_t fIntervalLow; // {lower, upper} interval quantifier values. 192 int32_t fIntervalUpper; // Placed here temporarily, when pattern is 193 // initially scanned. Each new interval 194 // encountered overwrites these values. 195 // -1 for the upper interval value means none 196 // was specified (unlimited occurences.) 197 198 int64_t fNameStartPos; // Starting position of a \N{NAME} name in a 199 // pattern, valid while remainder of name is 200 // scanned. 201 202 UStack fSetStack; // Stack of UnicodeSets, used while evaluating 203 // (at compile time) set expressions within 204 // the pattern. 205 UStack fSetOpStack; // Stack of pending set operators (&&, --, union) 206 207 UChar32 fLastSetLiteral; // The last single code point added to a set. 208 // needed when "-y" is scanned, and we need 209 // to turn "x-y" into a range. 210 }; 211 212 // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions] 213 // The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself. 214 215 enum SetOperations { 216 setStart = 0 << 16 | 1, 217 setEnd = 1 << 16 | 2, 218 setNegation = 2 << 16 | 3, 219 setCaseClose = 2 << 16 | 9, 220 setDifference2 = 3 << 16 | 4, // '--' set difference operator 221 setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator 222 setUnion = 4 << 16 | 6, // implicit union of adjacent items 223 setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet. 224 setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. 225 }; 226 227 U_NAMESPACE_END 228 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 229 #endif // RBBISCAN_H 230