1 // 2 // regexcmp.h 3 // 4 // Copyright (C) 2002-2008, International Business Machines Corporation and others. 5 // All Rights Reserved. 6 // 7 // This file contains declarations for the class RegexCompile 8 // 9 // This class is internal to the regular expression implementation. 10 // For the public Regular Expression API, see the file "unicode/regex.h" 11 // 12 13 14 #ifndef RBBISCAN_H 15 #define RBBISCAN_H 16 17 #include "unicode/utypes.h" 18 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 19 20 #include "unicode/uobject.h" 21 #include "unicode/uniset.h" 22 #include "unicode/parseerr.h" 23 #include "uhash.h" 24 #include "uvector.h" 25 26 27 28 U_NAMESPACE_BEGIN 29 30 31 //-------------------------------------------------------------------------------- 32 // 33 // class RegexCompile Contains the regular expression compiler. 34 // 35 //-------------------------------------------------------------------------------- 36 struct RegexTableEl; 37 class RegexPattern; 38 39 40 class RegexCompile : public UMemory { 41 public: 42 43 enum { 44 kStackSize = 100 // The size of the state stack for 45 }; // pattern parsing. Corresponds roughly 46 // to the depth of parentheses nesting 47 // that is allowed in the rules. 48 49 struct RegexPatternChar { 50 UChar32 fChar; 51 UBool fQuoted; 52 }; 53 54 RegexCompile(RegexPattern *rp, UErrorCode &e); 55 56 void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); 57 58 59 virtual ~RegexCompile(); 60 61 void nextChar(RegexPatternChar &c); // Get the next char from the input stream. 62 63 static void cleanup(); // Memory cleanup 64 65 66 67 // Categories of parentheses in pattern. 68 // The category is saved in the compile-time parentheses stack frame, and 69 // determines the code to be generated when the matching close ) is encountered. 70 enum EParenClass { 71 plain = -1, // No special handling 72 capturing = -2, 73 atomic = -3, 74 lookAhead = -4, 75 negLookAhead = -5, 76 flags = -6, 77 lookBehind = -7, 78 lookBehindN = -8 79 }; 80 81 private: 82 83 84 UBool doParseActions(int32_t a); 85 void error(UErrorCode e); // error reporting convenience function. 86 87 UChar32 nextCharLL(); 88 UChar32 peekCharLL(); 89 UnicodeSet *scanProp(); 90 UnicodeSet *scanPosixProp(); 91 void handleCloseParen(); 92 int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern 93 // at the top of the just completed block 94 // or operation, and optionally ensure that 95 // there is space to add an opcode there. 96 void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for 97 // a reference to a UnicodeSet. 98 void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. 99 int32_t LoopOp); 100 UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier 101 void literalChar(UChar32 c); // Compile a literal char 102 void fixLiterals(UBool split=FALSE); // Fix literal strings. 103 void insertOp(int32_t where); // Open up a slot for a new op in the 104 // generated code at the specified location. 105 void emitONE_CHAR(UChar32 c); // EMit a ONE_CHAR op into the compiled code, 106 // taking case mode into account. 107 int32_t minMatchLength(int32_t start, 108 int32_t end); 109 int32_t maxMatchLength(int32_t start, 110 int32_t end); 111 void matchStartType(); 112 void stripNOPs(); 113 114 void setEval(int32_t op); 115 void setPushOp(int32_t op); 116 UChar32 scanNamedChar(); 117 UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated); 118 119 120 UErrorCode *fStatus; 121 RegexPattern *fRXPat; 122 UParseError *fParseErr; 123 124 // 125 // Data associated with low level character scanning 126 // 127 int32_t fScanIndex; // Index of current character being processed 128 // in the rule input string. 129 int32_t fNextIndex; // Index of the next character, which 130 // is the first character not yet scanned. 131 UBool fQuoteMode; // Scan is in a \Q...\E quoted region 132 UBool fInBackslashQuote; // Scan is between a '\' and the following char. 133 UBool fEOLComments; // When scan is just after '(?', inhibit #... to 134 // end of line comments, in favor of (?#...) comments. 135 int32_t fLineNum; // Line number in input file. 136 int32_t fCharNum; // Char position within the line. 137 UChar32 fLastChar; // Previous char, needed to count CR-LF 138 // as a single line, not two. 139 UChar32 fPeekChar; // Saved char, if we've scanned ahead. 140 141 142 RegexPatternChar fC; // Current char for parse state machine 143 // processing. 144 145 // 146 // Data for the state machine that parses the regular expression. 147 // 148 RegexTableEl **fStateTable; // State Transition Table for regex Rule 149 // parsing. index by p[state][char-class] 150 151 uint16_t fStack[kStackSize]; // State stack, holds state pushes 152 int32_t fStackPtr; // and pops as specified in the state 153 // transition rules. 154 155 // 156 // Data associated with the generation of the pcode for the match engine 157 // 158 int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) 159 // Always has high bit (31) set so that flag values 160 // on the paren stack are distinguished from relocatable 161 // pcode addresses. 162 int32_t fNewModeFlags; // New flags, while compiling (?i, holds state 163 // until last flag is scanned. 164 UBool fSetModeFlag; // true for (?ismx, false for (?-ismx 165 166 167 int32_t fStringOpStart; // While a literal string is being scanned 168 // holds the start index within RegexPattern. 169 // fLiteralText where the string is being stored. 170 171 int32_t fPatternLength; // Length of the input pattern string. 172 173 UVector32 fParenStack; // parentheses stack. Each frame consists of 174 // the positions of compiled pattern operations 175 // needing fixup, followed by negative value. The 176 // first entry in each frame is the position of the 177 // spot reserved for use when a quantifier 178 // needs to add a SAVE at the start of a (block) 179 // The negative value (-1, -2,...) indicates 180 // the kind of paren that opened the frame. Some 181 // need special handling on close. 182 183 184 int32_t fMatchOpenParen; // The position in the compiled pattern 185 // of the slot reserved for a state save 186 // at the start of the most recently processed 187 // parenthesized block. 188 int32_t fMatchCloseParen; // The position in the pattern of the first 189 // location after the most recently processed 190 // parenthesized block. 191 192 int32_t fIntervalLow; // {lower, upper} interval quantifier values. 193 int32_t fIntervalUpper; // Placed here temporarily, when pattern is 194 // initially scanned. Each new interval 195 // encountered overwrites these values. 196 // -1 for the upper interval value means none 197 // was specified (unlimited occurences.) 198 199 int32_t fNameStartPos; // Starting position of a \N{NAME} name in a 200 // pattern, valid while remainder of name is 201 // scanned. 202 203 UStack fSetStack; // Stack of UnicodeSets, used while evaluating 204 // (at compile time) set expressions within 205 // the pattern. 206 UStack fSetOpStack; // Stack of pending set operators (&&, --, union) 207 208 UChar32 fLastSetLiteral; // The last single code point added to a set. 209 // needed when "-y" is scanned, and we need 210 // to turn "x-y" into a range. 211 212 }; 213 214 // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions] 215 // The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself. 216 217 enum SetOperations { 218 setStart = 0 << 16 | 1, 219 setEnd = 1 << 16 | 2, 220 setNegation = 2 << 16 | 3, 221 setCaseClose = 2 << 16 | 9, 222 setDifference2 = 3 << 16 | 4, // '--' set difference operator 223 setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator 224 setUnion = 4 << 16 | 6, // implicit union of adjacent items 225 setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet. 226 setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. 227 }; 228 229 U_NAMESPACE_END 230 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 231 #endif // RBBISCAN_H 232