1 // Copyright 2007 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Compiled representation of regular expressions. 6 // See regexp.h for the Regexp class, which represents a regular 7 // expression symbolically. 8 9 #ifndef RE2_PROG_H__ 10 #define RE2_PROG_H__ 11 12 #include "util/util.h" 13 #include "re2/re2.h" 14 15 namespace re2 { 16 17 // Simple fixed-size bitmap. 18 template<int Bits> 19 class Bitmap { 20 public: 21 Bitmap() { Reset(); } 22 int Size() { return Bits; } 23 24 void Reset() { 25 for (int i = 0; i < Words; i++) 26 w_[i] = 0; 27 } 28 bool Get(int k) const { 29 return w_[k >> WordLog] & (1<<(k & 31)); 30 } 31 void Set(int k) { 32 w_[k >> WordLog] |= 1<<(k & 31); 33 } 34 void Clear(int k) { 35 w_[k >> WordLog] &= ~(1<<(k & 31)); 36 } 37 uint32 Word(int i) const { 38 return w_[i]; 39 } 40 41 private: 42 static const int WordLog = 5; 43 static const int Words = (Bits+31)/32; 44 uint32 w_[Words]; 45 DISALLOW_EVIL_CONSTRUCTORS(Bitmap); 46 }; 47 48 49 // Opcodes for Inst 50 enum InstOp { 51 kInstAlt = 0, // choose between out_ and out1_ 52 kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa. 53 kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_] 54 kInstCapture, // capturing parenthesis number cap_ 55 kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_ 56 kInstMatch, // found a match! 57 kInstNop, // no-op; occasionally unavoidable 58 kInstFail, // never match; occasionally unavoidable 59 }; 60 61 // Bit flags for empty-width specials 62 enum EmptyOp { 63 kEmptyBeginLine = 1<<0, // ^ - beginning of line 64 kEmptyEndLine = 1<<1, // $ - end of line 65 kEmptyBeginText = 1<<2, // \A - beginning of text 66 kEmptyEndText = 1<<3, // \z - end of text 67 kEmptyWordBoundary = 1<<4, // \b - word boundary 68 kEmptyNonWordBoundary = 1<<5, // \B - not \b 69 kEmptyAllFlags = (1<<6)-1, 70 }; 71 72 class Regexp; 73 74 class DFA; 75 struct OneState; 76 77 // Compiled form of regexp program. 78 class Prog { 79 public: 80 Prog(); 81 ~Prog(); 82 83 // Single instruction in regexp program. 84 class Inst { 85 public: 86 Inst() : out_opcode_(0), out1_(0) { } 87 88 // Constructors per opcode 89 void InitAlt(uint32 out, uint32 out1); 90 void InitByteRange(int lo, int hi, int foldcase, uint32 out); 91 void InitCapture(int cap, uint32 out); 92 void InitEmptyWidth(EmptyOp empty, uint32 out); 93 void InitMatch(int id); 94 void InitNop(uint32 out); 95 void InitFail(); 96 97 // Getters 98 int id(Prog* p) { return this - p->inst_; } 99 InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); } 100 int out() { return out_opcode_>>3; } 101 int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; } 102 int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; } 103 int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; } 104 int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; } 105 int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; } 106 int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; } 107 EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; } 108 bool greedy(Prog *p) { 109 DCHECK_EQ(opcode(), kInstAltMatch); 110 return p->inst(out())->opcode() == kInstByteRange; 111 } 112 113 // Does this inst (an kInstByteRange) match c? 114 inline bool Matches(int c) { 115 DCHECK_EQ(opcode(), kInstByteRange); 116 if (foldcase_ && 'A' <= c && c <= 'Z') 117 c += 'a' - 'A'; 118 return lo_ <= c && c <= hi_; 119 } 120 121 // Returns string representation for debugging. 122 string Dump(); 123 124 // Maximum instruction id. 125 // (Must fit in out_opcode_, and PatchList steals another bit.) 126 static const int kMaxInst = (1<<28) - 1; 127 128 private: 129 void set_opcode(InstOp opcode) { 130 out_opcode_ = (out()<<3) | opcode; 131 } 132 133 void set_out(int out) { 134 out_opcode_ = (out<<3) | opcode(); 135 } 136 137 void set_out_opcode(int out, InstOp opcode) { 138 out_opcode_ = (out<<3) | opcode; 139 } 140 141 uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode 142 union { // additional instruction arguments: 143 uint32 out1_; // opcode == kInstAlt 144 // alternate next instruction 145 146 int32 cap_; // opcode == kInstCapture 147 // Index of capture register (holds text 148 // position recorded by capturing parentheses). 149 // For \n (the submatch for the nth parentheses), 150 // the left parenthesis captures into register 2*n 151 // and the right one captures into register 2*n+1. 152 153 int32 match_id_; // opcode == kInstMatch 154 // Match ID to identify this match (for re2::Set). 155 156 struct { // opcode == kInstByteRange 157 uint8 lo_; // byte range is lo_-hi_ inclusive 158 uint8 hi_; // 159 uint8 foldcase_; // convert A-Z to a-z before checking range. 160 }; 161 162 EmptyOp empty_; // opcode == kInstEmptyWidth 163 // empty_ is bitwise OR of kEmpty* flags above. 164 }; 165 166 friend class Compiler; 167 friend struct PatchList; 168 friend class Prog; 169 170 DISALLOW_EVIL_CONSTRUCTORS(Inst); 171 }; 172 173 // Whether to anchor the search. 174 enum Anchor { 175 kUnanchored, // match anywhere 176 kAnchored, // match only starting at beginning of text 177 }; 178 179 // Kind of match to look for (for anchor != kFullMatch) 180 // 181 // kLongestMatch mode finds the overall longest 182 // match but still makes its submatch choices the way 183 // Perl would, not in the way prescribed by POSIX. 184 // The POSIX rules are much more expensive to implement, 185 // and no one has needed them. 186 // 187 // kFullMatch is not strictly necessary -- we could use 188 // kLongestMatch and then check the length of the match -- but 189 // the matching code can run faster if it knows to consider only 190 // full matches. 191 enum MatchKind { 192 kFirstMatch, // like Perl, PCRE 193 kLongestMatch, // like egrep or POSIX 194 kFullMatch, // match only entire text; implies anchor==kAnchored 195 kManyMatch // for SearchDFA, records set of matches 196 }; 197 198 Inst *inst(int id) { return &inst_[id]; } 199 int start() { return start_; } 200 int start_unanchored() { return start_unanchored_; } 201 void set_start(int start) { start_ = start; } 202 void set_start_unanchored(int start) { start_unanchored_ = start; } 203 int64 size() { return size_; } 204 bool reversed() { return reversed_; } 205 void set_reversed(bool reversed) { reversed_ = reversed; } 206 int64 byte_inst_count() { return byte_inst_count_; } 207 const Bitmap<256>& byterange() { return byterange_; } 208 void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; } 209 int64 dfa_mem() { return dfa_mem_; } 210 int flags() { return flags_; } 211 void set_flags(int flags) { flags_ = flags; } 212 bool anchor_start() { return anchor_start_; } 213 void set_anchor_start(bool b) { anchor_start_ = b; } 214 bool anchor_end() { return anchor_end_; } 215 void set_anchor_end(bool b) { anchor_end_ = b; } 216 int bytemap_range() { return bytemap_range_; } 217 const uint8* bytemap() { return bytemap_; } 218 219 // Returns string representation of program for debugging. 220 string Dump(); 221 string DumpUnanchored(); 222 223 // Record that at some point in the prog, the bytes in the range 224 // lo-hi (inclusive) are treated as different from bytes outside the range. 225 // Tracking this lets the DFA collapse commonly-treated byte ranges 226 // when recording state pointers, greatly reducing its memory footprint. 227 void MarkByteRange(int lo, int hi); 228 229 // Returns the set of kEmpty flags that are in effect at 230 // position p within context. 231 static uint32 EmptyFlags(const StringPiece& context, const char* p); 232 233 // Returns whether byte c is a word character: ASCII only. 234 // Used by the implementation of \b and \B. 235 // This is not right for Unicode, but: 236 // - it's hard to get right in a byte-at-a-time matching world 237 // (the DFA has only one-byte lookahead). 238 // - even if the lookahead were possible, the Progs would be huge. 239 // This crude approximation is the same one PCRE uses. 240 static bool IsWordChar(uint8 c) { 241 return ('A' <= c && c <= 'Z') || 242 ('a' <= c && c <= 'z') || 243 ('0' <= c && c <= '9') || 244 c == '_'; 245 } 246 247 // Execution engines. They all search for the regexp (run the prog) 248 // in text, which is in the larger context (used for ^ $ \b etc). 249 // Anchor and kind control the kind of search. 250 // Returns true if match found, false if not. 251 // If match found, fills match[0..nmatch-1] with submatch info. 252 // match[0] is overall match, match[1] is first set of parens, etc. 253 // If a particular submatch is not matched during the regexp match, 254 // it is set to NULL. 255 // 256 // Matching text == StringPiece(NULL, 0) is treated as any other empty 257 // string, but note that on return, it will not be possible to distinguish 258 // submatches that matched that empty string from submatches that didn't 259 // match anything. Either way, match[i] == NULL. 260 261 // Search using NFA: can find submatches but kind of slow. 262 bool SearchNFA(const StringPiece& text, const StringPiece& context, 263 Anchor anchor, MatchKind kind, 264 StringPiece* match, int nmatch); 265 266 // Search using DFA: much faster than NFA but only finds 267 // end of match and can use a lot more memory. 268 // Returns whether a match was found. 269 // If the DFA runs out of memory, sets *failed to true and returns false. 270 // If matches != NULL and kind == kManyMatch and there is a match, 271 // SearchDFA fills matches with the match IDs of the final matching state. 272 bool SearchDFA(const StringPiece& text, const StringPiece& context, 273 Anchor anchor, MatchKind kind, 274 StringPiece* match0, bool* failed, 275 vector<int>* matches); 276 277 // Build the entire DFA for the given match kind. FOR TESTING ONLY. 278 // Usually the DFA is built out incrementally, as needed, which 279 // avoids lots of unnecessary work. This function is useful only 280 // for testing purposes. Returns number of states. 281 int BuildEntireDFA(MatchKind kind); 282 283 // Compute byte map. 284 void ComputeByteMap(); 285 286 // Run peep-hole optimizer on program. 287 void Optimize(); 288 289 // One-pass NFA: only correct if IsOnePass() is true, 290 // but much faster than NFA (competitive with PCRE) 291 // for those expressions. 292 bool IsOnePass(); 293 bool SearchOnePass(const StringPiece& text, const StringPiece& context, 294 Anchor anchor, MatchKind kind, 295 StringPiece* match, int nmatch); 296 297 // Bit-state backtracking. Fast on small cases but uses memory 298 // proportional to the product of the program size and the text size. 299 bool SearchBitState(const StringPiece& text, const StringPiece& context, 300 Anchor anchor, MatchKind kind, 301 StringPiece* match, int nmatch); 302 303 static const int kMaxOnePassCapture = 5; // $0 through $4 304 305 // Backtracking search: the gold standard against which the other 306 // implementations are checked. FOR TESTING ONLY. 307 // It allocates a ton of memory to avoid running forever. 308 // It is also recursive, so can't use in production (will overflow stacks). 309 // The name "Unsafe" here is supposed to be a flag that 310 // you should not be using this function. 311 bool UnsafeSearchBacktrack(const StringPiece& text, 312 const StringPiece& context, 313 Anchor anchor, MatchKind kind, 314 StringPiece* match, int nmatch); 315 316 // Computes range for any strings matching regexp. The min and max can in 317 // some cases be arbitrarily precise, so the caller gets to specify the 318 // maximum desired length of string returned. 319 // 320 // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any 321 // string s that is an anchored match for this regexp satisfies 322 // min <= s && s <= max. 323 // 324 // Note that PossibleMatchRange() will only consider the first copy of an 325 // infinitely repeated element (i.e., any regexp element followed by a '*' or 326 // '+' operator). Regexps with "{N}" constructions are not affected, as those 327 // do not compile down to infinite repetitions. 328 // 329 // Returns true on success, false on error. 330 bool PossibleMatchRange(string* min, string* max, int maxlen); 331 332 // Compiles a collection of regexps to Prog. Each regexp will have 333 // its own Match instruction recording the index in the vector. 334 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, 335 Regexp* re); 336 337 private: 338 friend class Compiler; 339 340 DFA* GetDFA(MatchKind kind); 341 342 bool anchor_start_; // regexp has explicit start anchor 343 bool anchor_end_; // regexp has explicit end anchor 344 bool reversed_; // whether program runs backward over input 345 bool did_onepass_; // has IsOnePass been called? 346 347 int start_; // entry point for program 348 int start_unanchored_; // unanchored entry point for program 349 int size_; // number of instructions 350 int byte_inst_count_; // number of kInstByteRange instructions 351 int bytemap_range_; // bytemap_[x] < bytemap_range_ 352 int flags_; // regexp parse flags 353 int onepass_statesize_; // byte size of each OneState* node 354 355 Inst* inst_; // pointer to instruction array 356 357 Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_ 358 DFA* volatile dfa_first_; // DFA cached for kFirstMatch 359 DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch 360 int64 dfa_mem_; // Maximum memory for DFAs. 361 void (*delete_dfa_)(DFA* dfa); 362 363 Bitmap<256> byterange_; // byterange.Get(x) true if x ends a 364 // commonly-treated byte range. 365 uint8 bytemap_[256]; // map from input bytes to byte classes 366 uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x 367 368 uint8* onepass_nodes_; // data for OnePass nodes 369 OneState* onepass_start_; // start node for OnePass program 370 371 DISALLOW_EVIL_CONSTRUCTORS(Prog); 372 }; 373 374 } // namespace re2 375 376 #endif // RE2_PROG_H__ 377