1 // Copyright 2008 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc 6 // 7 // Prog::BadSearchBacktrack is a backtracking regular expression search, 8 // except that it remembers where it has been, trading a lot of 9 // memory for a lot of time. It exists only for testing purposes. 10 // 11 // Let me repeat that. 12 // 13 // THIS CODE SHOULD NEVER BE USED IN PRODUCTION: 14 // - It uses a ton of memory. 15 // - It uses a ton of stack. 16 // - It uses CHECK and LOG(FATAL). 17 // - It implements unanchored search by repeated anchored search. 18 // 19 // On the other hand, it is very simple and a good reference 20 // implementation for the more complicated regexp packages. 21 // 22 // In BUILD, this file is linked into the ":testing" library, 23 // not the main library, in order to make it harder to pick up 24 // accidentally. 25 26 #include "util/util.h" 27 #include "re2/prog.h" 28 #include "re2/regexp.h" 29 30 namespace re2 { 31 32 // Backtracker holds the state for a backtracking search. 33 // 34 // Excluding the search parameters, the main search state 35 // is just the "capture registers", which record, for the 36 // current execution, the string position at which each 37 // parenthesis was passed. cap_[0] and cap_[1] are the 38 // left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc. 39 // 40 // To avoid infinite loops during backtracking on expressions 41 // like (a*)*, the visited_[] bitmap marks the (state, string-position) 42 // pairs that have already been explored and are thus not worth 43 // re-exploring if we get there via another path. Modern backtracking 44 // libraries engineer their program representation differently, to make 45 // such infinite loops possible to avoid without keeping a giant visited_ 46 // bitmap, but visited_ works fine for a reference implementation 47 // and it has the nice benefit of making the search run in linear time. 48 class Backtracker { 49 public: 50 explicit Backtracker(Prog* prog); 51 ~Backtracker(); 52 53 bool Search(const StringPiece& text, const StringPiece& context, 54 bool anchored, bool longest, 55 StringPiece* submatch, int nsubmatch); 56 57 private: 58 // Explores from instruction ip at string position p looking for a match. 59 // Returns true if found (so that caller can stop trying other possibilities). 60 bool Visit(int id, const char* p); 61 62 // Search parameters 63 Prog* prog_; // program being run 64 StringPiece text_; // text being searched 65 StringPiece context_; // greater context of text being searched 66 bool anchored_; // whether search is anchored at text.begin() 67 bool longest_; // whether search wants leftmost-longest match 68 bool endmatch_; // whether search must end at text.end() 69 StringPiece *submatch_; // submatches to fill in 70 int nsubmatch_; // # of submatches to fill in 71 72 // Search state 73 const char* cap_[64]; // capture registers 74 uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked 75 int nvisited_; // # of words in bitmap 76 }; 77 78 Backtracker::Backtracker(Prog* prog) 79 : prog_(prog), 80 anchored_(false), 81 longest_(false), 82 endmatch_(false), 83 submatch_(NULL), 84 nsubmatch_(0), 85 visited_(NULL), 86 nvisited_(0) { 87 } 88 89 Backtracker::~Backtracker() { 90 delete[] visited_; 91 } 92 93 // Runs a backtracking search. 94 bool Backtracker::Search(const StringPiece& text, const StringPiece& context, 95 bool anchored, bool longest, 96 StringPiece* submatch, int nsubmatch) { 97 text_ = text; 98 context_ = context; 99 if (context_.begin() == NULL) 100 context_ = text; 101 if (prog_->anchor_start() && text.begin() > context_.begin()) 102 return false; 103 if (prog_->anchor_end() && text.end() < context_.end()) 104 return false; 105 anchored_ = anchored | prog_->anchor_start(); 106 longest_ = longest | prog_->anchor_end(); 107 endmatch_ = prog_->anchor_end(); 108 submatch_ = submatch; 109 nsubmatch_ = nsubmatch; 110 CHECK(2*nsubmatch_ < arraysize(cap_)); 111 memset(cap_, 0, sizeof cap_); 112 113 // We use submatch_[0] for our own bookkeeping, 114 // so it had better exist. 115 StringPiece sp0; 116 if (nsubmatch < 1) { 117 submatch_ = &sp0; 118 nsubmatch_ = 1; 119 } 120 submatch_[0] = NULL; 121 122 // Allocate new visited_ bitmap -- size is proportional 123 // to text, so have to reallocate on each call to Search. 124 delete[] visited_; 125 nvisited_ = (prog_->size()*(text.size()+1) + 31)/32; 126 visited_ = new uint32[nvisited_]; 127 memset(visited_, 0, nvisited_*sizeof visited_[0]); 128 129 // Anchored search must start at text.begin(). 130 if (anchored_) { 131 cap_[0] = text.begin(); 132 return Visit(prog_->start(), text.begin()); 133 } 134 135 // Unanchored search, starting from each possible text position. 136 // Notice that we have to try the empty string at the end of 137 // the text, so the loop condition is p <= text.end(), not p < text.end(). 138 for (const char* p = text.begin(); p <= text.end(); p++) { 139 cap_[0] = p; 140 if (Visit(prog_->start(), p)) // Match must be leftmost; done. 141 return true; 142 } 143 return false; 144 } 145 146 // Explores from instruction ip at string position p looking for a match. 147 // Return true if found (so that caller can stop trying other possibilities). 148 bool Backtracker::Visit(int id, const char* p) { 149 // Check bitmap. If we've already explored from here, 150 // either it didn't match or it did but we're hoping for a better match. 151 // Either way, don't go down that road again. 152 CHECK(p <= text_.end()); 153 int n = id*(text_.size()+1) + (p - text_.begin()); 154 CHECK_LT(n/32, nvisited_); 155 if (visited_[n/32] & (1 << (n&31))) 156 return false; 157 visited_[n/32] |= 1 << (n&31); 158 159 // Pick out byte at current position. If at end of string, 160 // have to explore in hope of finishing a match. Use impossible byte -1. 161 int c = -1; 162 if (p < text_.end()) 163 c = *p & 0xFF; 164 165 Prog::Inst* ip = prog_->inst(id); 166 switch (ip->opcode()) { 167 default: 168 LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode(); 169 return false; // not reached 170 171 case kInstAlt: 172 case kInstAltMatch: 173 // Try both possible next states: out is preferred to out1. 174 if (Visit(ip->out(), p)) { 175 if (longest_) 176 Visit(ip->out1(), p); 177 return true; 178 } 179 return Visit(ip->out1(), p); 180 181 case kInstByteRange: 182 if (ip->Matches(c)) 183 return Visit(ip->out(), p+1); 184 return false; 185 186 case kInstCapture: 187 if (0 <= ip->cap() && ip->cap() < arraysize(cap_)) { 188 // Capture p to register, but save old value. 189 const char* q = cap_[ip->cap()]; 190 cap_[ip->cap()] = p; 191 bool ret = Visit(ip->out(), p); 192 // Restore old value as we backtrack. 193 cap_[ip->cap()] = q; 194 return ret; 195 } 196 return Visit(ip->out(), p); 197 198 case kInstEmptyWidth: 199 if (ip->empty() & ~Prog::EmptyFlags(context_, p)) 200 return false; 201 return Visit(ip->out(), p); 202 203 case kInstNop: 204 return Visit(ip->out(), p); 205 206 case kInstMatch: 207 // We found a match. If it's the best so far, record the 208 // parameters in the caller's submatch_ array. 209 if (endmatch_ && p != context_.end()) 210 return false; 211 cap_[1] = p; 212 if (submatch_[0].data() == NULL || // First match so far ... 213 (longest_ && p > submatch_[0].end())) { // ... or better match 214 for (int i = 0; i < nsubmatch_; i++) 215 submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]); 216 } 217 return true; 218 219 case kInstFail: 220 return false; 221 } 222 } 223 224 // Runs a backtracking search. 225 bool Prog::UnsafeSearchBacktrack(const StringPiece& text, 226 const StringPiece& context, 227 Anchor anchor, 228 MatchKind kind, 229 StringPiece* match, 230 int nmatch) { 231 // If full match, we ask for an anchored longest match 232 // and then check that match[0] == text. 233 // So make sure match[0] exists. 234 StringPiece sp0; 235 if (kind == kFullMatch) { 236 anchor = kAnchored; 237 if (nmatch < 1) { 238 match = &sp0; 239 nmatch = 1; 240 } 241 } 242 243 // Run the search. 244 Backtracker b(this); 245 bool anchored = anchor == kAnchored; 246 bool longest = kind != kFirstMatch; 247 if (!b.Search(text, context, anchored, longest, match, nmatch)) 248 return false; 249 if (kind == kFullMatch && match[0].end() != text.end()) 250 return false; 251 return true; 252 } 253 254 } // namespace re2 255