1 // Copyright 2008 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Tested by search_test.cc. 6 // 7 // Prog::SearchOnePass is an efficient implementation of 8 // regular expression search with submatch tracking for 9 // what I call "one-pass regular expressions". (An alternate 10 // name might be "backtracking-free regular expressions".) 11 // 12 // One-pass regular expressions have the property that 13 // at each input byte during an anchored match, there may be 14 // multiple alternatives but only one can proceed for any 15 // given input byte. 16 // 17 // For example, the regexp /x*yx*/ is one-pass: you read 18 // x's until a y, then you read the y, then you keep reading x's. 19 // At no point do you have to guess what to do or back up 20 // and try a different guess. 21 // 22 // On the other hand, /x*x/ is not one-pass: when you're 23 // looking at an input "x", it's not clear whether you should 24 // use it to extend the x* or as the final x. 25 // 26 // More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not. 27 // /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not. 28 // 29 // A simple intuition for identifying one-pass regular expressions 30 // is that it's always immediately obvious when a repetition ends. 31 // It must also be immediately obvious which branch of an | to take: 32 // 33 // /x(y|z)/ is one-pass, but /(xy|xz)/ is not. 34 // 35 // The NFA-based search in nfa.cc does some bookkeeping to 36 // avoid the need for backtracking and its associated exponential blowup. 37 // But if we have a one-pass regular expression, there is no 38 // possibility of backtracking, so there is no need for the 39 // extra bookkeeping. Hence, this code. 40 // 41 // On a one-pass regular expression, the NFA code in nfa.cc 42 // runs at about 1/20 of the backtracking-based PCRE speed. 43 // In contrast, the code in this file runs at about the same 44 // speed as PCRE. 45 // 46 // One-pass regular expressions get used a lot when RE is 47 // used for parsing simple strings, so it pays off to 48 // notice them and handle them efficiently. 49 // 50 // See also Anne Brggemann-Klein and Derick Wood, 51 // "One-unambiguous regular languages", Information and Computation 142(2). 52 53 #include <string.h> 54 #include <map> 55 #include "util/util.h" 56 #include "util/arena.h" 57 #include "util/sparse_set.h" 58 #include "re2/prog.h" 59 #include "re2/stringpiece.h" 60 61 namespace re2 { 62 63 static const int Debug = 0; 64 65 // The key insight behind this implementation is that the 66 // non-determinism in an NFA for a one-pass regular expression 67 // is contained. To explain what that means, first a 68 // refresher about what regular expression programs look like 69 // and how the usual NFA execution runs. 70 // 71 // In a regular expression program, only the kInstByteRange 72 // instruction processes an input byte c and moves on to the 73 // next byte in the string (it does so if c is in the given range). 74 // The kInstByteRange instructions correspond to literal characters 75 // and character classes in the regular expression. 76 // 77 // The kInstAlt instructions are used as wiring to connect the 78 // kInstByteRange instructions together in interesting ways when 79 // implementing | + and *. 80 // The kInstAlt instruction forks execution, like a goto that 81 // jumps to ip->out() and ip->out1() in parallel. Each of the 82 // resulting computation paths is called a thread. 83 // 84 // The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture -- 85 // are interesting in their own right but like kInstAlt they don't 86 // advance the input pointer. Only kInstByteRange does. 87 // 88 // The automaton execution in nfa.cc runs all the possible 89 // threads of execution in lock-step over the input. To process 90 // a particular byte, each thread gets run until it either dies 91 // or finds a kInstByteRange instruction matching the byte. 92 // If the latter happens, the thread stops just past the 93 // kInstByteRange instruction (at ip->out()) and waits for 94 // the other threads to finish processing the input byte. 95 // Then, once all the threads have processed that input byte, 96 // the whole process repeats. The kInstAlt state instruction 97 // might create new threads during input processing, but no 98 // matter what, all the threads stop after a kInstByteRange 99 // and wait for the other threads to "catch up". 100 // Running in lock step like this ensures that the NFA reads 101 // the input string only once. 102 // 103 // Each thread maintains its own set of capture registers 104 // (the string positions at which it executed the kInstCapture 105 // instructions corresponding to capturing parentheses in the 106 // regular expression). Repeated copying of the capture registers 107 // is the main performance bottleneck in the NFA implementation. 108 // 109 // A regular expression program is "one-pass" if, no matter what 110 // the input string, there is only one thread that makes it 111 // past a kInstByteRange instruction at each input byte. This means 112 // that there is in some sense only one active thread throughout 113 // the execution. Other threads might be created during the 114 // processing of an input byte, but they are ephemeral: only one 115 // thread is left to start processing the next input byte. 116 // This is what I meant above when I said the non-determinism 117 // was "contained". 118 // 119 // To execute a one-pass regular expression program, we can build 120 // a DFA (no non-determinism) that has at most as many states as 121 // the NFA (compare this to the possibly exponential number of states 122 // in the general case). Each state records, for each possible 123 // input byte, the next state along with the conditions required 124 // before entering that state -- empty-width flags that must be true 125 // and capture operations that must be performed. It also records 126 // whether a set of conditions required to finish a match at that 127 // point in the input rather than process the next byte. 128 129 // A state in the one-pass NFA (aka DFA) - just an array of actions. 130 struct OneState; 131 132 // A state in the one-pass NFA - just an array of actions indexed 133 // by the bytemap_[] of the next input byte. (The bytemap 134 // maps next input bytes into equivalence classes, to reduce 135 // the memory footprint.) 136 struct OneState { 137 uint32 matchcond; // conditions to match right now. 138 uint32 action[1]; 139 }; 140 141 // The uint32 conditions in the action are a combination of 142 // condition and capture bits and the next state. The bottom 16 bits 143 // are the condition and capture bits, and the top 16 are the index of 144 // the next state. 145 // 146 // Bits 0-5 are the empty-width flags from prog.h. 147 // Bit 6 is kMatchWins, which means the match takes 148 // priority over moving to next in a first-match search. 149 // The remaining bits mark capture registers that should 150 // be set to the current input position. The capture bits 151 // start at index 2, since the search loop can take care of 152 // cap[0], cap[1] (the overall match position). 153 // That means we can handle up to 5 capturing parens: $1 through $4, plus $0. 154 // No input position can satisfy both kEmptyWordBoundary 155 // and kEmptyNonWordBoundary, so we can use that as a sentinel 156 // instead of needing an extra bit. 157 158 static const int kIndexShift = 16; // number of bits below index 159 static const int kEmptyShift = 6; // number of empty flags in prog.h 160 static const int kRealCapShift = kEmptyShift + 1; 161 static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2; 162 163 // Parameters used to skip over cap[0], cap[1]. 164 static const int kCapShift = kRealCapShift - 2; 165 static const int kMaxCap = kRealMaxCap + 2; 166 167 static const uint32 kMatchWins = 1 << kEmptyShift; 168 static const uint32 kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift; 169 170 static const uint32 kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary; 171 172 // Check, at compile time, that prog.h agrees with math above. 173 // This function is never called. 174 void OnePass_Checks() { 175 COMPILE_ASSERT((1<<kEmptyShift)-1 == kEmptyAllFlags, 176 kEmptyShift_disagrees_with_kEmptyAllFlags); 177 // kMaxCap counts pointers, kMaxOnePassCapture counts pairs. 178 COMPILE_ASSERT(kMaxCap == Prog::kMaxOnePassCapture*2, 179 kMaxCap_disagrees_with_kMaxOnePassCapture); 180 } 181 182 static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) { 183 uint32 satisfied = Prog::EmptyFlags(context, p); 184 if (cond & kEmptyAllFlags & ~satisfied) 185 return false; 186 return true; 187 } 188 189 // Apply the capture bits in cond, saving p to the appropriate 190 // locations in cap[]. 191 static void ApplyCaptures(uint32 cond, const char* p, 192 const char** cap, int ncap) { 193 for (int i = 2; i < ncap; i++) 194 if (cond & (1 << kCapShift << i)) 195 cap[i] = p; 196 } 197 198 // Compute a node pointer. 199 // Basically (OneState*)(nodes + statesize*nodeindex) 200 // but the version with the C++ casts overflows 80 characters (and is ugly). 201 static inline OneState* IndexToNode(volatile uint8* nodes, int statesize, 202 int nodeindex) { 203 return reinterpret_cast<OneState*>( 204 const_cast<uint8*>(nodes + statesize*nodeindex)); 205 } 206 207 bool Prog::SearchOnePass(const StringPiece& text, 208 const StringPiece& const_context, 209 Anchor anchor, MatchKind kind, 210 StringPiece* match, int nmatch) { 211 if (anchor != kAnchored && kind != kFullMatch) { 212 LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches."; 213 return false; 214 } 215 216 // Make sure we have at least cap[1], 217 // because we use it to tell if we matched. 218 int ncap = 2*nmatch; 219 if (ncap < 2) 220 ncap = 2; 221 222 const char* cap[kMaxCap]; 223 for (int i = 0; i < ncap; i++) 224 cap[i] = NULL; 225 226 const char* matchcap[kMaxCap]; 227 for (int i = 0; i < ncap; i++) 228 matchcap[i] = NULL; 229 230 StringPiece context = const_context; 231 if (context.begin() == NULL) 232 context = text; 233 if (anchor_start() && context.begin() != text.begin()) 234 return false; 235 if (anchor_end() && context.end() != text.end()) 236 return false; 237 if (anchor_end()) 238 kind = kFullMatch; 239 240 // State and act are marked volatile to 241 // keep the compiler from re-ordering the 242 // memory accesses walking over the NFA. 243 // This is worth about 5%. 244 volatile OneState* state = onepass_start_; 245 volatile uint8* nodes = onepass_nodes_; 246 volatile uint32 statesize = onepass_statesize_; 247 uint8* bytemap = bytemap_; 248 const char* bp = text.begin(); 249 const char* ep = text.end(); 250 const char* p; 251 bool matched = false; 252 matchcap[0] = bp; 253 cap[0] = bp; 254 uint32 nextmatchcond = state->matchcond; 255 for (p = bp; p < ep; p++) { 256 int c = bytemap[*p & 0xFF]; 257 uint32 matchcond = nextmatchcond; 258 uint32 cond = state->action[c]; 259 260 // Determine whether we can reach act->next. 261 // If so, advance state and nextmatchcond. 262 if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) { 263 uint32 nextindex = cond >> kIndexShift; 264 state = IndexToNode(nodes, statesize, nextindex); 265 nextmatchcond = state->matchcond; 266 } else { 267 state = NULL; 268 nextmatchcond = kImpossible; 269 } 270 271 // This code section is carefully tuned. 272 // The goto sequence is about 10% faster than the 273 // obvious rewrite as a large if statement in the 274 // ASCIIMatchRE2 and DotMatchRE2 benchmarks. 275 276 // Saving the match capture registers is expensive. 277 // Is this intermediate match worth thinking about? 278 279 // Not if we want a full match. 280 if (kind == kFullMatch) 281 goto skipmatch; 282 283 // Not if it's impossible. 284 if (matchcond == kImpossible) 285 goto skipmatch; 286 287 // Not if the possible match is beaten by the certain 288 // match at the next byte. When this test is useless 289 // (e.g., HTTPPartialMatchRE2) it slows the loop by 290 // about 10%, but when it avoids work (e.g., DotMatchRE2), 291 // it cuts the loop execution by about 45%. 292 if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0) 293 goto skipmatch; 294 295 // Finally, the match conditions must be satisfied. 296 if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) { 297 for (int i = 2; i < 2*nmatch; i++) 298 matchcap[i] = cap[i]; 299 if (nmatch > 1 && (matchcond & kCapMask)) 300 ApplyCaptures(matchcond, p, matchcap, ncap); 301 matchcap[1] = p; 302 matched = true; 303 304 // If we're in longest match mode, we have to keep 305 // going and see if we find a longer match. 306 // In first match mode, we can stop if the match 307 // takes priority over the next state for this input byte. 308 // That bit is per-input byte and thus in cond, not matchcond. 309 if (kind == kFirstMatch && (cond & kMatchWins)) 310 goto done; 311 } 312 313 skipmatch: 314 if (state == NULL) 315 goto done; 316 if ((cond & kCapMask) && nmatch > 1) 317 ApplyCaptures(cond, p, cap, ncap); 318 } 319 320 // Look for match at end of input. 321 { 322 uint32 matchcond = state->matchcond; 323 if (matchcond != kImpossible && 324 ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) { 325 if (nmatch > 1 && (matchcond & kCapMask)) 326 ApplyCaptures(matchcond, p, cap, ncap); 327 for (int i = 2; i < ncap; i++) 328 matchcap[i] = cap[i]; 329 matchcap[1] = p; 330 matched = true; 331 } 332 } 333 334 done: 335 if (!matched) 336 return false; 337 for (int i = 0; i < nmatch; i++) 338 match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]); 339 return true; 340 } 341 342 343 // Analysis to determine whether a given regexp program is one-pass. 344 345 // If ip is not on workq, adds ip to work queue and returns true. 346 // If ip is already on work queue, does nothing and returns false. 347 // If ip is NULL, does nothing and returns true (pretends to add it). 348 typedef SparseSet Instq; 349 static bool AddQ(Instq *q, int id) { 350 if (id == 0) 351 return true; 352 if (q->contains(id)) 353 return false; 354 q->insert(id); 355 return true; 356 } 357 358 struct InstCond { 359 int id; 360 uint32 cond; 361 }; 362 363 // Returns whether this is a one-pass program; that is, 364 // returns whether it is safe to use SearchOnePass on this program. 365 // These conditions must be true for any instruction ip: 366 // 367 // (1) for any other Inst nip, there is at most one input-free 368 // path from ip to nip. 369 // (2) there is at most one kInstByte instruction reachable from 370 // ip that matches any particular byte c. 371 // (3) there is at most one input-free path from ip to a kInstMatch 372 // instruction. 373 // 374 // This is actually just a conservative approximation: it might 375 // return false when the answer is true, when kInstEmptyWidth 376 // instructions are involved. 377 // Constructs and saves corresponding one-pass NFA on success. 378 bool Prog::IsOnePass() { 379 if (did_onepass_) 380 return onepass_start_ != NULL; 381 did_onepass_ = true; 382 383 if (start() == 0) // no match 384 return false; 385 386 // Steal memory for the one-pass NFA from the overall DFA budget. 387 // Willing to use at most 1/4 of the DFA budget (heuristic). 388 // Limit max node count to 65000 as a conservative estimate to 389 // avoid overflowing 16-bit node index in encoding. 390 int maxnodes = 2 + byte_inst_count_; 391 int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32); 392 if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes) 393 return false; 394 395 // Flood the graph starting at the start state, and check 396 // that in each reachable state, each possible byte leads 397 // to a unique next state. 398 int size = this->size(); 399 InstCond *stack = new InstCond[size]; 400 401 int* nodebyid = new int[size]; // indexed by ip 402 memset(nodebyid, 0xFF, size*sizeof nodebyid[0]); 403 404 uint8* nodes = new uint8[maxnodes*statesize]; 405 uint8* nodep = nodes; 406 407 Instq tovisit(size), workq(size); 408 AddQ(&tovisit, start()); 409 nodebyid[start()] = 0; 410 nodep += statesize; 411 int nalloc = 1; 412 for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { 413 int id = *it; 414 int nodeindex = nodebyid[id]; 415 OneState* node = IndexToNode(nodes, statesize, nodeindex); 416 417 // Flood graph using manual stack, filling in actions as found. 418 // Default is none. 419 for (int b = 0; b < bytemap_range_; b++) 420 node->action[b] = kImpossible; 421 node->matchcond = kImpossible; 422 423 workq.clear(); 424 bool matched = false; 425 int nstack = 0; 426 stack[nstack].id = id; 427 stack[nstack++].cond = 0; 428 while (nstack > 0) { 429 int id = stack[--nstack].id; 430 Prog::Inst* ip = inst(id); 431 uint32 cond = stack[nstack].cond; 432 switch (ip->opcode()) { 433 case kInstAltMatch: 434 // TODO(rsc): Ignoring kInstAltMatch optimization. 435 // Should implement it in this engine, but it's subtle. 436 // Fall through. 437 case kInstAlt: 438 // If already on work queue, (1) is violated: bail out. 439 if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1())) 440 goto fail; 441 stack[nstack].id = ip->out1(); 442 stack[nstack++].cond = cond; 443 stack[nstack].id = ip->out(); 444 stack[nstack++].cond = cond; 445 break; 446 447 case kInstByteRange: { 448 int nextindex = nodebyid[ip->out()]; 449 if (nextindex == -1) { 450 if (nalloc >= maxnodes) { 451 if (Debug) 452 LOG(ERROR) 453 << StringPrintf("Not OnePass: hit node limit %d > %d", 454 nalloc, maxnodes); 455 goto fail; 456 } 457 nextindex = nalloc; 458 nodep += statesize; 459 nodebyid[ip->out()] = nextindex; 460 nalloc++; 461 AddQ(&tovisit, ip->out()); 462 } 463 if (matched) 464 cond |= kMatchWins; 465 for (int c = ip->lo(); c <= ip->hi(); c++) { 466 int b = bytemap_[c]; 467 c = unbytemap_[b]; // last c in byte class 468 uint32 act = node->action[b]; 469 uint32 newact = (nextindex << kIndexShift) | cond; 470 if ((act & kImpossible) == kImpossible) { 471 node->action[b] = newact; 472 } else if (act != newact) { 473 if (Debug) { 474 LOG(ERROR) 475 << StringPrintf("Not OnePass: conflict on byte " 476 "%#x at state %d", 477 c, *it); 478 } 479 goto fail; 480 } 481 } 482 if (ip->foldcase()) { 483 Rune lo = max<Rune>(ip->lo(), 'a') + 'A' - 'a'; 484 Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a'; 485 for (int c = lo; c <= hi; c++) { 486 int b = bytemap_[c]; 487 c = unbytemap_[b]; // last c in class 488 uint32 act = node->action[b]; 489 uint32 newact = (nextindex << kIndexShift) | cond; 490 if ((act & kImpossible) == kImpossible) { 491 node->action[b] = newact; 492 } else if (act != newact) { 493 if (Debug) { 494 LOG(ERROR) 495 << StringPrintf("Not OnePass: conflict on byte " 496 "%#x at state %d", 497 c, *it); 498 } 499 goto fail; 500 } 501 } 502 } 503 break; 504 } 505 506 case kInstCapture: 507 if (ip->cap() < kMaxCap) 508 cond |= (1 << kCapShift) << ip->cap(); 509 goto QueueEmpty; 510 511 case kInstEmptyWidth: 512 cond |= ip->empty(); 513 goto QueueEmpty; 514 515 case kInstNop: 516 QueueEmpty: 517 // kInstCapture and kInstNop always proceed to ip->out(). 518 // kInstEmptyWidth only sometimes proceeds to ip->out(), 519 // but as a conservative approximation we assume it always does. 520 // We could be a little more precise by looking at what c 521 // is, but that seems like overkill. 522 523 // If already on work queue, (1) is violated: bail out. 524 if (!AddQ(&workq, ip->out())) { 525 if (Debug) { 526 LOG(ERROR) << StringPrintf("Not OnePass: multiple paths" 527 " %d -> %d\n", 528 *it, ip->out()); 529 } 530 goto fail; 531 } 532 stack[nstack].id = ip->out(); 533 stack[nstack++].cond = cond; 534 break; 535 536 case kInstMatch: 537 if (matched) { 538 // (3) is violated 539 if (Debug) { 540 LOG(ERROR) << StringPrintf("Not OnePass: multiple matches" 541 " from %d\n", *it); 542 } 543 goto fail; 544 } 545 matched = true; 546 node->matchcond = cond; 547 break; 548 549 case kInstFail: 550 break; 551 } 552 } 553 } 554 555 if (Debug) { // For debugging, dump one-pass NFA to LOG(ERROR). 556 string dump = "prog dump:\n" + Dump() + "node dump\n"; 557 map<int, int> idmap; 558 for (int i = 0; i < size; i++) 559 if (nodebyid[i] != -1) 560 idmap[nodebyid[i]] = i; 561 562 StringAppendF(&dump, "byte ranges:\n"); 563 int i = 0; 564 for (int b = 0; b < bytemap_range_; b++) { 565 int lo = i; 566 while (bytemap_[i] == b) 567 i++; 568 StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1); 569 } 570 571 for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { 572 int id = *it; 573 int nodeindex = nodebyid[id]; 574 if (nodeindex == -1) 575 continue; 576 OneState* node = IndexToNode(nodes, statesize, nodeindex); 577 string s; 578 StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n", 579 nodeindex, id, node->matchcond); 580 for (int i = 0; i < bytemap_range_; i++) { 581 if ((node->action[i] & kImpossible) == kImpossible) 582 continue; 583 StringAppendF(&dump, " %d cond %#x -> %d id=%d\n", 584 i, node->action[i] & 0xFFFF, 585 node->action[i] >> kIndexShift, 586 idmap[node->action[i] >> kIndexShift]); 587 } 588 } 589 LOG(ERROR) << dump; 590 } 591 592 // Overallocated earlier; cut down to actual size. 593 nodep = new uint8[nalloc*statesize]; 594 memmove(nodep, nodes, nalloc*statesize); 595 delete[] nodes; 596 nodes = nodep; 597 598 onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]); 599 onepass_nodes_ = nodes; 600 onepass_statesize_ = statesize; 601 dfa_mem_ -= nalloc*statesize; 602 603 delete[] stack; 604 delete[] nodebyid; 605 return true; 606 607 fail: 608 delete[] stack; 609 delete[] nodebyid; 610 delete[] nodes; 611 return false; 612 } 613 614 } // namespace re2 615