1 // Copyright 2006-2007 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Tested by search_test.cc. 6 // 7 // Prog::SearchNFA, an NFA search. 8 // This is an actual NFA like the theorists talk about, 9 // not the pseudo-NFA found in backtracking regexp implementations. 10 // 11 // IMPLEMENTATION 12 // 13 // This algorithm is a variant of one that appeared in Rob Pike's sam editor, 14 // which is a variant of the one described in Thompson's 1968 CACM paper. 15 // See http://swtch.com/~rsc/regexp/ for various history. The main feature 16 // over the DFA implementation is that it tracks submatch boundaries. 17 // 18 // When the choice of submatch boundaries is ambiguous, this particular 19 // implementation makes the same choices that traditional backtracking 20 // implementations (in particular, Perl and PCRE) do. 21 // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential 22 // time in the length of the input. 23 // 24 // Like Thompson's original machine and like the DFA implementation, this 25 // implementation notices a match only once it is one byte past it. 26 27 #include "re2/prog.h" 28 #include "re2/regexp.h" 29 #include "util/sparse_array.h" 30 #include "util/sparse_set.h" 31 32 namespace re2 { 33 34 class NFA { 35 public: 36 NFA(Prog* prog); 37 ~NFA(); 38 39 // Searches for a matching string. 40 // * If anchored is true, only considers matches starting at offset. 41 // Otherwise finds lefmost match at or after offset. 42 // * If longest is true, returns the longest match starting 43 // at the chosen start point. Otherwise returns the so-called 44 // left-biased match, the one traditional backtracking engines 45 // (like Perl and PCRE) find. 46 // Records submatch boundaries in submatch[1..nsubmatch-1]. 47 // Submatch[0] is the entire match. When there is a choice in 48 // which text matches each subexpression, the submatch boundaries 49 // are chosen to match what a backtracking implementation would choose. 50 bool Search(const StringPiece& text, const StringPiece& context, 51 bool anchored, bool longest, 52 StringPiece* submatch, int nsubmatch); 53 54 static const int Debug = 0; 55 56 private: 57 struct Thread { 58 union { 59 int id; 60 Thread* next; // when on free list 61 }; 62 const char** capture; 63 }; 64 65 // State for explicit stack in AddToThreadq. 66 struct AddState { 67 int id; // Inst to process 68 int j; 69 const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip 70 71 AddState() 72 : id(0), j(-1), cap_j(NULL) {} 73 explicit AddState(int id) 74 : id(id), j(-1), cap_j(NULL) {} 75 AddState(int id, const char* cap_j, int j) 76 : id(id), j(j), cap_j(cap_j) {} 77 }; 78 79 // Threadq is a list of threads. The list is sorted by the order 80 // in which Perl would explore that particular state -- the earlier 81 // choices appear earlier in the list. 82 typedef SparseArray<Thread*> Threadq; 83 84 inline Thread* AllocThread(); 85 inline void FreeThread(Thread*); 86 87 // Add id (or its children, following unlabeled arrows) 88 // to the workqueue q with associated capture info. 89 void AddToThreadq(Threadq* q, int id, int flag, 90 const char* p, const char** capture); 91 92 // Run runq on byte c, appending new states to nextq. 93 // Updates matched_ and match_ as new, better matches are found. 94 // p is position of the next byte (the one after c) 95 // in the input string, used when processing capturing parens. 96 // flag is the bitwise or of Bol, Eol, etc., specifying whether 97 // ^, $ and \b match the current input point (after c). 98 inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p); 99 100 // Returns text version of capture information, for debugging. 101 string FormatCapture(const char** capture); 102 103 inline void CopyCapture(const char** dst, const char** src); 104 105 // Computes whether all matches must begin with the same first 106 // byte, and if so, returns that byte. If not, returns -1. 107 int ComputeFirstByte(); 108 109 Prog* prog_; // underlying program 110 int start_; // start instruction in program 111 int ncapture_; // number of submatches to track 112 bool longest_; // whether searching for longest match 113 bool endmatch_; // whether match must end at text.end() 114 const char* btext_; // beginning of text being matched (for FormatSubmatch) 115 const char* etext_; // end of text being matched (for endmatch_) 116 Threadq q0_, q1_; // pre-allocated for Search. 117 const char** match_; // best match so far 118 bool matched_; // any match so far? 119 AddState* astack_; // pre-allocated for AddToThreadq 120 int nastack_; 121 int first_byte_; // required first byte for match, or -1 if none 122 123 Thread* free_threads_; // free list 124 125 DISALLOW_EVIL_CONSTRUCTORS(NFA); 126 }; 127 128 NFA::NFA(Prog* prog) { 129 prog_ = prog; 130 start_ = prog->start(); 131 ncapture_ = 0; 132 longest_ = false; 133 endmatch_ = false; 134 btext_ = NULL; 135 etext_ = NULL; 136 q0_.resize(prog_->size()); 137 q1_.resize(prog_->size()); 138 nastack_ = 2*prog_->size(); 139 astack_ = new AddState[nastack_]; 140 match_ = NULL; 141 matched_ = false; 142 free_threads_ = NULL; 143 first_byte_ = ComputeFirstByte(); 144 } 145 146 NFA::~NFA() { 147 delete[] match_; 148 delete[] astack_; 149 Thread* next; 150 for (Thread* t = free_threads_; t; t = next) { 151 next = t->next; 152 delete[] t->capture; 153 delete t; 154 } 155 } 156 157 void NFA::FreeThread(Thread *t) { 158 if (t == NULL) 159 return; 160 t->next = free_threads_; 161 free_threads_ = t; 162 } 163 164 NFA::Thread* NFA::AllocThread() { 165 Thread* t = free_threads_; 166 if (t == NULL) { 167 t = new Thread; 168 t->capture = new const char*[ncapture_]; 169 return t; 170 } 171 free_threads_ = t->next; 172 return t; 173 } 174 175 void NFA::CopyCapture(const char** dst, const char** src) { 176 for (int i = 0; i < ncapture_; i+=2) { 177 dst[i] = src[i]; 178 dst[i+1] = src[i+1]; 179 } 180 } 181 182 // Follows all empty arrows from id0 and enqueues all the states reached. 183 // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match. 184 // The pointer p is the current input position, and m is the 185 // current set of match boundaries. 186 void NFA::AddToThreadq(Threadq* q, int id0, int flag, 187 const char* p, const char** capture) { 188 if (id0 == 0) 189 return; 190 191 // Astack_ is pre-allocated to avoid resize operations. 192 // It has room for 2*prog_->size() entries, which is enough: 193 // Each inst in prog can be processed at most once, 194 // pushing at most two entries on stk. 195 196 int nstk = 0; 197 AddState* stk = astack_; 198 stk[nstk++] = AddState(id0); 199 200 while (nstk > 0) { 201 DCHECK_LE(nstk, nastack_); 202 const AddState& a = stk[--nstk]; 203 if (a.j >= 0) 204 capture[a.j] = a.cap_j; 205 206 int id = a.id; 207 if (id == 0) 208 continue; 209 if (q->has_index(id)) { 210 if (Debug) 211 fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str()); 212 continue; 213 } 214 215 // Create entry in q no matter what. We might fill it in below, 216 // or we might not. Even if not, it is necessary to have it, 217 // so that we don't revisit id0 during the recursion. 218 q->set_new(id, NULL); 219 220 Thread** tp = &q->find(id)->second; 221 int j; 222 Thread* t; 223 Prog::Inst* ip = prog_->inst(id); 224 switch (ip->opcode()) { 225 default: 226 LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq"; 227 break; 228 229 case kInstFail: 230 break; 231 232 case kInstAltMatch: 233 // Save state; will pick up at next byte. 234 t = AllocThread(); 235 t->id = id; 236 CopyCapture(t->capture, capture); 237 *tp = t; 238 // fall through 239 240 case kInstAlt: 241 // Explore alternatives. 242 stk[nstk++] = AddState(ip->out1()); 243 stk[nstk++] = AddState(ip->out()); 244 break; 245 246 case kInstNop: 247 // Continue on. 248 stk[nstk++] = AddState(ip->out()); 249 break; 250 251 case kInstCapture: 252 if ((j=ip->cap()) < ncapture_) { 253 // Push a dummy whose only job is to restore capture[j] 254 // once we finish exploring this possibility. 255 stk[nstk++] = AddState(0, capture[j], j); 256 257 // Record capture. 258 capture[j] = p; 259 } 260 stk[nstk++] = AddState(ip->out()); 261 break; 262 263 case kInstMatch: 264 case kInstByteRange: 265 // Save state; will pick up at next byte. 266 t = AllocThread(); 267 t->id = id; 268 CopyCapture(t->capture, capture); 269 *tp = t; 270 if (Debug) 271 fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t); 272 break; 273 274 case kInstEmptyWidth: 275 // Continue on if we have all the right flag bits. 276 if (ip->empty() & ~flag) 277 break; 278 stk[nstk++] = AddState(ip->out()); 279 break; 280 } 281 } 282 } 283 284 // Run runq on byte c, appending new states to nextq. 285 // Updates match as new, better matches are found. 286 // p is position of the byte c in the input string, 287 // used when processing capturing parens. 288 // flag is the bitwise or of Bol, Eol, etc., specifying whether 289 // ^, $ and \b match the current input point (after c). 290 // Frees all the threads on runq. 291 // If there is a shortcut to the end, returns that shortcut. 292 int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) { 293 nextq->clear(); 294 295 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { 296 Thread* t = i->second; 297 if (t == NULL) 298 continue; 299 300 if (longest_) { 301 // Can skip any threads started after our current best match. 302 if (matched_ && match_[0] < t->capture[0]) { 303 FreeThread(t); 304 continue; 305 } 306 } 307 308 int id = t->id; 309 Prog::Inst* ip = prog_->inst(id); 310 311 switch (ip->opcode()) { 312 default: 313 // Should only see the values handled below. 314 LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step"; 315 break; 316 317 case kInstByteRange: 318 if (ip->Matches(c)) 319 AddToThreadq(nextq, ip->out(), flag, p+1, t->capture); 320 break; 321 322 case kInstAltMatch: 323 if (i != runq->begin()) 324 break; 325 // The match is ours if we want it. 326 if (ip->greedy(prog_) || longest_) { 327 CopyCapture((const char**)match_, t->capture); 328 FreeThread(t); 329 for (++i; i != runq->end(); ++i) 330 FreeThread(i->second); 331 runq->clear(); 332 matched_ = true; 333 if (ip->greedy(prog_)) 334 return ip->out1(); 335 return ip->out(); 336 } 337 break; 338 339 case kInstMatch: 340 if (endmatch_ && p != etext_) 341 break; 342 343 const char* old = t->capture[1]; // previous end pointer 344 t->capture[1] = p; 345 if (longest_) { 346 // Leftmost-longest mode: save this match only if 347 // it is either farther to the left or at the same 348 // point but longer than an existing match. 349 if (!matched_ || t->capture[0] < match_[0] || 350 (t->capture[0] == match_[0] && t->capture[1] > match_[1])) 351 CopyCapture((const char**)match_, t->capture); 352 } else { 353 // Leftmost-biased mode: this match is by definition 354 // better than what we've already found (see next line). 355 CopyCapture((const char**)match_, t->capture); 356 357 // Cut off the threads that can only find matches 358 // worse than the one we just found: don't run the 359 // rest of the current Threadq. 360 t->capture[0] = old; 361 FreeThread(t); 362 for (++i; i != runq->end(); ++i) 363 FreeThread(i->second); 364 runq->clear(); 365 matched_ = true; 366 return 0; 367 } 368 t->capture[0] = old; 369 matched_ = true; 370 break; 371 } 372 FreeThread(t); 373 } 374 runq->clear(); 375 return 0; 376 } 377 378 string NFA::FormatCapture(const char** capture) { 379 string s; 380 381 for (int i = 0; i < ncapture_; i+=2) { 382 if (capture[i] == NULL) 383 StringAppendF(&s, "(?,?)"); 384 else if (capture[i+1] == NULL) 385 StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_)); 386 else 387 StringAppendF(&s, "(%d,%d)", 388 (int)(capture[i] - btext_), 389 (int)(capture[i+1] - btext_)); 390 } 391 return s; 392 } 393 394 // Returns whether haystack contains needle's memory. 395 static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) { 396 return haystack.begin() <= needle.begin() && 397 haystack.end() >= needle.end(); 398 } 399 400 bool NFA::Search(const StringPiece& text, const StringPiece& const_context, 401 bool anchored, bool longest, 402 StringPiece* submatch, int nsubmatch) { 403 if (start_ == 0) 404 return false; 405 406 StringPiece context = const_context; 407 if (context.begin() == NULL) 408 context = text; 409 410 if (!StringPieceContains(context, text)) { 411 LOG(FATAL) << "Bad args: context does not contain text " 412 << reinterpret_cast<const void*>(context.begin()) 413 << "+" << context.size() << " " 414 << reinterpret_cast<const void*>(text.begin()) 415 << "+" << text.size(); 416 return false; 417 } 418 419 if (prog_->anchor_start() && context.begin() != text.begin()) 420 return false; 421 if (prog_->anchor_end() && context.end() != text.end()) 422 return false; 423 anchored |= prog_->anchor_start(); 424 if (prog_->anchor_end()) { 425 longest = true; 426 endmatch_ = true; 427 etext_ = text.end(); 428 } 429 430 if (nsubmatch < 0) { 431 LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch; 432 return false; 433 } 434 435 // Save search parameters. 436 ncapture_ = 2*nsubmatch; 437 longest_ = longest; 438 439 if (nsubmatch == 0) { 440 // We need to maintain match[0], both to distinguish the 441 // longest match (if longest is true) and also to tell 442 // whether we've seen any matches at all. 443 ncapture_ = 2; 444 } 445 446 match_ = new const char*[ncapture_]; 447 matched_ = false; 448 memset(match_, 0, ncapture_*sizeof match_[0]); 449 450 // For debugging prints. 451 btext_ = context.begin(); 452 453 if (Debug) { 454 fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n", 455 text.as_string().c_str(), context.as_string().c_str(), anchored, 456 longest); 457 } 458 459 // Set up search. 460 Threadq* runq = &q0_; 461 Threadq* nextq = &q1_; 462 runq->clear(); 463 nextq->clear(); 464 memset(&match_[0], 0, ncapture_*sizeof match_[0]); 465 const char* bp = context.begin(); 466 int c = -1; 467 int wasword = 0; 468 469 if (text.begin() > context.begin()) { 470 c = text.begin()[-1] & 0xFF; 471 wasword = Prog::IsWordChar(c); 472 } 473 474 // Loop over the text, stepping the machine. 475 for (const char* p = text.begin();; p++) { 476 // Check for empty-width specials. 477 int flag = 0; 478 479 // ^ and \A 480 if (p == context.begin()) 481 flag |= kEmptyBeginText | kEmptyBeginLine; 482 else if (p <= context.end() && p[-1] == '\n') 483 flag |= kEmptyBeginLine; 484 485 // $ and \z 486 if (p == context.end()) 487 flag |= kEmptyEndText | kEmptyEndLine; 488 else if (p < context.end() && p[0] == '\n') 489 flag |= kEmptyEndLine; 490 491 // \b and \B 492 int isword = 0; 493 if (p < context.end()) 494 isword = Prog::IsWordChar(p[0] & 0xFF); 495 496 if (isword != wasword) 497 flag |= kEmptyWordBoundary; 498 else 499 flag |= kEmptyNonWordBoundary; 500 501 if (Debug) { 502 fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword); 503 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { 504 Thread* t = i->second; 505 if (t == NULL) 506 continue; 507 fprintf(stderr, " %d%s", t->id, 508 FormatCapture((const char**)t->capture).c_str()); 509 } 510 fprintf(stderr, "\n"); 511 } 512 513 // Process previous character (waited until now to avoid 514 // repeating the flag computation above). 515 // This is a no-op the first time around the loop, because 516 // runq is empty. 517 int id = Step(runq, nextq, c, flag, p-1); 518 DCHECK_EQ(runq->size(), 0); 519 swap(nextq, runq); 520 nextq->clear(); 521 if (id != 0) { 522 // We're done: full match ahead. 523 p = text.end(); 524 for (;;) { 525 Prog::Inst* ip = prog_->inst(id); 526 switch (ip->opcode()) { 527 default: 528 LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode(); 529 break; 530 531 case kInstCapture: 532 match_[ip->cap()] = p; 533 id = ip->out(); 534 continue; 535 536 case kInstNop: 537 id = ip->out(); 538 continue; 539 540 case kInstMatch: 541 match_[1] = p; 542 matched_ = true; 543 break; 544 545 case kInstEmptyWidth: 546 if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) { 547 LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty(); 548 break; 549 } 550 id = ip->out(); 551 continue; 552 } 553 break; 554 } 555 break; 556 } 557 558 if (p > text.end()) 559 break; 560 561 // Start a new thread if there have not been any matches. 562 // (No point in starting a new thread if there have been 563 // matches, since it would be to the right of the match 564 // we already found.) 565 if (!matched_ && (!anchored || p == text.begin())) { 566 // If there's a required first byte for an unanchored search 567 // and we're not in the middle of any possible matches, 568 // use memchr to search for the byte quickly. 569 if (!anchored && first_byte_ >= 0 && runq->size() == 0 && 570 p < text.end() && (p[0] & 0xFF) != first_byte_) { 571 p = reinterpret_cast<const char*>(memchr(p, first_byte_, 572 text.end() - p)); 573 if (p == NULL) { 574 p = text.end(); 575 isword = 0; 576 } else { 577 isword = Prog::IsWordChar(p[0] & 0xFF); 578 } 579 flag = Prog::EmptyFlags(context, p); 580 } 581 582 // Steal match storage (cleared but unused as of yet) 583 // temporarily to hold match boundaries for new thread. 584 match_[0] = p; 585 AddToThreadq(runq, start_, flag, p, match_); 586 match_[0] = NULL; 587 } 588 589 // If all the threads have died, stop early. 590 if (runq->size() == 0) { 591 if (Debug) 592 fprintf(stderr, "dead\n"); 593 break; 594 } 595 596 if (p == text.end()) 597 c = 0; 598 else 599 c = *p & 0xFF; 600 wasword = isword; 601 602 // Will run step(runq, nextq, c, ...) on next iteration. See above. 603 } 604 605 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) 606 FreeThread(i->second); 607 608 if (matched_) { 609 for (int i = 0; i < nsubmatch; i++) 610 submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]); 611 if (Debug) 612 fprintf(stderr, "match (%d,%d)\n", 613 static_cast<int>(match_[0] - btext_), 614 static_cast<int>(match_[1] - btext_)); 615 return true; 616 } 617 VLOG(1) << "No matches found"; 618 return false; 619 } 620 621 // Computes whether all successful matches have a common first byte, 622 // and if so, returns that byte. If not, returns -1. 623 int NFA::ComputeFirstByte() { 624 if (start_ == 0) 625 return -1; 626 627 int b = -1; // first byte, not yet computed 628 629 typedef SparseSet Workq; 630 Workq q(prog_->size()); 631 q.insert(start_); 632 for (Workq::iterator it = q.begin(); it != q.end(); ++it) { 633 int id = *it; 634 Prog::Inst* ip = prog_->inst(id); 635 switch (ip->opcode()) { 636 default: 637 LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte"; 638 break; 639 640 case kInstMatch: 641 // The empty string matches: no first byte. 642 return -1; 643 644 case kInstByteRange: 645 // Must match only a single byte 646 if (ip->lo() != ip->hi()) 647 return -1; 648 if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z') 649 return -1; 650 // If we haven't seen any bytes yet, record it; 651 // otherwise must match the one we saw before. 652 if (b == -1) 653 b = ip->lo(); 654 else if (b != ip->lo()) 655 return -1; 656 break; 657 658 case kInstNop: 659 case kInstCapture: 660 case kInstEmptyWidth: 661 // Continue on. 662 // Ignore ip->empty() flags for kInstEmptyWidth 663 // in order to be as conservative as possible 664 // (assume all possible empty-width flags are true). 665 if (ip->out()) 666 q.insert(ip->out()); 667 break; 668 669 case kInstAlt: 670 case kInstAltMatch: 671 // Explore alternatives. 672 if (ip->out()) 673 q.insert(ip->out()); 674 if (ip->out1()) 675 q.insert(ip->out1()); 676 break; 677 678 case kInstFail: 679 break; 680 } 681 } 682 return b; 683 } 684 685 bool 686 Prog::SearchNFA(const StringPiece& text, const StringPiece& context, 687 Anchor anchor, MatchKind kind, 688 StringPiece* match, int nmatch) { 689 if (NFA::Debug) 690 Dump(); 691 692 NFA nfa(this); 693 StringPiece sp; 694 if (kind == kFullMatch) { 695 anchor = kAnchored; 696 if (nmatch == 0) { 697 match = &sp; 698 nmatch = 1; 699 } 700 } 701 if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch)) 702 return false; 703 if (kind == kFullMatch && match[0].end() != text.end()) 704 return false; 705 return true; 706 } 707 708 } // namespace re2 709 710