Home | History | Annotate | Download | only in testing
      1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc
      6 //
      7 // Prog::BadSearchBacktrack is a backtracking regular expression search,
      8 // except that it remembers where it has been, trading a lot of
      9 // memory for a lot of time. It exists only for testing purposes.
     10 //
     11 // Let me repeat that.
     12 //
     13 // THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
     14 //   - It uses a ton of memory.
     15 //   - It uses a ton of stack.
     16 //   - It uses CHECK and LOG(FATAL).
     17 //   - It implements unanchored search by repeated anchored search.
     18 //
     19 // On the other hand, it is very simple and a good reference
     20 // implementation for the more complicated regexp packages.
     21 //
     22 // In BUILD, this file is linked into the ":testing" library,
     23 // not the main library, in order to make it harder to pick up
     24 // accidentally.
     25 
     26 #include "util/util.h"
     27 #include "re2/prog.h"
     28 #include "re2/regexp.h"
     29 
     30 namespace re2 {
     31 
     32 // Backtracker holds the state for a backtracking search.
     33 //
     34 // Excluding the search parameters, the main search state
     35 // is just the "capture registers", which record, for the
     36 // current execution, the string position at which each
     37 // parenthesis was passed.  cap_[0] and cap_[1] are the
     38 // left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
     39 //
     40 // To avoid infinite loops during backtracking on expressions
     41 // like (a*)*, the visited_[] bitmap marks the (state, string-position)
     42 // pairs that have already been explored and are thus not worth
     43 // re-exploring if we get there via another path.  Modern backtracking
     44 // libraries engineer their program representation differently, to make
     45 // such infinite loops possible to avoid without keeping a giant visited_
     46 // bitmap, but visited_ works fine for a reference implementation
     47 // and it has the nice benefit of making the search run in linear time.
     48 class Backtracker {
     49  public:
     50   explicit Backtracker(Prog* prog);
     51   ~Backtracker();
     52 
     53   bool Search(const StringPiece& text, const StringPiece& context,
     54               bool anchored, bool longest,
     55               StringPiece* submatch, int nsubmatch);
     56 
     57  private:
     58   // Explores from instruction ip at string position p looking for a match.
     59   // Returns true if found (so that caller can stop trying other possibilities).
     60   bool Visit(int id, const char* p);
     61 
     62   // Search parameters
     63   Prog* prog_;              // program being run
     64   StringPiece text_;        // text being searched
     65   StringPiece context_;     // greater context of text being searched
     66   bool anchored_;           // whether search is anchored at text.begin()
     67   bool longest_;            // whether search wants leftmost-longest match
     68   bool endmatch_;           // whether search must end at text.end()
     69   StringPiece *submatch_;   // submatches to fill in
     70   int nsubmatch_;           //   # of submatches to fill in
     71 
     72   // Search state
     73   const char* cap_[64];     // capture registers
     74   uint32 *visited_;         // bitmap: (Inst*, char*) pairs already backtracked
     75   int nvisited_;            //   # of words in bitmap
     76 };
     77 
     78 Backtracker::Backtracker(Prog* prog)
     79   : prog_(prog),
     80     anchored_(false),
     81     longest_(false),
     82     endmatch_(false),
     83     submatch_(NULL),
     84     nsubmatch_(0),
     85     visited_(NULL),
     86     nvisited_(0) {
     87 }
     88 
     89 Backtracker::~Backtracker() {
     90   delete[] visited_;
     91 }
     92 
     93 // Runs a backtracking search.
     94 bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
     95                          bool anchored, bool longest,
     96                          StringPiece* submatch, int nsubmatch) {
     97   text_ = text;
     98   context_ = context;
     99   if (context_.begin() == NULL)
    100     context_ = text;
    101   if (prog_->anchor_start() && text.begin() > context_.begin())
    102     return false;
    103   if (prog_->anchor_end() && text.end() < context_.end())
    104     return false;
    105   anchored_ = anchored | prog_->anchor_start();
    106   longest_ = longest | prog_->anchor_end();
    107   endmatch_ = prog_->anchor_end();
    108   submatch_ = submatch;
    109   nsubmatch_ = nsubmatch;
    110   CHECK(2*nsubmatch_ < arraysize(cap_));
    111   memset(cap_, 0, sizeof cap_);
    112 
    113   // We use submatch_[0] for our own bookkeeping,
    114   // so it had better exist.
    115   StringPiece sp0;
    116   if (nsubmatch < 1) {
    117     submatch_ = &sp0;
    118     nsubmatch_ = 1;
    119   }
    120   submatch_[0] = NULL;
    121 
    122   // Allocate new visited_ bitmap -- size is proportional
    123   // to text, so have to reallocate on each call to Search.
    124   delete[] visited_;
    125   nvisited_ = (prog_->size()*(text.size()+1) + 31)/32;
    126   visited_ = new uint32[nvisited_];
    127   memset(visited_, 0, nvisited_*sizeof visited_[0]);
    128 
    129   // Anchored search must start at text.begin().
    130   if (anchored_) {
    131     cap_[0] = text.begin();
    132     return Visit(prog_->start(), text.begin());
    133   }
    134 
    135   // Unanchored search, starting from each possible text position.
    136   // Notice that we have to try the empty string at the end of
    137   // the text, so the loop condition is p <= text.end(), not p < text.end().
    138   for (const char* p = text.begin(); p <= text.end(); p++) {
    139     cap_[0] = p;
    140     if (Visit(prog_->start(), p))  // Match must be leftmost; done.
    141       return true;
    142   }
    143   return false;
    144 }
    145 
    146 // Explores from instruction ip at string position p looking for a match.
    147 // Return true if found (so that caller can stop trying other possibilities).
    148 bool Backtracker::Visit(int id, const char* p) {
    149   // Check bitmap.  If we've already explored from here,
    150   // either it didn't match or it did but we're hoping for a better match.
    151   // Either way, don't go down that road again.
    152   CHECK(p <= text_.end());
    153   int n = id*(text_.size()+1) + (p - text_.begin());
    154   CHECK_LT(n/32, nvisited_);
    155   if (visited_[n/32] & (1 << (n&31)))
    156     return false;
    157   visited_[n/32] |= 1 << (n&31);
    158 
    159   // Pick out byte at current position.  If at end of string,
    160   // have to explore in hope of finishing a match.  Use impossible byte -1.
    161   int c = -1;
    162   if (p < text_.end())
    163     c = *p & 0xFF;
    164 
    165   Prog::Inst* ip = prog_->inst(id);
    166   switch (ip->opcode()) {
    167     default:
    168       LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
    169       return false;  // not reached
    170 
    171     case kInstAlt:
    172     case kInstAltMatch:
    173       // Try both possible next states: out is preferred to out1.
    174       if (Visit(ip->out(), p)) {
    175         if (longest_)
    176           Visit(ip->out1(), p);
    177         return true;
    178       }
    179       return Visit(ip->out1(), p);
    180 
    181     case kInstByteRange:
    182       if (ip->Matches(c))
    183         return Visit(ip->out(), p+1);
    184       return false;
    185 
    186     case kInstCapture:
    187       if (0 <= ip->cap() && ip->cap() < arraysize(cap_)) {
    188         // Capture p to register, but save old value.
    189         const char* q = cap_[ip->cap()];
    190         cap_[ip->cap()] = p;
    191         bool ret = Visit(ip->out(), p);
    192         // Restore old value as we backtrack.
    193         cap_[ip->cap()] = q;
    194         return ret;
    195       }
    196       return Visit(ip->out(), p);
    197 
    198     case kInstEmptyWidth:
    199       if (ip->empty() & ~Prog::EmptyFlags(context_, p))
    200         return false;
    201       return Visit(ip->out(), p);
    202 
    203     case kInstNop:
    204       return Visit(ip->out(), p);
    205 
    206     case kInstMatch:
    207       // We found a match.  If it's the best so far, record the
    208       // parameters in the caller's submatch_ array.
    209       if (endmatch_ && p != context_.end())
    210         return false;
    211       cap_[1] = p;
    212       if (submatch_[0].data() == NULL ||           // First match so far ...
    213           (longest_ && p > submatch_[0].end())) {  // ... or better match
    214         for (int i = 0; i < nsubmatch_; i++)
    215           submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
    216       }
    217       return true;
    218 
    219     case kInstFail:
    220       return false;
    221   }
    222 }
    223 
    224 // Runs a backtracking search.
    225 bool Prog::UnsafeSearchBacktrack(const StringPiece& text,
    226                                  const StringPiece& context,
    227                                  Anchor anchor,
    228                                  MatchKind kind,
    229                                  StringPiece* match,
    230                                  int nmatch) {
    231   // If full match, we ask for an anchored longest match
    232   // and then check that match[0] == text.
    233   // So make sure match[0] exists.
    234   StringPiece sp0;
    235   if (kind == kFullMatch) {
    236     anchor = kAnchored;
    237     if (nmatch < 1) {
    238       match = &sp0;
    239       nmatch = 1;
    240     }
    241   }
    242 
    243   // Run the search.
    244   Backtracker b(this);
    245   bool anchored = anchor == kAnchored;
    246   bool longest = kind != kFirstMatch;
    247   if (!b.Search(text, context, anchored, longest, match, nmatch))
    248     return false;
    249   if (kind == kFullMatch && match[0].end() != text.end())
    250     return false;
    251   return true;
    252 }
    253 
    254 }  // namespace re2
    255