Home | History | Annotate | Download | only in re2
      1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc
      6 
      7 // Prog::SearchBitState is a regular expression search with submatch
      8 // tracking for small regular expressions and texts.  Like
      9 // testing/backtrack.cc, it allocates a bit vector with (length of
     10 // text) * (length of prog) bits, to make sure it never explores the
     11 // same (character position, instruction) state multiple times.  This
     12 // limits the search to run in time linear in the length of the text.
     13 //
     14 // Unlike testing/backtrack.cc, SearchBitState is not recursive
     15 // on the text.
     16 //
     17 // SearchBitState is a fast replacement for the NFA code on small
     18 // regexps and texts when SearchOnePass cannot be used.
     19 
     20 #include "re2/prog.h"
     21 #include "re2/regexp.h"
     22 
     23 namespace re2 {
     24 
     25 struct Job {
     26   int id;
     27   int arg;
     28   const char* p;
     29 };
     30 
     31 class BitState {
     32  public:
     33   explicit BitState(Prog* prog);
     34   ~BitState();
     35 
     36   // The usual Search prototype.
     37   // Can only call Search once per BitState.
     38   bool Search(const StringPiece& text, const StringPiece& context,
     39               bool anchored, bool longest,
     40               StringPiece* submatch, int nsubmatch);
     41 
     42  private:
     43   inline bool ShouldVisit(int id, const char* p);
     44   void Push(int id, const char* p, int arg);
     45   bool GrowStack();
     46   bool TrySearch(int id, const char* p);
     47 
     48   // Search parameters
     49   Prog* prog_;              // program being run
     50   StringPiece text_;        // text being searched
     51   StringPiece context_;     // greater context of text being searched
     52   bool anchored_;           // whether search is anchored at text.begin()
     53   bool longest_;            // whether search wants leftmost-longest match
     54   bool endmatch_;           // whether match must end at text.end()
     55   StringPiece *submatch_;   // submatches to fill in
     56   int nsubmatch_;           //   # of submatches to fill in
     57 
     58   // Search state
     59   const char** cap_;        // capture registers
     60   int ncap_;
     61 
     62   static const int VisitedBits = 32;
     63   uint32 *visited_;         // bitmap: (Inst*, char*) pairs already backtracked
     64   int nvisited_;            //   # of words in bitmap
     65 
     66   Job *job_;                // stack of text positions to explore
     67   int njob_;
     68   int maxjob_;
     69 };
     70 
     71 BitState::BitState(Prog* prog)
     72   : prog_(prog),
     73     anchored_(false),
     74     longest_(false),
     75     endmatch_(false),
     76     submatch_(NULL),
     77     nsubmatch_(0),
     78     cap_(NULL),
     79     ncap_(0),
     80     visited_(NULL),
     81     nvisited_(0),
     82     job_(NULL),
     83     njob_(0),
     84     maxjob_(0) {
     85 }
     86 
     87 BitState::~BitState() {
     88   delete[] visited_;
     89   delete[] job_;
     90   delete[] cap_;
     91 }
     92 
     93 // Should the search visit the pair ip, p?
     94 // If so, remember that it was visited so that the next time,
     95 // we don't repeat the visit.
     96 bool BitState::ShouldVisit(int id, const char* p) {
     97   uint n = id * (text_.size() + 1) + (p - text_.begin());
     98   if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
     99     return false;
    100   visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
    101   return true;
    102 }
    103 
    104 // Grow the stack.
    105 bool BitState::GrowStack() {
    106   // VLOG(0) << "Reallocate.";
    107   maxjob_ *= 2;
    108   Job* newjob = new Job[maxjob_];
    109   memmove(newjob, job_, njob_*sizeof job_[0]);
    110   delete[] job_;
    111   job_ = newjob;
    112   if (njob_ >= maxjob_) {
    113     LOG(DFATAL) << "Job stack overflow.";
    114     return false;
    115   }
    116   return true;
    117 }
    118 
    119 // Push the triple (id, p, arg) onto the stack, growing it if necessary.
    120 void BitState::Push(int id, const char* p, int arg) {
    121   if (njob_ >= maxjob_) {
    122     if (!GrowStack())
    123       return;
    124   }
    125   int op = prog_->inst(id)->opcode();
    126   if (op == kInstFail)
    127     return;
    128 
    129   // Only check ShouldVisit when arg == 0.
    130   // When arg > 0, we are continuing a previous visit.
    131   if (arg == 0 && !ShouldVisit(id, p))
    132     return;
    133 
    134   Job* j = &job_[njob_++];
    135   j->id = id;
    136   j->p = p;
    137   j->arg = arg;
    138 }
    139 
    140 // Try a search from instruction id0 in state p0.
    141 // Return whether it succeeded.
    142 bool BitState::TrySearch(int id0, const char* p0) {
    143   bool matched = false;
    144   const char* end = text_.end();
    145   njob_ = 0;
    146   Push(id0, p0, 0);
    147   while (njob_ > 0) {
    148     // Pop job off stack.
    149     --njob_;
    150     int id = job_[njob_].id;
    151     const char* p = job_[njob_].p;
    152     int arg = job_[njob_].arg;
    153 
    154     // Optimization: rather than push and pop,
    155     // code that is going to Push and continue
    156     // the loop simply updates ip, p, and arg
    157     // and jumps to CheckAndLoop.  We have to
    158     // do the ShouldVisit check that Push
    159     // would have, but we avoid the stack
    160     // manipulation.
    161     if (0) {
    162     CheckAndLoop:
    163       if (!ShouldVisit(id, p))
    164         continue;
    165     }
    166 
    167     // Visit ip, p.
    168     // VLOG(0) << "Job: " << ip->id() << " "
    169     //         << (p - text_.begin()) << " " << arg;
    170     Prog::Inst* ip = prog_->inst(id);
    171     switch (ip->opcode()) {
    172       case kInstFail:
    173       default:
    174         LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
    175         return false;
    176 
    177       case kInstAlt:
    178         // Cannot just
    179         //   Push(ip->out1(), p, 0);
    180         //   Push(ip->out(), p, 0);
    181         // If, during the processing of ip->out(), we encounter
    182         // ip->out1() via another path, we want to process it then.
    183         // Pushing it here will inhibit that.  Instead, re-push
    184         // ip with arg==1 as a reminder to push ip->out1() later.
    185         switch (arg) {
    186           case 0:
    187             Push(id, p, 1);  // come back when we're done
    188             id = ip->out();
    189             goto CheckAndLoop;
    190 
    191           case 1:
    192             // Finished ip->out(); try ip->out1().
    193             arg = 0;
    194             id = ip->out1();
    195             goto CheckAndLoop;
    196         }
    197         LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
    198         continue;
    199 
    200       case kInstAltMatch:
    201         // One opcode is byte range; the other leads to match.
    202         if (ip->greedy(prog_)) {
    203           // out1 is the match
    204           Push(ip->out1(), p, 0);
    205           id = ip->out1();
    206           p = end;
    207           goto CheckAndLoop;
    208         }
    209         // out is the match - non-greedy
    210         Push(ip->out(), end, 0);
    211         id = ip->out();
    212         goto CheckAndLoop;
    213 
    214       case kInstByteRange: {
    215         int c = -1;
    216         if (p < end)
    217           c = *p & 0xFF;
    218         if (ip->Matches(c)) {
    219           id = ip->out();
    220           p++;
    221           goto CheckAndLoop;
    222         }
    223         continue;
    224       }
    225 
    226       case kInstCapture:
    227         switch (arg) {
    228           case 0:
    229             if (0 <= ip->cap() && ip->cap() < ncap_) {
    230               // Capture p to register, but save old value.
    231               Push(id, cap_[ip->cap()], 1);  // come back when we're done
    232               cap_[ip->cap()] = p;
    233             }
    234             // Continue on.
    235             id = ip->out();
    236             goto CheckAndLoop;
    237           case 1:
    238             // Finished ip->out(); restore the old value.
    239             cap_[ip->cap()] = p;
    240             continue;
    241         }
    242         LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
    243         continue;
    244 
    245       case kInstEmptyWidth:
    246         if (ip->empty() & ~Prog::EmptyFlags(context_, p))
    247           continue;
    248         id = ip->out();
    249         goto CheckAndLoop;
    250 
    251       case kInstNop:
    252         id = ip->out();
    253         goto CheckAndLoop;
    254 
    255       case kInstMatch: {
    256         if (endmatch_ && p != text_.end())
    257           continue;
    258 
    259         // VLOG(0) << "Found match.";
    260         // We found a match.  If the caller doesn't care
    261         // where the match is, no point going further.
    262         if (nsubmatch_ == 0)
    263           return true;
    264 
    265         // Record best match so far.
    266         // Only need to check end point, because this entire
    267         // call is only considering one start position.
    268         matched = true;
    269         cap_[1] = p;
    270         if (submatch_[0].data() == NULL ||
    271             (longest_ && p > submatch_[0].end())) {
    272           for (int i = 0; i < nsubmatch_; i++)
    273             submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
    274         }
    275 
    276         // If going for first match, we're done.
    277         if (!longest_)
    278           return true;
    279 
    280         // If we used the entire text, no longer match is possible.
    281         if (p == text_.end())
    282           return true;
    283 
    284         // Otherwise, continue on in hope of a longer match.
    285         continue;
    286       }
    287     }
    288   }
    289   return matched;
    290 }
    291 
    292 // Search text (within context) for prog_.
    293 bool BitState::Search(const StringPiece& text, const StringPiece& context,
    294                       bool anchored, bool longest,
    295                       StringPiece* submatch, int nsubmatch) {
    296   // Search parameters.
    297   text_ = text;
    298   context_ = context;
    299   if (context_.begin() == NULL)
    300     context_ = text;
    301   if (prog_->anchor_start() && context_.begin() != text.begin())
    302     return false;
    303   if (prog_->anchor_end() && context_.end() != text.end())
    304     return false;
    305   anchored_ = anchored || prog_->anchor_start();
    306   longest_ = longest || prog_->anchor_end();
    307   endmatch_ = prog_->anchor_end();
    308   submatch_ = submatch;
    309   nsubmatch_ = nsubmatch;
    310   for (int i = 0; i < nsubmatch_; i++)
    311     submatch_[i] = NULL;
    312 
    313   // Allocate scratch space.
    314   nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
    315   visited_ = new uint32[nvisited_];
    316   memset(visited_, 0, nvisited_*sizeof visited_[0]);
    317   // VLOG(0) << "nvisited_ = " << nvisited_;
    318 
    319   ncap_ = 2*nsubmatch;
    320   if (ncap_ < 2)
    321     ncap_ = 2;
    322   cap_ = new const char*[ncap_];
    323   memset(cap_, 0, ncap_*sizeof cap_[0]);
    324 
    325   maxjob_ = 256;
    326   job_ = new Job[maxjob_];
    327 
    328   // Anchored search must start at text.begin().
    329   if (anchored_) {
    330     cap_[0] = text.begin();
    331     return TrySearch(prog_->start(), text.begin());
    332   }
    333 
    334   // Unanchored search, starting from each possible text position.
    335   // Notice that we have to try the empty string at the end of
    336   // the text, so the loop condition is p <= text.end(), not p < text.end().
    337   // This looks like it's quadratic in the size of the text,
    338   // but we are not clearing visited_ between calls to TrySearch,
    339   // so no work is duplicated and it ends up still being linear.
    340   for (const char* p = text.begin(); p <= text.end(); p++) {
    341     cap_[0] = p;
    342     if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
    343       return true;
    344   }
    345   return false;
    346 }
    347 
    348 // Bit-state search.
    349 bool Prog::SearchBitState(const StringPiece& text,
    350                           const StringPiece& context,
    351                           Anchor anchor,
    352                           MatchKind kind,
    353                           StringPiece* match,
    354                           int nmatch) {
    355   // If full match, we ask for an anchored longest match
    356   // and then check that match[0] == text.
    357   // So make sure match[0] exists.
    358   StringPiece sp0;
    359   if (kind == kFullMatch) {
    360     anchor = kAnchored;
    361     if (nmatch < 1) {
    362       match = &sp0;
    363       nmatch = 1;
    364     }
    365   }
    366 
    367   // Run the search.
    368   BitState b(this);
    369   bool anchored = anchor == kAnchored;
    370   bool longest = kind != kFirstMatch;
    371   if (!b.Search(text, context, anchored, longest, match, nmatch))
    372     return false;
    373   if (kind == kFullMatch && match[0].end() != text.end())
    374     return false;
    375   return true;
    376 }
    377 
    378 }  // namespace re2
    379