Home | History | Annotate | Download | only in re2
      1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Helper class for traversing Regexps without recursion.
      6 // Clients should declare their own subclasses that override
      7 // the PreVisit and PostVisit methods, which are called before
      8 // and after visiting the subexpressions.
      9 
     10 // Not quite the Visitor pattern, because (among other things)
     11 // the Visitor pattern is recursive.
     12 
     13 #ifndef RE2_WALKER_INL_H__
     14 #define RE2_WALKER_INL_H__
     15 
     16 #include "re2/regexp.h"
     17 
     18 namespace re2 {
     19 
     20 template<typename T> struct WalkState;
     21 
     22 template<typename T> class Regexp::Walker {
     23  public:
     24   Walker();
     25   virtual ~Walker();
     26 
     27   // Virtual method called before visiting re's children.
     28   // PreVisit passes ownership of its return value to its caller.
     29   // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
     30   // and passed to the child PreVisits and PostVisits as parent_arg.
     31   // At the top-most Regexp, parent_arg is arg passed to walk.
     32   // If PreVisit sets *stop to true, the walk does not recurse
     33   // into the children.  Instead it behaves as though the return
     34   // value from PreVisit is the return value from PostVisit.
     35   // The default PreVisit returns parent_arg.
     36   virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
     37 
     38   // Virtual method called after visiting re's children.
     39   // The pre_arg is the T that PreVisit returned.
     40   // The child_args is a vector of the T that the child PostVisits returned.
     41   // PostVisit takes ownership of pre_arg.
     42   // PostVisit takes ownership of the Ts
     43   // in *child_args, but not the vector itself.
     44   // PostVisit passes ownership of its return value
     45   // to its caller.
     46   // The default PostVisit simply returns pre_arg.
     47   virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
     48                       T* child_args, int nchild_args);
     49 
     50   // Virtual method called to copy a T,
     51   // when Walk notices that more than one child is the same re.
     52   virtual T Copy(T arg);
     53 
     54   // Virtual method called to do a "quick visit" of the re,
     55   // but not its children.  Only called once the visit budget
     56   // has been used up and we're trying to abort the walk
     57   // as quickly as possible.  Should return a value that
     58   // makes sense for the parent PostVisits still to be run.
     59   // This function is (hopefully) only called by
     60   // WalkExponential, but must be implemented by all clients,
     61   // just in case.
     62   virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
     63 
     64   // Walks over a regular expression.
     65   // Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
     66   // Returns the T returned by PostVisit on re.
     67   T Walk(Regexp* re, T top_arg);
     68 
     69   // Like Walk, but doesn't use Copy.  This can lead to
     70   // exponential runtimes on cross-linked Regexps like the
     71   // ones generated by Simplify.  To help limit this,
     72   // at most max_visits nodes will be visited and then
     73   // the walk will be cut off early.
     74   // If the walk *is* cut off early, ShortVisit(re)
     75   // will be called on regexps that cannot be fully
     76   // visited rather than calling PreVisit/PostVisit.
     77   T WalkExponential(Regexp* re, T top_arg, int max_visits);
     78 
     79   // Clears the stack.  Should never be necessary, since
     80   // Walk always enters and exits with an empty stack.
     81   // Logs DFATAL if stack is not already clear.
     82   void Reset();
     83 
     84   // Returns whether walk was cut off.
     85   bool stopped_early() { return stopped_early_; }
     86 
     87  private:
     88   // Walk state for the entire traversal.
     89   stack<WalkState<T> >* stack_;
     90   bool stopped_early_;
     91   int max_visits_;
     92 
     93   T WalkInternal(Regexp* re, T top_arg, bool use_copy);
     94 
     95   DISALLOW_EVIL_CONSTRUCTORS(Walker);
     96 };
     97 
     98 template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
     99                                                    T parent_arg,
    100                                                    bool* stop) {
    101   return parent_arg;
    102 }
    103 
    104 template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
    105                                                     T parent_arg,
    106                                                     T pre_arg,
    107                                                     T* child_args,
    108                                                     int nchild_args) {
    109   return pre_arg;
    110 }
    111 
    112 template<typename T> T Regexp::Walker<T>::Copy(T arg) {
    113   return arg;
    114 }
    115 
    116 // State about a single level in the traversal.
    117 template<typename T> struct WalkState {
    118   WalkState<T>(Regexp* re, T parent)
    119     : re(re),
    120       n(-1),
    121       parent_arg(parent),
    122       child_args(NULL) { }
    123 
    124   Regexp* re;  // The regexp
    125   int n;  // The index of the next child to process; -1 means need to PreVisit
    126   T parent_arg;  // Accumulated arguments.
    127   T pre_arg;
    128   T child_arg;  // One-element buffer for child_args.
    129   T* child_args;
    130 };
    131 
    132 template<typename T> Regexp::Walker<T>::Walker() {
    133   stack_ = new stack<WalkState<T> >;
    134   stopped_early_ = false;
    135 }
    136 
    137 template<typename T> Regexp::Walker<T>::~Walker() {
    138   Reset();
    139   delete stack_;
    140 }
    141 
    142 // Clears the stack.  Should never be necessary, since
    143 // Walk always enters and exits with an empty stack.
    144 // Logs DFATAL if stack is not already clear.
    145 template<typename T> void Regexp::Walker<T>::Reset() {
    146   if (stack_ && stack_->size() > 0) {
    147     LOG(DFATAL) << "Stack not empty.";
    148     while (stack_->size() > 0) {
    149       delete stack_->top().child_args;
    150       stack_->pop();
    151     }
    152   }
    153 }
    154 
    155 template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
    156                                                        bool use_copy) {
    157   Reset();
    158 
    159   if (re == NULL) {
    160     LOG(DFATAL) << "Walk NULL";
    161     return top_arg;
    162   }
    163 
    164   stack_->push(WalkState<T>(re, top_arg));
    165 
    166   WalkState<T>* s;
    167   for (;;) {
    168     T t;
    169     s = &stack_->top();
    170     Regexp* re = s->re;
    171     switch (s->n) {
    172       case -1: {
    173         if (--max_visits_ < 0) {
    174           stopped_early_ = true;
    175           t = ShortVisit(re, s->parent_arg);
    176           break;
    177         }
    178         bool stop = false;
    179         s->pre_arg = PreVisit(re, s->parent_arg, &stop);
    180         if (stop) {
    181           t = s->pre_arg;
    182           break;
    183         }
    184         s->n = 0;
    185         s->child_args = NULL;
    186         if (re->nsub_ == 1)
    187           s->child_args = &s->child_arg;
    188         else if (re->nsub_ > 1)
    189           s->child_args = new T[re->nsub_];
    190         // Fall through.
    191       }
    192       default: {
    193         if (re->nsub_ > 0) {
    194           Regexp** sub = re->sub();
    195           if (s->n < re->nsub_) {
    196             if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
    197               s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
    198               s->n++;
    199             } else {
    200               stack_->push(WalkState<T>(sub[s->n], s->pre_arg));
    201             }
    202             continue;
    203           }
    204         }
    205 
    206         t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
    207         if (re->nsub_ > 1)
    208           delete[] s->child_args;
    209         break;
    210       }
    211     }
    212 
    213     // We've finished stack_->top().
    214     // Update next guy down.
    215     stack_->pop();
    216     if (stack_->size() == 0)
    217       return t;
    218     s = &stack_->top();
    219     if (s->child_args != NULL)
    220       s->child_args[s->n] = t;
    221     else
    222       s->child_arg = t;
    223     s->n++;
    224   }
    225 }
    226 
    227 template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
    228   // Without the exponential walking behavior,
    229   // this budget should be more than enough for any
    230   // regexp, and yet not enough to get us in trouble
    231   // as far as CPU time.
    232   max_visits_ = 1000000;
    233   return WalkInternal(re, top_arg, true);
    234 }
    235 
    236 template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
    237                                                           int max_visits) {
    238   max_visits_ = max_visits;
    239   return WalkInternal(re, top_arg, false);
    240 }
    241 
    242 }  // namespace re2
    243 
    244 #endif  // RE2_WALKER_INL_H__
    245