1 // Copyright 2006 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Helper class for traversing Regexps without recursion. 6 // Clients should declare their own subclasses that override 7 // the PreVisit and PostVisit methods, which are called before 8 // and after visiting the subexpressions. 9 10 // Not quite the Visitor pattern, because (among other things) 11 // the Visitor pattern is recursive. 12 13 #ifndef RE2_WALKER_INL_H__ 14 #define RE2_WALKER_INL_H__ 15 16 #include "re2/regexp.h" 17 18 namespace re2 { 19 20 template<typename T> struct WalkState; 21 22 template<typename T> class Regexp::Walker { 23 public: 24 Walker(); 25 virtual ~Walker(); 26 27 // Virtual method called before visiting re's children. 28 // PreVisit passes ownership of its return value to its caller. 29 // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg 30 // and passed to the child PreVisits and PostVisits as parent_arg. 31 // At the top-most Regexp, parent_arg is arg passed to walk. 32 // If PreVisit sets *stop to true, the walk does not recurse 33 // into the children. Instead it behaves as though the return 34 // value from PreVisit is the return value from PostVisit. 35 // The default PreVisit returns parent_arg. 36 virtual T PreVisit(Regexp* re, T parent_arg, bool* stop); 37 38 // Virtual method called after visiting re's children. 39 // The pre_arg is the T that PreVisit returned. 40 // The child_args is a vector of the T that the child PostVisits returned. 41 // PostVisit takes ownership of pre_arg. 42 // PostVisit takes ownership of the Ts 43 // in *child_args, but not the vector itself. 44 // PostVisit passes ownership of its return value 45 // to its caller. 46 // The default PostVisit simply returns pre_arg. 47 virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg, 48 T* child_args, int nchild_args); 49 50 // Virtual method called to copy a T, 51 // when Walk notices that more than one child is the same re. 52 virtual T Copy(T arg); 53 54 // Virtual method called to do a "quick visit" of the re, 55 // but not its children. Only called once the visit budget 56 // has been used up and we're trying to abort the walk 57 // as quickly as possible. Should return a value that 58 // makes sense for the parent PostVisits still to be run. 59 // This function is (hopefully) only called by 60 // WalkExponential, but must be implemented by all clients, 61 // just in case. 62 virtual T ShortVisit(Regexp* re, T parent_arg) = 0; 63 64 // Walks over a regular expression. 65 // Top_arg is passed as parent_arg to PreVisit and PostVisit of re. 66 // Returns the T returned by PostVisit on re. 67 T Walk(Regexp* re, T top_arg); 68 69 // Like Walk, but doesn't use Copy. This can lead to 70 // exponential runtimes on cross-linked Regexps like the 71 // ones generated by Simplify. To help limit this, 72 // at most max_visits nodes will be visited and then 73 // the walk will be cut off early. 74 // If the walk *is* cut off early, ShortVisit(re) 75 // will be called on regexps that cannot be fully 76 // visited rather than calling PreVisit/PostVisit. 77 T WalkExponential(Regexp* re, T top_arg, int max_visits); 78 79 // Clears the stack. Should never be necessary, since 80 // Walk always enters and exits with an empty stack. 81 // Logs DFATAL if stack is not already clear. 82 void Reset(); 83 84 // Returns whether walk was cut off. 85 bool stopped_early() { return stopped_early_; } 86 87 private: 88 // Walk state for the entire traversal. 89 stack<WalkState<T> >* stack_; 90 bool stopped_early_; 91 int max_visits_; 92 93 T WalkInternal(Regexp* re, T top_arg, bool use_copy); 94 95 DISALLOW_EVIL_CONSTRUCTORS(Walker); 96 }; 97 98 template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re, 99 T parent_arg, 100 bool* stop) { 101 return parent_arg; 102 } 103 104 template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re, 105 T parent_arg, 106 T pre_arg, 107 T* child_args, 108 int nchild_args) { 109 return pre_arg; 110 } 111 112 template<typename T> T Regexp::Walker<T>::Copy(T arg) { 113 return arg; 114 } 115 116 // State about a single level in the traversal. 117 template<typename T> struct WalkState { 118 WalkState<T>(Regexp* re, T parent) 119 : re(re), 120 n(-1), 121 parent_arg(parent), 122 child_args(NULL) { } 123 124 Regexp* re; // The regexp 125 int n; // The index of the next child to process; -1 means need to PreVisit 126 T parent_arg; // Accumulated arguments. 127 T pre_arg; 128 T child_arg; // One-element buffer for child_args. 129 T* child_args; 130 }; 131 132 template<typename T> Regexp::Walker<T>::Walker() { 133 stack_ = new stack<WalkState<T> >; 134 stopped_early_ = false; 135 } 136 137 template<typename T> Regexp::Walker<T>::~Walker() { 138 Reset(); 139 delete stack_; 140 } 141 142 // Clears the stack. Should never be necessary, since 143 // Walk always enters and exits with an empty stack. 144 // Logs DFATAL if stack is not already clear. 145 template<typename T> void Regexp::Walker<T>::Reset() { 146 if (stack_ && stack_->size() > 0) { 147 LOG(DFATAL) << "Stack not empty."; 148 while (stack_->size() > 0) { 149 delete stack_->top().child_args; 150 stack_->pop(); 151 } 152 } 153 } 154 155 template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg, 156 bool use_copy) { 157 Reset(); 158 159 if (re == NULL) { 160 LOG(DFATAL) << "Walk NULL"; 161 return top_arg; 162 } 163 164 stack_->push(WalkState<T>(re, top_arg)); 165 166 WalkState<T>* s; 167 for (;;) { 168 T t; 169 s = &stack_->top(); 170 Regexp* re = s->re; 171 switch (s->n) { 172 case -1: { 173 if (--max_visits_ < 0) { 174 stopped_early_ = true; 175 t = ShortVisit(re, s->parent_arg); 176 break; 177 } 178 bool stop = false; 179 s->pre_arg = PreVisit(re, s->parent_arg, &stop); 180 if (stop) { 181 t = s->pre_arg; 182 break; 183 } 184 s->n = 0; 185 s->child_args = NULL; 186 if (re->nsub_ == 1) 187 s->child_args = &s->child_arg; 188 else if (re->nsub_ > 1) 189 s->child_args = new T[re->nsub_]; 190 // Fall through. 191 } 192 default: { 193 if (re->nsub_ > 0) { 194 Regexp** sub = re->sub(); 195 if (s->n < re->nsub_) { 196 if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) { 197 s->child_args[s->n] = Copy(s->child_args[s->n - 1]); 198 s->n++; 199 } else { 200 stack_->push(WalkState<T>(sub[s->n], s->pre_arg)); 201 } 202 continue; 203 } 204 } 205 206 t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n); 207 if (re->nsub_ > 1) 208 delete[] s->child_args; 209 break; 210 } 211 } 212 213 // We've finished stack_->top(). 214 // Update next guy down. 215 stack_->pop(); 216 if (stack_->size() == 0) 217 return t; 218 s = &stack_->top(); 219 if (s->child_args != NULL) 220 s->child_args[s->n] = t; 221 else 222 s->child_arg = t; 223 s->n++; 224 } 225 } 226 227 template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) { 228 // Without the exponential walking behavior, 229 // this budget should be more than enough for any 230 // regexp, and yet not enough to get us in trouble 231 // as far as CPU time. 232 max_visits_ = 1000000; 233 return WalkInternal(re, top_arg, true); 234 } 235 236 template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg, 237 int max_visits) { 238 max_visits_ = max_visits; 239 return WalkInternal(re, top_arg, false); 240 } 241 242 } // namespace re2 243 244 #endif // RE2_WALKER_INL_H__ 245