1 // Copyright 2006 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Format a regular expression structure as a string. 6 // Tested by parse_test.cc 7 8 #include "util/util.h" 9 #include "re2/regexp.h" 10 #include "re2/walker-inl.h" 11 12 namespace re2 { 13 14 enum { 15 PrecAtom, 16 PrecUnary, 17 PrecConcat, 18 PrecAlternate, 19 PrecEmpty, 20 PrecParen, 21 PrecToplevel, 22 }; 23 24 // Helper function. See description below. 25 static void AppendCCRange(string* t, Rune lo, Rune hi); 26 27 // Walker to generate string in s_. 28 // The arg pointers are actually integers giving the 29 // context precedence. 30 // The child_args are always NULL. 31 class ToStringWalker : public Regexp::Walker<int> { 32 public: 33 explicit ToStringWalker(string* t) : t_(t) {} 34 35 virtual int PreVisit(Regexp* re, int parent_arg, bool* stop); 36 virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg, 37 int* child_args, int nchild_args); 38 virtual int ShortVisit(Regexp* re, int parent_arg) { 39 return 0; 40 } 41 42 private: 43 string* t_; // The string the walker appends to. 44 45 DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker); 46 }; 47 48 string Regexp::ToString() { 49 string t; 50 ToStringWalker w(&t); 51 w.WalkExponential(this, PrecToplevel, 100000); 52 if (w.stopped_early()) 53 t += " [truncated]"; 54 return t; 55 } 56 57 #define ToString DontCallToString // Avoid accidental recursion. 58 59 // Visits re before children are processed. 60 // Appends ( if needed and passes new precedence to children. 61 int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { 62 int prec = parent_arg; 63 int nprec = PrecAtom; 64 65 switch (re->op()) { 66 case kRegexpNoMatch: 67 case kRegexpEmptyMatch: 68 case kRegexpLiteral: 69 case kRegexpAnyChar: 70 case kRegexpAnyByte: 71 case kRegexpBeginLine: 72 case kRegexpEndLine: 73 case kRegexpBeginText: 74 case kRegexpEndText: 75 case kRegexpWordBoundary: 76 case kRegexpNoWordBoundary: 77 case kRegexpCharClass: 78 case kRegexpHaveMatch: 79 nprec = PrecAtom; 80 break; 81 82 case kRegexpConcat: 83 case kRegexpLiteralString: 84 if (prec < PrecConcat) 85 t_->append("(?:"); 86 nprec = PrecConcat; 87 break; 88 89 case kRegexpAlternate: 90 if (prec < PrecAlternate) 91 t_->append("(?:"); 92 nprec = PrecAlternate; 93 break; 94 95 case kRegexpCapture: 96 t_->append("("); 97 if (re->name()) { 98 t_->append("?P<"); 99 t_->append(*re->name()); 100 t_->append(">"); 101 } 102 nprec = PrecParen; 103 break; 104 105 case kRegexpStar: 106 case kRegexpPlus: 107 case kRegexpQuest: 108 case kRegexpRepeat: 109 if (prec < PrecUnary) 110 t_->append("(?:"); 111 // The subprecedence here is PrecAtom instead of PrecUnary 112 // because PCRE treats two unary ops in a row as a parse error. 113 nprec = PrecAtom; 114 break; 115 } 116 117 return nprec; 118 } 119 120 static void AppendLiteral(string *t, Rune r, bool foldcase) { 121 if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) { 122 t->append(1, '\\'); 123 t->append(1, r); 124 } else if (foldcase && 'a' <= r && r <= 'z') { 125 if ('a' <= r && r <= 'z') 126 r += 'A' - 'a'; 127 t->append(1, '['); 128 t->append(1, r); 129 t->append(1, r + 'a' - 'A'); 130 t->append(1, ']'); 131 } else { 132 AppendCCRange(t, r, r); 133 } 134 } 135 136 // Visits re after children are processed. 137 // For childless regexps, all the work is done here. 138 // For regexps with children, append any unary suffixes or ). 139 int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, 140 int* child_args, int nchild_args) { 141 int prec = parent_arg; 142 switch (re->op()) { 143 case kRegexpNoMatch: 144 // There's no simple symbol for "no match", but 145 // [^0-Runemax] excludes everything. 146 t_->append("[^\\x00-\\x{10ffff}]"); 147 break; 148 149 case kRegexpEmptyMatch: 150 // Append (?:) to make empty string visible, 151 // unless this is already being parenthesized. 152 if (prec < PrecEmpty) 153 t_->append("(?:)"); 154 break; 155 156 case kRegexpLiteral: 157 AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase); 158 break; 159 160 case kRegexpLiteralString: 161 for (int i = 0; i < re->nrunes(); i++) 162 AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase); 163 if (prec < PrecConcat) 164 t_->append(")"); 165 break; 166 167 case kRegexpConcat: 168 if (prec < PrecConcat) 169 t_->append(")"); 170 break; 171 172 case kRegexpAlternate: 173 // Clumsy but workable: the children all appended | 174 // at the end of their strings, so just remove the last one. 175 if ((*t_)[t_->size()-1] == '|') 176 t_->erase(t_->size()-1); 177 else 178 LOG(DFATAL) << "Bad final char: " << t_; 179 if (prec < PrecAlternate) 180 t_->append(")"); 181 break; 182 183 case kRegexpStar: 184 t_->append("*"); 185 if (re->parse_flags() & Regexp::NonGreedy) 186 t_->append("?"); 187 if (prec < PrecUnary) 188 t_->append(")"); 189 break; 190 191 case kRegexpPlus: 192 t_->append("+"); 193 if (re->parse_flags() & Regexp::NonGreedy) 194 t_->append("?"); 195 if (prec < PrecUnary) 196 t_->append(")"); 197 break; 198 199 case kRegexpQuest: 200 t_->append("?"); 201 if (re->parse_flags() & Regexp::NonGreedy) 202 t_->append("?"); 203 if (prec < PrecUnary) 204 t_->append(")"); 205 break; 206 207 case kRegexpRepeat: 208 if (re->max() == -1) 209 t_->append(StringPrintf("{%d,}", re->min())); 210 else if (re->min() == re->max()) 211 t_->append(StringPrintf("{%d}", re->min())); 212 else 213 t_->append(StringPrintf("{%d,%d}", re->min(), re->max())); 214 if (re->parse_flags() & Regexp::NonGreedy) 215 t_->append("?"); 216 if (prec < PrecUnary) 217 t_->append(")"); 218 break; 219 220 case kRegexpAnyChar: 221 t_->append("."); 222 break; 223 224 case kRegexpAnyByte: 225 t_->append("\\C"); 226 break; 227 228 case kRegexpBeginLine: 229 t_->append("^"); 230 break; 231 232 case kRegexpEndLine: 233 t_->append("$"); 234 break; 235 236 case kRegexpBeginText: 237 t_->append("(?-m:^)"); 238 break; 239 240 case kRegexpEndText: 241 if (re->parse_flags() & Regexp::WasDollar) 242 t_->append("(?-m:$)"); 243 else 244 t_->append("\\z"); 245 break; 246 247 case kRegexpWordBoundary: 248 t_->append("\\b"); 249 break; 250 251 case kRegexpNoWordBoundary: 252 t_->append("\\B"); 253 break; 254 255 case kRegexpCharClass: { 256 if (re->cc()->size() == 0) { 257 t_->append("[^\\x00-\\x{10ffff}]"); 258 break; 259 } 260 t_->append("["); 261 // Heuristic: show class as negated if it contains the 262 // non-character 0xFFFE. 263 CharClass* cc = re->cc(); 264 if (cc->Contains(0xFFFE)) { 265 cc = cc->Negate(); 266 t_->append("^"); 267 } 268 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) 269 AppendCCRange(t_, i->lo, i->hi); 270 if (cc != re->cc()) 271 cc->Delete(); 272 t_->append("]"); 273 break; 274 } 275 276 case kRegexpCapture: 277 t_->append(")"); 278 break; 279 280 case kRegexpHaveMatch: 281 // There's no syntax accepted by the parser to generate 282 // this node (it is generated by RE2::Set) so make something 283 // up that is readable but won't compile. 284 t_->append("(?HaveMatch:%d)", re->match_id()); 285 break; 286 } 287 288 // If the parent is an alternation, append the | for it. 289 if (prec == PrecAlternate) 290 t_->append("|"); 291 292 return 0; 293 } 294 295 // Appends a rune for use in a character class to the string t. 296 static void AppendCCChar(string* t, Rune r) { 297 if (0x20 <= r && r <= 0x7E) { 298 if (strchr("[]^-\\", r)) 299 t->append("\\"); 300 t->append(1, r); 301 return; 302 } 303 switch (r) { 304 default: 305 break; 306 307 case '\r': 308 t->append("\\r"); 309 return; 310 311 case '\t': 312 t->append("\\t"); 313 return; 314 315 case '\n': 316 t->append("\\n"); 317 return; 318 319 case '\f': 320 t->append("\\f"); 321 return; 322 } 323 324 if (r < 0x100) { 325 StringAppendF(t, "\\x%02x", static_cast<int>(r)); 326 return; 327 } 328 StringAppendF(t, "\\x{%x}", static_cast<int>(r)); 329 } 330 331 static void AppendCCRange(string* t, Rune lo, Rune hi) { 332 if (lo > hi) 333 return; 334 AppendCCChar(t, lo); 335 if (lo < hi) { 336 t->append("-"); 337 AppendCCChar(t, hi); 338 } 339 } 340 341 } // namespace re2 342