1 // Copyright 2008 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Regular expression engine tester -- test all the implementations against each other. 6 7 #include "util/util.h" 8 #include "util/flags.h" 9 #include "re2/testing/tester.h" 10 #include "re2/prog.h" 11 #include "re2/re2.h" 12 #include "re2/regexp.h" 13 14 DEFINE_bool(dump_prog, false, "dump regexp program"); 15 DEFINE_bool(log_okay, false, "log successful runs"); 16 DEFINE_bool(dump_rprog, false, "dump reversed regexp program"); 17 18 DEFINE_int32(max_regexp_failures, 100, 19 "maximum number of regexp test failures (-1 = unlimited)"); 20 21 DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test"); 22 23 namespace re2 { 24 25 enum { 26 kMaxSubmatch = 1+16, // $0...$16 27 }; 28 29 const char* engine_types[kEngineMax] = { 30 "Backtrack", 31 "NFA", 32 "DFA", 33 "DFA1", 34 "OnePass", 35 "BitState", 36 "RE2", 37 "RE2a", 38 "RE2b", 39 "PCRE", 40 }; 41 42 // Returns the name string for the type t. 43 static string EngineString(Engine t) { 44 if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) { 45 return StringPrintf("type%d", static_cast<int>(t)); 46 } 47 return engine_types[t]; 48 } 49 50 // Returns bit mask of engines to use. 51 static uint32 Engines() { 52 static uint32 cached_engines; 53 static bool did_parse; 54 55 if (did_parse) 56 return cached_engines; 57 58 if (FLAGS_regexp_engines.empty()) { 59 cached_engines = ~0; 60 } else { 61 for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) 62 if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str())) 63 cached_engines |= 1<<i; 64 } 65 66 if (cached_engines == 0) 67 LOG(INFO) << "Warning: no engines enabled."; 68 if (!UsingPCRE) 69 cached_engines &= ~(1<<kEnginePCRE); 70 for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) { 71 if (cached_engines & (1<<i)) 72 LOG(INFO) << EngineString(i) << " enabled"; 73 } 74 did_parse = true; 75 return cached_engines; 76 } 77 78 // The result of running a match. 79 struct TestInstance::Result { 80 bool skipped; // test skipped: wasn't applicable 81 bool matched; // found a match 82 bool untrusted; // don't really trust the answer 83 bool have_submatch; // computed all submatch info 84 bool have_submatch0; // computed just submatch[0] 85 StringPiece submatch[kMaxSubmatch]; 86 }; 87 88 typedef TestInstance::Result Result; 89 90 // Formats a single capture range s in text in the form (a,b) 91 // where a and b are the starting and ending offsets of s in text. 92 static string FormatCapture(const StringPiece& text, const StringPiece& s) { 93 if (s.begin() == NULL) 94 return "(?,?)"; 95 return StringPrintf("(%d,%d)", 96 static_cast<int>(s.begin() - text.begin()), 97 static_cast<int>(s.end() - text.begin())); 98 } 99 100 // Returns whether text contains non-ASCII (>= 0x80) bytes. 101 static bool NonASCII(const StringPiece& text) { 102 for (int i = 0; i < text.size(); i++) 103 if ((uint8)text[i] >= 0x80) 104 return true; 105 return false; 106 } 107 108 // Returns string representation of match kind. 109 static string FormatKind(Prog::MatchKind kind) { 110 switch (kind) { 111 case Prog::kFullMatch: 112 return "full match"; 113 case Prog::kLongestMatch: 114 return "longest match"; 115 case Prog::kFirstMatch: 116 return "first match"; 117 case Prog::kManyMatch: 118 return "many match"; 119 } 120 return "???"; 121 } 122 123 // Returns string representation of anchor kind. 124 static string FormatAnchor(Prog::Anchor anchor) { 125 switch (anchor) { 126 case Prog::kAnchored: 127 return "anchored"; 128 case Prog::kUnanchored: 129 return "unanchored"; 130 } 131 return "???"; 132 } 133 134 struct ParseMode { 135 Regexp::ParseFlags parse_flags; 136 string desc; 137 }; 138 139 static const Regexp::ParseFlags single_line = 140 Regexp::LikePerl; 141 static const Regexp::ParseFlags multi_line = 142 static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine); 143 144 static ParseMode parse_modes[] = { 145 { single_line, "single-line" }, 146 { single_line|Regexp::Latin1, "single-line, latin1" }, 147 { multi_line, "multiline" }, 148 { multi_line|Regexp::NonGreedy, "multiline, nongreedy" }, 149 { multi_line|Regexp::Latin1, "multiline, latin1" }, 150 }; 151 152 static string FormatMode(Regexp::ParseFlags flags) { 153 for (int i = 0; i < arraysize(parse_modes); i++) 154 if (parse_modes[i].parse_flags == flags) 155 return parse_modes[i].desc; 156 return StringPrintf("%#x", static_cast<uint>(flags)); 157 } 158 159 // Constructs and saves all the matching engines that 160 // will be required for the given tests. 161 TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind, 162 Regexp::ParseFlags flags) 163 : regexp_str_(regexp_str), 164 kind_(kind), 165 flags_(flags), 166 error_(false), 167 regexp_(NULL), 168 num_captures_(0), 169 prog_(NULL), 170 rprog_(NULL), 171 re_(NULL), 172 re2_(NULL) { 173 174 VLOG(1) << CEscape(regexp_str); 175 176 // Compile regexp to prog. 177 // Always required - needed for backtracking (reference implementation). 178 RegexpStatus status; 179 regexp_ = Regexp::Parse(regexp_str, flags, &status); 180 if (regexp_ == NULL) { 181 LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) 182 << " mode: " << FormatMode(flags); 183 error_ = true; 184 return; 185 } 186 num_captures_ = regexp_->NumCaptures(); 187 prog_ = regexp_->CompileToProg(0); 188 if (prog_ == NULL) { 189 LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_); 190 error_ = true; 191 return; 192 } 193 if (FLAGS_dump_prog) { 194 LOG(INFO) << "Prog for " 195 << " regexp " 196 << CEscape(regexp_str_) 197 << " (" << FormatKind(kind_) 198 << ", " << FormatMode(flags_) 199 << ")\n" 200 << prog_->Dump(); 201 } 202 203 // Compile regexp to reversed prog. Only needed for DFA engines. 204 if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) { 205 rprog_ = regexp_->CompileToReverseProg(0); 206 if (rprog_ == NULL) { 207 LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_); 208 error_ = true; 209 return; 210 } 211 if (FLAGS_dump_rprog) 212 LOG(INFO) << rprog_->Dump(); 213 } 214 215 // Create re string that will be used for RE and RE2. 216 string re = regexp_str.as_string(); 217 // Accomodate flags. 218 // Regexp::Latin1 will be accomodated below. 219 if (!(flags & Regexp::OneLine)) 220 re = "(?m)" + re; 221 if (flags & Regexp::NonGreedy) 222 re = "(?U)" + re; 223 if (flags & Regexp::DotNL) 224 re = "(?s)" + re; 225 226 // Compile regexp to RE2. 227 if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) { 228 RE2::Options options; 229 if (flags & Regexp::Latin1) 230 options.set_encoding(RE2::Options::EncodingLatin1); 231 if (kind_ == Prog::kLongestMatch) 232 options.set_longest_match(true); 233 re2_ = new RE2(re, options); 234 if (!re2_->error().empty()) { 235 LOG(INFO) << "Cannot RE2: " << CEscape(re); 236 error_ = true; 237 return; 238 } 239 } 240 241 // Compile regexp to RE. 242 // PCRE as exposed by the RE interface isn't always usable. 243 // 1. It disagrees about handling of empty-string reptitions 244 // like matching (a*)* against "b". PCRE treats the (a*) as 245 // occurring once, while we treat it as occurring not at all. 246 // 2. It treats $ as this weird thing meaning end of string 247 // or before the \n at the end of the string. 248 // 3. It doesn't implement POSIX leftmost-longest matching. 249 // MimicsPCRE() detects 1 and 2. 250 if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() && 251 kind_ != Prog::kLongestMatch) { 252 PCRE_Options o; 253 o.set_option(PCRE::UTF8); 254 if (flags & Regexp::Latin1) 255 o.set_option(PCRE::None); 256 // PCRE has interface bug keeping us from finding $0, so 257 // add one more layer of parens. 258 re_ = new PCRE("("+re+")", o); 259 if (!re_->error().empty()) { 260 LOG(INFO) << "Cannot PCRE: " << CEscape(re); 261 error_ = true; 262 return; 263 } 264 } 265 } 266 267 TestInstance::~TestInstance() { 268 if (regexp_) 269 regexp_->Decref(); 270 delete prog_; 271 delete rprog_; 272 delete re_; 273 delete re2_; 274 } 275 276 // Runs a single search using the named engine type. 277 // This interface hides all the irregularities of the various 278 // engine interfaces from the rest of this file. 279 void TestInstance::RunSearch(Engine type, 280 const StringPiece& orig_text, 281 const StringPiece& orig_context, 282 Prog::Anchor anchor, 283 Result *result) { 284 memset(result, 0, sizeof *result); 285 if (regexp_ == NULL) { 286 result->skipped = true; 287 return; 288 } 289 int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0 290 if (nsubmatch > kMaxSubmatch) 291 nsubmatch = kMaxSubmatch; 292 293 StringPiece text = orig_text; 294 StringPiece context = orig_context; 295 296 switch (type) { 297 default: 298 LOG(FATAL) << "Bad RunSearch type: " << (int)type; 299 300 case kEngineBacktrack: 301 if (prog_ == NULL) { 302 result->skipped = true; 303 break; 304 } 305 result->matched = 306 prog_->UnsafeSearchBacktrack(text, context, anchor, kind_, 307 result->submatch, nsubmatch); 308 result->have_submatch = true; 309 break; 310 311 case kEngineNFA: 312 if (prog_ == NULL) { 313 result->skipped = true; 314 break; 315 } 316 result->matched = 317 prog_->SearchNFA(text, context, anchor, kind_, 318 result->submatch, nsubmatch); 319 result->have_submatch = true; 320 break; 321 322 case kEngineDFA: 323 if (prog_ == NULL) { 324 result->skipped = true; 325 break; 326 } 327 result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL, 328 &result->skipped, NULL); 329 break; 330 331 case kEngineDFA1: 332 if (prog_ == NULL || rprog_ == NULL) { 333 result->skipped = true; 334 break; 335 } 336 result->matched = 337 prog_->SearchDFA(text, context, anchor, kind_, result->submatch, 338 &result->skipped, NULL); 339 // If anchored, no need for second run, 340 // but do it anyway to find more bugs. 341 if (result->matched) { 342 if (!rprog_->SearchDFA(result->submatch[0], context, 343 Prog::kAnchored, Prog::kLongestMatch, 344 result->submatch, 345 &result->skipped, NULL)) { 346 LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_) 347 << " on " << CEscape(text); 348 result->matched = false; 349 } 350 } 351 result->have_submatch0 = true; 352 break; 353 354 case kEngineOnePass: 355 if (prog_ == NULL || 356 anchor == Prog::kUnanchored || 357 !prog_->IsOnePass() || 358 nsubmatch > Prog::kMaxOnePassCapture) { 359 result->skipped = true; 360 break; 361 } 362 result->matched = prog_->SearchOnePass(text, context, anchor, kind_, 363 result->submatch, nsubmatch); 364 result->have_submatch = true; 365 break; 366 367 case kEngineBitState: 368 if (prog_ == NULL) { 369 result->skipped = true; 370 break; 371 } 372 result->matched = prog_->SearchBitState(text, context, anchor, kind_, 373 result->submatch, nsubmatch); 374 result->have_submatch = true; 375 break; 376 377 case kEngineRE2: 378 case kEngineRE2a: 379 case kEngineRE2b: { 380 if (!re2_ || text.end() != context.end()) { 381 result->skipped = true; 382 break; 383 } 384 385 RE2::Anchor re_anchor; 386 if (anchor == Prog::kAnchored) 387 re_anchor = RE2::ANCHOR_START; 388 else 389 re_anchor = RE2::UNANCHORED; 390 if (kind_ == Prog::kFullMatch) 391 re_anchor = RE2::ANCHOR_BOTH; 392 393 result->matched = re2_->Match(context, 394 text.begin() - context.begin(), 395 text.end() - context.begin(), 396 re_anchor, result->submatch, nsubmatch); 397 result->have_submatch = nsubmatch > 0; 398 break; 399 } 400 401 case kEnginePCRE: { 402 if (!re_ || text.begin() != context.begin() || 403 text.end() != context.end()) { 404 result->skipped = true; 405 break; 406 } 407 408 const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch]; 409 PCRE::Arg *a = new PCRE::Arg[nsubmatch]; 410 for (int i = 0; i < nsubmatch; i++) { 411 a[i] = PCRE::Arg(&result->submatch[i]); 412 argptr[i] = &a[i]; 413 } 414 int consumed; 415 PCRE::Anchor pcre_anchor; 416 if (anchor == Prog::kAnchored) 417 pcre_anchor = PCRE::ANCHOR_START; 418 else 419 pcre_anchor = PCRE::UNANCHORED; 420 if (kind_ == Prog::kFullMatch) 421 pcre_anchor = PCRE::ANCHOR_BOTH; 422 re_->ClearHitLimit(); 423 result->matched = 424 re_->DoMatch(text, 425 pcre_anchor, 426 &consumed, 427 argptr, nsubmatch); 428 if (re_->HitLimit()) { 429 result->untrusted = true; 430 delete[] argptr; 431 delete[] a; 432 break; 433 } 434 result->have_submatch = true; 435 436 // Work around RE interface bug: PCRE returns -1 as the 437 // offsets for an unmatched subexpression, and RE should 438 // turn that into StringPiece(NULL) but in fact it uses 439 // StringPiece(text.begin() - 1, 0). Oops. 440 for (int i = 0; i < nsubmatch; i++) 441 if (result->submatch[i].begin() == text.begin() - 1) 442 result->submatch[i] = NULL; 443 delete[] argptr; 444 delete[] a; 445 break; 446 } 447 } 448 449 if (!result->matched) 450 memset(result->submatch, 0, sizeof result->submatch); 451 } 452 453 // Checks whether r is okay given that correct is the right answer. 454 // Specifically, r's answers have to match (but it doesn't have to 455 // claim to have all the answers). 456 static bool ResultOkay(const Result& r, const Result& correct) { 457 if (r.skipped) 458 return true; 459 if (r.matched != correct.matched) 460 return false; 461 if (r.have_submatch || r.have_submatch0) { 462 for (int i = 0; i < kMaxSubmatch; i++) { 463 if (correct.submatch[i].begin() != r.submatch[i].begin() || 464 correct.submatch[i].size() != r.submatch[i].size()) 465 return false; 466 if (!r.have_submatch) 467 break; 468 } 469 } 470 return true; 471 } 472 473 // Runs a single test. 474 bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, 475 Prog::Anchor anchor) { 476 // Backtracking is the gold standard. 477 Result correct; 478 RunSearch(kEngineBacktrack, text, context, anchor, &correct); 479 if (correct.skipped) { 480 if (regexp_ == NULL) 481 return true; 482 LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_) 483 << " " << FormatMode(flags_); 484 return false; 485 } 486 VLOG(1) << "Try: regexp " << CEscape(regexp_str_) 487 << " text " << CEscape(text) 488 << " (" << FormatKind(kind_) 489 << ", " << FormatAnchor(anchor) 490 << ", " << FormatMode(flags_) 491 << ")"; 492 493 // Compare the others. 494 bool all_okay = true; 495 for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) { 496 if (!(Engines() & (1<<i))) 497 continue; 498 499 Result r; 500 RunSearch(i, text, context, anchor, &r); 501 if (ResultOkay(r, correct)) { 502 if (FLAGS_log_okay) 503 LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor); 504 continue; 505 } 506 507 // We disagree with PCRE on the meaning of some Unicode matches. 508 // In particular, we treat all non-ASCII UTF-8 as word characters. 509 // We also treat "empty" character sets like [^\w\W] as being 510 // impossible to match, while PCRE apparently excludes some code 511 // points (e.g., 0x0080) from both \w and \W. 512 if (i == kEnginePCRE && NonASCII(text)) 513 continue; 514 515 if (!r.untrusted) 516 all_okay = false; 517 518 LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text, 519 context, anchor); 520 if (r.matched != correct.matched) { 521 if (r.matched) { 522 LOG(INFO) << " Should not match (but does)."; 523 } else { 524 LOG(INFO) << " Should match (but does not)."; 525 continue; 526 } 527 } 528 for (int i = 0; i < 1+num_captures_; i++) { 529 if (r.submatch[i].begin() != correct.submatch[i].begin() || 530 r.submatch[i].end() != correct.submatch[i].end()) { 531 LOG(INFO) << 532 StringPrintf(" $%d: should be %s is %s", 533 i, 534 FormatCapture(text, correct.submatch[i]).c_str(), 535 FormatCapture(text, r.submatch[i]).c_str()); 536 } else { 537 LOG(INFO) << 538 StringPrintf(" $%d: %s ok", i, 539 FormatCapture(text, r.submatch[i]).c_str()); 540 } 541 } 542 } 543 544 if (!all_okay) { 545 if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0) 546 LOG(QFATAL) << "Too many regexp failures."; 547 } 548 549 return all_okay; 550 } 551 552 void TestInstance::LogMatch(const char* prefix, Engine e, 553 const StringPiece& text, const StringPiece& context, 554 Prog::Anchor anchor) { 555 LOG(INFO) << prefix 556 << EngineString(e) 557 << " regexp " 558 << CEscape(regexp_str_) 559 << " " 560 << CEscape(regexp_->ToString()) 561 << " text " 562 << CEscape(text) 563 << " (" 564 << text.begin() - context.begin() 565 << "," 566 << text.end() - context.begin() 567 << ") of context " 568 << CEscape(context) 569 << " (" << FormatKind(kind_) 570 << ", " << FormatAnchor(anchor) 571 << ", " << FormatMode(flags_) 572 << ")"; 573 } 574 575 static Prog::MatchKind kinds[] = { 576 Prog::kFirstMatch, 577 Prog::kLongestMatch, 578 Prog::kFullMatch, 579 }; 580 581 // Test all possible match kinds and parse modes. 582 Tester::Tester(const StringPiece& regexp) { 583 error_ = false; 584 for (int i = 0; i < arraysize(kinds); i++) { 585 for (int j = 0; j < arraysize(parse_modes); j++) { 586 TestInstance* t = new TestInstance(regexp, kinds[i], 587 parse_modes[j].parse_flags); 588 error_ |= t->error(); 589 v_.push_back(t); 590 } 591 } 592 } 593 594 Tester::~Tester() { 595 for (int i = 0; i < v_.size(); i++) 596 delete v_[i]; 597 } 598 599 bool Tester::TestCase(const StringPiece& text, const StringPiece& context, 600 Prog::Anchor anchor) { 601 bool okay = true; 602 for (int i = 0; i < v_.size(); i++) 603 okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor)); 604 return okay; 605 } 606 607 static Prog::Anchor anchors[] = { 608 Prog::kAnchored, 609 Prog::kUnanchored 610 }; 611 612 bool Tester::TestInput(const StringPiece& text) { 613 bool okay = TestInputInContext(text, text); 614 if (text.size() > 0) { 615 StringPiece sp; 616 sp = text; 617 sp.remove_prefix(1); 618 okay &= TestInputInContext(sp, text); 619 sp = text; 620 sp.remove_suffix(1); 621 okay &= TestInputInContext(sp, text); 622 } 623 return okay; 624 } 625 626 bool Tester::TestInputInContext(const StringPiece& text, 627 const StringPiece& context) { 628 bool okay = true; 629 for (int i = 0; i < arraysize(anchors); i++) 630 okay &= TestCase(text, context, anchors[i]); 631 return okay; 632 } 633 634 bool TestRegexpOnText(const StringPiece& regexp, 635 const StringPiece& text) { 636 Tester t(regexp); 637 return t.TestInput(text); 638 } 639 640 } // namespace re2 641