Home | History | Annotate | Download | only in testing
      1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Regular expression engine tester -- test all the implementations against each other.
      6 
      7 #include "util/util.h"
      8 #include "util/flags.h"
      9 #include "re2/testing/tester.h"
     10 #include "re2/prog.h"
     11 #include "re2/re2.h"
     12 #include "re2/regexp.h"
     13 
     14 DEFINE_bool(dump_prog, false, "dump regexp program");
     15 DEFINE_bool(log_okay, false, "log successful runs");
     16 DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
     17 
     18 DEFINE_int32(max_regexp_failures, 100,
     19              "maximum number of regexp test failures (-1 = unlimited)");
     20 
     21 DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
     22 
     23 namespace re2 {
     24 
     25 enum {
     26   kMaxSubmatch = 1+16,  // $0...$16
     27 };
     28 
     29 const char* engine_types[kEngineMax] = {
     30   "Backtrack",
     31   "NFA",
     32   "DFA",
     33   "DFA1",
     34   "OnePass",
     35   "BitState",
     36   "RE2",
     37   "RE2a",
     38   "RE2b",
     39   "PCRE",
     40 };
     41 
     42 // Returns the name string for the type t.
     43 static string EngineString(Engine t) {
     44   if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) {
     45     return StringPrintf("type%d", static_cast<int>(t));
     46   }
     47   return engine_types[t];
     48 }
     49 
     50 // Returns bit mask of engines to use.
     51 static uint32 Engines() {
     52   static uint32 cached_engines;
     53   static bool did_parse;
     54 
     55   if (did_parse)
     56     return cached_engines;
     57 
     58   if (FLAGS_regexp_engines.empty()) {
     59     cached_engines = ~0;
     60   } else {
     61     for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
     62       if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str()))
     63         cached_engines |= 1<<i;
     64   }
     65 
     66   if (cached_engines == 0)
     67     LOG(INFO) << "Warning: no engines enabled.";
     68   if (!UsingPCRE)
     69     cached_engines &= ~(1<<kEnginePCRE);
     70   for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
     71     if (cached_engines & (1<<i))
     72       LOG(INFO) << EngineString(i) << " enabled";
     73   }
     74   did_parse = true;
     75   return cached_engines;
     76 }
     77 
     78 // The result of running a match.
     79 struct TestInstance::Result {
     80   bool skipped;         // test skipped: wasn't applicable
     81   bool matched;         // found a match
     82   bool untrusted;       // don't really trust the answer
     83   bool have_submatch;   // computed all submatch info
     84   bool have_submatch0;  // computed just submatch[0]
     85   StringPiece submatch[kMaxSubmatch];
     86 };
     87 
     88 typedef TestInstance::Result Result;
     89 
     90 // Formats a single capture range s in text in the form (a,b)
     91 // where a and b are the starting and ending offsets of s in text.
     92 static string FormatCapture(const StringPiece& text, const StringPiece& s) {
     93   if (s.begin() == NULL)
     94     return "(?,?)";
     95   return StringPrintf("(%d,%d)",
     96                       static_cast<int>(s.begin() - text.begin()),
     97                       static_cast<int>(s.end() - text.begin()));
     98 }
     99 
    100 // Returns whether text contains non-ASCII (>= 0x80) bytes.
    101 static bool NonASCII(const StringPiece& text) {
    102   for (int i = 0; i < text.size(); i++)
    103     if ((uint8)text[i] >= 0x80)
    104       return true;
    105   return false;
    106 }
    107 
    108 // Returns string representation of match kind.
    109 static string FormatKind(Prog::MatchKind kind) {
    110   switch (kind) {
    111     case Prog::kFullMatch:
    112       return "full match";
    113     case Prog::kLongestMatch:
    114       return "longest match";
    115     case Prog::kFirstMatch:
    116       return "first match";
    117     case Prog::kManyMatch:
    118       return "many match";
    119   }
    120   return "???";
    121 }
    122 
    123 // Returns string representation of anchor kind.
    124 static string FormatAnchor(Prog::Anchor anchor) {
    125   switch (anchor) {
    126     case Prog::kAnchored:
    127       return "anchored";
    128     case Prog::kUnanchored:
    129       return "unanchored";
    130   }
    131   return "???";
    132 }
    133 
    134 struct ParseMode {
    135   Regexp::ParseFlags parse_flags;
    136   string desc;
    137 };
    138 
    139 static const Regexp::ParseFlags single_line =
    140   Regexp::LikePerl;
    141 static const Regexp::ParseFlags multi_line =
    142   static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
    143 
    144 static ParseMode parse_modes[] = {
    145   { single_line,                   "single-line"          },
    146   { single_line|Regexp::Latin1,    "single-line, latin1"  },
    147   { multi_line,                    "multiline"            },
    148   { multi_line|Regexp::NonGreedy,  "multiline, nongreedy" },
    149   { multi_line|Regexp::Latin1,     "multiline, latin1"    },
    150 };
    151 
    152 static string FormatMode(Regexp::ParseFlags flags) {
    153   for (int i = 0; i < arraysize(parse_modes); i++)
    154     if (parse_modes[i].parse_flags == flags)
    155       return parse_modes[i].desc;
    156   return StringPrintf("%#x", static_cast<uint>(flags));
    157 }
    158 
    159 // Constructs and saves all the matching engines that
    160 // will be required for the given tests.
    161 TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
    162                            Regexp::ParseFlags flags)
    163   : regexp_str_(regexp_str),
    164     kind_(kind),
    165     flags_(flags),
    166     error_(false),
    167     regexp_(NULL),
    168     num_captures_(0),
    169     prog_(NULL),
    170     rprog_(NULL),
    171     re_(NULL),
    172     re2_(NULL) {
    173 
    174   VLOG(1) << CEscape(regexp_str);
    175 
    176   // Compile regexp to prog.
    177   // Always required - needed for backtracking (reference implementation).
    178   RegexpStatus status;
    179   regexp_ = Regexp::Parse(regexp_str, flags, &status);
    180   if (regexp_ == NULL) {
    181     LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
    182               << " mode: " << FormatMode(flags);
    183     error_ = true;
    184     return;
    185   }
    186   num_captures_ = regexp_->NumCaptures();
    187   prog_ = regexp_->CompileToProg(0);
    188   if (prog_ == NULL) {
    189     LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
    190     error_ = true;
    191     return;
    192   }
    193   if (FLAGS_dump_prog) {
    194     LOG(INFO) << "Prog for "
    195               << " regexp "
    196               << CEscape(regexp_str_)
    197               << " (" << FormatKind(kind_)
    198               << ", " << FormatMode(flags_)
    199               << ")\n"
    200               << prog_->Dump();
    201   }
    202 
    203   // Compile regexp to reversed prog.  Only needed for DFA engines.
    204   if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
    205     rprog_ = regexp_->CompileToReverseProg(0);
    206     if (rprog_ == NULL) {
    207       LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
    208       error_ = true;
    209       return;
    210     }
    211     if (FLAGS_dump_rprog)
    212       LOG(INFO) << rprog_->Dump();
    213   }
    214 
    215   // Create re string that will be used for RE and RE2.
    216   string re = regexp_str.as_string();
    217   // Accomodate flags.
    218   // Regexp::Latin1 will be accomodated below.
    219   if (!(flags & Regexp::OneLine))
    220     re = "(?m)" + re;
    221   if (flags & Regexp::NonGreedy)
    222     re = "(?U)" + re;
    223   if (flags & Regexp::DotNL)
    224     re = "(?s)" + re;
    225 
    226   // Compile regexp to RE2.
    227   if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
    228     RE2::Options options;
    229     if (flags & Regexp::Latin1)
    230       options.set_encoding(RE2::Options::EncodingLatin1);
    231     if (kind_ == Prog::kLongestMatch)
    232       options.set_longest_match(true);
    233     re2_ = new RE2(re, options);
    234     if (!re2_->error().empty()) {
    235       LOG(INFO) << "Cannot RE2: " << CEscape(re);
    236       error_ = true;
    237       return;
    238     }
    239   }
    240 
    241   // Compile regexp to RE.
    242   // PCRE as exposed by the RE interface isn't always usable.
    243   // 1. It disagrees about handling of empty-string reptitions
    244   //    like matching (a*)* against "b".  PCRE treats the (a*) as
    245   //    occurring once, while we treat it as occurring not at all.
    246   // 2. It treats $ as this weird thing meaning end of string
    247   //    or before the \n at the end of the string.
    248   // 3. It doesn't implement POSIX leftmost-longest matching.
    249   // MimicsPCRE() detects 1 and 2.
    250   if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
    251       kind_ != Prog::kLongestMatch) {
    252     PCRE_Options o;
    253     o.set_option(PCRE::UTF8);
    254     if (flags & Regexp::Latin1)
    255       o.set_option(PCRE::None);
    256     // PCRE has interface bug keeping us from finding $0, so
    257     // add one more layer of parens.
    258     re_ = new PCRE("("+re+")", o);
    259     if (!re_->error().empty()) {
    260       LOG(INFO) << "Cannot PCRE: " << CEscape(re);
    261       error_ = true;
    262       return;
    263     }
    264   }
    265 }
    266 
    267 TestInstance::~TestInstance() {
    268   if (regexp_)
    269     regexp_->Decref();
    270   delete prog_;
    271   delete rprog_;
    272   delete re_;
    273   delete re2_;
    274 }
    275 
    276 // Runs a single search using the named engine type.
    277 // This interface hides all the irregularities of the various
    278 // engine interfaces from the rest of this file.
    279 void TestInstance::RunSearch(Engine type,
    280                              const StringPiece& orig_text,
    281                              const StringPiece& orig_context,
    282                              Prog::Anchor anchor,
    283                              Result *result) {
    284   memset(result, 0, sizeof *result);
    285   if (regexp_ == NULL) {
    286     result->skipped = true;
    287     return;
    288   }
    289   int nsubmatch = 1 + num_captures_;  // NumCaptures doesn't count $0
    290   if (nsubmatch > kMaxSubmatch)
    291     nsubmatch = kMaxSubmatch;
    292 
    293   StringPiece text = orig_text;
    294   StringPiece context = orig_context;
    295 
    296   switch (type) {
    297     default:
    298       LOG(FATAL) << "Bad RunSearch type: " << (int)type;
    299 
    300     case kEngineBacktrack:
    301       if (prog_ == NULL) {
    302         result->skipped = true;
    303         break;
    304       }
    305       result->matched =
    306         prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
    307                                      result->submatch, nsubmatch);
    308       result->have_submatch = true;
    309       break;
    310 
    311     case kEngineNFA:
    312       if (prog_ == NULL) {
    313         result->skipped = true;
    314         break;
    315       }
    316       result->matched =
    317         prog_->SearchNFA(text, context, anchor, kind_,
    318                         result->submatch, nsubmatch);
    319       result->have_submatch = true;
    320       break;
    321 
    322     case kEngineDFA:
    323       if (prog_ == NULL) {
    324         result->skipped = true;
    325         break;
    326       }
    327       result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
    328                                          &result->skipped, NULL);
    329       break;
    330 
    331     case kEngineDFA1:
    332       if (prog_ == NULL || rprog_ == NULL) {
    333         result->skipped = true;
    334         break;
    335       }
    336       result->matched =
    337         prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
    338                          &result->skipped, NULL);
    339       // If anchored, no need for second run,
    340       // but do it anyway to find more bugs.
    341       if (result->matched) {
    342         if (!rprog_->SearchDFA(result->submatch[0], context,
    343                                Prog::kAnchored, Prog::kLongestMatch,
    344                                result->submatch,
    345                                &result->skipped, NULL)) {
    346           LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_)
    347                      << " on " << CEscape(text);
    348           result->matched = false;
    349         }
    350       }
    351       result->have_submatch0 = true;
    352       break;
    353 
    354     case kEngineOnePass:
    355       if (prog_ == NULL ||
    356           anchor == Prog::kUnanchored ||
    357           !prog_->IsOnePass() ||
    358           nsubmatch > Prog::kMaxOnePassCapture) {
    359         result->skipped = true;
    360         break;
    361       }
    362       result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
    363                                       result->submatch, nsubmatch);
    364       result->have_submatch = true;
    365       break;
    366 
    367     case kEngineBitState:
    368       if (prog_ == NULL) {
    369         result->skipped = true;
    370         break;
    371       }
    372       result->matched = prog_->SearchBitState(text, context, anchor, kind_,
    373                                               result->submatch, nsubmatch);
    374       result->have_submatch = true;
    375       break;
    376 
    377     case kEngineRE2:
    378     case kEngineRE2a:
    379     case kEngineRE2b: {
    380       if (!re2_ || text.end() != context.end()) {
    381         result->skipped = true;
    382         break;
    383       }
    384 
    385       RE2::Anchor re_anchor;
    386       if (anchor == Prog::kAnchored)
    387         re_anchor = RE2::ANCHOR_START;
    388       else
    389         re_anchor = RE2::UNANCHORED;
    390       if (kind_ == Prog::kFullMatch)
    391         re_anchor = RE2::ANCHOR_BOTH;
    392 
    393       result->matched = re2_->Match(context,
    394                                     text.begin() - context.begin(),
    395                                     text.end() - context.begin(),
    396                                     re_anchor, result->submatch, nsubmatch);
    397       result->have_submatch = nsubmatch > 0;
    398       break;
    399     }
    400 
    401     case kEnginePCRE: {
    402       if (!re_ || text.begin() != context.begin() ||
    403           text.end() != context.end()) {
    404         result->skipped = true;
    405         break;
    406       }
    407 
    408       const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
    409       PCRE::Arg *a = new PCRE::Arg[nsubmatch];
    410       for (int i = 0; i < nsubmatch; i++) {
    411         a[i] = PCRE::Arg(&result->submatch[i]);
    412         argptr[i] = &a[i];
    413       }
    414       int consumed;
    415       PCRE::Anchor pcre_anchor;
    416       if (anchor == Prog::kAnchored)
    417         pcre_anchor = PCRE::ANCHOR_START;
    418       else
    419         pcre_anchor = PCRE::UNANCHORED;
    420       if (kind_ == Prog::kFullMatch)
    421         pcre_anchor = PCRE::ANCHOR_BOTH;
    422       re_->ClearHitLimit();
    423       result->matched =
    424         re_->DoMatch(text,
    425                      pcre_anchor,
    426                      &consumed,
    427                      argptr, nsubmatch);
    428       if (re_->HitLimit()) {
    429         result->untrusted = true;
    430         delete[] argptr;
    431         delete[] a;
    432         break;
    433       }
    434       result->have_submatch = true;
    435 
    436       // Work around RE interface bug: PCRE returns -1 as the
    437       // offsets for an unmatched subexpression, and RE should
    438       // turn that into StringPiece(NULL) but in fact it uses
    439       // StringPiece(text.begin() - 1, 0).  Oops.
    440       for (int i = 0; i < nsubmatch; i++)
    441         if (result->submatch[i].begin() == text.begin() - 1)
    442           result->submatch[i] = NULL;
    443       delete[] argptr;
    444       delete[] a;
    445       break;
    446     }
    447   }
    448 
    449   if (!result->matched)
    450     memset(result->submatch, 0, sizeof result->submatch);
    451 }
    452 
    453 // Checks whether r is okay given that correct is the right answer.
    454 // Specifically, r's answers have to match (but it doesn't have to
    455 // claim to have all the answers).
    456 static bool ResultOkay(const Result& r, const Result& correct) {
    457   if (r.skipped)
    458     return true;
    459   if (r.matched != correct.matched)
    460     return false;
    461   if (r.have_submatch || r.have_submatch0) {
    462     for (int i = 0; i < kMaxSubmatch; i++) {
    463       if (correct.submatch[i].begin() != r.submatch[i].begin() ||
    464           correct.submatch[i].size() != r.submatch[i].size())
    465         return false;
    466       if (!r.have_submatch)
    467         break;
    468     }
    469   }
    470   return true;
    471 }
    472 
    473 // Runs a single test.
    474 bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
    475                            Prog::Anchor anchor) {
    476   // Backtracking is the gold standard.
    477   Result correct;
    478   RunSearch(kEngineBacktrack, text, context, anchor, &correct);
    479   if (correct.skipped) {
    480     if (regexp_ == NULL)
    481       return true;
    482     LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
    483                << " " << FormatMode(flags_);
    484     return false;
    485   }
    486   VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
    487           << " text " << CEscape(text)
    488           << " (" << FormatKind(kind_)
    489           << ", " << FormatAnchor(anchor)
    490           << ", " << FormatMode(flags_)
    491           << ")";
    492 
    493   // Compare the others.
    494   bool all_okay = true;
    495   for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
    496     if (!(Engines() & (1<<i)))
    497       continue;
    498 
    499     Result r;
    500     RunSearch(i, text, context, anchor, &r);
    501     if (ResultOkay(r, correct)) {
    502       if (FLAGS_log_okay)
    503         LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
    504       continue;
    505     }
    506 
    507     // We disagree with PCRE on the meaning of some Unicode matches.
    508     // In particular, we treat all non-ASCII UTF-8 as word characters.
    509     // We also treat "empty" character sets like [^\w\W] as being
    510     // impossible to match, while PCRE apparently excludes some code
    511     // points (e.g., 0x0080) from both \w and \W.
    512     if (i == kEnginePCRE && NonASCII(text))
    513       continue;
    514 
    515     if (!r.untrusted)
    516       all_okay = false;
    517 
    518     LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
    519              context, anchor);
    520     if (r.matched != correct.matched) {
    521       if (r.matched) {
    522         LOG(INFO) << "   Should not match (but does).";
    523       } else {
    524         LOG(INFO) << "   Should match (but does not).";
    525         continue;
    526       }
    527     }
    528     for (int i = 0; i < 1+num_captures_; i++) {
    529       if (r.submatch[i].begin() != correct.submatch[i].begin() ||
    530           r.submatch[i].end() != correct.submatch[i].end()) {
    531         LOG(INFO) <<
    532           StringPrintf("   $%d: should be %s is %s",
    533                        i,
    534                        FormatCapture(text, correct.submatch[i]).c_str(),
    535                        FormatCapture(text, r.submatch[i]).c_str());
    536       } else {
    537         LOG(INFO) <<
    538           StringPrintf("   $%d: %s ok", i,
    539                        FormatCapture(text, r.submatch[i]).c_str());
    540       }
    541     }
    542   }
    543 
    544   if (!all_okay) {
    545     if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
    546       LOG(QFATAL) << "Too many regexp failures.";
    547   }
    548 
    549   return all_okay;
    550 }
    551 
    552 void TestInstance::LogMatch(const char* prefix, Engine e,
    553                             const StringPiece& text, const StringPiece& context,
    554                             Prog::Anchor anchor) {
    555   LOG(INFO) << prefix
    556     << EngineString(e)
    557     << " regexp "
    558     << CEscape(regexp_str_)
    559     << " "
    560     << CEscape(regexp_->ToString())
    561     << " text "
    562     << CEscape(text)
    563     << " ("
    564     << text.begin() - context.begin()
    565     << ","
    566     << text.end() - context.begin()
    567     << ") of context "
    568     << CEscape(context)
    569     << " (" << FormatKind(kind_)
    570     << ", " << FormatAnchor(anchor)
    571     << ", " << FormatMode(flags_)
    572     << ")";
    573 }
    574 
    575 static Prog::MatchKind kinds[] = {
    576   Prog::kFirstMatch,
    577   Prog::kLongestMatch,
    578   Prog::kFullMatch,
    579 };
    580 
    581 // Test all possible match kinds and parse modes.
    582 Tester::Tester(const StringPiece& regexp) {
    583   error_ = false;
    584   for (int i = 0; i < arraysize(kinds); i++) {
    585     for (int j = 0; j < arraysize(parse_modes); j++) {
    586       TestInstance* t = new TestInstance(regexp, kinds[i],
    587                                          parse_modes[j].parse_flags);
    588       error_ |= t->error();
    589       v_.push_back(t);
    590     }
    591   }
    592 }
    593 
    594 Tester::~Tester() {
    595   for (int i = 0; i < v_.size(); i++)
    596     delete v_[i];
    597 }
    598 
    599 bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
    600                          Prog::Anchor anchor) {
    601   bool okay = true;
    602   for (int i = 0; i < v_.size(); i++)
    603     okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
    604   return okay;
    605 }
    606 
    607 static Prog::Anchor anchors[] = {
    608   Prog::kAnchored,
    609   Prog::kUnanchored
    610 };
    611 
    612 bool Tester::TestInput(const StringPiece& text) {
    613   bool okay = TestInputInContext(text, text);
    614   if (text.size() > 0) {
    615     StringPiece sp;
    616     sp = text;
    617     sp.remove_prefix(1);
    618     okay &= TestInputInContext(sp, text);
    619     sp = text;
    620     sp.remove_suffix(1);
    621     okay &= TestInputInContext(sp, text);
    622   }
    623   return okay;
    624 }
    625 
    626 bool Tester::TestInputInContext(const StringPiece& text,
    627                                 const StringPiece& context) {
    628   bool okay = true;
    629   for (int i = 0; i < arraysize(anchors); i++)
    630     okay &= TestCase(text, context, anchors[i]);
    631   return okay;
    632 }
    633 
    634 bool TestRegexpOnText(const StringPiece& regexp,
    635                       const StringPiece& text) {
    636   Tester t(regexp);
    637   return t.TestInput(text);
    638 }
    639 
    640 }  // namespace re2
    641