Home | History | Annotate | Download | only in testing
      1 // Copyright 2006-2008 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "util/test.h"
      6 #include "util/thread.h"
      7 #include "re2/prog.h"
      8 #include "re2/re2.h"
      9 #include "re2/regexp.h"
     10 #include "re2/testing/regexp_generator.h"
     11 #include "re2/testing/string_generator.h"
     12 
     13 DECLARE_bool(re2_dfa_bail_when_slow);
     14 
     15 DEFINE_int32(size, 8, "log2(number of DFA nodes)");
     16 DEFINE_int32(repeat, 2, "Repetition count.");
     17 DEFINE_int32(threads, 4, "number of threads");
     18 
     19 namespace re2 {
     20 
     21 // Check that multithreaded access to DFA class works.
     22 
     23 // Helper thread: builds entire DFA for prog.
     24 class BuildThread : public Thread {
     25  public:
     26   BuildThread(Prog* prog) : prog_(prog) {}
     27   virtual void Run() {
     28     CHECK(prog_->BuildEntireDFA(Prog::kFirstMatch));
     29   }
     30 
     31  private:
     32   Prog* prog_;
     33 };
     34 
     35 TEST(Multithreaded, BuildEntireDFA) {
     36   // Create regexp with 2^FLAGS_size states in DFA.
     37   string s = "a";
     38   for (int i = 0; i < FLAGS_size; i++)
     39     s += "[ab]";
     40   s += "b";
     41 
     42   // Check that single-threaded code works.
     43   {
     44     //LOG(INFO) << s;
     45     Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
     46     CHECK(re);
     47     Prog* prog = re->CompileToProg(0);
     48     CHECK(prog);
     49     BuildThread* t = new BuildThread(prog);
     50     t->SetJoinable(true);
     51     t->Start();
     52     t->Join();
     53     delete t;
     54     delete prog;
     55     re->Decref();
     56   }
     57 
     58   // Build the DFA simultaneously in a bunch of threads.
     59   for (int i = 0; i < FLAGS_repeat; i++) {
     60     Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
     61     CHECK(re);
     62     Prog* prog = re->CompileToProg(0);
     63     CHECK(prog);
     64 
     65     vector<BuildThread*> threads;
     66     for (int j = 0; j < FLAGS_threads; j++) {
     67       BuildThread *t = new BuildThread(prog);
     68       t->SetJoinable(true);
     69       threads.push_back(t);
     70     }
     71     for (int j = 0; j < FLAGS_threads; j++)
     72       threads[j]->Start();
     73     for (int j = 0; j < FLAGS_threads; j++) {
     74       threads[j]->Join();
     75       delete threads[j];
     76     }
     77 
     78     // One more compile, to make sure everything is okay.
     79     prog->BuildEntireDFA(Prog::kFirstMatch);
     80     delete prog;
     81     re->Decref();
     82   }
     83 }
     84 
     85 // Check that DFA size requirements are followed.
     86 // BuildEntireDFA will, like SearchDFA, stop building out
     87 // the DFA once the memory limits are reached.
     88 TEST(SingleThreaded, BuildEntireDFA) {
     89   // Create regexp with 2^30 states in DFA.
     90   string s = "a";
     91   for (int i = 0; i < 30; i++)
     92     s += "[ab]";
     93   s += "b";
     94 
     95   //LOG(INFO) << s;
     96   Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
     97   CHECK(re);
     98   int max = 24;
     99   for (int i = 17; i < max; i++) {
    100     int limit = 1<<i;
    101     int usage;
    102     //int progusage, dfamem;
    103     {
    104       testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
    105       Prog* prog = re->CompileToProg(limit);
    106       CHECK(prog);
    107       //progusage = m.HeapGrowth();
    108       //dfamem = prog->dfa_mem();
    109       prog->BuildEntireDFA(Prog::kFirstMatch);
    110       prog->BuildEntireDFA(Prog::kLongestMatch);
    111       usage = m.HeapGrowth();
    112       delete prog;
    113     }
    114     if (!UsingMallocCounter)
    115       continue;
    116     //LOG(INFO) << StringPrintf("Limit %d: prog used %d, DFA budget %d, total %d\n",
    117     //                          limit, progusage, dfamem, usage);
    118     CHECK_GT(usage, limit*9/10);
    119     CHECK_LT(usage, limit + (16<<10));  // 16kB of slop okay
    120   }
    121   re->Decref();
    122 }
    123 
    124 // Generates and returns a string over binary alphabet {0,1} that contains
    125 // all possible binary sequences of length n as subsequences.  The obvious
    126 // brute force method would generate a string of length n * 2^n, but this
    127 // generates a string of length n + 2^n - 1 called a De Bruijn cycle.
    128 // See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
    129 // Such a string is useful for testing a DFA.  If you have a DFA
    130 // where distinct last n bytes implies distinct states, then running on a
    131 // DeBruijn string causes the DFA to need to create a new state at every
    132 // position in the input, never reusing any states until it gets to the
    133 // end of the string.  This is the worst possible case for DFA execution.
    134 static string DeBruijnString(int n) {
    135   CHECK_LT(n, 8*sizeof(int));
    136   CHECK_GT(n, 0);
    137 
    138   vector<bool> did(1<<n);
    139   for (int i = 0; i < 1<<n; i++)
    140     did[i] = false;
    141 
    142   string s;
    143   for (int i = 0; i < n-1; i++)
    144     s.append("0");
    145   int bits = 0;
    146   int mask = (1<<n) - 1;
    147   for (int i = 0; i < (1<<n); i++) {
    148     bits <<= 1;
    149     bits &= mask;
    150     if (!did[bits|1]) {
    151       bits |= 1;
    152       s.append("1");
    153     } else {
    154       s.append("0");
    155     }
    156     CHECK(!did[bits]);
    157     did[bits] = true;
    158   }
    159   return s;
    160 }
    161 
    162 // Test that the DFA gets the right result even if it runs
    163 // out of memory during a search.  The regular expression
    164 // 0[01]{n}$ matches a binary string of 0s and 1s only if
    165 // the (n+1)th-to-last character is a 0.  Matching this in
    166 // a single forward pass (as done by the DFA) requires
    167 // keeping one bit for each of the last n+1 characters
    168 // (whether each was a 0), or 2^(n+1) possible states.
    169 // If we run this regexp to search in a string that contains
    170 // every possible n-character binary string as a substring,
    171 // then it will have to run through at least 2^n states.
    172 // States are big data structures -- certainly more than 1 byte --
    173 // so if the DFA can search correctly while staying within a
    174 // 2^n byte limit, it must be handling out-of-memory conditions
    175 // gracefully.
    176 TEST(SingleThreaded, SearchDFA) {
    177   // Choice of n is mostly arbitrary, except that:
    178   //   * making n too big makes the test run for too long.
    179   //   * making n too small makes the DFA refuse to run,
    180   //     because it has so little memory compared to the program size.
    181   // Empirically, n = 18 is a good compromise between the two.
    182   const int n = 18;
    183 
    184   Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
    185                              Regexp::LikePerl, NULL);
    186   CHECK(re);
    187 
    188   // The De Bruijn string for n ends with a 1 followed by n 0s in a row,
    189   // which is not a match for 0[01]{n}$.  Adding one more 0 is a match.
    190   string no_match = DeBruijnString(n);
    191   string match = no_match + "0";
    192 
    193   // The De Bruijn string is the worst case input for this regexp.
    194   // By default, the DFA will notice that it is flushing its cache
    195   // too frequently and will bail out early, so that RE2 can use the
    196   // NFA implementation instead.  (The DFA loses its speed advantage
    197   // if it can't get a good cache hit rate.)
    198   // Tell the DFA to trudge along instead.
    199   FLAGS_re2_dfa_bail_when_slow = false;
    200 
    201   int64 usage;
    202   int64 peak_usage;
    203   {
    204     testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
    205     Prog* prog = re->CompileToProg(1<<n);
    206     CHECK(prog);
    207     for (int i = 0; i < 10; i++) {
    208       bool matched, failed = false;
    209       matched = prog->SearchDFA(match, NULL,
    210                                 Prog::kUnanchored, Prog::kFirstMatch,
    211                                 NULL, &failed, NULL);
    212       CHECK(!failed);
    213       CHECK(matched);
    214       matched = prog->SearchDFA(no_match, NULL,
    215                                 Prog::kUnanchored, Prog::kFirstMatch,
    216                                 NULL, &failed, NULL);
    217       CHECK(!failed);
    218       CHECK(!matched);
    219     }
    220     usage = m.HeapGrowth();
    221     peak_usage = m.PeakHeapGrowth();
    222     delete prog;
    223   }
    224   re->Decref();
    225 
    226   if (!UsingMallocCounter)
    227     return;
    228   //LOG(INFO) << "usage " << usage << " " << peak_usage;
    229   CHECK_LT(usage, 1<<n);
    230   CHECK_LT(peak_usage, 1<<n);
    231 }
    232 
    233 // Helper thread: searches for match, which should match,
    234 // and no_match, which should not.
    235 class SearchThread : public Thread {
    236  public:
    237   SearchThread(Prog* prog, const StringPiece& match,
    238                const StringPiece& no_match)
    239     : prog_(prog), match_(match), no_match_(no_match) {}
    240 
    241   virtual void Run() {
    242     for (int i = 0; i < 2; i++) {
    243       bool matched, failed = false;
    244       matched = prog_->SearchDFA(match_, NULL,
    245                                  Prog::kUnanchored, Prog::kFirstMatch,
    246                                  NULL, &failed, NULL);
    247       CHECK(!failed);
    248       CHECK(matched);
    249       matched = prog_->SearchDFA(no_match_, NULL,
    250                                  Prog::kUnanchored, Prog::kFirstMatch,
    251                                  NULL, &failed, NULL);
    252       CHECK(!failed);
    253       CHECK(!matched);
    254     }
    255   }
    256 
    257  private:
    258   Prog* prog_;
    259   StringPiece match_;
    260   StringPiece no_match_;
    261 };
    262 
    263 TEST(Multithreaded, SearchDFA) {
    264   // Same as single-threaded test above.
    265   const int n = 18;
    266   Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
    267                              Regexp::LikePerl, NULL);
    268   CHECK(re);
    269   string no_match = DeBruijnString(n);
    270   string match = no_match + "0";
    271   FLAGS_re2_dfa_bail_when_slow = false;
    272 
    273   // Check that single-threaded code works.
    274   {
    275     Prog* prog = re->CompileToProg(1<<n);
    276     CHECK(prog);
    277     SearchThread* t = new SearchThread(prog, match, no_match);
    278     t->SetJoinable(true);
    279     t->Start();
    280     t->Join();
    281     delete t;
    282     delete prog;
    283   }
    284 
    285   // Run the search simultaneously in a bunch of threads.
    286   // Reuse same flags for Multithreaded.BuildDFA above.
    287   for (int i = 0; i < FLAGS_repeat; i++) {
    288     //LOG(INFO) << "Search " << i;
    289     Prog* prog = re->CompileToProg(1<<n);
    290     CHECK(prog);
    291 
    292     vector<SearchThread*> threads;
    293     for (int j = 0; j < FLAGS_threads; j++) {
    294       SearchThread *t = new SearchThread(prog, match, no_match);
    295       t->SetJoinable(true);
    296       threads.push_back(t);
    297     }
    298     for (int j = 0; j < FLAGS_threads; j++)
    299       threads[j]->Start();
    300     for (int j = 0; j < FLAGS_threads; j++) {
    301       threads[j]->Join();
    302       delete threads[j];
    303     }
    304     delete prog;
    305   }
    306   re->Decref();
    307 }
    308 
    309 struct ReverseTest {
    310   const char *regexp;
    311   const char *text;
    312   bool match;
    313 };
    314 
    315 // Test that reverse DFA handles anchored/unanchored correctly.
    316 // It's in the DFA interface but not used by RE2.
    317 ReverseTest reverse_tests[] = {
    318   { "\\A(a|b)", "abc", true },
    319   { "(a|b)\\z", "cba", true },
    320   { "\\A(a|b)", "cba", false },
    321   { "(a|b)\\z", "abc", false },
    322 };
    323 
    324 TEST(DFA, ReverseMatch) {
    325   int nfail = 0;
    326   for (int i = 0; i < arraysize(reverse_tests); i++) {
    327     const ReverseTest& t = reverse_tests[i];
    328     Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
    329     CHECK(re);
    330     Prog *prog = re->CompileToReverseProg(0);
    331     CHECK(prog);
    332     bool failed = false;
    333     bool matched = prog->SearchDFA(t.text, NULL, Prog::kUnanchored, Prog::kFirstMatch, NULL, &failed, NULL);
    334     if (matched != t.match) {
    335       LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
    336       nfail++;
    337     }
    338     delete prog;
    339     re->Decref();
    340   }
    341   EXPECT_EQ(nfail, 0);
    342 }
    343 
    344 }  // namespace re2
    345