Home | History | Annotate | Download | only in testing
      1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "util/test.h"
      6 #include "re2/filtered_re2.h"
      7 #include "re2/re2.h"
      8 
      9 DECLARE_int32(filtered_re2_min_atom_len); // From prefilter_tree.cc
     10 
     11 namespace re2 {
     12 
     13 struct FilterTestVars {
     14   vector<string> atoms;
     15   vector<int> atom_indices;
     16   vector<int> matches;
     17   RE2::Options opts;
     18   FilteredRE2 f;
     19 };
     20 
     21 TEST(FilteredRE2Test, EmptyTest) {
     22   FilterTestVars v;
     23   v.f.AllMatches("foo", v.atom_indices, &v.matches);
     24   EXPECT_EQ(0, v.matches.size());
     25 }
     26 
     27 TEST(FilteredRE2Test, SmallOrTest) {
     28   FLAGS_filtered_re2_min_atom_len = 4;
     29 
     30   FilterTestVars v;
     31   int id;
     32   v.f.Add("(foo|bar)", v.opts, &id);
     33 
     34   v.f.Compile(&v.atoms);
     35   EXPECT_EQ(0, v.atoms.size());
     36 
     37   v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches);
     38   EXPECT_EQ(1, v.matches.size());
     39   EXPECT_EQ(id, v.matches[0]);
     40 }
     41 
     42 TEST(FilteredRE2Test, SmallLatinTest) {
     43   FLAGS_filtered_re2_min_atom_len = 3;
     44   FilterTestVars v;
     45   int id;
     46 
     47   v.opts.set_utf8(false);
     48   v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id);
     49   v.f.Compile(&v.atoms);
     50   EXPECT_EQ(1, v.atoms.size());
     51   EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef");
     52 
     53   v.atom_indices.push_back(0);
     54   v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches);
     55   EXPECT_EQ(1, v.matches.size());
     56   EXPECT_EQ(id, v.matches[0]);
     57 }
     58 
     59 struct AtomTest {
     60   const char* testname;
     61   // If any test needs more than this many regexps or atoms, increase
     62   // the size of the corresponding array.
     63   const char* regexps[20];
     64   const char* atoms[20];
     65 };
     66 
     67 AtomTest atom_tests[] = {
     68   {
     69     // This test checks to make sure empty patterns are allowed.
     70     "CheckEmptyPattern",
     71     {""},
     72     {}
     73   }, {
     74     // This test checks that all atoms of length greater than min length
     75     // are found, and no atoms that are of smaller length are found.
     76     "AllAtomsGtMinLengthFound", {
     77       "(abc123|def456|ghi789).*mnop[x-z]+",
     78       "abc..yyy..zz",
     79       "mnmnpp[a-z]+PPP"
     80     }, {
     81       "abc123",
     82       "def456",
     83       "ghi789",
     84       "mnop",
     85       "abc",
     86       "yyy",
     87       "mnmnpp",
     88       "ppp"
     89     }
     90   }, {
     91     // Test to make sure that any atoms that have another atom as a
     92     // substring in an OR are removed; that is, only the shortest
     93     // substring is kept.
     94     "SubstrAtomRemovesSuperStrInOr", {
     95       "(abc123|abc|ghi789|abc1234).*[x-z]+",
     96       "abcd..yyy..yyyzzz",
     97       "mnmnpp[a-z]+PPP"
     98     }, {
     99       "abc",
    100       "ghi789",
    101       "abcd",
    102       "yyy",
    103       "yyyzzz",
    104       "mnmnpp",
    105       "ppp"
    106     }
    107   }, {
    108     // Test character class expansion.
    109     "CharClassExpansion", {
    110       "m[a-c][d-f]n.*[x-z]+",
    111       "[x-y]bcde[ab]"
    112     }, {
    113       "madn", "maen", "mafn",
    114       "mbdn", "mben", "mbfn",
    115       "mcdn", "mcen", "mcfn",
    116       "xbcdea", "xbcdeb",
    117       "ybcdea", "ybcdeb"
    118     }
    119   }, {
    120     // Test upper/lower of non-ASCII.
    121     "UnicodeLower", {
    122       "(?i)",
    123       "",
    124       "",
    125     }, {
    126       "",
    127       "",
    128       "",
    129     },
    130   },
    131 };
    132 
    133 void AddRegexpsAndCompile(const char* regexps[],
    134                           int n,
    135                           struct FilterTestVars* v) {
    136   for (int i = 0; i < n; i++) {
    137     int id;
    138     v->f.Add(regexps[i], v->opts, &id);
    139   }
    140   v->f.Compile(&v->atoms);
    141 }
    142 
    143 bool CheckExpectedAtoms(const char* atoms[],
    144                         int n,
    145                         const char* testname,
    146                         struct FilterTestVars* v) {
    147   vector<string> expected;
    148   for (int i = 0; i < n; i++)
    149     expected.push_back(atoms[i]);
    150 
    151   bool pass = expected.size() == v->atoms.size();
    152 
    153   sort(v->atoms.begin(), v->atoms.end());
    154   sort(expected.begin(), expected.end());
    155   for (int i = 0; pass && i < n; i++)
    156       pass = pass && expected[i] == v->atoms[i];
    157 
    158   if (!pass) {
    159     LOG(WARNING) << "Failed " << testname;
    160     LOG(WARNING) << "Expected #atoms = " << expected.size();
    161     for (int i = 0; i < expected.size(); i++)
    162       LOG(WARNING) << expected[i];
    163     LOG(WARNING) << "Found #atoms = " << v->atoms.size();
    164     for (int i = 0; i < v->atoms.size(); i++)
    165       LOG(WARNING) << v->atoms[i];
    166   }
    167 
    168   return pass;
    169 }
    170 
    171 TEST(FilteredRE2Test, AtomTests) {
    172   FLAGS_filtered_re2_min_atom_len = 3;
    173 
    174   int nfail = 0;
    175   for (int i = 0; i < arraysize(atom_tests); i++) {
    176     FilterTestVars v;
    177     AtomTest* t = &atom_tests[i];
    178     int natom, nregexp;
    179     for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
    180       if (t->regexps[nregexp] == NULL)
    181         break;
    182     for (natom = 0; natom < arraysize(t->atoms); natom++)
    183       if (t->atoms[natom] == NULL)
    184         break;
    185     AddRegexpsAndCompile(t->regexps, nregexp, &v);
    186     if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v))
    187       nfail++;
    188   }
    189   EXPECT_EQ(0, nfail);
    190 }
    191 
    192 void FindAtomIndices(const vector<string> atoms,
    193                      const vector<string> matched_atoms,
    194                      vector<int>* atom_indices) {
    195   atom_indices->clear();
    196   for (int i = 0; i < matched_atoms.size(); i++) {
    197     int j = 0;
    198     for (; j < atoms.size(); j++) {
    199       if (matched_atoms[i] == atoms[j]) {
    200         atom_indices->push_back(j);
    201         break;
    202       }
    203       EXPECT_LT(j, atoms.size());
    204     }
    205   }
    206 }
    207 
    208 TEST(FilteredRE2Test, MatchEmptyPattern) {
    209   FLAGS_filtered_re2_min_atom_len = 3;
    210 
    211   FilterTestVars v;
    212   AtomTest* t = &atom_tests[0];
    213   // We are using the regexps used in one of the atom tests
    214   // for this test. Adding the EXPECT here to make sure
    215   // the index we use for the test is for the correct test.
    216   EXPECT_EQ("CheckEmptyPattern", string(t->testname));
    217   int nregexp;
    218   for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
    219     if (t->regexps[nregexp] == NULL)
    220       break;
    221   AddRegexpsAndCompile(t->regexps, nregexp, &v);
    222   string text = "0123";
    223   vector<int> atom_ids;
    224   vector<int> matching_regexps;
    225   EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids));
    226 }
    227 
    228 TEST(FilteredRE2Test, MatchTests) {
    229   FLAGS_filtered_re2_min_atom_len = 3;
    230 
    231   FilterTestVars v;
    232   AtomTest* t = &atom_tests[2];
    233   // We are using the regexps used in one of the atom tests
    234   // for this test.
    235   EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", string(t->testname));
    236   int nregexp;
    237   for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
    238     if (t->regexps[nregexp] == NULL)
    239       break;
    240   AddRegexpsAndCompile(t->regexps, nregexp, &v);
    241 
    242   string text = "abc121212xyz";
    243   // atoms = abc
    244   vector<int> atom_ids;
    245   vector<string> atoms;
    246   atoms.push_back("abc");
    247   FindAtomIndices(v.atoms, atoms, &atom_ids);
    248   vector<int> matching_regexps;
    249   v.f.AllMatches(text, atom_ids, &matching_regexps);
    250   EXPECT_EQ(1, matching_regexps.size());
    251 
    252   text = "abc12312yyyzzz";
    253   atoms.clear();
    254   atoms.push_back("abc");
    255   atoms.push_back("yyy");
    256   atoms.push_back("yyyzzz");
    257   FindAtomIndices(v.atoms, atoms, &atom_ids);
    258   v.f.AllMatches(text, atom_ids, &matching_regexps);
    259   EXPECT_EQ(1, matching_regexps.size());
    260 
    261   text = "abcd12yyy32yyyzzz";
    262   atoms.clear();
    263   atoms.push_back("abc");
    264   atoms.push_back("abcd");
    265   atoms.push_back("yyy");
    266   atoms.push_back("yyyzzz");
    267   FindAtomIndices(v.atoms, atoms, &atom_ids);
    268   LOG(INFO) << "S: " << atom_ids.size();
    269   for (int i = 0; i < atom_ids.size(); i++)
    270     LOG(INFO) << "i: " << i << " : " << atom_ids[i];
    271   v.f.AllMatches(text, atom_ids, &matching_regexps);
    272   EXPECT_EQ(2, matching_regexps.size());
    273 }
    274 
    275 }  //  namespace re2
    276