1 // Copyright 2009 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "util/test.h" 6 #include "re2/filtered_re2.h" 7 #include "re2/re2.h" 8 9 DECLARE_int32(filtered_re2_min_atom_len); // From prefilter_tree.cc 10 11 namespace re2 { 12 13 struct FilterTestVars { 14 vector<string> atoms; 15 vector<int> atom_indices; 16 vector<int> matches; 17 RE2::Options opts; 18 FilteredRE2 f; 19 }; 20 21 TEST(FilteredRE2Test, EmptyTest) { 22 FilterTestVars v; 23 v.f.AllMatches("foo", v.atom_indices, &v.matches); 24 EXPECT_EQ(0, v.matches.size()); 25 } 26 27 TEST(FilteredRE2Test, SmallOrTest) { 28 FLAGS_filtered_re2_min_atom_len = 4; 29 30 FilterTestVars v; 31 int id; 32 v.f.Add("(foo|bar)", v.opts, &id); 33 34 v.f.Compile(&v.atoms); 35 EXPECT_EQ(0, v.atoms.size()); 36 37 v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches); 38 EXPECT_EQ(1, v.matches.size()); 39 EXPECT_EQ(id, v.matches[0]); 40 } 41 42 TEST(FilteredRE2Test, SmallLatinTest) { 43 FLAGS_filtered_re2_min_atom_len = 3; 44 FilterTestVars v; 45 int id; 46 47 v.opts.set_utf8(false); 48 v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id); 49 v.f.Compile(&v.atoms); 50 EXPECT_EQ(1, v.atoms.size()); 51 EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef"); 52 53 v.atom_indices.push_back(0); 54 v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches); 55 EXPECT_EQ(1, v.matches.size()); 56 EXPECT_EQ(id, v.matches[0]); 57 } 58 59 struct AtomTest { 60 const char* testname; 61 // If any test needs more than this many regexps or atoms, increase 62 // the size of the corresponding array. 63 const char* regexps[20]; 64 const char* atoms[20]; 65 }; 66 67 AtomTest atom_tests[] = { 68 { 69 // This test checks to make sure empty patterns are allowed. 70 "CheckEmptyPattern", 71 {""}, 72 {} 73 }, { 74 // This test checks that all atoms of length greater than min length 75 // are found, and no atoms that are of smaller length are found. 76 "AllAtomsGtMinLengthFound", { 77 "(abc123|def456|ghi789).*mnop[x-z]+", 78 "abc..yyy..zz", 79 "mnmnpp[a-z]+PPP" 80 }, { 81 "abc123", 82 "def456", 83 "ghi789", 84 "mnop", 85 "abc", 86 "yyy", 87 "mnmnpp", 88 "ppp" 89 } 90 }, { 91 // Test to make sure that any atoms that have another atom as a 92 // substring in an OR are removed; that is, only the shortest 93 // substring is kept. 94 "SubstrAtomRemovesSuperStrInOr", { 95 "(abc123|abc|ghi789|abc1234).*[x-z]+", 96 "abcd..yyy..yyyzzz", 97 "mnmnpp[a-z]+PPP" 98 }, { 99 "abc", 100 "ghi789", 101 "abcd", 102 "yyy", 103 "yyyzzz", 104 "mnmnpp", 105 "ppp" 106 } 107 }, { 108 // Test character class expansion. 109 "CharClassExpansion", { 110 "m[a-c][d-f]n.*[x-z]+", 111 "[x-y]bcde[ab]" 112 }, { 113 "madn", "maen", "mafn", 114 "mbdn", "mben", "mbfn", 115 "mcdn", "mcen", "mcfn", 116 "xbcdea", "xbcdeb", 117 "ybcdea", "ybcdeb" 118 } 119 }, { 120 // Test upper/lower of non-ASCII. 121 "UnicodeLower", { 122 "(?i)", 123 "", 124 "", 125 }, { 126 "", 127 "", 128 "", 129 }, 130 }, 131 }; 132 133 void AddRegexpsAndCompile(const char* regexps[], 134 int n, 135 struct FilterTestVars* v) { 136 for (int i = 0; i < n; i++) { 137 int id; 138 v->f.Add(regexps[i], v->opts, &id); 139 } 140 v->f.Compile(&v->atoms); 141 } 142 143 bool CheckExpectedAtoms(const char* atoms[], 144 int n, 145 const char* testname, 146 struct FilterTestVars* v) { 147 vector<string> expected; 148 for (int i = 0; i < n; i++) 149 expected.push_back(atoms[i]); 150 151 bool pass = expected.size() == v->atoms.size(); 152 153 sort(v->atoms.begin(), v->atoms.end()); 154 sort(expected.begin(), expected.end()); 155 for (int i = 0; pass && i < n; i++) 156 pass = pass && expected[i] == v->atoms[i]; 157 158 if (!pass) { 159 LOG(WARNING) << "Failed " << testname; 160 LOG(WARNING) << "Expected #atoms = " << expected.size(); 161 for (int i = 0; i < expected.size(); i++) 162 LOG(WARNING) << expected[i]; 163 LOG(WARNING) << "Found #atoms = " << v->atoms.size(); 164 for (int i = 0; i < v->atoms.size(); i++) 165 LOG(WARNING) << v->atoms[i]; 166 } 167 168 return pass; 169 } 170 171 TEST(FilteredRE2Test, AtomTests) { 172 FLAGS_filtered_re2_min_atom_len = 3; 173 174 int nfail = 0; 175 for (int i = 0; i < arraysize(atom_tests); i++) { 176 FilterTestVars v; 177 AtomTest* t = &atom_tests[i]; 178 int natom, nregexp; 179 for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) 180 if (t->regexps[nregexp] == NULL) 181 break; 182 for (natom = 0; natom < arraysize(t->atoms); natom++) 183 if (t->atoms[natom] == NULL) 184 break; 185 AddRegexpsAndCompile(t->regexps, nregexp, &v); 186 if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v)) 187 nfail++; 188 } 189 EXPECT_EQ(0, nfail); 190 } 191 192 void FindAtomIndices(const vector<string> atoms, 193 const vector<string> matched_atoms, 194 vector<int>* atom_indices) { 195 atom_indices->clear(); 196 for (int i = 0; i < matched_atoms.size(); i++) { 197 int j = 0; 198 for (; j < atoms.size(); j++) { 199 if (matched_atoms[i] == atoms[j]) { 200 atom_indices->push_back(j); 201 break; 202 } 203 EXPECT_LT(j, atoms.size()); 204 } 205 } 206 } 207 208 TEST(FilteredRE2Test, MatchEmptyPattern) { 209 FLAGS_filtered_re2_min_atom_len = 3; 210 211 FilterTestVars v; 212 AtomTest* t = &atom_tests[0]; 213 // We are using the regexps used in one of the atom tests 214 // for this test. Adding the EXPECT here to make sure 215 // the index we use for the test is for the correct test. 216 EXPECT_EQ("CheckEmptyPattern", string(t->testname)); 217 int nregexp; 218 for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) 219 if (t->regexps[nregexp] == NULL) 220 break; 221 AddRegexpsAndCompile(t->regexps, nregexp, &v); 222 string text = "0123"; 223 vector<int> atom_ids; 224 vector<int> matching_regexps; 225 EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids)); 226 } 227 228 TEST(FilteredRE2Test, MatchTests) { 229 FLAGS_filtered_re2_min_atom_len = 3; 230 231 FilterTestVars v; 232 AtomTest* t = &atom_tests[2]; 233 // We are using the regexps used in one of the atom tests 234 // for this test. 235 EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", string(t->testname)); 236 int nregexp; 237 for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) 238 if (t->regexps[nregexp] == NULL) 239 break; 240 AddRegexpsAndCompile(t->regexps, nregexp, &v); 241 242 string text = "abc121212xyz"; 243 // atoms = abc 244 vector<int> atom_ids; 245 vector<string> atoms; 246 atoms.push_back("abc"); 247 FindAtomIndices(v.atoms, atoms, &atom_ids); 248 vector<int> matching_regexps; 249 v.f.AllMatches(text, atom_ids, &matching_regexps); 250 EXPECT_EQ(1, matching_regexps.size()); 251 252 text = "abc12312yyyzzz"; 253 atoms.clear(); 254 atoms.push_back("abc"); 255 atoms.push_back("yyy"); 256 atoms.push_back("yyyzzz"); 257 FindAtomIndices(v.atoms, atoms, &atom_ids); 258 v.f.AllMatches(text, atom_ids, &matching_regexps); 259 EXPECT_EQ(1, matching_regexps.size()); 260 261 text = "abcd12yyy32yyyzzz"; 262 atoms.clear(); 263 atoms.push_back("abc"); 264 atoms.push_back("abcd"); 265 atoms.push_back("yyy"); 266 atoms.push_back("yyyzzz"); 267 FindAtomIndices(v.atoms, atoms, &atom_ids); 268 LOG(INFO) << "S: " << atom_ids.size(); 269 for (int i = 0; i < atom_ids.size(); i++) 270 LOG(INFO) << "i: " << i << " : " << atom_ids[i]; 271 v.f.AllMatches(text, atom_ids, &matching_regexps); 272 EXPECT_EQ(2, matching_regexps.size()); 273 } 274 275 } // namespace re2 276