1 // Copyright 2006 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Test parse.cc, dump.cc, and tostring.cc. 6 7 #include <string> 8 #include <vector> 9 #include "util/test.h" 10 #include "re2/regexp.h" 11 12 namespace re2 { 13 14 static const Regexp::ParseFlags TestZeroFlags = Regexp::ParseFlags(1<<30); 15 16 struct Test { 17 const char* regexp; 18 const char* parse; 19 Regexp::ParseFlags flags; 20 }; 21 22 static Regexp::ParseFlags kTestFlags = Regexp::MatchNL | 23 Regexp::PerlX | 24 Regexp::PerlClasses | 25 Regexp::UnicodeGroups; 26 27 static Test tests[] = { 28 // Base cases 29 { "a", "lit{a}" }, 30 { "a.", "cat{lit{a}dot{}}" }, 31 { "a.b", "cat{lit{a}dot{}lit{b}}" }, 32 { "ab", "str{ab}" }, 33 { "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" }, 34 { "abc", "str{abc}" }, 35 { "a|^", "alt{lit{a}bol{}}" }, 36 { "a|b", "cc{0x61-0x62}" }, 37 { "(a)", "cap{lit{a}}" }, 38 { "(a)|b", "alt{cap{lit{a}}lit{b}}" }, 39 { "a*", "star{lit{a}}" }, 40 { "a+", "plus{lit{a}}" }, 41 { "a?", "que{lit{a}}" }, 42 { "a{2}", "rep{2,2 lit{a}}" }, 43 { "a{2,3}", "rep{2,3 lit{a}}" }, 44 { "a{2,}", "rep{2,-1 lit{a}}" }, 45 { "a*?", "nstar{lit{a}}" }, 46 { "a+?", "nplus{lit{a}}" }, 47 { "a??", "nque{lit{a}}" }, 48 { "a{2}?", "nrep{2,2 lit{a}}" }, 49 { "a{2,3}?", "nrep{2,3 lit{a}}" }, 50 { "a{2,}?", "nrep{2,-1 lit{a}}" }, 51 { "", "emp{}" }, 52 { "|", "emp{}" }, // alt{emp{}emp{}} but got factored 53 { "|x|", "alt{emp{}lit{x}emp{}}" }, 54 { ".", "dot{}" }, 55 { "^", "bol{}" }, 56 { "$", "eol{}" }, 57 { "\\|", "lit{|}" }, 58 { "\\(", "lit{(}" }, 59 { "\\)", "lit{)}" }, 60 { "\\*", "lit{*}" }, 61 { "\\+", "lit{+}" }, 62 { "\\?", "lit{?}" }, 63 { "{", "lit{{}" }, 64 { "}", "lit{}}" }, 65 { "\\.", "lit{.}" }, 66 { "\\^", "lit{^}" }, 67 { "\\$", "lit{$}" }, 68 { "\\\\", "lit{\\}" }, 69 { "[ace]", "cc{0x61 0x63 0x65}" }, 70 { "[abc]", "cc{0x61-0x63}" }, 71 { "[a-z]", "cc{0x61-0x7a}" }, 72 { "[a]", "lit{a}" }, 73 { "\\-", "lit{-}" }, 74 { "-", "lit{-}" }, 75 { "\\_", "lit{_}" }, 76 77 // Posix and Perl extensions 78 { "[[:lower:]]", "cc{0x61-0x7a}" }, 79 { "[a-z]", "cc{0x61-0x7a}" }, 80 { "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" }, 81 { "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" }, 82 { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, 83 { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, 84 { "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, 85 { "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, 86 { "\\d", "cc{0x30-0x39}" }, 87 { "\\D", "cc{0-0x2f 0x3a-0x10ffff}" }, 88 { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" }, 89 { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" }, 90 { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" }, 91 { "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" }, 92 { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" }, 93 { "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, 94 { "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" }, 95 { "\\C", "byte{}" }, 96 97 // Unicode, negatives, and a double negative. 98 { "\\p{Braille}", "cc{0x2800-0x28ff}" }, 99 { "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" }, 100 { "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" }, 101 { "\\P{^Braille}", "cc{0x2800-0x28ff}" }, 102 103 // More interesting regular expressions. 104 { "a{,2}", "str{a{,2}}" }, 105 { "\\.\\^\\$\\\\", "str{.^$\\}" }, 106 { "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" }, 107 { "[^a]", "cc{0-0x60 0x62-0x10ffff}" }, 108 { "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" }, // utf-8 109 { "a*{", "cat{star{lit{a}}lit{{}}" }, 110 111 // Test precedences 112 { "(?:ab)*", "star{str{ab}}" }, 113 { "(ab)*", "star{cap{str{ab}}}" }, 114 { "ab|cd", "alt{str{ab}str{cd}}" }, 115 { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" }, 116 117 // Test flattening. 118 { "(?:a)", "lit{a}" }, 119 { "(?:ab)(?:cd)", "str{abcd}" }, 120 { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" }, 121 { "a|.", "dot{}" }, 122 { ".|a", "dot{}" }, 123 124 // Test Perl quoted literals 125 { "\\Q+|*?{[\\E", "str{+|*?{[}" }, 126 { "\\Q+\\E+", "plus{lit{+}}" }, 127 { "\\Q\\\\E", "lit{\\}" }, 128 { "\\Q\\\\\\E", "str{\\\\}" }, 129 130 // Test Perl \A and \z 131 { "(?m)^", "bol{}" }, 132 { "(?m)$", "eol{}" }, 133 { "(?-m)^", "bot{}" }, 134 { "(?-m)$", "eot{}" }, 135 { "(?m)\\A", "bot{}" }, 136 { "(?m)\\z", "eot{\\z}" }, 137 { "(?-m)\\A", "bot{}" }, 138 { "(?-m)\\z", "eot{\\z}" }, 139 140 // Test named captures 141 { "(?P<name>a)", "cap{name:lit{a}}" }, 142 143 // Case-folded literals 144 { "[Aa]", "litfold{a}" }, 145 146 // Strings 147 { "abcde", "str{abcde}" }, 148 { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" }, 149 150 // Reported bug involving \n leaking in despite use of NeverNL. 151 { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags }, 152 { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, 153 { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, 154 { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, 155 { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", TestZeroFlags }, 156 { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, 157 { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, 158 { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, 159 { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", TestZeroFlags }, 160 { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, 161 { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, 162 { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, 163 { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", TestZeroFlags }, 164 { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, 165 { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, 166 { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, 167 { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags }, 168 { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, 169 { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, 170 { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, 171 { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, 172 { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, 173 { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, 174 { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, 175 { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, 176 { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, 177 { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, 178 { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, 179 { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", 180 Regexp::PerlClasses }, 181 { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", 182 Regexp::PerlClasses | Regexp::FoldCase }, 183 { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", 184 Regexp::PerlClasses | Regexp::NeverNL }, 185 { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", 186 Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase }, 187 { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", 188 Regexp::PerlClasses }, 189 { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", 190 Regexp::PerlClasses | Regexp::FoldCase }, 191 { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", 192 Regexp::PerlClasses | Regexp::NeverNL }, 193 { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", 194 Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase }, 195 }; 196 197 bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) { 198 return Regexp::Equal(a, b); 199 } 200 201 void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags, 202 const string& title) { 203 Regexp** re = new Regexp*[ntests]; 204 for (int i = 0; i < ntests; i++) { 205 RegexpStatus status; 206 Regexp::ParseFlags f = flags; 207 if (tests[i].flags != 0) { 208 f = tests[i].flags & ~TestZeroFlags; 209 } 210 re[i] = Regexp::Parse(tests[i].regexp, f, &status); 211 CHECK(re[i] != NULL) << " " << tests[i].regexp << " " 212 << status.Text(); 213 string s = re[i]->Dump(); 214 EXPECT_EQ(string(tests[i].parse), s) << "Regexp: " << tests[i].regexp 215 << "\nparse: " << tests[i].parse << " s: " << s << " flag=" << f; 216 } 217 218 for (int i = 0; i < ntests; i++) { 219 for (int j = 0; j < ntests; j++) { 220 EXPECT_EQ(string(tests[i].parse) == tests[j].parse, 221 RegexpEqualTestingOnly(re[i], re[j])) 222 << "Regexp: " << tests[i].regexp << " " << tests[j].regexp; 223 } 224 } 225 226 for (int i = 0; i < ntests; i++) 227 re[i]->Decref(); 228 delete[] re; 229 } 230 231 // Test that regexps parse to expected structures. 232 TEST(TestParse, SimpleRegexps) { 233 TestParse(tests, arraysize(tests), kTestFlags, "simple"); 234 } 235 236 Test foldcase_tests[] = { 237 { "AbCdE", "strfold{abcde}" }, 238 { "[Aa]", "litfold{a}" }, 239 { "a", "litfold{a}" }, 240 241 // 0x17F is an old English long s (looks like an f) and folds to s. 242 // 0x212A is the Kelvin symbol and folds to k. 243 { "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" }, // [Aa][A-z...] 244 { "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, 245 { "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, 246 }; 247 248 // Test that parsing with FoldCase works. 249 TEST(TestParse, FoldCase) { 250 TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase"); 251 } 252 253 Test literal_tests[] = { 254 { "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" }, 255 }; 256 257 // Test that parsing with Literal works. 258 TEST(TestParse, Literal) { 259 TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal"); 260 } 261 262 Test matchnl_tests[] = { 263 { ".", "dot{}" }, 264 { "\n", "lit{\n}" }, 265 { "[^a]", "cc{0-0x60 0x62-0x10ffff}" }, 266 { "[a\\n]", "cc{0xa 0x61}" }, 267 }; 268 269 // Test that parsing with MatchNL works. 270 // (Also tested above during simple cases.) 271 TEST(TestParse, MatchNL) { 272 TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL"); 273 } 274 275 Test nomatchnl_tests[] = { 276 { ".", "cc{0-0x9 0xb-0x10ffff}" }, 277 { "\n", "lit{\n}" }, 278 { "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" }, 279 { "[a\\n]", "cc{0xa 0x61}" }, 280 }; 281 282 // Test that parsing without MatchNL works. 283 TEST(TestParse, NoMatchNL) { 284 TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL"); 285 } 286 287 Test prefix_tests[] = { 288 { "abc|abd", "cat{str{ab}cc{0x63-0x64}}" }, 289 { "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" }, 290 { "abc|abd|aef|bcx|bcy", 291 "alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}" 292 "cat{str{bc}cc{0x78-0x79}}}" }, 293 { "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" }, 294 { "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" }, 295 { "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" }, 296 { "(?:xx|yy)c|(?:xx|yy)d", 297 "cat{alt{str{xx}str{yy}}cc{0x63-0x64}}" }, 298 { "x{2}|x{2}[0-9]", 299 "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" }, 300 { "x{2}y|x{2}[0-9]y", 301 "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" }, 302 }; 303 304 // Test that prefix factoring works. 305 TEST(TestParse, Prefix) { 306 TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix"); 307 } 308 309 // Invalid regular expressions 310 const char* badtests[] = { 311 "(", 312 ")", 313 "(a", 314 "(a|b|", 315 "(a|b", 316 "[a-z", 317 "([a-z)", 318 "x{1001}", 319 "\xff", // Invalid UTF-8 320 "[\xff]", 321 "[\\\xff]", 322 "\\\xff", 323 "(?P<name>a", 324 "(?P<name>", 325 "(?P<name", 326 "(?P<x y>a)", 327 "(?P<>a)", 328 "[a-Z]", 329 "(?i)[a-Z]", 330 "a{100000}", 331 "a{100000,}", 332 }; 333 334 // Valid in Perl, bad in POSIX 335 const char* only_perl[] = { 336 "[a-b-c]", 337 "\\Qabc\\E", 338 "\\Q*+?{[\\E", 339 "\\Q\\\\E", 340 "\\Q\\\\\\E", 341 "\\Q\\\\\\\\E", 342 "\\Q\\\\\\\\\\E", 343 "(?:a)", 344 "(?P<name>a)", 345 }; 346 347 // Valid in POSIX, bad in Perl. 348 const char* only_posix[] = { 349 "a++", 350 "a**", 351 "a?*", 352 "a+*", 353 "a{1}*", 354 }; 355 356 // Test that parser rejects bad regexps. 357 TEST(TestParse, InvalidRegexps) { 358 for (int i = 0; i < arraysize(badtests); i++) { 359 CHECK(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL) 360 << " " << badtests[i]; 361 CHECK(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL) 362 << " " << badtests[i]; 363 } 364 for (int i = 0; i < arraysize(only_posix); i++) { 365 CHECK(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL) 366 << " " << only_posix[i]; 367 Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL); 368 CHECK(re) << " " << only_posix[i]; 369 re->Decref(); 370 } 371 for (int i = 0; i < arraysize(only_perl); i++) { 372 CHECK(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL) 373 << " " << only_perl[i]; 374 Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL); 375 CHECK(re) << " " << only_perl[i]; 376 re->Decref(); 377 } 378 } 379 380 // Test that ToString produces original regexp or equivalent one. 381 TEST(TestToString, EquivalentParse) { 382 for (int i = 0; i < arraysize(tests); i++) { 383 RegexpStatus status; 384 Regexp::ParseFlags f = kTestFlags; 385 if (tests[i].flags != 0) { 386 f = tests[i].flags & ~TestZeroFlags; 387 } 388 Regexp* re = Regexp::Parse(tests[i].regexp, f, &status); 389 CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text(); 390 string s = re->Dump(); 391 EXPECT_EQ(string(tests[i].parse), s) << " " << tests[i].regexp << " " << string(tests[i].parse) << " " << s; 392 string t = re->ToString(); 393 if (t != tests[i].regexp) { 394 // If ToString didn't return the original regexp, 395 // it must have found one with fewer parens. 396 // Unfortunately we can't check the length here, because 397 // ToString produces "\\{" for a literal brace, 398 // but "{" is a shorter equivalent. 399 // CHECK_LT(t.size(), strlen(tests[i].regexp)) 400 // << " t=" << t << " regexp=" << tests[i].regexp; 401 402 // Test that if we parse the new regexp we get the same structure. 403 Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status); 404 CHECK(nre != NULL) << " reparse " << t << " " << status.Text(); 405 string ss = nre->Dump(); 406 string tt = nre->ToString(); 407 if (s != ss || t != tt) 408 LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t; 409 EXPECT_EQ(s, ss); 410 EXPECT_EQ(t, tt); 411 nre->Decref(); 412 } 413 re->Decref(); 414 } 415 } 416 417 // Test that capture error args are correct. 418 TEST(NamedCaptures, ErrorArgs) { 419 RegexpStatus status; 420 Regexp* re; 421 422 re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status); 423 EXPECT_TRUE(re == NULL); 424 EXPECT_EQ(status.code(), kRegexpBadNamedCapture); 425 EXPECT_EQ(status.error_arg(), "(?P<name"); 426 427 re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status); 428 EXPECT_TRUE(re == NULL); 429 EXPECT_EQ(status.code(), kRegexpBadNamedCapture); 430 EXPECT_EQ(status.error_arg(), "(?P<space bar>"); 431 } 432 433 } // namespace re2 434