1 // -*- coding: utf-8 -*- 2 // Copyright 2002-2009 The RE2 Authors. All Rights Reserved. 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 // TODO: Test extractions for PartialMatch/Consume 7 8 #include <sys/types.h> 9 #include <sys/mman.h> 10 #include <sys/stat.h> 11 #include <errno.h> 12 #include <vector> 13 #include "util/test.h" 14 #include "re2/re2.h" 15 #include "re2/regexp.h" 16 17 DECLARE_bool(logtostderr); 18 19 namespace re2 { 20 21 TEST(RE2, HexTests) { 22 23 VLOG(1) << "hex tests"; 24 25 #define CHECK_HEX(type, value) \ 26 do { \ 27 type v; \ 28 CHECK(RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \ 29 CHECK_EQ(v, 0x ## value); \ 30 CHECK(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \ 31 CHECK_EQ(v, 0x ## value); \ 32 } while(0) 33 34 CHECK_HEX(short, 2bad); 35 CHECK_HEX(unsigned short, 2badU); 36 CHECK_HEX(int, dead); 37 CHECK_HEX(unsigned int, deadU); 38 CHECK_HEX(long, 7eadbeefL); 39 CHECK_HEX(unsigned long, deadbeefUL); 40 CHECK_HEX(long long, 12345678deadbeefLL); 41 CHECK_HEX(unsigned long long, cafebabedeadbeefULL); 42 43 #undef CHECK_HEX 44 } 45 46 TEST(RE2, OctalTests) { 47 VLOG(1) << "octal tests"; 48 49 #define CHECK_OCTAL(type, value) \ 50 do { \ 51 type v; \ 52 CHECK(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \ 53 CHECK_EQ(v, 0 ## value); \ 54 CHECK(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \ 55 CHECK_EQ(v, 0 ## value); \ 56 } while(0) 57 58 CHECK_OCTAL(short, 77777); 59 CHECK_OCTAL(unsigned short, 177777U); 60 CHECK_OCTAL(int, 17777777777); 61 CHECK_OCTAL(unsigned int, 37777777777U); 62 CHECK_OCTAL(long, 17777777777L); 63 CHECK_OCTAL(unsigned long, 37777777777UL); 64 CHECK_OCTAL(long long, 777777777777777777777LL); 65 CHECK_OCTAL(unsigned long long, 1777777777777777777777ULL); 66 67 #undef CHECK_OCTAL 68 } 69 70 TEST(RE2, DecimalTests) { 71 VLOG(1) << "decimal tests"; 72 73 #define CHECK_DECIMAL(type, value) \ 74 do { \ 75 type v; \ 76 CHECK(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v)); \ 77 CHECK_EQ(v, value); \ 78 CHECK(RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \ 79 CHECK_EQ(v, value); \ 80 } while(0) 81 82 CHECK_DECIMAL(short, -1); 83 CHECK_DECIMAL(unsigned short, 9999); 84 CHECK_DECIMAL(int, -1000); 85 CHECK_DECIMAL(unsigned int, 12345U); 86 CHECK_DECIMAL(long, -10000000L); 87 CHECK_DECIMAL(unsigned long, 3083324652U); 88 CHECK_DECIMAL(long long, -100000000000000LL); 89 CHECK_DECIMAL(unsigned long long, 1234567890987654321ULL); 90 91 #undef CHECK_DECIMAL 92 } 93 94 TEST(RE2, Replace) { 95 VLOG(1) << "TestReplace"; 96 97 struct ReplaceTest { 98 const char *regexp; 99 const char *rewrite; 100 const char *original; 101 const char *single; 102 const char *global; 103 int greplace_count; 104 }; 105 static const ReplaceTest tests[] = { 106 { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)", 107 "\\2\\1ay", 108 "the quick brown fox jumps over the lazy dogs.", 109 "ethay quick brown fox jumps over the lazy dogs.", 110 "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.", 111 9 }, 112 { "\\w+", 113 "\\0-NOSPAM", 114 "abcd.efghi (at) google.com", 115 "abcd-NOSPAM.efghi (at) google.com", 116 "abcd-NOSPAM.efghi-NOSPAM (at) google-NOSPAM.com-NOSPAM", 117 4 }, 118 { "^", 119 "(START)", 120 "foo", 121 "(START)foo", 122 "(START)foo", 123 1 }, 124 { "^", 125 "(START)", 126 "", 127 "(START)", 128 "(START)", 129 1 }, 130 { "$", 131 "(END)", 132 "", 133 "(END)", 134 "(END)", 135 1 }, 136 { "b", 137 "bb", 138 "ababababab", 139 "abbabababab", 140 "abbabbabbabbabb", 141 5 }, 142 { "b", 143 "bb", 144 "bbbbbb", 145 "bbbbbbb", 146 "bbbbbbbbbbbb", 147 6 }, 148 { "b+", 149 "bb", 150 "bbbbbb", 151 "bb", 152 "bb", 153 1 }, 154 { "b*", 155 "bb", 156 "bbbbbb", 157 "bb", 158 "bb", 159 1 }, 160 { "b*", 161 "bb", 162 "aaaaa", 163 "bbaaaaa", 164 "bbabbabbabbabbabb", 165 6 }, 166 // Check newline handling 167 { "a.*a", 168 "(\\0)", 169 "aba\naba", 170 "(aba)\naba", 171 "(aba)\n(aba)", 172 2 }, 173 { "", NULL, NULL, NULL, NULL, 0 } 174 }; 175 176 for (const ReplaceTest *t = tests; t->original != NULL; ++t) { 177 VLOG(1) << StringPrintf("\"%s\" =~ s/%s/%s/g", t->original, t->regexp, t->rewrite); 178 string one(t->original); 179 CHECK(RE2::Replace(&one, t->regexp, t->rewrite)); 180 CHECK_EQ(one, t->single); 181 string all(t->original); 182 CHECK_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count) 183 << "Got: " << all; 184 CHECK_EQ(all, t->global); 185 } 186 } 187 188 static void TestCheckRewriteString(const char* regexp, const char* rewrite, 189 bool expect_ok) { 190 string error; 191 RE2 exp(regexp); 192 bool actual_ok = exp.CheckRewriteString(rewrite, &error); 193 EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error; 194 } 195 196 TEST(CheckRewriteString, all) { 197 TestCheckRewriteString("abc", "foo", true); 198 TestCheckRewriteString("abc", "foo\\", false); 199 TestCheckRewriteString("abc", "foo\\0bar", true); 200 201 TestCheckRewriteString("a(b)c", "foo", true); 202 TestCheckRewriteString("a(b)c", "foo\\0bar", true); 203 TestCheckRewriteString("a(b)c", "foo\\1bar", true); 204 TestCheckRewriteString("a(b)c", "foo\\2bar", false); 205 TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true); 206 207 TestCheckRewriteString("a(b)(c)", "foo\\12", true); 208 TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true); 209 TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false); 210 } 211 212 TEST(RE2, Extract) { 213 VLOG(1) << "TestExtract"; 214 215 string s; 216 217 CHECK(RE2::Extract("boris (at) kremvax.ru", "(.*)@([^.]*)", "\\2!\\1", &s)); 218 CHECK_EQ(s, "kremvax!boris"); 219 220 CHECK(RE2::Extract("foo", ".*", "'\\0'", &s)); 221 CHECK_EQ(s, "'foo'"); 222 // check that false match doesn't overwrite 223 CHECK(!RE2::Extract("baz", "bar", "'\\0'", &s)); 224 CHECK_EQ(s, "'foo'"); 225 } 226 227 TEST(RE2, Consume) { 228 VLOG(1) << "TestConsume"; 229 230 RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace 231 string word; 232 233 string s(" aaa b!@#$@#$cccc"); 234 StringPiece input(s); 235 236 CHECK(RE2::Consume(&input, r, &word)); 237 CHECK_EQ(word, "aaa") << " input: " << input; 238 CHECK(RE2::Consume(&input, r, &word)); 239 CHECK_EQ(word, "b") << " input: " << input; 240 CHECK(! RE2::Consume(&input, r, &word)) << " input: " << input; 241 } 242 243 TEST(RE2, ConsumeN) { 244 const string s(" one two three 4"); 245 StringPiece input(s); 246 247 RE2::Arg argv[2]; 248 const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; 249 250 // 0 arg 251 EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 0)); // Skips "one". 252 253 // 1 arg 254 string word; 255 argv[0] = &word; 256 EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 1)); 257 EXPECT_EQ("two", word); 258 259 // Multi-args 260 int n; 261 argv[1] = &n; 262 EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)", args, 2)); 263 EXPECT_EQ("three", word); 264 EXPECT_EQ(4, n); 265 } 266 267 TEST(RE2, FindAndConsume) { 268 VLOG(1) << "TestFindAndConsume"; 269 270 RE2 r("(\\w+)"); // matches a word 271 string word; 272 273 string s(" aaa b!@#$@#$cccc"); 274 StringPiece input(s); 275 276 CHECK(RE2::FindAndConsume(&input, r, &word)); 277 CHECK_EQ(word, "aaa"); 278 CHECK(RE2::FindAndConsume(&input, r, &word)); 279 CHECK_EQ(word, "b"); 280 CHECK(RE2::FindAndConsume(&input, r, &word)); 281 CHECK_EQ(word, "cccc"); 282 CHECK(! RE2::FindAndConsume(&input, r, &word)); 283 284 // Check that FindAndConsume works without any submatches. 285 // Earlier version used uninitialized data for 286 // length to consume. 287 input = "aaa"; 288 CHECK(RE2::FindAndConsume(&input, "aaa")); 289 CHECK_EQ(input, ""); 290 } 291 292 TEST(RE2, FindAndConsumeN) { 293 const string s(" one two three 4"); 294 StringPiece input(s); 295 296 RE2::Arg argv[2]; 297 const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; 298 299 // 0 arg 300 EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 0)); // Skips "one". 301 302 // 1 arg 303 string word; 304 argv[0] = &word; 305 EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 1)); 306 EXPECT_EQ("two", word); 307 308 // Multi-args 309 int n; 310 argv[1] = &n; 311 EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)", args, 2)); 312 EXPECT_EQ("three", word); 313 EXPECT_EQ(4, n); 314 } 315 316 TEST(RE2, MatchNumberPeculiarity) { 317 VLOG(1) << "TestMatchNumberPeculiarity"; 318 319 RE2 r("(foo)|(bar)|(baz)"); 320 string word1; 321 string word2; 322 string word3; 323 324 CHECK(RE2::PartialMatch("foo", r, &word1, &word2, &word3)); 325 CHECK_EQ(word1, "foo"); 326 CHECK_EQ(word2, ""); 327 CHECK_EQ(word3, ""); 328 CHECK(RE2::PartialMatch("bar", r, &word1, &word2, &word3)); 329 CHECK_EQ(word1, ""); 330 CHECK_EQ(word2, "bar"); 331 CHECK_EQ(word3, ""); 332 CHECK(RE2::PartialMatch("baz", r, &word1, &word2, &word3)); 333 CHECK_EQ(word1, ""); 334 CHECK_EQ(word2, ""); 335 CHECK_EQ(word3, "baz"); 336 CHECK(!RE2::PartialMatch("f", r, &word1, &word2, &word3)); 337 338 string a; 339 CHECK(RE2::FullMatch("hello", "(foo)|hello", &a)); 340 CHECK_EQ(a, ""); 341 } 342 343 TEST(RE2, Match) { 344 RE2 re("((\\w+):([0-9]+))"); // extracts host and port 345 StringPiece group[4]; 346 347 // No match. 348 StringPiece s = "zyzzyva"; 349 CHECK(!re.Match(s, 0, s.size(), RE2::UNANCHORED, 350 group, arraysize(group))); 351 352 // Matches and extracts. 353 s = "a chrisr:9000 here"; 354 CHECK(re.Match(s, 0, s.size(), RE2::UNANCHORED, 355 group, arraysize(group))); 356 CHECK_EQ(group[0], "chrisr:9000"); 357 CHECK_EQ(group[1], "chrisr:9000"); 358 CHECK_EQ(group[2], "chrisr"); 359 CHECK_EQ(group[3], "9000"); 360 361 string all, host; 362 int port; 363 CHECK(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port)); 364 CHECK_EQ(all, "chrisr:9000"); 365 CHECK_EQ(host, "chrisr"); 366 CHECK_EQ(port, 9000); 367 } 368 369 static void TestRecursion(int size, const char *pattern) { 370 // Fill up a string repeating the pattern given 371 string domain; 372 domain.resize(size); 373 int patlen = strlen(pattern); 374 for (int i = 0; i < size; ++i) { 375 domain[i] = pattern[i % patlen]; 376 } 377 // Just make sure it doesn't crash due to too much recursion. 378 RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet); 379 RE2::FullMatch(domain, re); 380 } 381 382 // A meta-quoted string, interpreted as a pattern, should always match 383 // the original unquoted string. 384 static void TestQuoteMeta(string unquoted, 385 const RE2::Options& options = RE2::DefaultOptions) { 386 string quoted = RE2::QuoteMeta(unquoted); 387 RE2 re(quoted, options); 388 EXPECT_TRUE_M(RE2::FullMatch(unquoted, re), 389 "Unquoted='" + unquoted + "', quoted='" + quoted + "'."); 390 } 391 392 // A meta-quoted string, interpreted as a pattern, should always match 393 // the original unquoted string. 394 static void NegativeTestQuoteMeta(string unquoted, string should_not_match, 395 const RE2::Options& options = RE2::DefaultOptions) { 396 string quoted = RE2::QuoteMeta(unquoted); 397 RE2 re(quoted, options); 398 EXPECT_FALSE_M(RE2::FullMatch(should_not_match, re), 399 "Unquoted='" + unquoted + "', quoted='" + quoted + "'."); 400 } 401 402 // Tests that quoted meta characters match their original strings, 403 // and that a few things that shouldn't match indeed do not. 404 TEST(QuoteMeta, Simple) { 405 TestQuoteMeta("foo"); 406 TestQuoteMeta("foo.bar"); 407 TestQuoteMeta("foo\\.bar"); 408 TestQuoteMeta("[1-9]"); 409 TestQuoteMeta("1.5-2.0?"); 410 TestQuoteMeta("\\d"); 411 TestQuoteMeta("Who doesn't like ice cream?"); 412 TestQuoteMeta("((a|b)c?d*e+[f-h]i)"); 413 TestQuoteMeta("((?!)xxx).*yyy"); 414 TestQuoteMeta("(["); 415 } 416 TEST(QuoteMeta, SimpleNegative) { 417 NegativeTestQuoteMeta("foo", "bar"); 418 NegativeTestQuoteMeta("...", "bar"); 419 NegativeTestQuoteMeta("\\.", "."); 420 NegativeTestQuoteMeta("\\.", ".."); 421 NegativeTestQuoteMeta("(a)", "a"); 422 NegativeTestQuoteMeta("(a|b)", "a"); 423 NegativeTestQuoteMeta("(a|b)", "(a)"); 424 NegativeTestQuoteMeta("(a|b)", "a|b"); 425 NegativeTestQuoteMeta("[0-9]", "0"); 426 NegativeTestQuoteMeta("[0-9]", "0-9"); 427 NegativeTestQuoteMeta("[0-9]", "[9]"); 428 NegativeTestQuoteMeta("((?!)xxx)", "xxx"); 429 } 430 431 TEST(QuoteMeta, Latin1) { 432 TestQuoteMeta("3\xb2 = 9", RE2::Latin1); 433 } 434 435 TEST(QuoteMeta, UTF8) { 436 TestQuoteMeta("Plcido Domingo"); 437 TestQuoteMeta("xyz"); // No fancy utf8. 438 TestQuoteMeta("\xc2\xb0"); // 2-byte utf8 -- a degree symbol. 439 TestQuoteMeta("27\xc2\xb0 degrees"); // As a middle character. 440 TestQuoteMeta("\xe2\x80\xb3"); // 3-byte utf8 -- a double prime. 441 TestQuoteMeta("\xf0\x9d\x85\x9f"); // 4-byte utf8 -- a music note. 442 TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, this should 443 // still work. 444 NegativeTestQuoteMeta("27\xc2\xb0", 445 "27\\\xc2\\\xb0"); // 2-byte utf8 -- a degree symbol. 446 } 447 448 TEST(QuoteMeta, HasNull) { 449 string has_null; 450 451 // string with one null character 452 has_null += '\0'; 453 TestQuoteMeta(has_null); 454 NegativeTestQuoteMeta(has_null, ""); 455 456 // Don't want null-followed-by-'1' to be interpreted as '\01'. 457 has_null += '1'; 458 TestQuoteMeta(has_null); 459 NegativeTestQuoteMeta(has_null, "\1"); 460 } 461 462 TEST(ProgramSize, BigProgram) { 463 RE2 re_simple("simple regexp"); 464 RE2 re_medium("medium.*regexp"); 465 RE2 re_complex("hard.{1,128}regexp"); 466 467 CHECK_GT(re_simple.ProgramSize(), 0); 468 CHECK_GT(re_medium.ProgramSize(), re_simple.ProgramSize()); 469 CHECK_GT(re_complex.ProgramSize(), re_medium.ProgramSize()); 470 } 471 472 // Issue 956519: handling empty character sets was 473 // causing NULL dereference. This tests a few empty character sets. 474 // (The way to get an empty character set is to negate a full one.) 475 TEST(EmptyCharset, Fuzz) { 476 static const char *empties[] = { 477 "[^\\S\\s]", 478 "[^\\S[:space:]]", 479 "[^\\D\\d]", 480 "[^\\D[:digit:]]" 481 }; 482 for (int i = 0; i < arraysize(empties); i++) 483 CHECK(!RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0)); 484 } 485 486 // Test that named groups work correctly. 487 TEST(Capture, NamedGroups) { 488 { 489 RE2 re("(hello world)"); 490 CHECK_EQ(re.NumberOfCapturingGroups(), 1); 491 const map<string, int>& m = re.NamedCapturingGroups(); 492 CHECK_EQ(m.size(), 0); 493 } 494 495 { 496 RE2 re("(?P<A>expr(?P<B>expr)(?P<C>expr))((expr)(?P<D>expr))"); 497 CHECK_EQ(re.NumberOfCapturingGroups(), 6); 498 const map<string, int>& m = re.NamedCapturingGroups(); 499 CHECK_EQ(m.size(), 4); 500 CHECK_EQ(m.find("A")->second, 1); 501 CHECK_EQ(m.find("B")->second, 2); 502 CHECK_EQ(m.find("C")->second, 3); 503 CHECK_EQ(m.find("D")->second, 6); // $4 and $5 are anonymous 504 } 505 } 506 507 TEST(RE2, FullMatchWithNoArgs) { 508 CHECK(RE2::FullMatch("h", "h")); 509 CHECK(RE2::FullMatch("hello", "hello")); 510 CHECK(RE2::FullMatch("hello", "h.*o")); 511 CHECK(!RE2::FullMatch("othello", "h.*o")); // Must be anchored at front 512 CHECK(!RE2::FullMatch("hello!", "h.*o")); // Must be anchored at end 513 } 514 515 TEST(RE2, PartialMatch) { 516 CHECK(RE2::PartialMatch("x", "x")); 517 CHECK(RE2::PartialMatch("hello", "h.*o")); 518 CHECK(RE2::PartialMatch("othello", "h.*o")); 519 CHECK(RE2::PartialMatch("hello!", "h.*o")); 520 CHECK(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))")); 521 } 522 523 TEST(RE2, PartialMatchN) { 524 RE2::Arg argv[2]; 525 const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; 526 527 // 0 arg 528 EXPECT_TRUE(RE2::PartialMatchN("hello", "e.*o", args, 0)); 529 EXPECT_FALSE(RE2::PartialMatchN("othello", "a.*o", args, 0)); 530 531 // 1 arg 532 int i; 533 argv[0] = &i; 534 EXPECT_TRUE(RE2::PartialMatchN("1001 nights", "(\\d+)", args, 1)); 535 EXPECT_EQ(1001, i); 536 EXPECT_FALSE(RE2::PartialMatchN("three", "(\\d+)", args, 1)); 537 538 // Multi-arg 539 string s; 540 argv[1] = &s; 541 EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life", "(\\d+):(\\w+)", args, 2)); 542 EXPECT_EQ(42, i); 543 EXPECT_EQ("life", s); 544 EXPECT_FALSE(RE2::PartialMatchN("hi1", "(\\w+)(1)", args, 2)); 545 } 546 547 TEST(RE2, FullMatchZeroArg) { 548 // Zero-arg 549 CHECK(RE2::FullMatch("1001", "\\d+")); 550 } 551 552 TEST(RE2, FullMatchOneArg) { 553 int i; 554 555 // Single-arg 556 CHECK(RE2::FullMatch("1001", "(\\d+)", &i)); 557 CHECK_EQ(i, 1001); 558 CHECK(RE2::FullMatch("-123", "(-?\\d+)", &i)); 559 CHECK_EQ(i, -123); 560 CHECK(!RE2::FullMatch("10", "()\\d+", &i)); 561 CHECK(!RE2::FullMatch("1234567890123456789012345678901234567890", 562 "(\\d+)", &i)); 563 } 564 565 TEST(RE2, FullMatchIntegerArg) { 566 int i; 567 568 // Digits surrounding integer-arg 569 CHECK(RE2::FullMatch("1234", "1(\\d*)4", &i)); 570 CHECK_EQ(i, 23); 571 CHECK(RE2::FullMatch("1234", "(\\d)\\d+", &i)); 572 CHECK_EQ(i, 1); 573 CHECK(RE2::FullMatch("-1234", "(-\\d)\\d+", &i)); 574 CHECK_EQ(i, -1); 575 CHECK(RE2::PartialMatch("1234", "(\\d)", &i)); 576 CHECK_EQ(i, 1); 577 CHECK(RE2::PartialMatch("-1234", "(-\\d)", &i)); 578 CHECK_EQ(i, -1); 579 } 580 581 TEST(RE2, FullMatchStringArg) { 582 string s; 583 // String-arg 584 CHECK(RE2::FullMatch("hello", "h(.*)o", &s)); 585 CHECK_EQ(s, string("ell")); 586 } 587 588 TEST(RE2, FullMatchStringPieceArg) { 589 int i; 590 // StringPiece-arg 591 StringPiece sp; 592 CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i)); 593 CHECK_EQ(sp.size(), 4); 594 CHECK(memcmp(sp.data(), "ruby", 4) == 0); 595 CHECK_EQ(i, 1234); 596 } 597 598 TEST(RE2, FullMatchMultiArg) { 599 int i; 600 string s; 601 // Multi-arg 602 CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); 603 CHECK_EQ(s, string("ruby")); 604 CHECK_EQ(i, 1234); 605 } 606 607 TEST(RE2, FullMatchN) { 608 RE2::Arg argv[2]; 609 const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; 610 611 // 0 arg 612 EXPECT_TRUE(RE2::FullMatchN("hello", "h.*o", args, 0)); 613 EXPECT_FALSE(RE2::FullMatchN("othello", "h.*o", args, 0)); 614 615 // 1 arg 616 int i; 617 argv[0] = &i; 618 EXPECT_TRUE(RE2::FullMatchN("1001", "(\\d+)", args, 1)); 619 EXPECT_EQ(1001, i); 620 EXPECT_FALSE(RE2::FullMatchN("three", "(\\d+)", args, 1)); 621 622 // Multi-arg 623 string s; 624 argv[1] = &s; 625 EXPECT_TRUE(RE2::FullMatchN("42:life", "(\\d+):(\\w+)", args, 2)); 626 EXPECT_EQ(42, i); 627 EXPECT_EQ("life", s); 628 EXPECT_FALSE(RE2::FullMatchN("hi1", "(\\w+)(1)", args, 2)); 629 } 630 631 TEST(RE2, FullMatchIgnoredArg) { 632 int i; 633 string s; 634 // Ignored arg 635 CHECK(RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i)); 636 CHECK_EQ(s, string("ruby")); 637 CHECK_EQ(i, 1234); 638 } 639 640 TEST(RE2, FullMatchTypedNullArg) { 641 string s; 642 643 // Ignore non-void* NULL arg 644 CHECK(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL)); 645 CHECK(RE2::FullMatch("hello", "h(.*)o", (string*)NULL)); 646 CHECK(RE2::FullMatch("hello", "h(.*)o", (StringPiece*)NULL)); 647 CHECK(RE2::FullMatch("1234", "(.*)", (int*)NULL)); 648 CHECK(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL)); 649 CHECK(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL)); 650 CHECK(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL)); 651 652 // Fail on non-void* NULL arg if the match doesn't parse for the given type. 653 CHECK(!RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL)); 654 CHECK(!RE2::FullMatch("hello", "(.*)", (int*)NULL)); 655 CHECK(!RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL)); 656 CHECK(!RE2::FullMatch("hello", "(.*)", (double*)NULL)); 657 CHECK(!RE2::FullMatch("hello", "(.*)", (float*)NULL)); 658 } 659 660 // Check that numeric parsing code does not read past the end of 661 // the number being parsed. 662 TEST(RE2, NULTerminated) { 663 char *v; 664 int x; 665 long pagesize = sysconf(_SC_PAGE_SIZE); 666 667 #ifndef MAP_ANONYMOUS 668 #define MAP_ANONYMOUS MAP_ANON 669 #endif 670 v = static_cast<char*>(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE, 671 MAP_ANONYMOUS|MAP_PRIVATE, -1, 0)); 672 CHECK(v != reinterpret_cast<char*>(-1)); 673 LOG(INFO) << "Memory at " << (void*)v; 674 CHECK_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno; 675 v[pagesize - 1] = '1'; 676 677 x = 0; 678 CHECK(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x)); 679 CHECK_EQ(x, 1); 680 } 681 682 TEST(RE2, FullMatchTypeTests) { 683 // Type tests 684 string zeros(100, '0'); 685 { 686 char c; 687 CHECK(RE2::FullMatch("Hello", "(H)ello", &c)); 688 CHECK_EQ(c, 'H'); 689 } 690 { 691 unsigned char c; 692 CHECK(RE2::FullMatch("Hello", "(H)ello", &c)); 693 CHECK_EQ(c, static_cast<unsigned char>('H')); 694 } 695 { 696 int16 v; 697 CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100); 698 CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100); 699 CHECK(RE2::FullMatch("32767", "(-?\\d+)", &v)); CHECK_EQ(v, 32767); 700 CHECK(RE2::FullMatch("-32768", "(-?\\d+)", &v)); CHECK_EQ(v, -32768); 701 CHECK(!RE2::FullMatch("-32769", "(-?\\d+)", &v)); 702 CHECK(!RE2::FullMatch("32768", "(-?\\d+)", &v)); 703 } 704 { 705 uint16 v; 706 CHECK(RE2::FullMatch("100", "(\\d+)", &v)); CHECK_EQ(v, 100); 707 CHECK(RE2::FullMatch("32767", "(\\d+)", &v)); CHECK_EQ(v, 32767); 708 CHECK(RE2::FullMatch("65535", "(\\d+)", &v)); CHECK_EQ(v, 65535); 709 CHECK(!RE2::FullMatch("65536", "(\\d+)", &v)); 710 } 711 { 712 int32 v; 713 static const int32 max = 0x7fffffff; 714 static const int32 min = -max - 1; 715 CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100); 716 CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100); 717 CHECK(RE2::FullMatch("2147483647", "(-?\\d+)", &v)); CHECK_EQ(v, max); 718 CHECK(RE2::FullMatch("-2147483648", "(-?\\d+)", &v)); CHECK_EQ(v, min); 719 CHECK(!RE2::FullMatch("-2147483649", "(-?\\d+)", &v)); 720 CHECK(!RE2::FullMatch("2147483648", "(-?\\d+)", &v)); 721 722 CHECK(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v)); 723 CHECK_EQ(v, max); 724 CHECK(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v)); 725 CHECK_EQ(v, min); 726 727 CHECK(!RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v)); 728 CHECK(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v))); 729 CHECK_EQ(v, max); 730 CHECK(!RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v))); 731 } 732 { 733 uint32 v; 734 static const uint32 max = 0xfffffffful; 735 CHECK(RE2::FullMatch("100", "(\\d+)", &v)); CHECK_EQ(v, 100); 736 CHECK(RE2::FullMatch("4294967295", "(\\d+)", &v)); CHECK_EQ(v, max); 737 CHECK(!RE2::FullMatch("4294967296", "(\\d+)", &v)); 738 CHECK(!RE2::FullMatch("-1", "(\\d+)", &v)); 739 740 CHECK(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); CHECK_EQ(v, max); 741 } 742 { 743 int64 v; 744 static const int64 max = 0x7fffffffffffffffull; 745 static const int64 min = -max - 1; 746 char buf[32]; 747 748 CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100); 749 CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100); 750 751 snprintf(buf, sizeof(buf), "%lld", (long long int)max); 752 CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, max); 753 754 snprintf(buf, sizeof(buf), "%lld", (long long int)min); 755 CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, min); 756 757 snprintf(buf, sizeof(buf), "%lld", (long long int)max); 758 assert(buf[strlen(buf)-1] != '9'); 759 buf[strlen(buf)-1]++; 760 CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v)); 761 762 snprintf(buf, sizeof(buf), "%lld", (long long int)min); 763 assert(buf[strlen(buf)-1] != '9'); 764 buf[strlen(buf)-1]++; 765 CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v)); 766 } 767 { 768 uint64 v; 769 int64 v2; 770 static const uint64 max = 0xffffffffffffffffull; 771 char buf[32]; 772 773 CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100); 774 CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v2)); CHECK_EQ(v2, -100); 775 776 snprintf(buf, sizeof(buf), "%llu", (long long unsigned)max); 777 CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, max); 778 779 assert(buf[strlen(buf)-1] != '9'); 780 buf[strlen(buf)-1]++; 781 CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v)); 782 } 783 } 784 785 TEST(RE2, FloatingPointFullMatchTypes) { 786 string zeros(100, '0'); 787 { 788 float v; 789 CHECK(RE2::FullMatch("100", "(.*)", &v)); CHECK_EQ(v, 100); 790 CHECK(RE2::FullMatch("-100.", "(.*)", &v)); CHECK_EQ(v, -100); 791 CHECK(RE2::FullMatch("1e23", "(.*)", &v)); CHECK_EQ(v, float(1e23)); 792 793 CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); 794 CHECK_EQ(v, float(1e23)); 795 796 // 6700000000081920.1 is an edge case. 797 // 6700000000081920 is exactly halfway between 798 // two float32s, so the .1 should make it round up. 799 // However, the .1 is outside the precision possible with 800 // a float64: the nearest float64 is 6700000000081920. 801 // So if the code uses strtod and then converts to float32, 802 // round-to-even will make it round down instead of up. 803 // To pass the test, the parser must call strtof directly. 804 // This test case is carefully chosen to use only a 17-digit 805 // number, since C does not guarantee to get the correctly 806 // rounded answer for strtod and strtof unless the input is 807 // short. 808 CHECK(RE2::FullMatch("0.1", "(.*)", &v)); 809 CHECK_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f); 810 CHECK(RE2::FullMatch("6700000000081920.1", "(.*)", &v)); 811 CHECK_EQ(v, 6700000000081920.1f) 812 << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f); 813 } 814 { 815 double v; 816 CHECK(RE2::FullMatch("100", "(.*)", &v)); CHECK_EQ(v, 100); 817 CHECK(RE2::FullMatch("-100.", "(.*)", &v)); CHECK_EQ(v, -100); 818 CHECK(RE2::FullMatch("1e23", "(.*)", &v)); CHECK_EQ(v, 1e23); 819 CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); 820 CHECK_EQ(v, double(1e23)); 821 822 CHECK(RE2::FullMatch("0.1", "(.*)", &v)); 823 CHECK_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1); 824 CHECK(RE2::FullMatch("1.00000005960464485", "(.*)", &v)); 825 CHECK_EQ(v, 1.0000000596046448) 826 << StringPrintf("%.17g != %.17g", v, 1.0000000596046448); 827 } 828 } 829 830 TEST(RE2, FullMatchAnchored) { 831 int i; 832 // Check that matching is fully anchored 833 CHECK(!RE2::FullMatch("x1001", "(\\d+)", &i)); 834 CHECK(!RE2::FullMatch("1001x", "(\\d+)", &i)); 835 CHECK(RE2::FullMatch("x1001", "x(\\d+)", &i)); CHECK_EQ(i, 1001); 836 CHECK(RE2::FullMatch("1001x", "(\\d+)x", &i)); CHECK_EQ(i, 1001); 837 } 838 839 TEST(RE2, FullMatchBraces) { 840 // Braces 841 CHECK(RE2::FullMatch("0abcd", "[0-9a-f+.-]{5,}")); 842 CHECK(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}")); 843 CHECK(!RE2::FullMatch("0abc", "[0-9a-f+.-]{5,}")); 844 } 845 846 TEST(RE2, Complicated) { 847 // Complicated RE2 848 CHECK(RE2::FullMatch("foo", "foo|bar|[A-Z]")); 849 CHECK(RE2::FullMatch("bar", "foo|bar|[A-Z]")); 850 CHECK(RE2::FullMatch("X", "foo|bar|[A-Z]")); 851 CHECK(!RE2::FullMatch("XY", "foo|bar|[A-Z]")); 852 } 853 854 TEST(RE2, FullMatchEnd) { 855 // Check full-match handling (needs '$' tacked on internally) 856 CHECK(RE2::FullMatch("fo", "fo|foo")); 857 CHECK(RE2::FullMatch("foo", "fo|foo")); 858 CHECK(RE2::FullMatch("fo", "fo|foo$")); 859 CHECK(RE2::FullMatch("foo", "fo|foo$")); 860 CHECK(RE2::FullMatch("foo", "foo$")); 861 CHECK(!RE2::FullMatch("foo$bar", "foo\\$")); 862 CHECK(!RE2::FullMatch("fox", "fo|bar")); 863 864 // Uncomment the following if we change the handling of '$' to 865 // prevent it from matching a trailing newline 866 if (false) { 867 // Check that we don't get bitten by pcre's special handling of a 868 // '\n' at the end of the string matching '$' 869 CHECK(!RE2::PartialMatch("foo\n", "foo$")); 870 } 871 } 872 873 TEST(RE2, FullMatchArgCount) { 874 // Number of args 875 int a[16]; 876 CHECK(RE2::FullMatch("", "")); 877 878 memset(a, 0, sizeof(0)); 879 CHECK(RE2::FullMatch("1", 880 "(\\d){1}", 881 &a[0])); 882 CHECK_EQ(a[0], 1); 883 884 memset(a, 0, sizeof(0)); 885 CHECK(RE2::FullMatch("12", 886 "(\\d)(\\d)", 887 &a[0], &a[1])); 888 CHECK_EQ(a[0], 1); 889 CHECK_EQ(a[1], 2); 890 891 memset(a, 0, sizeof(0)); 892 CHECK(RE2::FullMatch("123", 893 "(\\d)(\\d)(\\d)", 894 &a[0], &a[1], &a[2])); 895 CHECK_EQ(a[0], 1); 896 CHECK_EQ(a[1], 2); 897 CHECK_EQ(a[2], 3); 898 899 memset(a, 0, sizeof(0)); 900 CHECK(RE2::FullMatch("1234", 901 "(\\d)(\\d)(\\d)(\\d)", 902 &a[0], &a[1], &a[2], &a[3])); 903 CHECK_EQ(a[0], 1); 904 CHECK_EQ(a[1], 2); 905 CHECK_EQ(a[2], 3); 906 CHECK_EQ(a[3], 4); 907 908 memset(a, 0, sizeof(0)); 909 CHECK(RE2::FullMatch("12345", 910 "(\\d)(\\d)(\\d)(\\d)(\\d)", 911 &a[0], &a[1], &a[2], &a[3], 912 &a[4])); 913 CHECK_EQ(a[0], 1); 914 CHECK_EQ(a[1], 2); 915 CHECK_EQ(a[2], 3); 916 CHECK_EQ(a[3], 4); 917 CHECK_EQ(a[4], 5); 918 919 memset(a, 0, sizeof(0)); 920 CHECK(RE2::FullMatch("123456", 921 "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", 922 &a[0], &a[1], &a[2], &a[3], 923 &a[4], &a[5])); 924 CHECK_EQ(a[0], 1); 925 CHECK_EQ(a[1], 2); 926 CHECK_EQ(a[2], 3); 927 CHECK_EQ(a[3], 4); 928 CHECK_EQ(a[4], 5); 929 CHECK_EQ(a[5], 6); 930 931 memset(a, 0, sizeof(0)); 932 CHECK(RE2::FullMatch("1234567", 933 "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", 934 &a[0], &a[1], &a[2], &a[3], 935 &a[4], &a[5], &a[6])); 936 CHECK_EQ(a[0], 1); 937 CHECK_EQ(a[1], 2); 938 CHECK_EQ(a[2], 3); 939 CHECK_EQ(a[3], 4); 940 CHECK_EQ(a[4], 5); 941 CHECK_EQ(a[5], 6); 942 CHECK_EQ(a[6], 7); 943 944 memset(a, 0, sizeof(0)); 945 CHECK(RE2::FullMatch("1234567890123456", 946 "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)" 947 "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", 948 &a[0], &a[1], &a[2], &a[3], 949 &a[4], &a[5], &a[6], &a[7], 950 &a[8], &a[9], &a[10], &a[11], 951 &a[12], &a[13], &a[14], &a[15])); 952 CHECK_EQ(a[0], 1); 953 CHECK_EQ(a[1], 2); 954 CHECK_EQ(a[2], 3); 955 CHECK_EQ(a[3], 4); 956 CHECK_EQ(a[4], 5); 957 CHECK_EQ(a[5], 6); 958 CHECK_EQ(a[6], 7); 959 CHECK_EQ(a[7], 8); 960 CHECK_EQ(a[8], 9); 961 CHECK_EQ(a[9], 0); 962 CHECK_EQ(a[10], 1); 963 CHECK_EQ(a[11], 2); 964 CHECK_EQ(a[12], 3); 965 CHECK_EQ(a[13], 4); 966 CHECK_EQ(a[14], 5); 967 CHECK_EQ(a[15], 6); 968 } 969 970 TEST(RE2, Accessors) { 971 // Check the pattern() accessor 972 { 973 const string kPattern = "http://([^/]+)/.*"; 974 const RE2 re(kPattern); 975 CHECK_EQ(kPattern, re.pattern()); 976 } 977 978 // Check RE2 error field. 979 { 980 RE2 re("foo"); 981 CHECK(re.error().empty()); // Must have no error 982 CHECK(re.ok()); 983 CHECK(re.error_code() == RE2::NoError); 984 } 985 } 986 987 TEST(RE2, UTF8) { 988 // Check UTF-8 handling 989 // Three Japanese characters (nihongo) 990 const char utf8_string[] = { 991 0xe6, 0x97, 0xa5, // 65e5 992 0xe6, 0x9c, 0xac, // 627c 993 0xe8, 0xaa, 0x9e, // 8a9e 994 0 995 }; 996 const char utf8_pattern[] = { 997 '.', 998 0xe6, 0x9c, 0xac, // 627c 999 '.', 1000 0 1001 }; 1002 1003 // Both should match in either mode, bytes or UTF-8 1004 RE2 re_test1(".........", RE2::Latin1); 1005 CHECK(RE2::FullMatch(utf8_string, re_test1)); 1006 RE2 re_test2("..."); 1007 CHECK(RE2::FullMatch(utf8_string, re_test2)); 1008 1009 // Check that '.' matches one byte or UTF-8 character 1010 // according to the mode. 1011 string s; 1012 RE2 re_test3("(.)", RE2::Latin1); 1013 CHECK(RE2::PartialMatch(utf8_string, re_test3, &s)); 1014 CHECK_EQ(s, string("\xe6")); 1015 RE2 re_test4("(.)"); 1016 CHECK(RE2::PartialMatch(utf8_string, re_test4, &s)); 1017 CHECK_EQ(s, string("\xe6\x97\xa5")); 1018 1019 // Check that string matches itself in either mode 1020 RE2 re_test5(utf8_string, RE2::Latin1); 1021 CHECK(RE2::FullMatch(utf8_string, re_test5)); 1022 RE2 re_test6(utf8_string); 1023 CHECK(RE2::FullMatch(utf8_string, re_test6)); 1024 1025 // Check that pattern matches string only in UTF8 mode 1026 RE2 re_test7(utf8_pattern, RE2::Latin1); 1027 CHECK(!RE2::FullMatch(utf8_string, re_test7)); 1028 RE2 re_test8(utf8_pattern); 1029 CHECK(RE2::FullMatch(utf8_string, re_test8)); 1030 } 1031 1032 TEST(RE2, UngreedyUTF8) { 1033 // Check that ungreedy, UTF8 regular expressions don't match when they 1034 // oughtn't -- see bug 82246. 1035 { 1036 // This code always worked. 1037 const char* pattern = "\\w+X"; 1038 const string target = "a aX"; 1039 RE2 match_sentence(pattern, RE2::Latin1); 1040 RE2 match_sentence_re(pattern); 1041 1042 CHECK(!RE2::FullMatch(target, match_sentence)); 1043 CHECK(!RE2::FullMatch(target, match_sentence_re)); 1044 } 1045 { 1046 const char* pattern = "(?U)\\w+X"; 1047 const string target = "a aX"; 1048 RE2 match_sentence(pattern, RE2::Latin1); 1049 CHECK_EQ(match_sentence.error(), ""); 1050 RE2 match_sentence_re(pattern); 1051 1052 CHECK(!RE2::FullMatch(target, match_sentence)); 1053 CHECK(!RE2::FullMatch(target, match_sentence_re)); 1054 } 1055 } 1056 1057 TEST(RE2, Rejects) { 1058 { RE2 re("a\\1", RE2::Quiet); CHECK(!re.ok()); } 1059 { 1060 RE2 re("a[x", RE2::Quiet); 1061 CHECK(!re.ok()); 1062 } 1063 { 1064 RE2 re("a[z-a]", RE2::Quiet); 1065 CHECK(!re.ok()); 1066 } 1067 { 1068 RE2 re("a[[:foobar:]]", RE2::Quiet); 1069 CHECK(!re.ok()); 1070 } 1071 { 1072 RE2 re("a(b", RE2::Quiet); 1073 CHECK(!re.ok()); 1074 } 1075 { 1076 RE2 re("a\\", RE2::Quiet); 1077 CHECK(!re.ok()); 1078 } 1079 } 1080 1081 TEST(RE2, NoCrash) { 1082 // Test that using a bad regexp doesn't crash. 1083 { 1084 RE2 re("a\\", RE2::Quiet); 1085 CHECK(!re.ok()); 1086 CHECK(!RE2::PartialMatch("a\\b", re)); 1087 } 1088 1089 // Test that using an enormous regexp doesn't crash 1090 { 1091 RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet); 1092 CHECK(!re.ok()); 1093 CHECK(!RE2::PartialMatch("aaa", re)); 1094 } 1095 1096 // Test that a crazy regexp still compiles and runs. 1097 { 1098 RE2 re(".{512}x", RE2::Quiet); 1099 CHECK(re.ok()); 1100 string s; 1101 s.append(515, 'c'); 1102 s.append("x"); 1103 CHECK(RE2::PartialMatch(s, re)); 1104 } 1105 } 1106 1107 TEST(RE2, Recursion) { 1108 // Test that recursion is stopped. 1109 // This test is PCRE-legacy -- there's no recursion in RE2. 1110 int bytes = 15 * 1024; // enough to crash PCRE 1111 TestRecursion(bytes, "."); 1112 TestRecursion(bytes, "a"); 1113 TestRecursion(bytes, "a."); 1114 TestRecursion(bytes, "ab."); 1115 TestRecursion(bytes, "abc."); 1116 } 1117 1118 TEST(RE2, BigCountedRepetition) { 1119 // Test that counted repetition works, given tons of memory. 1120 RE2::Options opt; 1121 opt.set_max_mem(256<<20); 1122 1123 RE2 re(".{512}x", opt); 1124 CHECK(re.ok()); 1125 string s; 1126 s.append(515, 'c'); 1127 s.append("x"); 1128 CHECK(RE2::PartialMatch(s, re)); 1129 } 1130 1131 TEST(RE2, DeepRecursion) { 1132 // Test for deep stack recursion. This would fail with a 1133 // segmentation violation due to stack overflow before pcre was 1134 // patched. 1135 // Again, a PCRE legacy test. RE2 doesn't recurse. 1136 string comment("x*"); 1137 string a(131072, 'a'); 1138 comment += a; 1139 comment += "*x"; 1140 RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)"); 1141 CHECK(RE2::FullMatch(comment, re)); 1142 } 1143 1144 // Suggested by Josh Hyman. Failed when SearchOnePass was 1145 // not implementing case-folding. 1146 TEST(CaseInsensitive, MatchAndConsume) { 1147 string result; 1148 string text = "A fish named *Wanda*"; 1149 StringPiece sp(text); 1150 1151 EXPECT_TRUE(RE2::PartialMatch(sp, "(?i)([wand]{5})", &result)); 1152 EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result)); 1153 } 1154 1155 // RE2 should permit implicit conversions from string, StringPiece, const char*, 1156 // and C string literals. 1157 TEST(RE2, ImplicitConversions) { 1158 string re_string("."); 1159 StringPiece re_stringpiece("."); 1160 const char* re_cstring = "."; 1161 EXPECT_TRUE(RE2::PartialMatch("e", re_string)); 1162 EXPECT_TRUE(RE2::PartialMatch("e", re_stringpiece)); 1163 EXPECT_TRUE(RE2::PartialMatch("e", re_cstring)); 1164 EXPECT_TRUE(RE2::PartialMatch("e", ".")); 1165 } 1166 1167 // Bugs introduced by 8622304 1168 TEST(RE2, CL8622304) { 1169 // reported by ingow 1170 string dir; 1171 EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])")); // ok 1172 EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir)); // fails 1173 1174 // reported by jacobsa 1175 string key, val; 1176 EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true", 1177 "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?", 1178 &key, 1179 &val)); 1180 EXPECT_EQ(key, "bar"); 1181 EXPECT_EQ(val, "1,0x2F,030,4,5"); 1182 } 1183 1184 1185 // Check that RE2 returns correct regexp pieces on error. 1186 // In particular, make sure it returns whole runes 1187 // and that it always reports invalid UTF-8. 1188 // Also check that Perl error flag piece is big enough. 1189 static struct ErrorTest { 1190 const char *regexp; 1191 const char *error; 1192 } error_tests[] = { 1193 { "ab\\cd", "\\" }, 1194 { "ef\\x01", "\\x0" }, 1195 { "gh\\x101", "\\x1" }, 1196 { "ij\\x1", "\\x1" }, 1197 { "kl\\x", "\\x" }, 1198 { "uv\\x{0000}", "\\x{0000" }, 1199 { "wx\\p{ABC", "\\p{ABC" }, 1200 { "yz(?smiUX:abc)", "(?smiUX" }, // used to return (?s but the error is X 1201 { "aa(?smi", "(?sm" }, 1202 { "bb[abc", "[abc" }, 1203 1204 { "mn\\x1\377", "" }, // no argument string returned for invalid UTF-8 1205 { "op\377qr", "" }, 1206 { "st\\x{00000\377", "" }, 1207 { "zz\\p{\377}", "" }, 1208 { "zz\\x{00\377}", "" }, 1209 { "zz(?P<name\377>abc)", "" }, 1210 }; 1211 TEST(RE2, ErrorArgs) { 1212 for (int i = 0; i < arraysize(error_tests); i++) { 1213 RE2 re(error_tests[i].regexp, RE2::Quiet); 1214 EXPECT_FALSE(re.ok()); 1215 EXPECT_EQ(re.error_arg(), error_tests[i].error) << re.error(); 1216 } 1217 } 1218 1219 // Check that "never match \n" mode never matches \n. 1220 static struct NeverTest { 1221 const char* regexp; 1222 const char* text; 1223 const char* match; 1224 } never_tests[] = { 1225 { "(.*)", "abc\ndef\nghi\n", "abc" }, 1226 { "(?s)(abc.*def)", "abc\ndef\n", NULL }, 1227 { "(abc(.|\n)*def)", "abc\ndef\n", NULL }, 1228 { "(abc[^x]*def)", "abc\ndef\n", NULL }, 1229 { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" }, 1230 }; 1231 TEST(RE2, NeverNewline) { 1232 RE2::Options opt; 1233 opt.set_never_nl(true); 1234 for (int i = 0; i < arraysize(never_tests); i++) { 1235 const NeverTest& t = never_tests[i]; 1236 RE2 re(t.regexp, opt); 1237 if (t.match == NULL) { 1238 EXPECT_FALSE(re.PartialMatch(t.text, re)); 1239 } else { 1240 StringPiece m; 1241 EXPECT_TRUE(re.PartialMatch(t.text, re, &m)); 1242 EXPECT_EQ(m, t.match); 1243 } 1244 } 1245 } 1246 1247 // Check that there are no capturing groups in "never capture" mode. 1248 TEST(RE2, NeverCapture) { 1249 RE2::Options opt; 1250 opt.set_never_capture(true); 1251 RE2 re("(r)(e)", opt); 1252 EXPECT_EQ(0, re.NumberOfCapturingGroups()); 1253 } 1254 1255 // Bitstate bug was looking at submatch[0] even if nsubmatch == 0. 1256 // Triggered by a failed DFA search falling back to Bitstate when 1257 // using Match with a NULL submatch set. Bitstate tried to read 1258 // the submatch[0] entry even if nsubmatch was 0. 1259 TEST(RE2, BitstateCaptureBug) { 1260 RE2::Options opt; 1261 opt.set_max_mem(20000); 1262 RE2 re("(_________$)", opt); 1263 StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x"; 1264 EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0)); 1265 } 1266 1267 // C++ version of bug 609710. 1268 TEST(RE2, UnicodeClasses) { 1269 const string str = "ABCDEFGHI"; 1270 string a, b, c; 1271 1272 EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}")); 1273 EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}")); 1274 EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}")); 1275 EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}")); 1276 EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}")); 1277 EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}")); 1278 1279 EXPECT_TRUE(RE2::FullMatch("", "\\p{L}")); 1280 EXPECT_FALSE(RE2::FullMatch("", "\\p{Lu}")); 1281 EXPECT_FALSE(RE2::FullMatch("", "\\p{Ll}")); 1282 EXPECT_FALSE(RE2::FullMatch("", "\\P{L}")); 1283 EXPECT_TRUE(RE2::FullMatch("", "\\P{Lu}")); 1284 EXPECT_TRUE(RE2::FullMatch("", "\\P{Ll}")); 1285 1286 EXPECT_TRUE(RE2::FullMatch("", "\\p{L}")); 1287 EXPECT_FALSE(RE2::FullMatch("", "\\p{Lu}")); 1288 EXPECT_FALSE(RE2::FullMatch("", "\\p{Ll}")); 1289 EXPECT_FALSE(RE2::FullMatch("", "\\P{L}")); 1290 EXPECT_TRUE(RE2::FullMatch("", "\\P{Lu}")); 1291 EXPECT_TRUE(RE2::FullMatch("", "\\P{Ll}")); 1292 1293 EXPECT_TRUE(RE2::FullMatch("", "\\p{L}")); 1294 EXPECT_FALSE(RE2::FullMatch("", "\\p{Lu}")); 1295 EXPECT_FALSE(RE2::FullMatch("", "\\p{Ll}")); 1296 EXPECT_FALSE(RE2::FullMatch("", "\\P{L}")); 1297 EXPECT_TRUE(RE2::FullMatch("", "\\P{Lu}")); 1298 EXPECT_TRUE(RE2::FullMatch("", "\\P{Ll}")); 1299 1300 EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c)); 1301 EXPECT_EQ("A", a); 1302 EXPECT_EQ("B", b); 1303 EXPECT_EQ("C", c); 1304 1305 EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c)); 1306 EXPECT_EQ("A", a); 1307 EXPECT_EQ("B", b); 1308 EXPECT_EQ("C", c); 1309 1310 EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}")); 1311 1312 EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c)); 1313 EXPECT_EQ("A", a); 1314 EXPECT_EQ("B", b); 1315 EXPECT_EQ("C", c); 1316 1317 EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]")); 1318 1319 EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c)); 1320 EXPECT_EQ("", a); 1321 EXPECT_EQ("", b); 1322 EXPECT_EQ("", c); 1323 } 1324 1325 // Bug reported by saito. 2009/02/17 1326 TEST(RE2, NullVsEmptyString) { 1327 RE2 re2(".*"); 1328 StringPiece v1(""); 1329 EXPECT_TRUE(RE2::FullMatch(v1, re2)); 1330 1331 StringPiece v2; 1332 EXPECT_TRUE(RE2::FullMatch(v2, re2)); 1333 } 1334 1335 // Issue 1816809 1336 TEST(RE2, Bug1816809) { 1337 RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))"); 1338 StringPiece piece("llx-3;llx4"); 1339 string x; 1340 EXPECT_TRUE(RE2::Consume(&piece, re, &x)); 1341 } 1342 1343 // Issue 3061120 1344 TEST(RE2, Bug3061120) { 1345 RE2 re("(?i)\\W"); 1346 EXPECT_FALSE(RE2::PartialMatch("x", re)); // always worked 1347 EXPECT_FALSE(RE2::PartialMatch("k", re)); // broke because of kelvin 1348 EXPECT_FALSE(RE2::PartialMatch("s", re)); // broke because of latin long s 1349 } 1350 1351 TEST(RE2, CapturingGroupNames) { 1352 // Opening parentheses annotated with group IDs: 1353 // 12 3 45 6 7 1354 RE2 re("((abc)(?P<G2>)|((e+)(?P<G2>.*)(?P<G1>u+)))"); 1355 EXPECT_TRUE(re.ok()); 1356 const map<int, string>& have = re.CapturingGroupNames(); 1357 map<int, string> want; 1358 want[3] = "G2"; 1359 want[6] = "G2"; 1360 want[7] = "G1"; 1361 EXPECT_EQ(want, have); 1362 } 1363 1364 TEST(RE2, RegexpToStringLossOfAnchor) { 1365 EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at"); 1366 EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at"); 1367 EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$"); 1368 EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)"); 1369 } 1370 1371 } // namespace re2 1372