re2/testing/exhaustive2_test.cc

// Copyright 2008 The RE2 Authors.  All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Exhaustive testing of regular expression matching.

#include "util/test.h"
#include "re2/re2.h"
#include "re2/testing/exhaustive_tester.h"

DECLARE_string(regexp_engines);

namespace re2 {

// Test empty string matches (aka "(?:)")
TEST(EmptyString, Exhaustive) {
  ExhaustiveTest(2, 2, Split(" ", "(?:) a"),
                 RegexpGenerator::EgrepOps(),
                 5, Split("", "ab"), "", "");
}

// Test escaped versions of regexp syntax.
TEST(Punctuation, Literals) {
  vector<string> alphabet = Explode("()*+?{}[]\\^$.");
  vector<string> escaped = alphabet;
  for (int i = 0; i < escaped.size(); i++)
    escaped[i] = "\\" + escaped[i];
  ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(),
                 2, alphabet, "", "");
}

// Test ^ $ . \A \z in presence of line endings.
// Have to wrap the empty-width ones in (?:) so that
// they can be repeated -- PCRE rejects ^* but allows (?:^)*
TEST(LineEnds, Exhaustive) {
  ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"),
                 RegexpGenerator::EgrepOps(),
                 4, Explode("ab\n"), "", "");
}

// Test what does and does not match \n.
// This would be a good test, except that PCRE seems to have a bug:
// in single-byte character set mode (the default),
// [^a] matches \n, but in UTF-8 mode it does not.
// So when we run the test, the tester complains that
// we don't agree with PCRE, but it's PCRE that is at fault.
// For what it's worth, Perl gets this right (matches
// regardless of whether UTF-8 input is selected):
//
//     #!/usr/bin/perl
//     use POSIX qw(locale_h);
//     print "matches in latin1\n" if "\n" =~ /[^a]/;
//     setlocale("en_US.utf8");
//     print "matches in utf8\n" if "\n" =~ /[^a]/;
//
// The rule chosen for RE2 is that by default, like Perl,
// dot does not match \n but negated character classes [^a] do.
// (?s) will allow dot to match \n; there is no way in RE2
// to stop [^a] from matching \n, though the underlying library
// provides a mechanism, and RE2 could add new syntax if needed.
//
// TEST(Newlines, Exhaustive) {
//   vector<string> empty_vector;
//   ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"),
//                  RegexpGenerator::EgrepOps(),
//                  4, Explode("a\n"), "");
// }

}  // namespace re2