Home | History | Annotate | Download | only in testing
      1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Exhaustive testing of regular expression matching.
      6 
      7 #include "util/test.h"
      8 #include "re2/re2.h"
      9 #include "re2/testing/exhaustive_tester.h"
     10 
     11 DECLARE_string(regexp_engines);
     12 
     13 namespace re2 {
     14 
     15 // Test empty string matches (aka "(?:)")
     16 TEST(EmptyString, Exhaustive) {
     17   ExhaustiveTest(2, 2, Split(" ", "(?:) a"),
     18                  RegexpGenerator::EgrepOps(),
     19                  5, Split("", "ab"), "", "");
     20 }
     21 
     22 // Test escaped versions of regexp syntax.
     23 TEST(Punctuation, Literals) {
     24   vector<string> alphabet = Explode("()*+?{}[]\\^$.");
     25   vector<string> escaped = alphabet;
     26   for (int i = 0; i < escaped.size(); i++)
     27     escaped[i] = "\\" + escaped[i];
     28   ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(),
     29                  2, alphabet, "", "");
     30 }
     31 
     32 // Test ^ $ . \A \z in presence of line endings.
     33 // Have to wrap the empty-width ones in (?:) so that
     34 // they can be repeated -- PCRE rejects ^* but allows (?:^)*
     35 TEST(LineEnds, Exhaustive) {
     36   ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"),
     37                  RegexpGenerator::EgrepOps(),
     38                  4, Explode("ab\n"), "", "");
     39 }
     40 
     41 // Test what does and does not match \n.
     42 // This would be a good test, except that PCRE seems to have a bug:
     43 // in single-byte character set mode (the default),
     44 // [^a] matches \n, but in UTF-8 mode it does not.
     45 // So when we run the test, the tester complains that
     46 // we don't agree with PCRE, but it's PCRE that is at fault.
     47 // For what it's worth, Perl gets this right (matches
     48 // regardless of whether UTF-8 input is selected):
     49 //
     50 //     #!/usr/bin/perl
     51 //     use POSIX qw(locale_h);
     52 //     print "matches in latin1\n" if "\n" =~ /[^a]/;
     53 //     setlocale("en_US.utf8");
     54 //     print "matches in utf8\n" if "\n" =~ /[^a]/;
     55 //
     56 // The rule chosen for RE2 is that by default, like Perl,
     57 // dot does not match \n but negated character classes [^a] do.
     58 // (?s) will allow dot to match \n; there is no way in RE2
     59 // to stop [^a] from matching \n, though the underlying library
     60 // provides a mechanism, and RE2 could add new syntax if needed.
     61 //
     62 // TEST(Newlines, Exhaustive) {
     63 //   vector<string> empty_vector;
     64 //   ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"),
     65 //                  RegexpGenerator::EgrepOps(),
     66 //                  4, Explode("a\n"), "");
     67 // }
     68 
     69 }  // namespace re2
     70 
     71