Home | History | Annotate | Download | only in testing
      1 // Copyright 2006-2008 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include <vector>
      6 #include "util/test.h"
      7 #include "re2/prog.h"
      8 #include "re2/re2.h"
      9 #include "re2/regexp.h"
     10 #include "re2/testing/regexp_generator.h"
     11 #include "re2/testing/string_generator.h"
     12 
     13 namespace re2 {
     14 
     15 // Test that C++ strings are compared as uint8s, not int8s.
     16 // PossibleMatchRange doesn't depend on this, but callers probably will.
     17 TEST(CplusplusStrings, EightBit) {
     18   string s = "\x70";
     19   string t = "\xA0";
     20   EXPECT_LT(s, t);
     21 }
     22 
     23 struct PrefixTest {
     24   const char* regexp;
     25   int maxlen;
     26   const char* min;
     27   const char* max;
     28 };
     29 
     30 static PrefixTest tests[] = {
     31   { "",                  10,  "",           "",        },
     32   { "Abcdef",            10,  "Abcdef",     "Abcdef"   },
     33   { "abc(def|ghi)",      10,  "abcdef",     "abcghi"   },
     34   { "a+hello",           10,  "aa",         "ahello"   },
     35   { "a*hello",           10,  "a",          "hello"    },
     36   { "def|abc",           10,  "abc",        "def"      },
     37   { "a(b)(c)[d]",        10,  "abcd",       "abcd"     },
     38   { "ab(cab|cat)",       10,  "abcab",      "abcat"    },
     39   { "ab(cab|ca)x",       10,  "abcabx",     "abcax"    },
     40   { "(ab|x)(c|de)",      10,  "abc",        "xde"      },
     41   { "(ab|x)?(c|z)?",     10,  "",           "z"        },
     42   { "[^\\s\\S]",         10,  "",           ""         },
     43   { "(abc)+",             5,  "abc",        "abcac"    },
     44   { "(abc)+",             2,  "ab",         "ac"       },
     45   { "(abc)+",             1,  "a",          "b"        },
     46   { "[a\xC3\xA1]",        4,  "a",          "\xC3\xA1" },
     47   { "a*",                10,  "",           "ab"       },
     48 
     49   { "(?i)Abcdef",        10,  "ABCDEF",     "abcdef"   },
     50   { "(?i)abc(def|ghi)",  10,  "ABCDEF",     "abcghi"   },
     51   { "(?i)a+hello",       10,  "AA",         "ahello"   },
     52   { "(?i)a*hello",       10,  "A",          "hello"    },
     53   { "(?i)def|abc",       10,  "ABC",        "def"      },
     54   { "(?i)a(b)(c)[d]",    10,  "ABCD",       "abcd"     },
     55   { "(?i)ab(cab|cat)",   10,  "ABCAB",      "abcat"    },
     56   { "(?i)ab(cab|ca)x",   10,  "ABCABX",     "abcax"    },
     57   { "(?i)(ab|x)(c|de)",  10,  "ABC",        "xde"      },
     58   { "(?i)(ab|x)?(c|z)?", 10,  "",           "z"        },
     59   { "(?i)[^\\s\\S]",     10,  "",           ""         },
     60   { "(?i)(abc)+",         5,  "ABC",        "abcac"    },
     61   { "(?i)(abc)+",         2,  "AB",         "ac"       },
     62   { "(?i)(abc)+",         1,  "A",          "b"        },
     63   { "(?i)[a\xC3\xA1]",    4,  "A",          "\xC3\xA1" },
     64   { "(?i)a*",            10,  "",           "ab"       },
     65   { "(?i)A*",            10,  "",           "ab"       },
     66 
     67   { "\\AAbcdef",         10,  "Abcdef",     "Abcdef"   },
     68   { "\\Aabc(def|ghi)",   10,  "abcdef",     "abcghi"   },
     69   { "\\Aa+hello",        10,  "aa",         "ahello"   },
     70   { "\\Aa*hello",        10,  "a",          "hello"    },
     71   { "\\Adef|abc",        10,  "abc",        "def"      },
     72   { "\\Aa(b)(c)[d]",     10,  "abcd",       "abcd"     },
     73   { "\\Aab(cab|cat)",    10,  "abcab",      "abcat"    },
     74   { "\\Aab(cab|ca)x",    10,  "abcabx",     "abcax"    },
     75   { "\\A(ab|x)(c|de)",   10,  "abc",        "xde"      },
     76   { "\\A(ab|x)?(c|z)?",  10,  "",           "z"        },
     77   { "\\A[^\\s\\S]",      10,  "",           ""         },
     78   { "\\A(abc)+",          5,  "abc",        "abcac"    },
     79   { "\\A(abc)+",          2,  "ab",         "ac"       },
     80   { "\\A(abc)+",          1,  "a",          "b"        },
     81   { "\\A[a\xC3\xA1]",     4,  "a",          "\xC3\xA1" },
     82   { "\\Aa*",             10,  "",           "ab"       },
     83 
     84   { "(?i)\\AAbcdef",         10,  "ABCDEF",     "abcdef"   },
     85   { "(?i)\\Aabc(def|ghi)",   10,  "ABCDEF",     "abcghi"   },
     86   { "(?i)\\Aa+hello",        10,  "AA",         "ahello"   },
     87   { "(?i)\\Aa*hello",        10,  "A",          "hello"    },
     88   { "(?i)\\Adef|abc",        10,  "ABC",        "def"      },
     89   { "(?i)\\Aa(b)(c)[d]",     10,  "ABCD",       "abcd"     },
     90   { "(?i)\\Aab(cab|cat)",    10,  "ABCAB",      "abcat"    },
     91   { "(?i)\\Aab(cab|ca)x",    10,  "ABCABX",     "abcax"    },
     92   { "(?i)\\A(ab|x)(c|de)",   10,  "ABC",        "xde"      },
     93   { "(?i)\\A(ab|x)?(c|z)?",  10,  "",           "z"        },
     94   { "(?i)\\A[^\\s\\S]",      10,  "",           ""         },
     95   { "(?i)\\A(abc)+",          5,  "ABC",        "abcac"    },
     96   { "(?i)\\A(abc)+",          2,  "AB",         "ac"       },
     97   { "(?i)\\A(abc)+",          1,  "A",          "b"        },
     98   { "(?i)\\A[a\xC3\xA1]",     4,  "A",          "\xC3\xA1" },
     99   { "(?i)\\Aa*",             10,  "",           "ab"       },
    100   { "(?i)\\AA*",             10,  "",           "ab"       },
    101 };
    102 
    103 TEST(PossibleMatchRange, HandWritten) {
    104   for (int i = 0; i < arraysize(tests); i++) {
    105     for (int j = 0; j < 2; j++) {
    106       const PrefixTest& t = tests[i];
    107       string min, max;
    108       if (j == 0) {
    109         LOG(INFO) << "Checking regexp=" << CEscape(t.regexp);
    110         Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
    111         CHECK(re);
    112         Prog* prog = re->CompileToProg(0);
    113         CHECK(prog);
    114         CHECK(prog->PossibleMatchRange(&min, &max, t.maxlen))
    115           << " " << t.regexp;
    116         delete prog;
    117         re->Decref();
    118       } else {
    119         CHECK(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen));
    120       }
    121       EXPECT_EQ(t.min, min) << t.regexp;
    122       EXPECT_EQ(t.max, max) << t.regexp;
    123     }
    124   }
    125 }
    126 
    127 // Test cases where PossibleMatchRange should return false.
    128 TEST(PossibleMatchRange, Failures) {
    129   string min, max;
    130 
    131   // Fails because no room to write max.
    132   EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0));
    133 
    134   // Fails because there is no max -- any non-empty string matches
    135   // or begins a match.  Have to use Latin-1 input, because there
    136   // are no valid UTF-8 strings beginning with byte 0xFF.
    137   EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
    138                PossibleMatchRange(&min, &max, 10))
    139     << "min=" << CEscape(min) << ", max=" << CEscape(max);
    140   EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
    141                PossibleMatchRange(&min, &max, 10))
    142     << "min=" << CEscape(min) << ", max=" << CEscape(max);
    143   EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
    144                PossibleMatchRange(&min, &max, 10))
    145     << "min=" << CEscape(min) << ", max=" << CEscape(max);
    146   EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
    147                PossibleMatchRange(&min, &max, 10))
    148     << "min=" << CEscape(min) << ", max=" << CEscape(max);
    149   EXPECT_FALSE(RE2(".*", RE2::Latin1).
    150                PossibleMatchRange(&min, &max, 10))
    151     << "min=" << CEscape(min) << ", max=" << CEscape(max);
    152   EXPECT_FALSE(RE2("\\C*").
    153                PossibleMatchRange(&min, &max, 10))
    154     << "min=" << CEscape(min) << ", max=" << CEscape(max);
    155 
    156   // Fails because it's a malformed regexp.
    157   EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
    158     << "min=" << CEscape(min) << ", max=" << CEscape(max);
    159 }
    160 
    161 // Exhaustive test: generate all regexps within parameters,
    162 // then generate all strings of a given length over a given alphabet,
    163 // then check that the prefix information agrees with whether
    164 // the regexp matches each of the strings.
    165 class PossibleMatchTester : public RegexpGenerator {
    166  public:
    167   PossibleMatchTester(int maxatoms,
    168                       int maxops,
    169                       const vector<string>& alphabet,
    170                       const vector<string>& ops,
    171                       int maxstrlen,
    172                       const vector<string>& stralphabet)
    173     : RegexpGenerator(maxatoms, maxops, alphabet, ops),
    174       strgen_(maxstrlen, stralphabet),
    175       regexps_(0), tests_(0) { }
    176 
    177   int regexps()  { return regexps_; }
    178   int tests()    { return tests_; }
    179 
    180   // Needed for RegexpGenerator interface.
    181   void HandleRegexp(const string& regexp);
    182 
    183  private:
    184   StringGenerator strgen_;
    185 
    186   int regexps_;   // Number of HandleRegexp calls
    187   int tests_;     // Number of regexp tests.
    188 
    189   DISALLOW_EVIL_CONSTRUCTORS(PossibleMatchTester);
    190 };
    191 
    192 // Processes a single generated regexp.
    193 // Checks that all accepted strings agree with the prefix range.
    194 void PossibleMatchTester::HandleRegexp(const string& regexp) {
    195   regexps_++;
    196 
    197   VLOG(3) << CEscape(regexp);
    198 
    199   RE2 re(regexp, RE2::Latin1);
    200   CHECK_EQ(re.error(), "");
    201 
    202   string min, max;
    203   if(!re.PossibleMatchRange(&min, &max, 10)) {
    204     // There's no good max for "\\C*".  Can't use strcmp
    205     // because sometimes it gets embedded in more
    206     // complicated expressions.
    207     if(strstr(regexp.c_str(), "\\C*"))
    208       return;
    209     LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp);
    210   }
    211 
    212   strgen_.Reset();
    213   while (strgen_.HasNext()) {
    214     const StringPiece& s = strgen_.Next();
    215     tests_++;
    216     if (!RE2::FullMatch(s, re))
    217       continue;
    218     CHECK_GE(s, min) << " regexp: " << regexp << " max: " << max;
    219     CHECK_LE(s, max) << " regexp: " << regexp << " min: " << min;
    220   }
    221 }
    222 
    223 TEST(PossibleMatchRange, Exhaustive) {
    224   int natom = 3;
    225   int noperator = 3;
    226   int stringlen = 5;
    227   if (DEBUG_MODE) {
    228     natom = 2;
    229     noperator = 3;
    230     stringlen = 3;
    231   }
    232   PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"),
    233                  RegexpGenerator::EgrepOps(),
    234                  stringlen, Explode("ab4"));
    235   t.Generate();
    236   LOG(INFO) << t.regexps() << " regexps, "
    237             << t.tests() << " tests";
    238 }
    239 
    240 }  // namespace re2
    241