Home | History | Annotate | Download | only in testing
      1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Test simplify.cc.
      6 
      7 #include <string>
      8 #include <vector>
      9 #include "util/test.h"
     10 #include "re2/regexp.h"
     11 
     12 namespace re2 {
     13 
     14 struct Test {
     15   const char* regexp;
     16   const char* simplified;
     17 };
     18 
     19 static Test tests[] = {
     20   // Already-simple constructs
     21   { "a", "a" },
     22   { "ab", "ab" },
     23   { "a|b", "[a-b]" },
     24   { "ab|cd", "ab|cd" },
     25   { "(ab)*", "(ab)*" },
     26   { "(ab)+", "(ab)+" },
     27   { "(ab)?", "(ab)?" },
     28   { ".", "." },
     29   { "^", "^" },
     30   { "$", "$" },
     31   { "[ac]", "[ac]" },
     32   { "[^ac]", "[^ac]" },
     33 
     34   // Posix character classes
     35   { "[[:alnum:]]", "[0-9A-Za-z]" },
     36   { "[[:alpha:]]", "[A-Za-z]" },
     37   { "[[:blank:]]", "[\\t ]" },
     38   { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" },
     39   { "[[:digit:]]", "[0-9]" },
     40   { "[[:graph:]]", "[!-~]" },
     41   { "[[:lower:]]", "[a-z]" },
     42   { "[[:print:]]", "[ -~]" },
     43   { "[[:punct:]]", "[!-/:-@\\[-`{-~]" },
     44   { "[[:space:]]" , "[\\t-\\r ]" },
     45   { "[[:upper:]]", "[A-Z]" },
     46   { "[[:xdigit:]]", "[0-9A-Fa-f]" },
     47 
     48   // Perl character classes
     49   { "\\d", "[0-9]" },
     50   { "\\s", "[\\t-\\n\\f-\\r ]" },
     51   { "\\w", "[0-9A-Z_a-z]" },
     52   { "\\D", "[^0-9]" },
     53   { "\\S", "[^\\t-\\n\\f-\\r ]" },
     54   { "\\W", "[^0-9A-Z_a-z]" },
     55   { "[\\d]", "[0-9]" },
     56   { "[\\s]", "[\\t-\\n\\f-\\r ]" },
     57   { "[\\w]", "[0-9A-Z_a-z]" },
     58   { "[\\D]", "[^0-9]" },
     59   { "[\\S]", "[^\\t-\\n\\f-\\r ]" },
     60   { "[\\W]", "[^0-9A-Z_a-z]" },
     61 
     62   // Posix repetitions
     63   { "a{1}", "a" },
     64   { "a{2}", "aa" },
     65   { "a{5}", "aaaaa" },
     66   { "a{0,1}", "a?" },
     67   // The next three are illegible because Simplify inserts (?:)
     68   // parens instead of () parens to avoid creating extra
     69   // captured subexpressions.  The comments show a version fewer parens.
     70   { "(a){0,2}",                   "(?:(a)(a)?)?"     },  //       (aa?)?
     71   { "(a){0,4}",       "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" },  //   (a(a(aa?)?)?)?
     72   { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" },  // aa(a(a(aa?)?)?)?
     73   { "a{0,2}",           "(?:aa?)?"     },  //       (aa?)?
     74   { "a{0,4}",   "(?:a(?:a(?:aa?)?)?)?" },  //   (a(a(aa?)?)?)?
     75   { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" },  // aa(a(a(aa?)?)?)?
     76   { "a{0,}", "a*" },
     77   { "a{1,}", "a+" },
     78   { "a{2,}", "aa+" },
     79   { "a{5,}", "aaaaa+" },
     80 
     81   // Test that operators simplify their arguments.
     82   // (Simplify used to not simplify arguments to a {} repeat.)
     83   { "(?:a{1,}){1,}", "a+" },
     84   { "(a{1,}b{1,})", "(a+b+)" },
     85   { "a{1,}|b{1,}", "a+|b+" },
     86   { "(?:a{1,})*", "(?:a+)*" },
     87   { "(?:a{1,})+", "a+" },
     88   { "(?:a{1,})?", "(?:a+)?" },
     89   { "a{0}", "" },
     90 
     91   // Character class simplification
     92   { "[ab]", "[a-b]" },
     93   { "[a-za-za-z]", "[a-z]" },
     94   { "[A-Za-zA-Za-z]", "[A-Za-z]" },
     95   { "[ABCDEFGH]", "[A-H]" },
     96   { "[AB-CD-EF-GH]", "[A-H]" },
     97   { "[W-ZP-XE-R]", "[E-Z]" },
     98   { "[a-ee-gg-m]", "[a-m]" },
     99   { "[a-ea-ha-m]", "[a-m]" },
    100   { "[a-ma-ha-e]", "[a-m]" },
    101   { "[a-zA-Z0-9 -~]", "[ -~]" },
    102 
    103   // Empty character classes
    104   { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" },
    105 
    106   // Full character classes
    107   { "[[:cntrl:][:^cntrl:]]", "." },
    108 
    109   // Unicode case folding.
    110   { "(?i)A", "[Aa]" },
    111   { "(?i)a", "[Aa]" },
    112   { "(?i)K", "[Kk\\x{212a}]" },
    113   { "(?i)k", "[Kk\\x{212a}]" },
    114   { "(?i)\\x{212a}", "[Kk\\x{212a}]" },
    115   { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" },
    116   { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" },
    117   { "(?i)[\\x00-\\x{10ffff}]", "." },
    118 
    119   // Empty string as a regular expression.
    120   // Empty string must be preserved inside parens in order
    121   // to make submatches work right, so these are less
    122   // interesting than they used to be.  ToString inserts
    123   // explicit (?:) in place of non-parenthesized empty strings,
    124   // to make them easier to spot for other parsers.
    125   { "(a|b|)", "([a-b]|(?:))" },
    126   { "(|)", "()" },
    127   { "a()", "a()" },
    128   { "(()|())", "(()|())" },
    129   { "(a|)", "(a|(?:))" },
    130   { "ab()cd()", "ab()cd()" },
    131   { "()", "()" },
    132   { "()*", "()*" },
    133   { "()+", "()+" },
    134   { "()?" , "()?" },
    135   { "(){0}", "" },
    136   { "(){1}", "()" },
    137   { "(){1,}", "()+" },
    138   { "(){0,2}", "(?:()()?)?" },
    139 };
    140 
    141 TEST(TestSimplify, SimpleRegexps) {
    142   for (int i = 0; i < arraysize(tests); i++) {
    143     RegexpStatus status;
    144     VLOG(1) << "Testing " << tests[i].regexp;
    145     Regexp* re = Regexp::Parse(tests[i].regexp,
    146                                Regexp::MatchNL | (Regexp::LikePerl &
    147                                                   ~Regexp::OneLine),
    148                                &status);
    149     CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();
    150     Regexp* sre = re->Simplify();
    151     CHECK(sre != NULL);
    152 
    153     // Check that already-simple regexps don't allocate new ones.
    154     if (strcmp(tests[i].regexp, tests[i].simplified) == 0) {
    155       CHECK(re == sre) << " " << tests[i].regexp
    156         << " " << re->ToString() << " " << sre->ToString();
    157     }
    158 
    159     EXPECT_EQ(tests[i].simplified, sre->ToString())
    160       << " " << tests[i].regexp << " " << sre->Dump();
    161 
    162     re->Decref();
    163     sre->Decref();
    164   }
    165 }
    166 
    167 }  // namespace re2
    168