1 // Copyright 2006 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Test simplify.cc. 6 7 #include <string> 8 #include <vector> 9 #include "util/test.h" 10 #include "re2/regexp.h" 11 12 namespace re2 { 13 14 struct Test { 15 const char* regexp; 16 const char* simplified; 17 }; 18 19 static Test tests[] = { 20 // Already-simple constructs 21 { "a", "a" }, 22 { "ab", "ab" }, 23 { "a|b", "[a-b]" }, 24 { "ab|cd", "ab|cd" }, 25 { "(ab)*", "(ab)*" }, 26 { "(ab)+", "(ab)+" }, 27 { "(ab)?", "(ab)?" }, 28 { ".", "." }, 29 { "^", "^" }, 30 { "$", "$" }, 31 { "[ac]", "[ac]" }, 32 { "[^ac]", "[^ac]" }, 33 34 // Posix character classes 35 { "[[:alnum:]]", "[0-9A-Za-z]" }, 36 { "[[:alpha:]]", "[A-Za-z]" }, 37 { "[[:blank:]]", "[\\t ]" }, 38 { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" }, 39 { "[[:digit:]]", "[0-9]" }, 40 { "[[:graph:]]", "[!-~]" }, 41 { "[[:lower:]]", "[a-z]" }, 42 { "[[:print:]]", "[ -~]" }, 43 { "[[:punct:]]", "[!-/:-@\\[-`{-~]" }, 44 { "[[:space:]]" , "[\\t-\\r ]" }, 45 { "[[:upper:]]", "[A-Z]" }, 46 { "[[:xdigit:]]", "[0-9A-Fa-f]" }, 47 48 // Perl character classes 49 { "\\d", "[0-9]" }, 50 { "\\s", "[\\t-\\n\\f-\\r ]" }, 51 { "\\w", "[0-9A-Z_a-z]" }, 52 { "\\D", "[^0-9]" }, 53 { "\\S", "[^\\t-\\n\\f-\\r ]" }, 54 { "\\W", "[^0-9A-Z_a-z]" }, 55 { "[\\d]", "[0-9]" }, 56 { "[\\s]", "[\\t-\\n\\f-\\r ]" }, 57 { "[\\w]", "[0-9A-Z_a-z]" }, 58 { "[\\D]", "[^0-9]" }, 59 { "[\\S]", "[^\\t-\\n\\f-\\r ]" }, 60 { "[\\W]", "[^0-9A-Z_a-z]" }, 61 62 // Posix repetitions 63 { "a{1}", "a" }, 64 { "a{2}", "aa" }, 65 { "a{5}", "aaaaa" }, 66 { "a{0,1}", "a?" }, 67 // The next three are illegible because Simplify inserts (?:) 68 // parens instead of () parens to avoid creating extra 69 // captured subexpressions. The comments show a version fewer parens. 70 { "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)? 71 { "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)? 72 { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)? 73 { "a{0,2}", "(?:aa?)?" }, // (aa?)? 74 { "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)? 75 { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)? 76 { "a{0,}", "a*" }, 77 { "a{1,}", "a+" }, 78 { "a{2,}", "aa+" }, 79 { "a{5,}", "aaaaa+" }, 80 81 // Test that operators simplify their arguments. 82 // (Simplify used to not simplify arguments to a {} repeat.) 83 { "(?:a{1,}){1,}", "a+" }, 84 { "(a{1,}b{1,})", "(a+b+)" }, 85 { "a{1,}|b{1,}", "a+|b+" }, 86 { "(?:a{1,})*", "(?:a+)*" }, 87 { "(?:a{1,})+", "a+" }, 88 { "(?:a{1,})?", "(?:a+)?" }, 89 { "a{0}", "" }, 90 91 // Character class simplification 92 { "[ab]", "[a-b]" }, 93 { "[a-za-za-z]", "[a-z]" }, 94 { "[A-Za-zA-Za-z]", "[A-Za-z]" }, 95 { "[ABCDEFGH]", "[A-H]" }, 96 { "[AB-CD-EF-GH]", "[A-H]" }, 97 { "[W-ZP-XE-R]", "[E-Z]" }, 98 { "[a-ee-gg-m]", "[a-m]" }, 99 { "[a-ea-ha-m]", "[a-m]" }, 100 { "[a-ma-ha-e]", "[a-m]" }, 101 { "[a-zA-Z0-9 -~]", "[ -~]" }, 102 103 // Empty character classes 104 { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" }, 105 106 // Full character classes 107 { "[[:cntrl:][:^cntrl:]]", "." }, 108 109 // Unicode case folding. 110 { "(?i)A", "[Aa]" }, 111 { "(?i)a", "[Aa]" }, 112 { "(?i)K", "[Kk\\x{212a}]" }, 113 { "(?i)k", "[Kk\\x{212a}]" }, 114 { "(?i)\\x{212a}", "[Kk\\x{212a}]" }, 115 { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" }, 116 { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" }, 117 { "(?i)[\\x00-\\x{10ffff}]", "." }, 118 119 // Empty string as a regular expression. 120 // Empty string must be preserved inside parens in order 121 // to make submatches work right, so these are less 122 // interesting than they used to be. ToString inserts 123 // explicit (?:) in place of non-parenthesized empty strings, 124 // to make them easier to spot for other parsers. 125 { "(a|b|)", "([a-b]|(?:))" }, 126 { "(|)", "()" }, 127 { "a()", "a()" }, 128 { "(()|())", "(()|())" }, 129 { "(a|)", "(a|(?:))" }, 130 { "ab()cd()", "ab()cd()" }, 131 { "()", "()" }, 132 { "()*", "()*" }, 133 { "()+", "()+" }, 134 { "()?" , "()?" }, 135 { "(){0}", "" }, 136 { "(){1}", "()" }, 137 { "(){1,}", "()+" }, 138 { "(){0,2}", "(?:()()?)?" }, 139 }; 140 141 TEST(TestSimplify, SimpleRegexps) { 142 for (int i = 0; i < arraysize(tests); i++) { 143 RegexpStatus status; 144 VLOG(1) << "Testing " << tests[i].regexp; 145 Regexp* re = Regexp::Parse(tests[i].regexp, 146 Regexp::MatchNL | (Regexp::LikePerl & 147 ~Regexp::OneLine), 148 &status); 149 CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text(); 150 Regexp* sre = re->Simplify(); 151 CHECK(sre != NULL); 152 153 // Check that already-simple regexps don't allocate new ones. 154 if (strcmp(tests[i].regexp, tests[i].simplified) == 0) { 155 CHECK(re == sre) << " " << tests[i].regexp 156 << " " << re->ToString() << " " << sre->ToString(); 157 } 158 159 EXPECT_EQ(tests[i].simplified, sre->ToString()) 160 << " " << tests[i].regexp << " " << sre->Dump(); 161 162 re->Decref(); 163 sre->Decref(); 164 } 165 } 166 167 } // namespace re2 168