1 // Copyright 2007 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Test prog.cc, compile.cc 6 7 #include <string> 8 #include <vector> 9 #include "util/test.h" 10 #include "re2/regexp.h" 11 #include "re2/prog.h" 12 13 DEFINE_string(show, "", "regular expression to compile and dump"); 14 15 namespace re2 { 16 17 // Simple input/output tests checking that 18 // the regexp compiles to the expected code. 19 // These are just to sanity check the basic implementation. 20 // The real confidence tests happen by testing the NFA/DFA 21 // that run the compiled code. 22 23 struct Test { 24 const char* regexp; 25 const char* code; 26 }; 27 28 static Test tests[] = { 29 { "a", 30 "1. byte [61-61] -> 2\n" 31 "2. match! 0\n" }, 32 { "ab", 33 "1. byte [61-61] -> 2\n" 34 "2. byte [62-62] -> 3\n" 35 "3. match! 0\n" }, 36 { "a|c", 37 "3. alt -> 1 | 2\n" 38 "1. byte [61-61] -> 4\n" 39 "2. byte [63-63] -> 4\n" 40 "4. match! 0\n" }, 41 { "a|b", 42 "1. byte [61-62] -> 2\n" 43 "2. match! 0\n" }, 44 { "[ab]", 45 "1. byte [61-62] -> 2\n" 46 "2. match! 0\n" }, 47 { "a+", 48 "1. byte [61-61] -> 2\n" 49 "2. alt -> 1 | 3\n" 50 "3. match! 0\n" }, 51 { "a+?", 52 "1. byte [61-61] -> 2\n" 53 "2. alt -> 3 | 1\n" 54 "3. match! 0\n" }, 55 { "a*", 56 "2. alt -> 1 | 3\n" 57 "1. byte [61-61] -> 2\n" 58 "3. match! 0\n" }, 59 { "a*?", 60 "2. alt -> 3 | 1\n" 61 "3. match! 0\n" 62 "1. byte [61-61] -> 2\n" }, 63 { "a?", 64 "2. alt -> 1 | 3\n" 65 "1. byte [61-61] -> 3\n" 66 "3. match! 0\n" }, 67 { "a??", 68 "2. alt -> 3 | 1\n" 69 "3. match! 0\n" 70 "1. byte [61-61] -> 3\n" }, 71 { "a{4}", 72 "1. byte [61-61] -> 2\n" 73 "2. byte [61-61] -> 3\n" 74 "3. byte [61-61] -> 4\n" 75 "4. byte [61-61] -> 5\n" 76 "5. match! 0\n" }, 77 { "(a)", 78 "2. capture 2 -> 1\n" 79 "1. byte [61-61] -> 3\n" 80 "3. capture 3 -> 4\n" 81 "4. match! 0\n" }, 82 { "(?:a)", 83 "1. byte [61-61] -> 2\n" 84 "2. match! 0\n" }, 85 { "", 86 "2. match! 0\n" }, 87 { ".", 88 "3. alt -> 1 | 2\n" 89 "1. byte [00-09] -> 4\n" 90 "2. byte [0b-ff] -> 4\n" 91 "4. match! 0\n" }, 92 { "[^ab]", 93 "5. alt -> 3 | 4\n" 94 "3. alt -> 1 | 2\n" 95 "4. byte [63-ff] -> 6\n" 96 "1. byte [00-09] -> 6\n" 97 "2. byte [0b-60] -> 6\n" 98 "6. match! 0\n" }, 99 { "[Aa]", 100 "1. byte/i [61-61] -> 2\n" 101 "2. match! 0\n" }, 102 }; 103 104 TEST(TestRegexpCompileToProg, Simple) { 105 int failed = 0; 106 for (int i = 0; i < arraysize(tests); i++) { 107 const re2::Test& t = tests[i]; 108 Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL); 109 if (re == NULL) { 110 LOG(ERROR) << "Cannot parse: " << t.regexp; 111 failed++; 112 continue; 113 } 114 Prog* prog = re->CompileToProg(0); 115 if (prog == NULL) { 116 LOG(ERROR) << "Cannot compile: " << t.regexp; 117 re->Decref(); 118 failed++; 119 continue; 120 } 121 CHECK(re->CompileToProg(1) == NULL); 122 string s = prog->Dump(); 123 if (s != t.code) { 124 LOG(ERROR) << "Incorrect compiled code for: " << t.regexp; 125 LOG(ERROR) << "Want:\n" << t.code; 126 LOG(ERROR) << "Got:\n" << s; 127 failed++; 128 } 129 delete prog; 130 re->Decref(); 131 } 132 EXPECT_EQ(failed, 0); 133 } 134 135 // The distinct byte ranges involved in the UTF-8 dot ([^\n]). 136 // Once, erroneously split between 0x3f and 0x40 because it is 137 // a 6-bit boundary. 138 static struct UTF8ByteRange { 139 int lo; 140 int hi; 141 } utf8ranges[] = { 142 { 0x00, 0x09 }, 143 { 0x0A, 0x0A }, 144 { 0x10, 0x7F }, 145 { 0x80, 0x8F }, 146 { 0x90, 0x9F }, 147 { 0xA0, 0xBF }, 148 { 0xC0, 0xC1 }, 149 { 0xC2, 0xDF }, 150 { 0xE0, 0xE0 }, 151 { 0xE1, 0xEF }, 152 { 0xF0, 0xF0 }, 153 { 0xF1, 0xF3 }, 154 { 0xF4, 0xF4 }, 155 { 0xF5, 0xFF }, 156 }; 157 158 TEST(TestCompile, ByteRanges) { 159 Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL); 160 EXPECT_TRUE(re != NULL); 161 Prog* prog = re->CompileToProg(0); 162 EXPECT_TRUE(prog != NULL); 163 EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges)); 164 for (int i = 0; i < arraysize(utf8ranges); i++) 165 for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++) 166 EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j; 167 delete prog; 168 re->Decref(); 169 } 170 171 } // namespace re2 172