Home | History | Annotate | Download | only in testing
      1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Test prog.cc, compile.cc
      6 
      7 #include <string>
      8 #include <vector>
      9 #include "util/test.h"
     10 #include "re2/regexp.h"
     11 #include "re2/prog.h"
     12 
     13 DEFINE_string(show, "", "regular expression to compile and dump");
     14 
     15 namespace re2 {
     16 
     17 // Simple input/output tests checking that
     18 // the regexp compiles to the expected code.
     19 // These are just to sanity check the basic implementation.
     20 // The real confidence tests happen by testing the NFA/DFA
     21 // that run the compiled code.
     22 
     23 struct Test {
     24   const char* regexp;
     25   const char* code;
     26 };
     27 
     28 static Test tests[] = {
     29   { "a",
     30     "1. byte [61-61] -> 2\n"
     31     "2. match! 0\n" },
     32   { "ab",
     33     "1. byte [61-61] -> 2\n"
     34     "2. byte [62-62] -> 3\n"
     35     "3. match! 0\n" },
     36   { "a|c",
     37     "3. alt -> 1 | 2\n"
     38     "1. byte [61-61] -> 4\n"
     39     "2. byte [63-63] -> 4\n"
     40     "4. match! 0\n" },
     41   { "a|b",
     42     "1. byte [61-62] -> 2\n"
     43     "2. match! 0\n" },
     44   { "[ab]",
     45     "1. byte [61-62] -> 2\n"
     46     "2. match! 0\n" },
     47   { "a+",
     48     "1. byte [61-61] -> 2\n"
     49     "2. alt -> 1 | 3\n"
     50     "3. match! 0\n" },
     51   { "a+?",
     52     "1. byte [61-61] -> 2\n"
     53     "2. alt -> 3 | 1\n"
     54     "3. match! 0\n" },
     55   { "a*",
     56     "2. alt -> 1 | 3\n"
     57     "1. byte [61-61] -> 2\n"
     58     "3. match! 0\n" },
     59   { "a*?",
     60     "2. alt -> 3 | 1\n"
     61     "3. match! 0\n"
     62     "1. byte [61-61] -> 2\n" },
     63   { "a?",
     64     "2. alt -> 1 | 3\n"
     65     "1. byte [61-61] -> 3\n"
     66     "3. match! 0\n" },
     67   { "a??",
     68     "2. alt -> 3 | 1\n"
     69     "3. match! 0\n"
     70     "1. byte [61-61] -> 3\n" },
     71   { "a{4}",
     72     "1. byte [61-61] -> 2\n"
     73     "2. byte [61-61] -> 3\n"
     74     "3. byte [61-61] -> 4\n"
     75     "4. byte [61-61] -> 5\n"
     76     "5. match! 0\n" },
     77   { "(a)",
     78     "2. capture 2 -> 1\n"
     79     "1. byte [61-61] -> 3\n"
     80     "3. capture 3 -> 4\n"
     81     "4. match! 0\n" },
     82   { "(?:a)",
     83     "1. byte [61-61] -> 2\n"
     84     "2. match! 0\n" },
     85   { "",
     86     "2. match! 0\n" },
     87   { ".",
     88     "3. alt -> 1 | 2\n"
     89     "1. byte [00-09] -> 4\n"
     90     "2. byte [0b-ff] -> 4\n"
     91     "4. match! 0\n" },
     92   { "[^ab]",
     93     "5. alt -> 3 | 4\n"
     94     "3. alt -> 1 | 2\n"
     95     "4. byte [63-ff] -> 6\n"
     96     "1. byte [00-09] -> 6\n"
     97     "2. byte [0b-60] -> 6\n"
     98     "6. match! 0\n" },
     99   { "[Aa]",
    100     "1. byte/i [61-61] -> 2\n"
    101     "2. match! 0\n" },
    102 };
    103 
    104 TEST(TestRegexpCompileToProg, Simple) {
    105   int failed = 0;
    106   for (int i = 0; i < arraysize(tests); i++) {
    107     const re2::Test& t = tests[i];
    108     Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
    109     if (re == NULL) {
    110       LOG(ERROR) << "Cannot parse: " << t.regexp;
    111       failed++;
    112       continue;
    113     }
    114     Prog* prog = re->CompileToProg(0);
    115     if (prog == NULL) {
    116       LOG(ERROR) << "Cannot compile: " << t.regexp;
    117       re->Decref();
    118       failed++;
    119       continue;
    120     }
    121     CHECK(re->CompileToProg(1) == NULL);
    122     string s = prog->Dump();
    123     if (s != t.code) {
    124       LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
    125       LOG(ERROR) << "Want:\n" << t.code;
    126       LOG(ERROR) << "Got:\n" << s;
    127       failed++;
    128     }
    129     delete prog;
    130     re->Decref();
    131   }
    132   EXPECT_EQ(failed, 0);
    133 }
    134 
    135 // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
    136 // Once, erroneously split between 0x3f and 0x40 because it is
    137 // a 6-bit boundary.
    138 static struct UTF8ByteRange {
    139   int lo;
    140   int hi;
    141 } utf8ranges[] = {
    142   { 0x00, 0x09 },
    143   { 0x0A, 0x0A },
    144   { 0x10, 0x7F },
    145   { 0x80, 0x8F },
    146   { 0x90, 0x9F },
    147   { 0xA0, 0xBF },
    148   { 0xC0, 0xC1 },
    149   { 0xC2, 0xDF },
    150   { 0xE0, 0xE0 },
    151   { 0xE1, 0xEF },
    152   { 0xF0, 0xF0 },
    153   { 0xF1, 0xF3 },
    154   { 0xF4, 0xF4 },
    155   { 0xF5, 0xFF },
    156 };
    157 
    158 TEST(TestCompile, ByteRanges) {
    159   Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL);
    160   EXPECT_TRUE(re != NULL);
    161   Prog* prog = re->CompileToProg(0);
    162   EXPECT_TRUE(prog != NULL);
    163   EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges));
    164   for (int i = 0; i < arraysize(utf8ranges); i++)
    165     for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++)
    166       EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
    167   delete prog;
    168   re->Decref();
    169 }
    170 
    171 }  // namespace re2
    172