Home | History | Annotate | Download | only in testing
      1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Regular expression generator: generates all possible
      6 // regular expressions within parameters (see regexp_generator.h for details).
      7 
      8 // The regexp generator first generates a sequence of commands in a simple
      9 // postfix language.  Each command in the language is a string,
     10 // like "a" or "%s*" or "%s|%s".
     11 //
     12 // To evaluate a command, enough arguments are popped from the value stack to
     13 // plug into the %s slots.  Then the result is pushed onto the stack.
     14 // For example, the command sequence
     15 //      a b %s%s c
     16 // results in the stack
     17 //      ab c
     18 //
     19 // GeneratePostfix generates all possible command sequences.
     20 // Then RunPostfix turns each sequence into a regular expression
     21 // and passes the regexp to HandleRegexp.
     22 
     23 #include <string.h>
     24 #include <string>
     25 #include <stack>
     26 #include <vector>
     27 #include "util/test.h"
     28 #include "re2/testing/regexp_generator.h"
     29 
     30 namespace re2 {
     31 
     32 // Returns a vector of the egrep regexp operators.
     33 const vector<string>& RegexpGenerator::EgrepOps() {
     34   static const char *ops[] = {
     35     "%s%s",
     36     "%s|%s",
     37     "%s*",
     38     "%s+",
     39     "%s?",
     40     "%s\\C*",
     41   };
     42   static vector<string> v(ops, ops + arraysize(ops));
     43   return v;
     44 }
     45 
     46 RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
     47                                  const vector<string>& atoms,
     48                                  const vector<string>& ops)
     49     : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
     50   // Degenerate case.
     51   if (atoms_.size() == 0)
     52     maxatoms_ = 0;
     53   if (ops_.size() == 0)
     54     maxops_ = 0;
     55 }
     56 
     57 // Generates all possible regular expressions (within the parameters),
     58 // calling HandleRegexp for each one.
     59 void RegexpGenerator::Generate() {
     60   vector<string> postfix;
     61   GeneratePostfix(&postfix, 0, 0, 0);
     62 }
     63 
     64 // Generates random regular expressions, calling HandleRegexp for each one.
     65 void RegexpGenerator::GenerateRandom(int32 seed, int n) {
     66   ACMRandom acm(seed);
     67   acm_ = &acm;
     68 
     69   for (int i = 0; i < n; i++) {
     70     vector<string> postfix;
     71     GenerateRandomPostfix(&postfix, 0, 0, 0);
     72   }
     73 
     74   acm_ = NULL;
     75 }
     76 
     77 // Counts and returns the number of occurrences of "%s" in s.
     78 static int CountArgs(const string& s) {
     79   const char *p = s.c_str();
     80   int n = 0;
     81   while ((p = strstr(p, "%s")) != NULL) {
     82     p += 2;
     83     n++;
     84   }
     85   return n;
     86 }
     87 
     88 // Generates all possible postfix command sequences.
     89 // Each sequence is handed off to RunPostfix to generate a regular expression.
     90 // The arguments are:
     91 //   post:  the current postfix sequence
     92 //   nstk:  the number of elements that would be on the stack after executing
     93 //          the sequence
     94 //   ops:   the number of operators used in the sequence
     95 //   atoms: the number of atoms used in the sequence
     96 // For example, if post were ["a", "b", "%s%s", "c"],
     97 // then nstk = 2, ops = 1, atoms = 3.
     98 //
     99 // The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
    100 //
    101 void RegexpGenerator::GeneratePostfix(vector<string>* post, int nstk,
    102                                       int ops, int atoms) {
    103   if (nstk == 1)
    104     RunPostfix(*post);
    105 
    106   // Early out: if used too many operators or can't
    107   // get back down to a single expression on the stack
    108   // using binary operators, give up.
    109   if (ops + nstk - 1 > maxops_)
    110     return;
    111 
    112   // Add atoms if there is room.
    113   if (atoms < maxatoms_) {
    114     for (int i = 0; i < atoms_.size(); i++) {
    115       post->push_back(atoms_[i]);
    116       GeneratePostfix(post, nstk + 1, ops, atoms + 1);
    117       post->pop_back();
    118     }
    119   }
    120 
    121   // Add operators if there are enough arguments.
    122   if (ops < maxops_) {
    123     for (int i = 0; i < ops_.size(); i++) {
    124       const string& fmt = ops_[i];
    125       int nargs = CountArgs(fmt);
    126       if (nargs <= nstk) {
    127         post->push_back(fmt);
    128         GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
    129         post->pop_back();
    130       }
    131     }
    132   }
    133 }
    134 
    135 // Generates a random postfix command sequence.
    136 // Stops and returns true once a single sequence has been generated.
    137 bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk,
    138                                             int ops, int atoms) {
    139   for (;;) {
    140     // Stop if we get to a single element, but only sometimes.
    141     if (nstk == 1 && acm_->Uniform(maxatoms_ + 1 - atoms) == 0) {
    142       RunPostfix(*post);
    143       return true;
    144     }
    145 
    146     // Early out: if used too many operators or can't
    147     // get back down to a single expression on the stack
    148     // using binary operators, give up.
    149     if (ops + nstk - 1 > maxops_)
    150       return false;
    151 
    152     // Add operators if there are enough arguments.
    153     if (ops < maxops_ && acm_->Uniform(2) == 0) {
    154       const string& fmt = ops_[acm_->Uniform(ops_.size())];
    155       int nargs = CountArgs(fmt);
    156       if (nargs <= nstk) {
    157         post->push_back(fmt);
    158         bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
    159                                          ops + 1, atoms);
    160         post->pop_back();
    161         if (ret)
    162           return true;
    163       }
    164     }
    165 
    166     // Add atoms if there is room.
    167     if (atoms < maxatoms_ && acm_->Uniform(2) == 0) {
    168       post->push_back(atoms_[acm_->Uniform(atoms_.size())]);
    169       bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
    170       post->pop_back();
    171       if (ret)
    172         return true;
    173     }
    174   }
    175 }
    176 
    177 // Interprets the postfix command sequence to create a regular expression
    178 // passed to HandleRegexp.  The results of operators like %s|%s are wrapped
    179 // in (?: ) to avoid needing to maintain a precedence table.
    180 void RegexpGenerator::RunPostfix(const vector<string>& post) {
    181   stack<string> regexps;
    182   for (int i = 0; i < post.size(); i++) {
    183     switch (CountArgs(post[i])) {
    184       default:
    185         LOG(FATAL) << "Bad operator: " << post[i];
    186       case 0:
    187         regexps.push(post[i]);
    188         break;
    189       case 1: {
    190         string a = regexps.top();
    191         regexps.pop();
    192         regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")");
    193         break;
    194       }
    195       case 2: {
    196         string b = regexps.top();
    197         regexps.pop();
    198         string a = regexps.top();
    199         regexps.pop();
    200         regexps.push("(?:" +
    201                      StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) +
    202                      ")");
    203         break;
    204       }
    205     }
    206   }
    207 
    208   if (regexps.size() != 1) {
    209     // Internal error - should never happen.
    210     printf("Bad regexp program:\n");
    211     for (int i = 0; i < post.size(); i++) {
    212       printf("  %s\n", CEscape(post[i]).c_str());
    213     }
    214     printf("Stack after running program:\n");
    215     while (!regexps.empty()) {
    216       printf("  %s\n", CEscape(regexps.top()).c_str());
    217       regexps.pop();
    218     }
    219     LOG(FATAL) << "Bad regexp program.";
    220   }
    221 
    222   HandleRegexp(regexps.top());
    223   HandleRegexp("^(?:" + regexps.top() + ")$");
    224   HandleRegexp("^(?:" + regexps.top() + ")");
    225   HandleRegexp("(?:" + regexps.top() + ")$");
    226 }
    227 
    228 // Split s into an vector of strings, one for each UTF-8 character.
    229 vector<string> Explode(const StringPiece& s) {
    230   vector<string> v;
    231 
    232   for (const char *q = s.begin(); q < s.end(); ) {
    233     const char* p = q;
    234     Rune r;
    235     q += chartorune(&r, q);
    236     v.push_back(string(p, q - p));
    237   }
    238 
    239   return v;
    240 }
    241 
    242 // Split string everywhere a substring is found, returning
    243 // vector of pieces.
    244 vector<string> Split(const StringPiece& sep, const StringPiece& s) {
    245   vector<string> v;
    246 
    247   if (sep.size() == 0)
    248     return Explode(s);
    249 
    250   const char *p = s.begin();
    251   for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) {
    252     if (StringPiece(q, sep.size()) == sep) {
    253       v.push_back(string(p, q - p));
    254       p = q + sep.size();
    255       q = p - 1;  // -1 for ++ in loop
    256       continue;
    257     }
    258   }
    259   if (p < s.end())
    260     v.push_back(string(p, s.end() - p));
    261   return v;
    262 }
    263 
    264 }  // namespace re2
    265