Home | History | Annotate | Download | only in re2
      1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Compiled regular expression representation.
      6 // Tested by compile_test.cc
      7 
      8 #include "util/util.h"
      9 #include "util/sparse_set.h"
     10 #include "re2/prog.h"
     11 #include "re2/stringpiece.h"
     12 
     13 namespace re2 {
     14 
     15 // Constructors per Inst opcode
     16 
     17 void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
     18   DCHECK_EQ(out_opcode_, 0);
     19   set_out_opcode(out, kInstAlt);
     20   out1_ = out1;
     21 }
     22 
     23 void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
     24   DCHECK_EQ(out_opcode_, 0);
     25   set_out_opcode(out, kInstByteRange);
     26   lo_ = lo & 0xFF;
     27   hi_ = hi & 0xFF;
     28   foldcase_ = foldcase;
     29 }
     30 
     31 void Prog::Inst::InitCapture(int cap, uint32 out) {
     32   DCHECK_EQ(out_opcode_, 0);
     33   set_out_opcode(out, kInstCapture);
     34   cap_ = cap;
     35 }
     36 
     37 void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
     38   DCHECK_EQ(out_opcode_, 0);
     39   set_out_opcode(out, kInstEmptyWidth);
     40   empty_ = empty;
     41 }
     42 
     43 void Prog::Inst::InitMatch(int32 id) {
     44   DCHECK_EQ(out_opcode_, 0);
     45   set_opcode(kInstMatch);
     46   match_id_ = id;
     47 }
     48 
     49 void Prog::Inst::InitNop(uint32 out) {
     50   DCHECK_EQ(out_opcode_, 0);
     51   set_opcode(kInstNop);
     52 }
     53 
     54 void Prog::Inst::InitFail() {
     55   DCHECK_EQ(out_opcode_, 0);
     56   set_opcode(kInstFail);
     57 }
     58 
     59 string Prog::Inst::Dump() {
     60   switch (opcode()) {
     61     default:
     62       return StringPrintf("opcode %d", static_cast<int>(opcode()));
     63 
     64     case kInstAlt:
     65       return StringPrintf("alt -> %d | %d", out(), out1_);
     66 
     67     case kInstAltMatch:
     68       return StringPrintf("altmatch -> %d | %d", out(), out1_);
     69 
     70     case kInstByteRange:
     71       return StringPrintf("byte%s [%02x-%02x] -> %d",
     72                           foldcase_ ? "/i" : "",
     73                           lo_, hi_, out());
     74 
     75     case kInstCapture:
     76       return StringPrintf("capture %d -> %d", cap_, out());
     77 
     78     case kInstEmptyWidth:
     79       return StringPrintf("emptywidth %#x -> %d",
     80                           static_cast<int>(empty_), out());
     81 
     82     case kInstMatch:
     83       return StringPrintf("match! %d", match_id());
     84 
     85     case kInstNop:
     86       return StringPrintf("nop -> %d", out());
     87 
     88     case kInstFail:
     89       return StringPrintf("fail");
     90   }
     91 }
     92 
     93 Prog::Prog()
     94   : anchor_start_(false),
     95     anchor_end_(false),
     96     reversed_(false),
     97     did_onepass_(false),
     98     start_(0),
     99     start_unanchored_(0),
    100     size_(0),
    101     byte_inst_count_(0),
    102     bytemap_range_(0),
    103     flags_(0),
    104     onepass_statesize_(0),
    105     inst_(NULL),
    106     dfa_first_(NULL),
    107     dfa_longest_(NULL),
    108     dfa_mem_(0),
    109     delete_dfa_(NULL),
    110     unbytemap_(NULL),
    111     onepass_nodes_(NULL),
    112     onepass_start_(NULL) {
    113 }
    114 
    115 Prog::~Prog() {
    116   if (delete_dfa_) {
    117     if (dfa_first_)
    118       delete_dfa_(dfa_first_);
    119     if (dfa_longest_)
    120       delete_dfa_(dfa_longest_);
    121   }
    122   delete[] onepass_nodes_;
    123   delete[] inst_;
    124   delete[] unbytemap_;
    125 }
    126 
    127 typedef SparseSet Workq;
    128 
    129 static inline void AddToQueue(Workq* q, int id) {
    130   if (id != 0)
    131     q->insert(id);
    132 }
    133 
    134 static string ProgToString(Prog* prog, Workq* q) {
    135   string s;
    136 
    137   for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
    138     int id = *i;
    139     Prog::Inst* ip = prog->inst(id);
    140     StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
    141     AddToQueue(q, ip->out());
    142     if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
    143       AddToQueue(q, ip->out1());
    144   }
    145   return s;
    146 }
    147 
    148 string Prog::Dump() {
    149   string map;
    150   if (false) {  // Debugging
    151     int lo = 0;
    152     StringAppendF(&map, "byte map:\n");
    153     for (int i = 0; i < bytemap_range_; i++) {
    154       StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
    155       lo = unbytemap_[i] + 1;
    156     }
    157     StringAppendF(&map, "\n");
    158   }
    159 
    160   Workq q(size_);
    161   AddToQueue(&q, start_);
    162   return map + ProgToString(this, &q);
    163 }
    164 
    165 string Prog::DumpUnanchored() {
    166   Workq q(size_);
    167   AddToQueue(&q, start_unanchored_);
    168   return ProgToString(this, &q);
    169 }
    170 
    171 static bool IsMatch(Prog*, Prog::Inst*);
    172 
    173 // Peep-hole optimizer.
    174 void Prog::Optimize() {
    175   Workq q(size_);
    176 
    177   // Eliminate nops.  Most are taken out during compilation
    178   // but a few are hard to avoid.
    179   q.clear();
    180   AddToQueue(&q, start_);
    181   for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
    182     int id = *i;
    183 
    184     Inst* ip = inst(id);
    185     int j = ip->out();
    186     Inst* jp;
    187     while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
    188       j = jp->out();
    189     }
    190     ip->set_out(j);
    191     AddToQueue(&q, ip->out());
    192 
    193     if (ip->opcode() == kInstAlt) {
    194       j = ip->out1();
    195       while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
    196         j = jp->out();
    197       }
    198       ip->out1_ = j;
    199       AddToQueue(&q, ip->out1());
    200     }
    201   }
    202 
    203   // Insert kInstAltMatch instructions
    204   // Look for
    205   //   ip: Alt -> j | k
    206   //	  j: ByteRange [00-FF] -> ip
    207   //    k: Match
    208   // or the reverse (the above is the greedy one).
    209   // Rewrite Alt to AltMatch.
    210   q.clear();
    211   AddToQueue(&q, start_);
    212   for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
    213     int id = *i;
    214     Inst* ip = inst(id);
    215     AddToQueue(&q, ip->out());
    216     if (ip->opcode() == kInstAlt)
    217       AddToQueue(&q, ip->out1());
    218 
    219     if (ip->opcode() == kInstAlt) {
    220       Inst* j = inst(ip->out());
    221       Inst* k = inst(ip->out1());
    222       if (j->opcode() == kInstByteRange && j->out() == id &&
    223           j->lo() == 0x00 && j->hi() == 0xFF &&
    224           IsMatch(this, k)) {
    225         ip->set_opcode(kInstAltMatch);
    226         continue;
    227       }
    228       if (IsMatch(this, j) &&
    229           k->opcode() == kInstByteRange && k->out() == id &&
    230           k->lo() == 0x00 && k->hi() == 0xFF) {
    231         ip->set_opcode(kInstAltMatch);
    232       }
    233     }
    234   }
    235 }
    236 
    237 // Is ip a guaranteed match at end of text, perhaps after some capturing?
    238 static bool IsMatch(Prog* prog, Prog::Inst* ip) {
    239   for (;;) {
    240     switch (ip->opcode()) {
    241       default:
    242         LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
    243         return false;
    244 
    245       case kInstAlt:
    246       case kInstAltMatch:
    247       case kInstByteRange:
    248       case kInstFail:
    249       case kInstEmptyWidth:
    250         return false;
    251 
    252       case kInstCapture:
    253       case kInstNop:
    254         ip = prog->inst(ip->out());
    255         break;
    256 
    257       case kInstMatch:
    258         return true;
    259     }
    260   }
    261 }
    262 
    263 uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
    264   int flags = 0;
    265 
    266   // ^ and \A
    267   if (p == text.begin())
    268     flags |= kEmptyBeginText | kEmptyBeginLine;
    269   else if (p[-1] == '\n')
    270     flags |= kEmptyBeginLine;
    271 
    272   // $ and \z
    273   if (p == text.end())
    274     flags |= kEmptyEndText | kEmptyEndLine;
    275   else if (p < text.end() && p[0] == '\n')
    276     flags |= kEmptyEndLine;
    277 
    278   // \b and \B
    279   if (p == text.begin() && p == text.end()) {
    280     // no word boundary here
    281   } else if (p == text.begin()) {
    282     if (IsWordChar(p[0]))
    283       flags |= kEmptyWordBoundary;
    284   } else if (p == text.end()) {
    285     if (IsWordChar(p[-1]))
    286       flags |= kEmptyWordBoundary;
    287   } else {
    288     if (IsWordChar(p[-1]) != IsWordChar(p[0]))
    289       flags |= kEmptyWordBoundary;
    290   }
    291   if (!(flags & kEmptyWordBoundary))
    292     flags |= kEmptyNonWordBoundary;
    293 
    294   return flags;
    295 }
    296 
    297 void Prog::MarkByteRange(int lo, int hi) {
    298   CHECK_GE(lo, 0);
    299   CHECK_GE(hi, 0);
    300   CHECK_LE(lo, 255);
    301   CHECK_LE(hi, 255);
    302   if (lo > 0)
    303     byterange_.Set(lo - 1);
    304   byterange_.Set(hi);
    305 }
    306 
    307 void Prog::ComputeByteMap() {
    308   // Fill in bytemap with byte classes for prog_.
    309   // Ranges of bytes that are treated as indistinguishable
    310   // by the regexp program are mapped to a single byte class.
    311   // The vector prog_->byterange() marks the end of each
    312   // such range.
    313   const Bitmap<256>& v = byterange();
    314 
    315   COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
    316   uint8 n = 0;
    317   uint32 bits = 0;
    318   for (int i = 0; i < 256; i++) {
    319     if ((i&31) == 0)
    320       bits = v.Word(i >> 5);
    321     bytemap_[i] = n;
    322     n += bits & 1;
    323     bits >>= 1;
    324   }
    325   bytemap_range_ = bytemap_[255] + 1;
    326   unbytemap_ = new uint8[bytemap_range_];
    327   for (int i = 0; i < 256; i++)
    328     unbytemap_[bytemap_[i]] = i;
    329 
    330   if (0) {  // For debugging: use trivial byte map.
    331     for (int i = 0; i < 256; i++) {
    332       bytemap_[i] = i;
    333       unbytemap_[i] = i;
    334     }
    335     bytemap_range_ = 256;
    336     LOG(INFO) << "Using trivial bytemap.";
    337   }
    338 }
    339 
    340 }  // namespace re2
    341 
    342