1 // Copyright 2007 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Compiled regular expression representation. 6 // Tested by compile_test.cc 7 8 #include "util/util.h" 9 #include "util/sparse_set.h" 10 #include "re2/prog.h" 11 #include "re2/stringpiece.h" 12 13 namespace re2 { 14 15 // Constructors per Inst opcode 16 17 void Prog::Inst::InitAlt(uint32 out, uint32 out1) { 18 DCHECK_EQ(out_opcode_, 0); 19 set_out_opcode(out, kInstAlt); 20 out1_ = out1; 21 } 22 23 void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) { 24 DCHECK_EQ(out_opcode_, 0); 25 set_out_opcode(out, kInstByteRange); 26 lo_ = lo & 0xFF; 27 hi_ = hi & 0xFF; 28 foldcase_ = foldcase; 29 } 30 31 void Prog::Inst::InitCapture(int cap, uint32 out) { 32 DCHECK_EQ(out_opcode_, 0); 33 set_out_opcode(out, kInstCapture); 34 cap_ = cap; 35 } 36 37 void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) { 38 DCHECK_EQ(out_opcode_, 0); 39 set_out_opcode(out, kInstEmptyWidth); 40 empty_ = empty; 41 } 42 43 void Prog::Inst::InitMatch(int32 id) { 44 DCHECK_EQ(out_opcode_, 0); 45 set_opcode(kInstMatch); 46 match_id_ = id; 47 } 48 49 void Prog::Inst::InitNop(uint32 out) { 50 DCHECK_EQ(out_opcode_, 0); 51 set_opcode(kInstNop); 52 } 53 54 void Prog::Inst::InitFail() { 55 DCHECK_EQ(out_opcode_, 0); 56 set_opcode(kInstFail); 57 } 58 59 string Prog::Inst::Dump() { 60 switch (opcode()) { 61 default: 62 return StringPrintf("opcode %d", static_cast<int>(opcode())); 63 64 case kInstAlt: 65 return StringPrintf("alt -> %d | %d", out(), out1_); 66 67 case kInstAltMatch: 68 return StringPrintf("altmatch -> %d | %d", out(), out1_); 69 70 case kInstByteRange: 71 return StringPrintf("byte%s [%02x-%02x] -> %d", 72 foldcase_ ? "/i" : "", 73 lo_, hi_, out()); 74 75 case kInstCapture: 76 return StringPrintf("capture %d -> %d", cap_, out()); 77 78 case kInstEmptyWidth: 79 return StringPrintf("emptywidth %#x -> %d", 80 static_cast<int>(empty_), out()); 81 82 case kInstMatch: 83 return StringPrintf("match! %d", match_id()); 84 85 case kInstNop: 86 return StringPrintf("nop -> %d", out()); 87 88 case kInstFail: 89 return StringPrintf("fail"); 90 } 91 } 92 93 Prog::Prog() 94 : anchor_start_(false), 95 anchor_end_(false), 96 reversed_(false), 97 did_onepass_(false), 98 start_(0), 99 start_unanchored_(0), 100 size_(0), 101 byte_inst_count_(0), 102 bytemap_range_(0), 103 flags_(0), 104 onepass_statesize_(0), 105 inst_(NULL), 106 dfa_first_(NULL), 107 dfa_longest_(NULL), 108 dfa_mem_(0), 109 delete_dfa_(NULL), 110 unbytemap_(NULL), 111 onepass_nodes_(NULL), 112 onepass_start_(NULL) { 113 } 114 115 Prog::~Prog() { 116 if (delete_dfa_) { 117 if (dfa_first_) 118 delete_dfa_(dfa_first_); 119 if (dfa_longest_) 120 delete_dfa_(dfa_longest_); 121 } 122 delete[] onepass_nodes_; 123 delete[] inst_; 124 delete[] unbytemap_; 125 } 126 127 typedef SparseSet Workq; 128 129 static inline void AddToQueue(Workq* q, int id) { 130 if (id != 0) 131 q->insert(id); 132 } 133 134 static string ProgToString(Prog* prog, Workq* q) { 135 string s; 136 137 for (Workq::iterator i = q->begin(); i != q->end(); ++i) { 138 int id = *i; 139 Prog::Inst* ip = prog->inst(id); 140 StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str()); 141 AddToQueue(q, ip->out()); 142 if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch) 143 AddToQueue(q, ip->out1()); 144 } 145 return s; 146 } 147 148 string Prog::Dump() { 149 string map; 150 if (false) { // Debugging 151 int lo = 0; 152 StringAppendF(&map, "byte map:\n"); 153 for (int i = 0; i < bytemap_range_; i++) { 154 StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]); 155 lo = unbytemap_[i] + 1; 156 } 157 StringAppendF(&map, "\n"); 158 } 159 160 Workq q(size_); 161 AddToQueue(&q, start_); 162 return map + ProgToString(this, &q); 163 } 164 165 string Prog::DumpUnanchored() { 166 Workq q(size_); 167 AddToQueue(&q, start_unanchored_); 168 return ProgToString(this, &q); 169 } 170 171 static bool IsMatch(Prog*, Prog::Inst*); 172 173 // Peep-hole optimizer. 174 void Prog::Optimize() { 175 Workq q(size_); 176 177 // Eliminate nops. Most are taken out during compilation 178 // but a few are hard to avoid. 179 q.clear(); 180 AddToQueue(&q, start_); 181 for (Workq::iterator i = q.begin(); i != q.end(); ++i) { 182 int id = *i; 183 184 Inst* ip = inst(id); 185 int j = ip->out(); 186 Inst* jp; 187 while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { 188 j = jp->out(); 189 } 190 ip->set_out(j); 191 AddToQueue(&q, ip->out()); 192 193 if (ip->opcode() == kInstAlt) { 194 j = ip->out1(); 195 while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { 196 j = jp->out(); 197 } 198 ip->out1_ = j; 199 AddToQueue(&q, ip->out1()); 200 } 201 } 202 203 // Insert kInstAltMatch instructions 204 // Look for 205 // ip: Alt -> j | k 206 // j: ByteRange [00-FF] -> ip 207 // k: Match 208 // or the reverse (the above is the greedy one). 209 // Rewrite Alt to AltMatch. 210 q.clear(); 211 AddToQueue(&q, start_); 212 for (Workq::iterator i = q.begin(); i != q.end(); ++i) { 213 int id = *i; 214 Inst* ip = inst(id); 215 AddToQueue(&q, ip->out()); 216 if (ip->opcode() == kInstAlt) 217 AddToQueue(&q, ip->out1()); 218 219 if (ip->opcode() == kInstAlt) { 220 Inst* j = inst(ip->out()); 221 Inst* k = inst(ip->out1()); 222 if (j->opcode() == kInstByteRange && j->out() == id && 223 j->lo() == 0x00 && j->hi() == 0xFF && 224 IsMatch(this, k)) { 225 ip->set_opcode(kInstAltMatch); 226 continue; 227 } 228 if (IsMatch(this, j) && 229 k->opcode() == kInstByteRange && k->out() == id && 230 k->lo() == 0x00 && k->hi() == 0xFF) { 231 ip->set_opcode(kInstAltMatch); 232 } 233 } 234 } 235 } 236 237 // Is ip a guaranteed match at end of text, perhaps after some capturing? 238 static bool IsMatch(Prog* prog, Prog::Inst* ip) { 239 for (;;) { 240 switch (ip->opcode()) { 241 default: 242 LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode(); 243 return false; 244 245 case kInstAlt: 246 case kInstAltMatch: 247 case kInstByteRange: 248 case kInstFail: 249 case kInstEmptyWidth: 250 return false; 251 252 case kInstCapture: 253 case kInstNop: 254 ip = prog->inst(ip->out()); 255 break; 256 257 case kInstMatch: 258 return true; 259 } 260 } 261 } 262 263 uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) { 264 int flags = 0; 265 266 // ^ and \A 267 if (p == text.begin()) 268 flags |= kEmptyBeginText | kEmptyBeginLine; 269 else if (p[-1] == '\n') 270 flags |= kEmptyBeginLine; 271 272 // $ and \z 273 if (p == text.end()) 274 flags |= kEmptyEndText | kEmptyEndLine; 275 else if (p < text.end() && p[0] == '\n') 276 flags |= kEmptyEndLine; 277 278 // \b and \B 279 if (p == text.begin() && p == text.end()) { 280 // no word boundary here 281 } else if (p == text.begin()) { 282 if (IsWordChar(p[0])) 283 flags |= kEmptyWordBoundary; 284 } else if (p == text.end()) { 285 if (IsWordChar(p[-1])) 286 flags |= kEmptyWordBoundary; 287 } else { 288 if (IsWordChar(p[-1]) != IsWordChar(p[0])) 289 flags |= kEmptyWordBoundary; 290 } 291 if (!(flags & kEmptyWordBoundary)) 292 flags |= kEmptyNonWordBoundary; 293 294 return flags; 295 } 296 297 void Prog::MarkByteRange(int lo, int hi) { 298 CHECK_GE(lo, 0); 299 CHECK_GE(hi, 0); 300 CHECK_LE(lo, 255); 301 CHECK_LE(hi, 255); 302 if (lo > 0) 303 byterange_.Set(lo - 1); 304 byterange_.Set(hi); 305 } 306 307 void Prog::ComputeByteMap() { 308 // Fill in bytemap with byte classes for prog_. 309 // Ranges of bytes that are treated as indistinguishable 310 // by the regexp program are mapped to a single byte class. 311 // The vector prog_->byterange() marks the end of each 312 // such range. 313 const Bitmap<256>& v = byterange(); 314 315 COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize); 316 uint8 n = 0; 317 uint32 bits = 0; 318 for (int i = 0; i < 256; i++) { 319 if ((i&31) == 0) 320 bits = v.Word(i >> 5); 321 bytemap_[i] = n; 322 n += bits & 1; 323 bits >>= 1; 324 } 325 bytemap_range_ = bytemap_[255] + 1; 326 unbytemap_ = new uint8[bytemap_range_]; 327 for (int i = 0; i < 256; i++) 328 unbytemap_[bytemap_[i]] = i; 329 330 if (0) { // For debugging: use trivial byte map. 331 for (int i = 0; i < 256; i++) { 332 bytemap_[i] = i; 333 unbytemap_[i] = i; 334 } 335 bytemap_range_ = 256; 336 LOG(INFO) << "Using trivial bytemap."; 337 } 338 } 339 340 } // namespace re2 341 342