1 // Copyright 2007 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Compile regular expression to Prog. 6 // 7 // Prog and Inst are defined in prog.h. 8 // This file's external interface is just Regexp::CompileToProg. 9 // The Compiler class defined in this file is private. 10 11 #include "re2/prog.h" 12 #include "re2/re2.h" 13 #include "re2/regexp.h" 14 #include "re2/walker-inl.h" 15 16 namespace re2 { 17 18 // List of pointers to Inst* that need to be filled in (patched). 19 // Because the Inst* haven't been filled in yet, 20 // we can use the Inst* word to hold the list's "next" pointer. 21 // It's kind of sleazy, but it works well in practice. 22 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. 23 // 24 // Because the out and out1 fields in Inst are no longer pointers, 25 // we can't use pointers directly here either. Instead, p refers 26 // to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1). 27 // p == 0 represents the NULL list. This is okay because instruction #0 28 // is always the fail instruction, which never appears on a list. 29 30 struct PatchList { 31 uint32 p; 32 33 // Returns patch list containing just p. 34 static PatchList Mk(uint32 p); 35 36 // Patches all the entries on l to have value v. 37 // Caller must not ever use patch list again. 38 static void Patch(Prog::Inst *inst0, PatchList l, uint32 v); 39 40 // Deref returns the next pointer pointed at by p. 41 static PatchList Deref(Prog::Inst *inst0, PatchList l); 42 43 // Appends two patch lists and returns result. 44 static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2); 45 }; 46 47 static PatchList nullPatchList = { 0 }; 48 49 // Returns patch list containing just p. 50 PatchList PatchList::Mk(uint32 p) { 51 PatchList l; 52 l.p = p; 53 return l; 54 } 55 56 // Returns the next pointer pointed at by l. 57 PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) { 58 Prog::Inst* ip = &inst0[l.p>>1]; 59 if (l.p&1) 60 l.p = ip->out1(); 61 else 62 l.p = ip->out(); 63 return l; 64 } 65 66 // Patches all the entries on l to have value v. 67 void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) { 68 while (l.p != 0) { 69 Prog::Inst* ip = &inst0[l.p>>1]; 70 if (l.p&1) { 71 l.p = ip->out1(); 72 ip->out1_ = val; 73 } else { 74 l.p = ip->out(); 75 ip->set_out(val); 76 } 77 } 78 } 79 80 // Appends two patch lists and returns result. 81 PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { 82 if (l1.p == 0) 83 return l2; 84 if (l2.p == 0) 85 return l1; 86 87 PatchList l = l1; 88 for (;;) { 89 PatchList next = PatchList::Deref(inst0, l); 90 if (next.p == 0) 91 break; 92 l = next; 93 } 94 95 Prog::Inst* ip = &inst0[l.p>>1]; 96 if (l.p&1) 97 ip->out1_ = l2.p; 98 else 99 ip->set_out(l2.p); 100 101 return l1; 102 } 103 104 // Compiled program fragment. 105 struct Frag { 106 uint32 begin; 107 PatchList end; 108 109 Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector 110 Frag(uint32 begin, PatchList end) : begin(begin), end(end) {} 111 }; 112 113 static Frag kNullFrag; 114 115 // Input encodings. 116 enum Encoding { 117 kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) 118 kEncodingLatin1, // Latin1 (0-FF) 119 }; 120 121 class Compiler : public Regexp::Walker<Frag> { 122 public: 123 explicit Compiler(); 124 ~Compiler(); 125 126 // Compiles Regexp to a new Prog. 127 // Caller is responsible for deleting Prog when finished with it. 128 // If reversed is true, compiles for walking over the input 129 // string backward (reverses all concatenations). 130 static Prog *Compile(Regexp* re, bool reversed, int64 max_mem); 131 132 // Compiles alternation of all the re to a new Prog. 133 // Each re has a match with an id equal to its index in the vector. 134 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, 135 Regexp* re); 136 137 // Interface for Regexp::Walker, which helps traverse the Regexp. 138 // The walk is purely post-recursive: given the machines for the 139 // children, PostVisit combines them to create the machine for 140 // the current node. The child_args are Frags. 141 // The Compiler traverses the Regexp parse tree, visiting 142 // each node in depth-first order. It invokes PreVisit before 143 // visiting the node's children and PostVisit after visiting 144 // the children. 145 Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); 146 Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args, 147 int nchild_args); 148 Frag ShortVisit(Regexp* re, Frag parent_arg); 149 Frag Copy(Frag arg); 150 151 // Given fragment a, returns a+ or a+?; a* or a*?; a? or a?? 152 Frag Plus(Frag a, bool nongreedy); 153 Frag Star(Frag a, bool nongreedy); 154 Frag Quest(Frag a, bool nongreedy); 155 156 // Given fragment a, returns (a) capturing as \n. 157 Frag Capture(Frag a, int n); 158 159 // Given fragments a and b, returns ab; a|b 160 Frag Cat(Frag a, Frag b); 161 Frag Alt(Frag a, Frag b); 162 163 // Returns a fragment that can't match anything. 164 Frag NoMatch(); 165 166 // Returns a fragment that matches the empty string. 167 Frag Match(int32 id); 168 169 // Returns a no-op fragment. 170 Frag Nop(); 171 172 // Returns a fragment matching the byte range lo-hi. 173 Frag ByteRange(int lo, int hi, bool foldcase); 174 175 // Returns a fragment matching an empty-width special op. 176 Frag EmptyWidth(EmptyOp op); 177 178 // Adds n instructions to the program. 179 // Returns the index of the first one. 180 // Returns -1 if no more instructions are available. 181 int AllocInst(int n); 182 183 // Deletes unused instructions. 184 void Trim(); 185 186 // Rune range compiler. 187 188 // Begins a new alternation. 189 void BeginRange(); 190 191 // Adds a fragment matching the rune range lo-hi. 192 void AddRuneRange(Rune lo, Rune hi, bool foldcase); 193 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); 194 void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); 195 void Add_80_10ffff(); 196 197 // New suffix that matches the byte range lo-hi, then goes to next. 198 int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); 199 int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); 200 201 // Adds a suffix to alternation. 202 void AddSuffix(int id); 203 204 // Returns the alternation of all the added suffixes. 205 Frag EndRange(); 206 207 // Single rune. 208 Frag Literal(Rune r, bool foldcase); 209 210 void Setup(Regexp::ParseFlags, int64, RE2::Anchor); 211 Prog* Finish(); 212 213 // Returns .* where dot = any byte 214 Frag DotStar(); 215 216 private: 217 Prog* prog_; // Program being built. 218 bool failed_; // Did we give up compiling? 219 Encoding encoding_; // Input encoding 220 bool reversed_; // Should program run backward over text? 221 222 int max_inst_; // Maximum number of instructions. 223 224 Prog::Inst* inst_; // Pointer to first instruction. 225 int inst_len_; // Number of instructions used. 226 int inst_cap_; // Number of instructions allocated. 227 228 int64 max_mem_; // Total memory budget. 229 230 map<uint64, int> rune_cache_; 231 Frag rune_range_; 232 233 RE2::Anchor anchor_; // anchor mode for RE2::Set 234 235 DISALLOW_EVIL_CONSTRUCTORS(Compiler); 236 }; 237 238 Compiler::Compiler() { 239 prog_ = new Prog(); 240 failed_ = false; 241 encoding_ = kEncodingUTF8; 242 reversed_ = false; 243 inst_ = NULL; 244 inst_len_ = 0; 245 inst_cap_ = 0; 246 max_inst_ = 1; // make AllocInst for fail instruction okay 247 max_mem_ = 0; 248 int fail = AllocInst(1); 249 inst_[fail].InitFail(); 250 max_inst_ = 0; // Caller must change 251 } 252 253 Compiler::~Compiler() { 254 delete prog_; 255 delete[] inst_; 256 } 257 258 int Compiler::AllocInst(int n) { 259 if (failed_ || inst_len_ + n > max_inst_) { 260 failed_ = true; 261 return -1; 262 } 263 264 if (inst_len_ + n > inst_cap_) { 265 if (inst_cap_ == 0) 266 inst_cap_ = 8; 267 while (inst_len_ + n > inst_cap_) 268 inst_cap_ *= 2; 269 Prog::Inst* ip = new Prog::Inst[inst_cap_]; 270 memmove(ip, inst_, inst_len_ * sizeof ip[0]); 271 memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]); 272 delete[] inst_; 273 inst_ = ip; 274 } 275 int id = inst_len_; 276 inst_len_ += n; 277 return id; 278 } 279 280 void Compiler::Trim() { 281 if (inst_len_ < inst_cap_) { 282 Prog::Inst* ip = new Prog::Inst[inst_len_]; 283 memmove(ip, inst_, inst_len_ * sizeof ip[0]); 284 delete[] inst_; 285 inst_ = ip; 286 inst_cap_ = inst_len_; 287 } 288 } 289 290 // These routines are somewhat hard to visualize in text -- 291 // see http://swtch.com/~rsc/regexp/regexp1.html for 292 // pictures explaining what is going on here. 293 294 // Returns an unmatchable fragment. 295 Frag Compiler::NoMatch() { 296 return Frag(0, nullPatchList); 297 } 298 299 // Is a an unmatchable fragment? 300 static bool IsNoMatch(Frag a) { 301 return a.begin == 0; 302 } 303 304 // Given fragments a and b, returns fragment for ab. 305 Frag Compiler::Cat(Frag a, Frag b) { 306 if (IsNoMatch(a) || IsNoMatch(b)) 307 return NoMatch(); 308 309 // Elide no-op. 310 Prog::Inst* begin = &inst_[a.begin]; 311 if (begin->opcode() == kInstNop && 312 a.end.p == (a.begin << 1) && 313 begin->out() == 0) { 314 PatchList::Patch(inst_, a.end, b.begin); // in case refs to a somewhere 315 return b; 316 } 317 318 // To run backward over string, reverse all concatenations. 319 if (reversed_) { 320 PatchList::Patch(inst_, b.end, a.begin); 321 return Frag(b.begin, a.end); 322 } 323 324 PatchList::Patch(inst_, a.end, b.begin); 325 return Frag(a.begin, b.end); 326 } 327 328 // Given fragments for a and b, returns fragment for a|b. 329 Frag Compiler::Alt(Frag a, Frag b) { 330 // Special case for convenience in loops. 331 if (IsNoMatch(a)) 332 return b; 333 if (IsNoMatch(b)) 334 return a; 335 336 int id = AllocInst(1); 337 if (id < 0) 338 return NoMatch(); 339 340 inst_[id].InitAlt(a.begin, b.begin); 341 return Frag(id, PatchList::Append(inst_, a.end, b.end)); 342 } 343 344 // When capturing submatches in like-Perl mode, a kOpAlt Inst 345 // treats out_ as the first choice, out1_ as the second. 346 // 347 // For *, +, and ?, if out_ causes another repetition, 348 // then the operator is greedy. If out1_ is the repetition 349 // (and out_ moves forward), then the operator is non-greedy. 350 351 // Given a fragment a, returns a fragment for a* or a*? (if nongreedy) 352 Frag Compiler::Star(Frag a, bool nongreedy) { 353 int id = AllocInst(1); 354 if (id < 0) 355 return NoMatch(); 356 inst_[id].InitAlt(0, 0); 357 PatchList::Patch(inst_, a.end, id); 358 if (nongreedy) { 359 inst_[id].out1_ = a.begin; 360 return Frag(id, PatchList::Mk(id << 1)); 361 } else { 362 inst_[id].set_out(a.begin); 363 return Frag(id, PatchList::Mk((id << 1) | 1)); 364 } 365 } 366 367 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) 368 Frag Compiler::Plus(Frag a, bool nongreedy) { 369 // a+ is just a* with a different entry point. 370 Frag f = Star(a, nongreedy); 371 return Frag(a.begin, f.end); 372 } 373 374 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) 375 Frag Compiler::Quest(Frag a, bool nongreedy) { 376 int id = AllocInst(1); 377 if (id < 0) 378 return NoMatch(); 379 PatchList pl; 380 if (nongreedy) { 381 inst_[id].InitAlt(0, a.begin); 382 pl = PatchList::Mk(id << 1); 383 } else { 384 inst_[id].InitAlt(a.begin, 0); 385 pl = PatchList::Mk((id << 1) | 1); 386 } 387 return Frag(id, PatchList::Append(inst_, pl, a.end)); 388 } 389 390 // Returns a fragment for the byte range lo-hi. 391 Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { 392 int id = AllocInst(1); 393 if (id < 0) 394 return NoMatch(); 395 inst_[id].InitByteRange(lo, hi, foldcase, 0); 396 prog_->byte_inst_count_++; 397 prog_->MarkByteRange(lo, hi); 398 if (foldcase && lo <= 'z' && hi >= 'a') { 399 if (lo < 'a') 400 lo = 'a'; 401 if (hi > 'z') 402 hi = 'z'; 403 if (lo <= hi) 404 prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a'); 405 } 406 return Frag(id, PatchList::Mk(id << 1)); 407 } 408 409 // Returns a no-op fragment. Sometimes unavoidable. 410 Frag Compiler::Nop() { 411 int id = AllocInst(1); 412 if (id < 0) 413 return NoMatch(); 414 inst_[id].InitNop(0); 415 return Frag(id, PatchList::Mk(id << 1)); 416 } 417 418 // Returns a fragment that signals a match. 419 Frag Compiler::Match(int32 match_id) { 420 int id = AllocInst(1); 421 if (id < 0) 422 return NoMatch(); 423 inst_[id].InitMatch(match_id); 424 return Frag(id, nullPatchList); 425 } 426 427 // Returns a fragment matching a particular empty-width op (like ^ or $) 428 Frag Compiler::EmptyWidth(EmptyOp empty) { 429 int id = AllocInst(1); 430 if (id < 0) 431 return NoMatch(); 432 inst_[id].InitEmptyWidth(empty, 0); 433 if (empty & (kEmptyBeginLine|kEmptyEndLine)) 434 prog_->MarkByteRange('\n', '\n'); 435 if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) { 436 int j; 437 for (int i = 0; i < 256; i = j) { 438 for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++) 439 ; 440 prog_->MarkByteRange(i, j-1); 441 } 442 } 443 return Frag(id, PatchList::Mk(id << 1)); 444 } 445 446 // Given a fragment a, returns a fragment with capturing parens around a. 447 Frag Compiler::Capture(Frag a, int n) { 448 int id = AllocInst(2); 449 if (id < 0) 450 return NoMatch(); 451 inst_[id].InitCapture(2*n, a.begin); 452 inst_[id+1].InitCapture(2*n+1, 0); 453 PatchList::Patch(inst_, a.end, id+1); 454 455 return Frag(id, PatchList::Mk((id+1) << 1)); 456 } 457 458 // A Rune is a name for a Unicode code point. 459 // Returns maximum rune encoded by UTF-8 sequence of length len. 460 static int MaxRune(int len) { 461 int b; // number of Rune blents lenn len-byte UTF-8 sequence (len < UTFmax) 462 if (len == 1) 463 b = 7; 464 else 465 b = 8-(len+1) + 6*(len-1); 466 return (1<<b) - 1; // maximum Rune for b bits. 467 } 468 469 // The rune range compiler caches common suffix fragments, 470 // which are very common in UTF-8 (e.g., [80-bf]). 471 // The fragment suffixes are identified by their start 472 // instructions. NULL denotes the eventual end match. 473 // The Frag accumulates in rune_range_. Caching common 474 // suffixes reduces the UTF-8 "." from 32 to 24 instructions, 475 // and it reduces the corresponding one-pass NFA from 16 nodes to 8. 476 477 void Compiler::BeginRange() { 478 rune_cache_.clear(); 479 rune_range_.begin = 0; 480 rune_range_.end = nullPatchList; 481 } 482 483 int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, 484 int next) { 485 Frag f = ByteRange(lo, hi, foldcase); 486 if (next != 0) { 487 PatchList::Patch(inst_, f.end, next); 488 } else { 489 rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end); 490 } 491 return f.begin; 492 } 493 494 int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { 495 // In Latin1 mode, there's no point in caching. 496 // In forward UTF-8 mode, only need to cache continuation bytes. 497 if (encoding_ == kEncodingLatin1 || 498 (encoding_ == kEncodingUTF8 && 499 !reversed_ && 500 !(0x80 <= lo && hi <= 0xbf))) { 501 return UncachedRuneByteSuffix(lo, hi, foldcase, next); 502 } 503 504 uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase; 505 map<uint64, int>::iterator it = rune_cache_.find(key); 506 if (it != rune_cache_.end()) 507 return it->second; 508 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); 509 rune_cache_[key] = id; 510 return id; 511 } 512 513 void Compiler::AddSuffix(int id) { 514 if (rune_range_.begin == 0) { 515 rune_range_.begin = id; 516 return; 517 } 518 519 int alt = AllocInst(1); 520 if (alt < 0) { 521 rune_range_.begin = 0; 522 return; 523 } 524 inst_[alt].InitAlt(rune_range_.begin, id); 525 rune_range_.begin = alt; 526 } 527 528 Frag Compiler::EndRange() { 529 return rune_range_; 530 } 531 532 // Converts rune range lo-hi into a fragment that recognizes 533 // the bytes that would make up those runes in the current 534 // encoding (Latin 1 or UTF-8). 535 // This lets the machine work byte-by-byte even when 536 // using multibyte encodings. 537 538 void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { 539 switch (encoding_) { 540 default: 541 case kEncodingUTF8: 542 AddRuneRangeUTF8(lo, hi, foldcase); 543 break; 544 case kEncodingLatin1: 545 AddRuneRangeLatin1(lo, hi, foldcase); 546 break; 547 } 548 } 549 550 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { 551 // Latin1 is easy: runes *are* bytes. 552 if (lo > hi || lo > 0xFF) 553 return; 554 if (hi > 0xFF) 555 hi = 0xFF; 556 AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); 557 } 558 559 // Table describing how to make a UTF-8 matching machine 560 // for the rune range 80-10FFFF (Runeself-Runemax). 561 // This range happens frequently enough (for example /./ and /[^a-z]/) 562 // and the rune_cache_ map is slow enough that this is worth 563 // special handling. Makes compilation of a small expression 564 // with a dot in it about 10% faster. 565 // The * in the comments below mark whole sequences. 566 static struct ByteRangeProg { 567 int next; 568 int lo; 569 int hi; 570 } prog_80_10ffff[] = { 571 // Two-byte 572 { -1, 0x80, 0xBF, }, // 0: 80-BF 573 { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF* 574 575 // Three-byte 576 { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF 577 { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF* 578 { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF 579 { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF* 580 581 // Four-byte 582 { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF 583 { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF* 584 { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF 585 { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF* 586 { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF 587 { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF* 588 }; 589 590 void Compiler::Add_80_10ffff() { 591 int inst[arraysize(prog_80_10ffff)]; 592 for (int i = 0; i < arraysize(prog_80_10ffff); i++) { 593 const ByteRangeProg& p = prog_80_10ffff[i]; 594 int next = 0; 595 if (p.next >= 0) 596 next = inst[p.next]; 597 inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next); 598 if ((p.lo & 0xC0) != 0x80) 599 AddSuffix(inst[i]); 600 } 601 } 602 603 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { 604 if (lo > hi) 605 return; 606 607 // Pick off 80-10FFFF as a common special case 608 // that can bypass the slow rune_cache_. 609 if (lo == 0x80 && hi == 0x10ffff && !reversed_) { 610 Add_80_10ffff(); 611 return; 612 } 613 614 // Split range into same-length sized ranges. 615 for (int i = 1; i < UTFmax; i++) { 616 Rune max = MaxRune(i); 617 if (lo <= max && max < hi) { 618 AddRuneRangeUTF8(lo, max, foldcase); 619 AddRuneRangeUTF8(max+1, hi, foldcase); 620 return; 621 } 622 } 623 624 // ASCII range is always a special case. 625 if (hi < Runeself) { 626 AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); 627 return; 628 } 629 630 // Split range into sections that agree on leading bytes. 631 for (int i = 1; i < UTFmax; i++) { 632 uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence 633 if ((lo & ~m) != (hi & ~m)) { 634 if ((lo & m) != 0) { 635 AddRuneRangeUTF8(lo, lo|m, foldcase); 636 AddRuneRangeUTF8((lo|m)+1, hi, foldcase); 637 return; 638 } 639 if ((hi & m) != m) { 640 AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase); 641 AddRuneRangeUTF8(hi&~m, hi, foldcase); 642 return; 643 } 644 } 645 } 646 647 // Finally. Generate byte matching equivalent for lo-hi. 648 uint8 ulo[UTFmax], uhi[UTFmax]; 649 int n = runetochar(reinterpret_cast<char*>(ulo), &lo); 650 int m = runetochar(reinterpret_cast<char*>(uhi), &hi); 651 (void)m; // USED(m) 652 DCHECK_EQ(n, m); 653 654 int id = 0; 655 if (reversed_) { 656 for (int i = 0; i < n; i++) 657 id = RuneByteSuffix(ulo[i], uhi[i], false, id); 658 } else { 659 for (int i = n-1; i >= 0; i--) 660 id = RuneByteSuffix(ulo[i], uhi[i], false, id); 661 } 662 AddSuffix(id); 663 } 664 665 // Should not be called. 666 Frag Compiler::Copy(Frag arg) { 667 // We're using WalkExponential; there should be no copying. 668 LOG(DFATAL) << "Compiler::Copy called!"; 669 failed_ = true; 670 return NoMatch(); 671 } 672 673 // Visits a node quickly; called once WalkExponential has 674 // decided to cut this walk short. 675 Frag Compiler::ShortVisit(Regexp* re, Frag) { 676 failed_ = true; 677 return NoMatch(); 678 } 679 680 // Called before traversing a node's children during the walk. 681 Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) { 682 // Cut off walk if we've already failed. 683 if (failed_) 684 *stop = true; 685 686 return kNullFrag; // not used by caller 687 } 688 689 Frag Compiler::Literal(Rune r, bool foldcase) { 690 switch (encoding_) { 691 default: 692 return kNullFrag; 693 694 case kEncodingLatin1: 695 return ByteRange(r, r, foldcase); 696 697 case kEncodingUTF8: { 698 if (r < Runeself) // Make common case fast. 699 return ByteRange(r, r, foldcase); 700 uint8 buf[UTFmax]; 701 int n = runetochar(reinterpret_cast<char*>(buf), &r); 702 Frag f = ByteRange((uint8)buf[0], buf[0], false); 703 for (int i = 1; i < n; i++) 704 f = Cat(f, ByteRange((uint8)buf[i], buf[i], false)); 705 return f; 706 } 707 } 708 } 709 710 // Called after traversing the node's children during the walk. 711 // Given their frags, build and return the frag for this re. 712 Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, 713 int nchild_frags) { 714 // If a child failed, don't bother going forward, especially 715 // since the child_frags might contain Frags with NULLs in them. 716 if (failed_) 717 return NoMatch(); 718 719 // Given the child fragments, return the fragment for this node. 720 switch (re->op()) { 721 case kRegexpRepeat: 722 // Should not see; code at bottom of function will print error 723 break; 724 725 case kRegexpNoMatch: 726 return NoMatch(); 727 728 case kRegexpEmptyMatch: 729 return Nop(); 730 731 case kRegexpHaveMatch: { 732 Frag f = Match(re->match_id()); 733 // Remember unanchored match to end of string. 734 if (anchor_ != RE2::ANCHOR_BOTH) 735 f = Cat(DotStar(), f); 736 return f; 737 } 738 739 case kRegexpConcat: { 740 Frag f = child_frags[0]; 741 for (int i = 1; i < nchild_frags; i++) 742 f = Cat(f, child_frags[i]); 743 return f; 744 } 745 746 case kRegexpAlternate: { 747 Frag f = child_frags[0]; 748 for (int i = 1; i < nchild_frags; i++) 749 f = Alt(f, child_frags[i]); 750 return f; 751 } 752 753 case kRegexpStar: 754 return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 755 756 case kRegexpPlus: 757 return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 758 759 case kRegexpQuest: 760 return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 761 762 case kRegexpLiteral: 763 return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase); 764 765 case kRegexpLiteralString: { 766 // Concatenation of literals. 767 if (re->nrunes() == 0) 768 return Nop(); 769 Frag f; 770 for (int i = 0; i < re->nrunes(); i++) { 771 Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase); 772 if (i == 0) 773 f = f1; 774 else 775 f = Cat(f, f1); 776 } 777 return f; 778 } 779 780 case kRegexpAnyChar: 781 BeginRange(); 782 AddRuneRange(0, Runemax, false); 783 return EndRange(); 784 785 case kRegexpAnyByte: 786 return ByteRange(0x00, 0xFF, false); 787 788 case kRegexpCharClass: { 789 CharClass* cc = re->cc(); 790 if (cc->empty()) { 791 // This can't happen. 792 LOG(DFATAL) << "No ranges in char class"; 793 failed_ = true; 794 return NoMatch(); 795 } 796 797 // ASCII case-folding optimization: if the char class 798 // behaves the same on A-Z as it does on a-z, 799 // discard any ranges wholly contained in A-Z 800 // and mark the other ranges as foldascii. 801 // This reduces the size of a program for 802 // (?i)abc from 3 insts per letter to 1 per letter. 803 bool foldascii = cc->FoldsASCII(); 804 805 // Character class is just a big OR of the different 806 // character ranges in the class. 807 BeginRange(); 808 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) { 809 // ASCII case-folding optimization (see above). 810 if (foldascii && 'A' <= i->lo && i->hi <= 'Z') 811 continue; 812 813 // If this range contains all of A-Za-z or none of it, 814 // the fold flag is unnecessary; don't bother. 815 bool fold = foldascii; 816 if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo) 817 fold = false; 818 819 AddRuneRange(i->lo, i->hi, fold); 820 } 821 return EndRange(); 822 } 823 824 case kRegexpCapture: 825 // If this is a non-capturing parenthesis -- (?:foo) -- 826 // just use the inner expression. 827 if (re->cap() < 0) 828 return child_frags[0]; 829 return Capture(child_frags[0], re->cap()); 830 831 case kRegexpBeginLine: 832 return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine); 833 834 case kRegexpEndLine: 835 return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine); 836 837 case kRegexpBeginText: 838 return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText); 839 840 case kRegexpEndText: 841 return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText); 842 843 case kRegexpWordBoundary: 844 return EmptyWidth(kEmptyWordBoundary); 845 846 case kRegexpNoWordBoundary: 847 return EmptyWidth(kEmptyNonWordBoundary); 848 } 849 LOG(DFATAL) << "Missing case in Compiler: " << re->op(); 850 failed_ = true; 851 return NoMatch(); 852 } 853 854 // Is this regexp required to start at the beginning of the text? 855 // Only approximate; can return false for complicated regexps like (\Aa|\Ab), 856 // but handles (\A(a|b)). Could use the Walker to write a more exact one. 857 static bool IsAnchorStart(Regexp** pre, int depth) { 858 Regexp* re = *pre; 859 Regexp* sub; 860 // The depth limit makes sure that we don't overflow 861 // the stack on a deeply nested regexp. As the comment 862 // above says, IsAnchorStart is conservative, so returning 863 // a false negative is okay. The exact limit is somewhat arbitrary. 864 if (re == NULL || depth >= 4) 865 return false; 866 switch (re->op()) { 867 default: 868 break; 869 case kRegexpConcat: 870 if (re->nsub() > 0) { 871 sub = re->sub()[0]->Incref(); 872 if (IsAnchorStart(&sub, depth+1)) { 873 Regexp** subcopy = new Regexp*[re->nsub()]; 874 subcopy[0] = sub; // already have reference 875 for (int i = 1; i < re->nsub(); i++) 876 subcopy[i] = re->sub()[i]->Incref(); 877 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); 878 delete[] subcopy; 879 re->Decref(); 880 return true; 881 } 882 sub->Decref(); 883 } 884 break; 885 case kRegexpCapture: 886 sub = re->sub()[0]->Incref(); 887 if (IsAnchorStart(&sub, depth+1)) { 888 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); 889 re->Decref(); 890 return true; 891 } 892 sub->Decref(); 893 break; 894 case kRegexpBeginText: 895 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); 896 re->Decref(); 897 return true; 898 } 899 return false; 900 } 901 902 // Is this regexp required to start at the end of the text? 903 // Only approximate; can return false for complicated regexps like (a\z|b\z), 904 // but handles ((a|b)\z). Could use the Walker to write a more exact one. 905 static bool IsAnchorEnd(Regexp** pre, int depth) { 906 Regexp* re = *pre; 907 Regexp* sub; 908 // The depth limit makes sure that we don't overflow 909 // the stack on a deeply nested regexp. As the comment 910 // above says, IsAnchorEnd is conservative, so returning 911 // a false negative is okay. The exact limit is somewhat arbitrary. 912 if (re == NULL || depth >= 4) 913 return false; 914 switch (re->op()) { 915 default: 916 break; 917 case kRegexpConcat: 918 if (re->nsub() > 0) { 919 sub = re->sub()[re->nsub() - 1]->Incref(); 920 if (IsAnchorEnd(&sub, depth+1)) { 921 Regexp** subcopy = new Regexp*[re->nsub()]; 922 subcopy[re->nsub() - 1] = sub; // already have reference 923 for (int i = 0; i < re->nsub() - 1; i++) 924 subcopy[i] = re->sub()[i]->Incref(); 925 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); 926 delete[] subcopy; 927 re->Decref(); 928 return true; 929 } 930 sub->Decref(); 931 } 932 break; 933 case kRegexpCapture: 934 sub = re->sub()[0]->Incref(); 935 if (IsAnchorEnd(&sub, depth+1)) { 936 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); 937 re->Decref(); 938 return true; 939 } 940 sub->Decref(); 941 break; 942 case kRegexpEndText: 943 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); 944 re->Decref(); 945 return true; 946 } 947 return false; 948 } 949 950 void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, 951 RE2::Anchor anchor) { 952 prog_->set_flags(flags); 953 954 if (flags & Regexp::Latin1) 955 encoding_ = kEncodingLatin1; 956 max_mem_ = max_mem; 957 if (max_mem <= 0) { 958 max_inst_ = 100000; // more than enough 959 } else if (max_mem <= sizeof(Prog)) { 960 // No room for anything. 961 max_inst_ = 0; 962 } else { 963 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); 964 // Limit instruction count so that inst->id() fits nicely in an int. 965 // SparseArray also assumes that the indices (inst->id()) are ints. 966 // The call to WalkExponential uses 2*max_inst_ below, 967 // and other places in the code use 2 or 3 * prog->size(). 968 // Limiting to 2^24 should avoid overflow in those places. 969 // (The point of allowing more than 32 bits of memory is to 970 // have plenty of room for the DFA states, not to use it up 971 // on the program.) 972 if (m >= 1<<24) 973 m = 1<<24; 974 975 // Inst imposes its own limit (currently bigger than 2^24 but be safe). 976 if (m > Prog::Inst::kMaxInst) 977 m = Prog::Inst::kMaxInst; 978 979 max_inst_ = m; 980 } 981 982 anchor_ = anchor; 983 } 984 985 // Compiles re, returning program. 986 // Caller is responsible for deleting prog_. 987 // If reversed is true, compiles a program that expects 988 // to run over the input string backward (reverses all concatenations). 989 // The reversed flag is also recorded in the returned program. 990 Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { 991 Compiler c; 992 993 c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */); 994 c.reversed_ = reversed; 995 996 // Simplify to remove things like counted repetitions 997 // and character classes like \d. 998 Regexp* sre = re->Simplify(); 999 if (sre == NULL) 1000 return NULL; 1001 1002 // Record whether prog is anchored, removing the anchors. 1003 // (They get in the way of other optimizations.) 1004 bool is_anchor_start = IsAnchorStart(&sre, 0); 1005 bool is_anchor_end = IsAnchorEnd(&sre, 0); 1006 1007 // Generate fragment for entire regexp. 1008 Frag f = c.WalkExponential(sre, kNullFrag, 2*c.max_inst_); 1009 sre->Decref(); 1010 if (c.failed_) 1011 return NULL; 1012 1013 // Success! Finish by putting Match node at end, and record start. 1014 // Turn off c.reversed_ (if it is set) to force the remaining concatenations 1015 // to behave normally. 1016 c.reversed_ = false; 1017 Frag all = c.Cat(f, c.Match(0)); 1018 c.prog_->set_start(all.begin); 1019 1020 if (reversed) { 1021 c.prog_->set_anchor_start(is_anchor_end); 1022 c.prog_->set_anchor_end(is_anchor_start); 1023 } else { 1024 c.prog_->set_anchor_start(is_anchor_start); 1025 c.prog_->set_anchor_end(is_anchor_end); 1026 } 1027 1028 // Also create unanchored version, which starts with a .*? loop. 1029 if (c.prog_->anchor_start()) { 1030 c.prog_->set_start_unanchored(c.prog_->start()); 1031 } else { 1032 Frag unanchored = c.Cat(c.DotStar(), all); 1033 c.prog_->set_start_unanchored(unanchored.begin); 1034 } 1035 1036 c.prog_->set_reversed(reversed); 1037 1038 // Hand ownership of prog_ to caller. 1039 return c.Finish(); 1040 } 1041 1042 Prog* Compiler::Finish() { 1043 if (failed_) 1044 return NULL; 1045 1046 if (prog_->start() == 0 && prog_->start_unanchored() == 0) { 1047 // No possible matches; keep Fail instruction only. 1048 inst_len_ = 1; 1049 } 1050 1051 // Trim instruction to minimum array and transfer to Prog. 1052 Trim(); 1053 prog_->inst_ = inst_; 1054 prog_->size_ = inst_len_; 1055 inst_ = NULL; 1056 1057 // Compute byte map. 1058 prog_->ComputeByteMap(); 1059 1060 prog_->Optimize(); 1061 1062 // Record remaining memory for DFA. 1063 if (max_mem_ <= 0) { 1064 prog_->set_dfa_mem(1<<20); 1065 } else { 1066 int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst); 1067 if (m < 0) 1068 m = 0; 1069 prog_->set_dfa_mem(m); 1070 } 1071 1072 Prog* p = prog_; 1073 prog_ = NULL; 1074 return p; 1075 } 1076 1077 // Converts Regexp to Prog. 1078 Prog* Regexp::CompileToProg(int64 max_mem) { 1079 return Compiler::Compile(this, false, max_mem); 1080 } 1081 1082 Prog* Regexp::CompileToReverseProg(int64 max_mem) { 1083 return Compiler::Compile(this, true, max_mem); 1084 } 1085 1086 Frag Compiler::DotStar() { 1087 return Star(ByteRange(0x00, 0xff, false), true); 1088 } 1089 1090 // Compiles RE set to Prog. 1091 Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, 1092 Regexp* re) { 1093 Compiler c; 1094 1095 Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags()); 1096 c.Setup(pf, options.max_mem(), anchor); 1097 1098 // Compile alternation of fragments. 1099 Frag all = c.WalkExponential(re, kNullFrag, 2*c.max_inst_); 1100 re->Decref(); 1101 if (c.failed_) 1102 return NULL; 1103 1104 if (anchor == RE2::UNANCHORED) { 1105 // The trailing .* was added while handling kRegexpHaveMatch. 1106 // We just have to add the leading one. 1107 all = c.Cat(c.DotStar(), all); 1108 } 1109 1110 c.prog_->set_start(all.begin); 1111 c.prog_->set_start_unanchored(all.begin); 1112 c.prog_->set_anchor_start(true); 1113 c.prog_->set_anchor_end(true); 1114 1115 Prog* prog = c.Finish(); 1116 if (prog == NULL) 1117 return NULL; 1118 1119 // Make sure DFA has enough memory to operate, 1120 // since we're not going to fall back to the NFA. 1121 bool failed; 1122 StringPiece sp = "hello, world"; 1123 prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, 1124 NULL, &failed, NULL); 1125 if (failed) { 1126 delete prog; 1127 return NULL; 1128 } 1129 1130 return prog; 1131 } 1132 1133 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor, 1134 Regexp* re) { 1135 return Compiler::CompileSet(options, anchor, re); 1136 } 1137 1138 } // namespace re2 1139