1 // Copyright 2007 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Compile regular expression to Prog. 6 // 7 // Prog and Inst are defined in prog.h. 8 // This file's external interface is just Regexp::CompileToProg. 9 // The Compiler class defined in this file is private. 10 11 #include "re2/prog.h" 12 #include "re2/re2.h" 13 #include "re2/regexp.h" 14 #include "re2/walker-inl.h" 15 16 namespace re2 { 17 18 // List of pointers to Inst* that need to be filled in (patched). 19 // Because the Inst* haven't been filled in yet, 20 // we can use the Inst* word to hold the list's "next" pointer. 21 // It's kind of sleazy, but it works well in practice. 22 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. 23 // 24 // Because the out and out1 fields in Inst are no longer pointers, 25 // we can't use pointers directly here either. Instead, p refers 26 // to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1). 27 // p == 0 represents the NULL list. This is okay because instruction #0 28 // is always the fail instruction, which never appears on a list. 29 30 struct PatchList { 31 uint32 p; 32 33 // Returns patch list containing just p. 34 static PatchList Mk(uint32 p); 35 36 // Patches all the entries on l to have value v. 37 // Caller must not ever use patch list again. 38 static void Patch(Prog::Inst *inst0, PatchList l, uint32 v); 39 40 // Deref returns the next pointer pointed at by p. 41 static PatchList Deref(Prog::Inst *inst0, PatchList l); 42 43 // Appends two patch lists and returns result. 44 static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2); 45 }; 46 47 static PatchList nullPatchList; 48 49 // Returns patch list containing just p. 50 PatchList PatchList::Mk(uint32 p) { 51 PatchList l; 52 l.p = p; 53 return l; 54 } 55 56 // Returns the next pointer pointed at by l. 57 PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) { 58 Prog::Inst* ip = &inst0[l.p>>1]; 59 if (l.p&1) 60 l.p = ip->out1(); 61 else 62 l.p = ip->out(); 63 return l; 64 } 65 66 // Patches all the entries on l to have value v. 67 void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) { 68 while (l.p != 0) { 69 Prog::Inst* ip = &inst0[l.p>>1]; 70 if (l.p&1) { 71 l.p = ip->out1(); 72 ip->out1_ = val; 73 } else { 74 l.p = ip->out(); 75 ip->set_out(val); 76 } 77 } 78 } 79 80 // Appends two patch lists and returns result. 81 PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { 82 if (l1.p == 0) 83 return l2; 84 if (l2.p == 0) 85 return l1; 86 87 PatchList l = l1; 88 for (;;) { 89 PatchList next = PatchList::Deref(inst0, l); 90 if (next.p == 0) 91 break; 92 l = next; 93 } 94 95 Prog::Inst* ip = &inst0[l.p>>1]; 96 if (l.p&1) 97 ip->out1_ = l2.p; 98 else 99 ip->set_out(l2.p); 100 101 return l1; 102 } 103 104 // Compiled program fragment. 105 struct Frag { 106 uint32 begin; 107 PatchList end; 108 109 explicit Frag(LinkerInitialized) {} 110 Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector 111 Frag(uint32 begin, PatchList end) : begin(begin), end(end) {} 112 }; 113 114 static Frag kNullFrag(LINKER_INITIALIZED); 115 116 // Input encodings. 117 enum Encoding { 118 kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) 119 kEncodingLatin1, // Latin1 (0-FF) 120 }; 121 122 class Compiler : public Regexp::Walker<Frag> { 123 public: 124 explicit Compiler(); 125 ~Compiler(); 126 127 // Compiles Regexp to a new Prog. 128 // Caller is responsible for deleting Prog when finished with it. 129 // If reversed is true, compiles for walking over the input 130 // string backward (reverses all concatenations). 131 static Prog *Compile(Regexp* re, bool reversed, int64 max_mem); 132 133 // Compiles alternation of all the re to a new Prog. 134 // Each re has a match with an id equal to its index in the vector. 135 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, 136 Regexp* re); 137 138 // Interface for Regexp::Walker, which helps traverse the Regexp. 139 // The walk is purely post-recursive: given the machines for the 140 // children, PostVisit combines them to create the machine for 141 // the current node. The child_args are Frags. 142 // The Compiler traverses the Regexp parse tree, visiting 143 // each node in depth-first order. It invokes PreVisit before 144 // visiting the node's children and PostVisit after visiting 145 // the children. 146 Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); 147 Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args, 148 int nchild_args); 149 Frag ShortVisit(Regexp* re, Frag parent_arg); 150 Frag Copy(Frag arg); 151 152 // Given fragment a, returns a+ or a+?; a* or a*?; a? or a?? 153 Frag Plus(Frag a, bool nongreedy); 154 Frag Star(Frag a, bool nongreedy); 155 Frag Quest(Frag a, bool nongreedy); 156 157 // Given fragment a, returns (a) capturing as \n. 158 Frag Capture(Frag a, int n); 159 160 // Given fragments a and b, returns ab; a|b 161 Frag Cat(Frag a, Frag b); 162 Frag Alt(Frag a, Frag b); 163 164 // Returns a fragment that can't match anything. 165 Frag NoMatch(); 166 167 // Returns a fragment that matches the empty string. 168 Frag Match(int32 id); 169 170 // Returns a no-op fragment. 171 Frag Nop(); 172 173 // Returns a fragment matching the byte range lo-hi. 174 Frag ByteRange(int lo, int hi, bool foldcase); 175 176 // Returns a fragment matching an empty-width special op. 177 Frag EmptyWidth(EmptyOp op); 178 179 // Adds n instructions to the program. 180 // Returns the index of the first one. 181 // Returns -1 if no more instructions are available. 182 int AllocInst(int n); 183 184 // Deletes unused instructions. 185 void Trim(); 186 187 // Rune range compiler. 188 189 // Begins a new alternation. 190 void BeginRange(); 191 192 // Adds a fragment matching the rune range lo-hi. 193 void AddRuneRange(Rune lo, Rune hi, bool foldcase); 194 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); 195 void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); 196 void Add_80_10ffff(); 197 198 // New suffix that matches the byte range lo-hi, then goes to next. 199 int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); 200 int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); 201 202 // Adds a suffix to alternation. 203 void AddSuffix(int id); 204 205 // Returns the alternation of all the added suffixes. 206 Frag EndRange(); 207 208 // Single rune. 209 Frag Literal(Rune r, bool foldcase); 210 211 void Setup(Regexp::ParseFlags, int64, RE2::Anchor); 212 Prog* Finish(); 213 214 // Returns .* where dot = any byte 215 Frag DotStar(); 216 217 private: 218 Prog* prog_; // Program being built. 219 bool failed_; // Did we give up compiling? 220 Encoding encoding_; // Input encoding 221 bool reversed_; // Should program run backward over text? 222 223 int max_inst_; // Maximum number of instructions. 224 225 Prog::Inst* inst_; // Pointer to first instruction. 226 int inst_len_; // Number of instructions used. 227 int inst_cap_; // Number of instructions allocated. 228 229 int64 max_mem_; // Total memory budget. 230 231 map<uint64, int> rune_cache_; 232 Frag rune_range_; 233 234 RE2::Anchor anchor_; // anchor mode for RE2::Set 235 236 DISALLOW_EVIL_CONSTRUCTORS(Compiler); 237 }; 238 239 Compiler::Compiler() { 240 prog_ = new Prog(); 241 failed_ = false; 242 encoding_ = kEncodingUTF8; 243 reversed_ = false; 244 inst_ = NULL; 245 inst_len_ = 0; 246 inst_cap_ = 0; 247 max_inst_ = 1; // make AllocInst for fail instruction okay 248 max_mem_ = 0; 249 int fail = AllocInst(1); 250 inst_[fail].InitFail(); 251 max_inst_ = 0; // Caller must change 252 } 253 254 Compiler::~Compiler() { 255 delete prog_; 256 delete[] inst_; 257 } 258 259 int Compiler::AllocInst(int n) { 260 if (failed_ || inst_len_ + n > max_inst_) { 261 failed_ = true; 262 return -1; 263 } 264 265 if (inst_len_ + n > inst_cap_) { 266 if (inst_cap_ == 0) 267 inst_cap_ = 8; 268 while (inst_len_ + n > inst_cap_) 269 inst_cap_ *= 2; 270 Prog::Inst* ip = new Prog::Inst[inst_cap_]; 271 memmove(ip, inst_, inst_len_ * sizeof ip[0]); 272 memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]); 273 delete[] inst_; 274 inst_ = ip; 275 } 276 int id = inst_len_; 277 inst_len_ += n; 278 return id; 279 } 280 281 void Compiler::Trim() { 282 if (inst_len_ < inst_cap_) { 283 Prog::Inst* ip = new Prog::Inst[inst_len_]; 284 memmove(ip, inst_, inst_len_ * sizeof ip[0]); 285 delete[] inst_; 286 inst_ = ip; 287 inst_cap_ = inst_len_; 288 } 289 } 290 291 // These routines are somewhat hard to visualize in text -- 292 // see http://swtch.com/~rsc/regexp/regexp1.html for 293 // pictures explaining what is going on here. 294 295 // Returns an unmatchable fragment. 296 Frag Compiler::NoMatch() { 297 return Frag(0, nullPatchList); 298 } 299 300 // Is a an unmatchable fragment? 301 static bool IsNoMatch(Frag a) { 302 return a.begin == 0; 303 } 304 305 // Given fragments a and b, returns fragment for ab. 306 Frag Compiler::Cat(Frag a, Frag b) { 307 if (IsNoMatch(a) || IsNoMatch(b)) 308 return NoMatch(); 309 310 // Elide no-op. 311 Prog::Inst* begin = &inst_[a.begin]; 312 if (begin->opcode() == kInstNop && 313 a.end.p == (a.begin << 1) && 314 begin->out() == 0) { 315 PatchList::Patch(inst_, a.end, b.begin); // in case refs to a somewhere 316 return b; 317 } 318 319 // To run backward over string, reverse all concatenations. 320 if (reversed_) { 321 PatchList::Patch(inst_, b.end, a.begin); 322 return Frag(b.begin, a.end); 323 } 324 325 PatchList::Patch(inst_, a.end, b.begin); 326 return Frag(a.begin, b.end); 327 } 328 329 // Given fragments for a and b, returns fragment for a|b. 330 Frag Compiler::Alt(Frag a, Frag b) { 331 // Special case for convenience in loops. 332 if (IsNoMatch(a)) 333 return b; 334 if (IsNoMatch(b)) 335 return a; 336 337 int id = AllocInst(1); 338 if (id < 0) 339 return NoMatch(); 340 341 inst_[id].InitAlt(a.begin, b.begin); 342 return Frag(id, PatchList::Append(inst_, a.end, b.end)); 343 } 344 345 // When capturing submatches in like-Perl mode, a kOpAlt Inst 346 // treats out_ as the first choice, out1_ as the second. 347 // 348 // For *, +, and ?, if out_ causes another repetition, 349 // then the operator is greedy. If out1_ is the repetition 350 // (and out_ moves forward), then the operator is non-greedy. 351 352 // Given a fragment a, returns a fragment for a* or a*? (if nongreedy) 353 Frag Compiler::Star(Frag a, bool nongreedy) { 354 int id = AllocInst(1); 355 if (id < 0) 356 return NoMatch(); 357 inst_[id].InitAlt(0, 0); 358 PatchList::Patch(inst_, a.end, id); 359 if (nongreedy) { 360 inst_[id].out1_ = a.begin; 361 return Frag(id, PatchList::Mk(id << 1)); 362 } else { 363 inst_[id].set_out(a.begin); 364 return Frag(id, PatchList::Mk((id << 1) | 1)); 365 } 366 } 367 368 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) 369 Frag Compiler::Plus(Frag a, bool nongreedy) { 370 // a+ is just a* with a different entry point. 371 Frag f = Star(a, nongreedy); 372 return Frag(a.begin, f.end); 373 } 374 375 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) 376 Frag Compiler::Quest(Frag a, bool nongreedy) { 377 int id = AllocInst(1); 378 if (id < 0) 379 return NoMatch(); 380 PatchList pl; 381 if (nongreedy) { 382 inst_[id].InitAlt(0, a.begin); 383 pl = PatchList::Mk(id << 1); 384 } else { 385 inst_[id].InitAlt(a.begin, 0); 386 pl = PatchList::Mk((id << 1) | 1); 387 } 388 return Frag(id, PatchList::Append(inst_, pl, a.end)); 389 } 390 391 // Returns a fragment for the byte range lo-hi. 392 Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { 393 int id = AllocInst(1); 394 if (id < 0) 395 return NoMatch(); 396 inst_[id].InitByteRange(lo, hi, foldcase, 0); 397 prog_->byte_inst_count_++; 398 prog_->MarkByteRange(lo, hi); 399 if (foldcase && lo <= 'z' && hi >= 'a') { 400 if (lo < 'a') 401 lo = 'a'; 402 if (hi > 'z') 403 hi = 'z'; 404 if (lo <= hi) 405 prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a'); 406 } 407 return Frag(id, PatchList::Mk(id << 1)); 408 } 409 410 // Returns a no-op fragment. Sometimes unavoidable. 411 Frag Compiler::Nop() { 412 int id = AllocInst(1); 413 if (id < 0) 414 return NoMatch(); 415 inst_[id].InitNop(0); 416 return Frag(id, PatchList::Mk(id << 1)); 417 } 418 419 // Returns a fragment that signals a match. 420 Frag Compiler::Match(int32 match_id) { 421 int id = AllocInst(1); 422 if (id < 0) 423 return NoMatch(); 424 inst_[id].InitMatch(match_id); 425 return Frag(id, nullPatchList); 426 } 427 428 // Returns a fragment matching a particular empty-width op (like ^ or $) 429 Frag Compiler::EmptyWidth(EmptyOp empty) { 430 int id = AllocInst(1); 431 if (id < 0) 432 return NoMatch(); 433 inst_[id].InitEmptyWidth(empty, 0); 434 if (empty & (kEmptyBeginLine|kEmptyEndLine)) 435 prog_->MarkByteRange('\n', '\n'); 436 if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) { 437 int j; 438 for (int i = 0; i < 256; i = j) { 439 for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++) 440 ; 441 prog_->MarkByteRange(i, j-1); 442 } 443 } 444 return Frag(id, PatchList::Mk(id << 1)); 445 } 446 447 // Given a fragment a, returns a fragment with capturing parens around a. 448 Frag Compiler::Capture(Frag a, int n) { 449 int id = AllocInst(2); 450 if (id < 0) 451 return NoMatch(); 452 inst_[id].InitCapture(2*n, a.begin); 453 inst_[id+1].InitCapture(2*n+1, 0); 454 PatchList::Patch(inst_, a.end, id+1); 455 456 return Frag(id, PatchList::Mk((id+1) << 1)); 457 } 458 459 // A Rune is a name for a Unicode code point. 460 // Returns maximum rune encoded by UTF-8 sequence of length len. 461 static int MaxRune(int len) { 462 int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax) 463 if (len == 1) 464 b = 7; 465 else 466 b = 8-(len+1) + 6*(len-1); 467 return (1<<b) - 1; // maximum Rune for b bits. 468 } 469 470 // The rune range compiler caches common suffix fragments, 471 // which are very common in UTF-8 (e.g., [80-bf]). 472 // The fragment suffixes are identified by their start 473 // instructions. NULL denotes the eventual end match. 474 // The Frag accumulates in rune_range_. Caching common 475 // suffixes reduces the UTF-8 "." from 32 to 24 instructions, 476 // and it reduces the corresponding one-pass NFA from 16 nodes to 8. 477 478 void Compiler::BeginRange() { 479 rune_cache_.clear(); 480 rune_range_.begin = 0; 481 rune_range_.end = nullPatchList; 482 } 483 484 int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, 485 int next) { 486 Frag f = ByteRange(lo, hi, foldcase); 487 if (next != 0) { 488 PatchList::Patch(inst_, f.end, next); 489 } else { 490 rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end); 491 } 492 return f.begin; 493 } 494 495 int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { 496 // In Latin1 mode, there's no point in caching. 497 // In forward UTF-8 mode, only need to cache continuation bytes. 498 if (encoding_ == kEncodingLatin1 || 499 (encoding_ == kEncodingUTF8 && 500 !reversed_ && 501 !(0x80 <= lo && hi <= 0xbf))) { 502 return UncachedRuneByteSuffix(lo, hi, foldcase, next); 503 } 504 505 uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase; 506 map<uint64, int>::iterator it = rune_cache_.find(key); 507 if (it != rune_cache_.end()) 508 return it->second; 509 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); 510 rune_cache_[key] = id; 511 return id; 512 } 513 514 void Compiler::AddSuffix(int id) { 515 if (rune_range_.begin == 0) { 516 rune_range_.begin = id; 517 return; 518 } 519 520 int alt = AllocInst(1); 521 if (alt < 0) { 522 rune_range_.begin = 0; 523 return; 524 } 525 inst_[alt].InitAlt(rune_range_.begin, id); 526 rune_range_.begin = alt; 527 } 528 529 Frag Compiler::EndRange() { 530 return rune_range_; 531 } 532 533 // Converts rune range lo-hi into a fragment that recognizes 534 // the bytes that would make up those runes in the current 535 // encoding (Latin 1 or UTF-8). 536 // This lets the machine work byte-by-byte even when 537 // using multibyte encodings. 538 539 void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { 540 switch (encoding_) { 541 default: 542 case kEncodingUTF8: 543 AddRuneRangeUTF8(lo, hi, foldcase); 544 break; 545 case kEncodingLatin1: 546 AddRuneRangeLatin1(lo, hi, foldcase); 547 break; 548 } 549 } 550 551 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { 552 // Latin1 is easy: runes *are* bytes. 553 if (lo > hi || lo > 0xFF) 554 return; 555 if (hi > 0xFF) 556 hi = 0xFF; 557 AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); 558 } 559 560 // Table describing how to make a UTF-8 matching machine 561 // for the rune range 80-10FFFF (Runeself-Runemax). 562 // This range happens frequently enough (for example /./ and /[^a-z]/) 563 // and the rune_cache_ map is slow enough that this is worth 564 // special handling. Makes compilation of a small expression 565 // with a dot in it about 10% faster. 566 // The * in the comments below mark whole sequences. 567 static struct ByteRangeProg { 568 int next; 569 int lo; 570 int hi; 571 } prog_80_10ffff[] = { 572 // Two-byte 573 { -1, 0x80, 0xBF, }, // 0: 80-BF 574 { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF* 575 576 // Three-byte 577 { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF 578 { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF* 579 { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF 580 { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF* 581 582 // Four-byte 583 { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF 584 { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF* 585 { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF 586 { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF* 587 { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF 588 { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF* 589 }; 590 591 void Compiler::Add_80_10ffff() { 592 int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning 593 for (int i = 0; i < arraysize(prog_80_10ffff); i++) { 594 const ByteRangeProg& p = prog_80_10ffff[i]; 595 int next = 0; 596 if (p.next >= 0) 597 next = inst[p.next]; 598 inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next); 599 if ((p.lo & 0xC0) != 0x80) 600 AddSuffix(inst[i]); 601 } 602 } 603 604 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { 605 if (lo > hi) 606 return; 607 608 // Pick off 80-10FFFF as a common special case 609 // that can bypass the slow rune_cache_. 610 if (lo == 0x80 && hi == 0x10ffff && !reversed_) { 611 Add_80_10ffff(); 612 return; 613 } 614 615 // Split range into same-length sized ranges. 616 for (int i = 1; i < UTFmax; i++) { 617 Rune max = MaxRune(i); 618 if (lo <= max && max < hi) { 619 AddRuneRangeUTF8(lo, max, foldcase); 620 AddRuneRangeUTF8(max+1, hi, foldcase); 621 return; 622 } 623 } 624 625 // ASCII range is always a special case. 626 if (hi < Runeself) { 627 AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); 628 return; 629 } 630 631 // Split range into sections that agree on leading bytes. 632 for (int i = 1; i < UTFmax; i++) { 633 uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence 634 if ((lo & ~m) != (hi & ~m)) { 635 if ((lo & m) != 0) { 636 AddRuneRangeUTF8(lo, lo|m, foldcase); 637 AddRuneRangeUTF8((lo|m)+1, hi, foldcase); 638 return; 639 } 640 if ((hi & m) != m) { 641 AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase); 642 AddRuneRangeUTF8(hi&~m, hi, foldcase); 643 return; 644 } 645 } 646 } 647 648 // Finally. Generate byte matching equivalent for lo-hi. 649 uint8 ulo[UTFmax], uhi[UTFmax]; 650 int n = runetochar(reinterpret_cast<char*>(ulo), &lo); 651 int m = runetochar(reinterpret_cast<char*>(uhi), &hi); 652 (void)m; // USED(m) 653 DCHECK_EQ(n, m); 654 655 int id = 0; 656 if (reversed_) { 657 for (int i = 0; i < n; i++) 658 id = RuneByteSuffix(ulo[i], uhi[i], false, id); 659 } else { 660 for (int i = n-1; i >= 0; i--) 661 id = RuneByteSuffix(ulo[i], uhi[i], false, id); 662 } 663 AddSuffix(id); 664 } 665 666 // Should not be called. 667 Frag Compiler::Copy(Frag arg) { 668 // We're using WalkExponential; there should be no copying. 669 LOG(DFATAL) << "Compiler::Copy called!"; 670 failed_ = true; 671 return NoMatch(); 672 } 673 674 // Visits a node quickly; called once WalkExponential has 675 // decided to cut this walk short. 676 Frag Compiler::ShortVisit(Regexp* re, Frag) { 677 failed_ = true; 678 return NoMatch(); 679 } 680 681 // Called before traversing a node's children during the walk. 682 Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) { 683 // Cut off walk if we've already failed. 684 if (failed_) 685 *stop = true; 686 687 return kNullFrag; // not used by caller 688 } 689 690 Frag Compiler::Literal(Rune r, bool foldcase) { 691 switch (encoding_) { 692 default: 693 return kNullFrag; 694 695 case kEncodingLatin1: 696 return ByteRange(r, r, foldcase); 697 698 case kEncodingUTF8: { 699 if (r < Runeself) // Make common case fast. 700 return ByteRange(r, r, foldcase); 701 uint8 buf[UTFmax]; 702 int n = runetochar(reinterpret_cast<char*>(buf), &r); 703 Frag f = ByteRange((uint8)buf[0], buf[0], false); 704 for (int i = 1; i < n; i++) 705 f = Cat(f, ByteRange((uint8)buf[i], buf[i], false)); 706 return f; 707 } 708 } 709 } 710 711 // Called after traversing the node's children during the walk. 712 // Given their frags, build and return the frag for this re. 713 Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, 714 int nchild_frags) { 715 // If a child failed, don't bother going forward, especially 716 // since the child_frags might contain Frags with NULLs in them. 717 if (failed_) 718 return NoMatch(); 719 720 // Given the child fragments, return the fragment for this node. 721 switch (re->op()) { 722 case kRegexpRepeat: 723 // Should not see; code at bottom of function will print error 724 break; 725 726 case kRegexpNoMatch: 727 return NoMatch(); 728 729 case kRegexpEmptyMatch: 730 return Nop(); 731 732 case kRegexpHaveMatch: { 733 Frag f = Match(re->match_id()); 734 // Remember unanchored match to end of string. 735 if (anchor_ != RE2::ANCHOR_BOTH) 736 f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f)); 737 return f; 738 } 739 740 case kRegexpConcat: { 741 Frag f = child_frags[0]; 742 for (int i = 1; i < nchild_frags; i++) 743 f = Cat(f, child_frags[i]); 744 return f; 745 } 746 747 case kRegexpAlternate: { 748 Frag f = child_frags[0]; 749 for (int i = 1; i < nchild_frags; i++) 750 f = Alt(f, child_frags[i]); 751 return f; 752 } 753 754 case kRegexpStar: 755 return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 756 757 case kRegexpPlus: 758 return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 759 760 case kRegexpQuest: 761 return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 762 763 case kRegexpLiteral: 764 return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase); 765 766 case kRegexpLiteralString: { 767 // Concatenation of literals. 768 if (re->nrunes() == 0) 769 return Nop(); 770 Frag f; 771 for (int i = 0; i < re->nrunes(); i++) { 772 Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase); 773 if (i == 0) 774 f = f1; 775 else 776 f = Cat(f, f1); 777 } 778 return f; 779 } 780 781 case kRegexpAnyChar: 782 BeginRange(); 783 AddRuneRange(0, Runemax, false); 784 return EndRange(); 785 786 case kRegexpAnyByte: 787 return ByteRange(0x00, 0xFF, false); 788 789 case kRegexpCharClass: { 790 CharClass* cc = re->cc(); 791 if (cc->empty()) { 792 // This can't happen. 793 LOG(DFATAL) << "No ranges in char class"; 794 failed_ = true; 795 return NoMatch(); 796 } 797 798 // ASCII case-folding optimization: if the char class 799 // behaves the same on A-Z as it does on a-z, 800 // discard any ranges wholly contained in A-Z 801 // and mark the other ranges as foldascii. 802 // This reduces the size of a program for 803 // (?i)abc from 3 insts per letter to 1 per letter. 804 bool foldascii = cc->FoldsASCII(); 805 806 // Character class is just a big OR of the different 807 // character ranges in the class. 808 BeginRange(); 809 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) { 810 // ASCII case-folding optimization (see above). 811 if (foldascii && 'A' <= i->lo && i->hi <= 'Z') 812 continue; 813 814 // If this range contains all of A-Za-z or none of it, 815 // the fold flag is unnecessary; don't bother. 816 bool fold = foldascii; 817 if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo) 818 fold = false; 819 820 AddRuneRange(i->lo, i->hi, fold); 821 } 822 return EndRange(); 823 } 824 825 case kRegexpCapture: 826 // If this is a non-capturing parenthesis -- (?:foo) -- 827 // just use the inner expression. 828 if (re->cap() < 0) 829 return child_frags[0]; 830 return Capture(child_frags[0], re->cap()); 831 832 case kRegexpBeginLine: 833 return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine); 834 835 case kRegexpEndLine: 836 return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine); 837 838 case kRegexpBeginText: 839 return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText); 840 841 case kRegexpEndText: 842 return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText); 843 844 case kRegexpWordBoundary: 845 return EmptyWidth(kEmptyWordBoundary); 846 847 case kRegexpNoWordBoundary: 848 return EmptyWidth(kEmptyNonWordBoundary); 849 } 850 LOG(DFATAL) << "Missing case in Compiler: " << re->op(); 851 failed_ = true; 852 return NoMatch(); 853 } 854 855 // Is this regexp required to start at the beginning of the text? 856 // Only approximate; can return false for complicated regexps like (\Aa|\Ab), 857 // but handles (\A(a|b)). Could use the Walker to write a more exact one. 858 static bool IsAnchorStart(Regexp** pre, int depth) { 859 Regexp* re = *pre; 860 Regexp* sub; 861 // The depth limit makes sure that we don't overflow 862 // the stack on a deeply nested regexp. As the comment 863 // above says, IsAnchorStart is conservative, so returning 864 // a false negative is okay. The exact limit is somewhat arbitrary. 865 if (re == NULL || depth >= 4) 866 return false; 867 switch (re->op()) { 868 default: 869 break; 870 case kRegexpConcat: 871 if (re->nsub() > 0) { 872 sub = re->sub()[0]->Incref(); 873 if (IsAnchorStart(&sub, depth+1)) { 874 Regexp** subcopy = new Regexp*[re->nsub()]; 875 subcopy[0] = sub; // already have reference 876 for (int i = 1; i < re->nsub(); i++) 877 subcopy[i] = re->sub()[i]->Incref(); 878 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); 879 delete[] subcopy; 880 re->Decref(); 881 return true; 882 } 883 sub->Decref(); 884 } 885 break; 886 case kRegexpCapture: 887 sub = re->sub()[0]->Incref(); 888 if (IsAnchorStart(&sub, depth+1)) { 889 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); 890 re->Decref(); 891 return true; 892 } 893 sub->Decref(); 894 break; 895 case kRegexpBeginText: 896 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); 897 re->Decref(); 898 return true; 899 } 900 return false; 901 } 902 903 // Is this regexp required to start at the end of the text? 904 // Only approximate; can return false for complicated regexps like (a\z|b\z), 905 // but handles ((a|b)\z). Could use the Walker to write a more exact one. 906 static bool IsAnchorEnd(Regexp** pre, int depth) { 907 Regexp* re = *pre; 908 Regexp* sub; 909 // The depth limit makes sure that we don't overflow 910 // the stack on a deeply nested regexp. As the comment 911 // above says, IsAnchorEnd is conservative, so returning 912 // a false negative is okay. The exact limit is somewhat arbitrary. 913 if (re == NULL || depth >= 4) 914 return false; 915 switch (re->op()) { 916 default: 917 break; 918 case kRegexpConcat: 919 if (re->nsub() > 0) { 920 sub = re->sub()[re->nsub() - 1]->Incref(); 921 if (IsAnchorEnd(&sub, depth+1)) { 922 Regexp** subcopy = new Regexp*[re->nsub()]; 923 subcopy[re->nsub() - 1] = sub; // already have reference 924 for (int i = 0; i < re->nsub() - 1; i++) 925 subcopy[i] = re->sub()[i]->Incref(); 926 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); 927 delete[] subcopy; 928 re->Decref(); 929 return true; 930 } 931 sub->Decref(); 932 } 933 break; 934 case kRegexpCapture: 935 sub = re->sub()[0]->Incref(); 936 if (IsAnchorEnd(&sub, depth+1)) { 937 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); 938 re->Decref(); 939 return true; 940 } 941 sub->Decref(); 942 break; 943 case kRegexpEndText: 944 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); 945 re->Decref(); 946 return true; 947 } 948 return false; 949 } 950 951 void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, 952 RE2::Anchor anchor) { 953 prog_->set_flags(flags); 954 955 if (flags & Regexp::Latin1) 956 encoding_ = kEncodingLatin1; 957 max_mem_ = max_mem; 958 if (max_mem <= 0) { 959 max_inst_ = 100000; // more than enough 960 } else if (max_mem <= sizeof(Prog)) { 961 // No room for anything. 962 max_inst_ = 0; 963 } else { 964 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); 965 // Limit instruction count so that inst->id() fits nicely in an int. 966 // SparseArray also assumes that the indices (inst->id()) are ints. 967 // The call to WalkExponential uses 2*max_inst_ below, 968 // and other places in the code use 2 or 3 * prog->size(). 969 // Limiting to 2^24 should avoid overflow in those places. 970 // (The point of allowing more than 32 bits of memory is to 971 // have plenty of room for the DFA states, not to use it up 972 // on the program.) 973 if (m >= 1<<24) 974 m = 1<<24; 975 976 // Inst imposes its own limit (currently bigger than 2^24 but be safe). 977 if (m > Prog::Inst::kMaxInst) 978 m = Prog::Inst::kMaxInst; 979 980 max_inst_ = m; 981 } 982 983 anchor_ = anchor; 984 } 985 986 // Compiles re, returning program. 987 // Caller is responsible for deleting prog_. 988 // If reversed is true, compiles a program that expects 989 // to run over the input string backward (reverses all concatenations). 990 // The reversed flag is also recorded in the returned program. 991 Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { 992 Compiler c; 993 994 c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */); 995 c.reversed_ = reversed; 996 997 // Simplify to remove things like counted repetitions 998 // and character classes like \d. 999 Regexp* sre = re->Simplify(); 1000 if (sre == NULL) 1001 return NULL; 1002 1003 // Record whether prog is anchored, removing the anchors. 1004 // (They get in the way of other optimizations.) 1005 bool is_anchor_start = IsAnchorStart(&sre, 0); 1006 bool is_anchor_end = IsAnchorEnd(&sre, 0); 1007 1008 // Generate fragment for entire regexp. 1009 Frag f = c.WalkExponential(sre, kNullFrag, 2*c.max_inst_); 1010 sre->Decref(); 1011 if (c.failed_) 1012 return NULL; 1013 1014 // Success! Finish by putting Match node at end, and record start. 1015 // Turn off c.reversed_ (if it is set) to force the remaining concatenations 1016 // to behave normally. 1017 c.reversed_ = false; 1018 Frag all = c.Cat(f, c.Match(0)); 1019 c.prog_->set_start(all.begin); 1020 1021 if (reversed) { 1022 c.prog_->set_anchor_start(is_anchor_end); 1023 c.prog_->set_anchor_end(is_anchor_start); 1024 } else { 1025 c.prog_->set_anchor_start(is_anchor_start); 1026 c.prog_->set_anchor_end(is_anchor_end); 1027 } 1028 1029 // Also create unanchored version, which starts with a .*? loop. 1030 if (c.prog_->anchor_start()) { 1031 c.prog_->set_start_unanchored(c.prog_->start()); 1032 } else { 1033 Frag unanchored = c.Cat(c.DotStar(), all); 1034 c.prog_->set_start_unanchored(unanchored.begin); 1035 } 1036 1037 c.prog_->set_reversed(reversed); 1038 1039 // Hand ownership of prog_ to caller. 1040 return c.Finish(); 1041 } 1042 1043 Prog* Compiler::Finish() { 1044 if (failed_) 1045 return NULL; 1046 1047 if (prog_->start() == 0 && prog_->start_unanchored() == 0) { 1048 // No possible matches; keep Fail instruction only. 1049 inst_len_ = 1; 1050 } 1051 1052 // Trim instruction to minimum array and transfer to Prog. 1053 Trim(); 1054 prog_->inst_ = inst_; 1055 prog_->size_ = inst_len_; 1056 inst_ = NULL; 1057 1058 // Compute byte map. 1059 prog_->ComputeByteMap(); 1060 1061 prog_->Optimize(); 1062 1063 // Record remaining memory for DFA. 1064 if (max_mem_ <= 0) { 1065 prog_->set_dfa_mem(1<<20); 1066 } else { 1067 int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst); 1068 if (m < 0) 1069 m = 0; 1070 prog_->set_dfa_mem(m); 1071 } 1072 1073 Prog* p = prog_; 1074 prog_ = NULL; 1075 return p; 1076 } 1077 1078 // Converts Regexp to Prog. 1079 Prog* Regexp::CompileToProg(int64 max_mem) { 1080 return Compiler::Compile(this, false, max_mem); 1081 } 1082 1083 Prog* Regexp::CompileToReverseProg(int64 max_mem) { 1084 return Compiler::Compile(this, true, max_mem); 1085 } 1086 1087 Frag Compiler::DotStar() { 1088 return Star(ByteRange(0x00, 0xff, false), true); 1089 } 1090 1091 // Compiles RE set to Prog. 1092 Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, 1093 Regexp* re) { 1094 Compiler c; 1095 1096 Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags()); 1097 c.Setup(pf, options.max_mem(), anchor); 1098 1099 // Compile alternation of fragments. 1100 Frag all = c.WalkExponential(re, kNullFrag, 2*c.max_inst_); 1101 re->Decref(); 1102 if (c.failed_) 1103 return NULL; 1104 1105 if (anchor == RE2::UNANCHORED) { 1106 // The trailing .* was added while handling kRegexpHaveMatch. 1107 // We just have to add the leading one. 1108 all = c.Cat(c.DotStar(), all); 1109 } 1110 1111 c.prog_->set_start(all.begin); 1112 c.prog_->set_start_unanchored(all.begin); 1113 c.prog_->set_anchor_start(true); 1114 c.prog_->set_anchor_end(true); 1115 1116 Prog* prog = c.Finish(); 1117 if (prog == NULL) 1118 return NULL; 1119 1120 // Make sure DFA has enough memory to operate, 1121 // since we're not going to fall back to the NFA. 1122 bool failed; 1123 StringPiece sp = "hello, world"; 1124 prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, 1125 NULL, &failed, NULL); 1126 if (failed) { 1127 delete prog; 1128 return NULL; 1129 } 1130 1131 return prog; 1132 } 1133 1134 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor, 1135 Regexp* re) { 1136 return Compiler::CompileSet(options, anchor, re); 1137 } 1138 1139 } // namespace re2 1140