1 // Copyright 2006 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Regular expression representation. 6 // Tested by parse_test.cc 7 8 #include "util/util.h" 9 #include "re2/regexp.h" 10 #include "re2/stringpiece.h" 11 #include "re2/walker-inl.h" 12 13 namespace re2 { 14 15 // Constructor. Allocates vectors as appropriate for operator. 16 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) 17 : op_(op), 18 simple_(false), 19 parse_flags_(static_cast<uint16>(parse_flags)), 20 ref_(1), 21 nsub_(0), 22 down_(NULL) { 23 subone_ = NULL; 24 memset(the_union_, 0, sizeof the_union_); 25 } 26 27 // Destructor. Assumes already cleaned up children. 28 // Private: use Decref() instead of delete to destroy Regexps. 29 // Can't call Decref on the sub-Regexps here because 30 // that could cause arbitrarily deep recursion, so 31 // required Decref() to have handled them for us. 32 Regexp::~Regexp() { 33 if (nsub_ > 0) 34 LOG(DFATAL) << "Regexp not destroyed."; 35 36 switch (op_) { 37 default: 38 break; 39 case kRegexpCapture: 40 delete name_; 41 break; 42 case kRegexpLiteralString: 43 delete[] runes_; 44 break; 45 case kRegexpCharClass: 46 cc_->Delete(); 47 delete ccb_; 48 break; 49 } 50 } 51 52 // If it's possible to destroy this regexp without recurring, 53 // do so and return true. Else return false. 54 bool Regexp::QuickDestroy() { 55 if (nsub_ == 0) { 56 delete this; 57 return true; 58 } 59 return false; 60 } 61 62 static map<Regexp*, int> ref_map; 63 static Mutex ref_mutex; 64 65 int Regexp::Ref() { 66 if (ref_ < kMaxRef) 67 return ref_; 68 69 MutexLock l(&ref_mutex); 70 return ref_map[this]; 71 } 72 73 // Increments reference count, returns object as convenience. 74 Regexp* Regexp::Incref() { 75 if (ref_ >= kMaxRef-1) { 76 // Store ref count in overflow map. 77 MutexLock l(&ref_mutex); 78 if (ref_ == kMaxRef) { // already overflowed 79 ref_map[this]++; 80 return this; 81 } 82 // overflowing now 83 ref_map[this] = kMaxRef; 84 ref_ = kMaxRef; 85 return this; 86 } 87 88 ref_++; 89 return this; 90 } 91 92 // Decrements reference count and deletes this object if count reaches 0. 93 void Regexp::Decref() { 94 if (ref_ == kMaxRef) { 95 // Ref count is stored in overflow map. 96 MutexLock l(&ref_mutex); 97 int r = ref_map[this] - 1; 98 if (r < kMaxRef) { 99 ref_ = r; 100 ref_map.erase(this); 101 } else { 102 ref_map[this] = r; 103 } 104 return; 105 } 106 ref_--; 107 if (ref_ == 0) 108 Destroy(); 109 } 110 111 // Deletes this object; ref count has count reached 0. 112 void Regexp::Destroy() { 113 if (QuickDestroy()) 114 return; 115 116 // Handle recursive Destroy with explicit stack 117 // to avoid arbitrarily deep recursion on process stack [sigh]. 118 down_ = NULL; 119 Regexp* stack = this; 120 while (stack != NULL) { 121 Regexp* re = stack; 122 stack = re->down_; 123 if (re->ref_ != 0) 124 LOG(DFATAL) << "Bad reference count " << re->ref_; 125 if (re->nsub_ > 0) { 126 Regexp** subs = re->sub(); 127 for (int i = 0; i < re->nsub_; i++) { 128 Regexp* sub = subs[i]; 129 if (sub == NULL) 130 continue; 131 if (sub->ref_ == kMaxRef) 132 sub->Decref(); 133 else 134 --sub->ref_; 135 if (sub->ref_ == 0 && !sub->QuickDestroy()) { 136 sub->down_ = stack; 137 stack = sub; 138 } 139 } 140 if (re->nsub_ > 1) 141 delete[] subs; 142 re->nsub_ = 0; 143 } 144 delete re; 145 } 146 } 147 148 void Regexp::AddRuneToString(Rune r) { 149 DCHECK(op_ == kRegexpLiteralString); 150 if (nrunes_ == 0) { 151 // start with 8 152 runes_ = new Rune[8]; 153 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) { 154 // double on powers of two 155 Rune *old = runes_; 156 runes_ = new Rune[nrunes_ * 2]; 157 for (int i = 0; i < nrunes_; i++) 158 runes_[i] = old[i]; 159 delete[] old; 160 } 161 162 runes_[nrunes_++] = r; 163 } 164 165 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) { 166 Regexp* re = new Regexp(kRegexpHaveMatch, flags); 167 re->match_id_ = match_id; 168 return re; 169 } 170 171 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { 172 if (sub->op() == kRegexpPlus && sub->parse_flags() == flags) 173 return sub; 174 Regexp* re = new Regexp(kRegexpPlus, flags); 175 re->AllocSub(1); 176 re->sub()[0] = sub; 177 return re; 178 } 179 180 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) { 181 if (sub->op() == kRegexpStar && sub->parse_flags() == flags) 182 return sub; 183 Regexp* re = new Regexp(kRegexpStar, flags); 184 re->AllocSub(1); 185 re->sub()[0] = sub; 186 return re; 187 } 188 189 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) { 190 if (sub->op() == kRegexpQuest && sub->parse_flags() == flags) 191 return sub; 192 Regexp* re = new Regexp(kRegexpQuest, flags); 193 re->AllocSub(1); 194 re->sub()[0] = sub; 195 return re; 196 } 197 198 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, 199 ParseFlags flags, bool can_factor) { 200 if (nsub == 1) 201 return sub[0]; 202 203 Regexp** subcopy = NULL; 204 if (op == kRegexpAlternate && can_factor) { 205 // Going to edit sub; make a copy so we don't step on caller. 206 subcopy = new Regexp*[nsub]; 207 memmove(subcopy, sub, nsub * sizeof sub[0]); 208 sub = subcopy; 209 nsub = FactorAlternation(sub, nsub, flags); 210 if (nsub == 1) { 211 Regexp* re = sub[0]; 212 delete[] subcopy; 213 return re; 214 } 215 } 216 217 if (nsub > kMaxNsub) { 218 // Too many subexpressions to fit in a single Regexp. 219 // Make a two-level tree. Two levels gets us to 65535^2. 220 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub; 221 Regexp* re = new Regexp(op, flags); 222 re->AllocSub(nbigsub); 223 Regexp** subs = re->sub(); 224 for (int i = 0; i < nbigsub - 1; i++) 225 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false); 226 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, 227 nsub - (nbigsub-1)*kMaxNsub, flags, 228 false); 229 delete[] subcopy; 230 return re; 231 } 232 233 Regexp* re = new Regexp(op, flags); 234 re->AllocSub(nsub); 235 Regexp** subs = re->sub(); 236 for (int i = 0; i < nsub; i++) 237 subs[i] = sub[i]; 238 239 delete[] subcopy; 240 return re; 241 } 242 243 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) { 244 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false); 245 } 246 247 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) { 248 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true); 249 } 250 251 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) { 252 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false); 253 } 254 255 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) { 256 Regexp* re = new Regexp(kRegexpCapture, flags); 257 re->AllocSub(1); 258 re->sub()[0] = sub; 259 re->cap_ = cap; 260 return re; 261 } 262 263 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) { 264 Regexp* re = new Regexp(kRegexpRepeat, flags); 265 re->AllocSub(1); 266 re->sub()[0] = sub; 267 re->min_ = min; 268 re->max_ = max; 269 return re; 270 } 271 272 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) { 273 Regexp* re = new Regexp(kRegexpLiteral, flags); 274 re->rune_ = rune; 275 return re; 276 } 277 278 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) { 279 if (nrunes <= 0) 280 return new Regexp(kRegexpEmptyMatch, flags); 281 if (nrunes == 1) 282 return NewLiteral(runes[0], flags); 283 Regexp* re = new Regexp(kRegexpLiteralString, flags); 284 for (int i = 0; i < nrunes; i++) 285 re->AddRuneToString(runes[i]); 286 return re; 287 } 288 289 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) { 290 Regexp* re = new Regexp(kRegexpCharClass, flags); 291 re->cc_ = cc; 292 return re; 293 } 294 295 // Swaps this and that in place. 296 void Regexp::Swap(Regexp* that) { 297 // Can use memmove because Regexp is just a struct (no vtable). 298 char tmp[sizeof *this]; 299 memmove(tmp, this, sizeof tmp); 300 memmove(this, that, sizeof tmp); 301 memmove(that, tmp, sizeof tmp); 302 } 303 304 // Tests equality of all top-level structure but not subregexps. 305 static bool TopEqual(Regexp* a, Regexp* b) { 306 if (a->op() != b->op()) 307 return false; 308 309 switch (a->op()) { 310 case kRegexpNoMatch: 311 case kRegexpEmptyMatch: 312 case kRegexpAnyChar: 313 case kRegexpAnyByte: 314 case kRegexpBeginLine: 315 case kRegexpEndLine: 316 case kRegexpWordBoundary: 317 case kRegexpNoWordBoundary: 318 case kRegexpBeginText: 319 return true; 320 321 case kRegexpEndText: 322 // The parse flags remember whether it's \z or (?-m:$), 323 // which matters when testing against PCRE. 324 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0; 325 326 case kRegexpLiteral: 327 return a->rune() == b->rune() && 328 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0; 329 330 case kRegexpLiteralString: 331 return a->nrunes() == b->nrunes() && 332 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 && 333 memcmp(a->runes(), b->runes(), 334 a->nrunes() * sizeof a->runes()[0]) == 0; 335 336 case kRegexpAlternate: 337 case kRegexpConcat: 338 return a->nsub() == b->nsub(); 339 340 case kRegexpStar: 341 case kRegexpPlus: 342 case kRegexpQuest: 343 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0; 344 345 case kRegexpRepeat: 346 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 && 347 a->min() == b->min() && 348 a->max() == b->max(); 349 350 case kRegexpCapture: 351 return a->cap() == b->cap() && a->name() == b->name(); 352 353 case kRegexpHaveMatch: 354 return a->match_id() == b->match_id(); 355 356 case kRegexpCharClass: { 357 CharClass* acc = a->cc(); 358 CharClass* bcc = b->cc(); 359 return acc->size() == bcc->size() && 360 acc->end() - acc->begin() == bcc->end() - bcc->begin() && 361 memcmp(acc->begin(), bcc->begin(), 362 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0; 363 } 364 } 365 366 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op(); 367 return 0; 368 } 369 370 bool Regexp::Equal(Regexp* a, Regexp* b) { 371 if (a == NULL || b == NULL) 372 return a == b; 373 374 if (!TopEqual(a, b)) 375 return false; 376 377 // Fast path: 378 // return without allocating vector if there are no subregexps. 379 switch (a->op()) { 380 case kRegexpAlternate: 381 case kRegexpConcat: 382 case kRegexpStar: 383 case kRegexpPlus: 384 case kRegexpQuest: 385 case kRegexpRepeat: 386 case kRegexpCapture: 387 break; 388 389 default: 390 return true; 391 } 392 393 // Committed to doing real work. 394 // The stack (vector) has pairs of regexps waiting to 395 // be compared. The regexps are only equal if 396 // all the pairs end up being equal. 397 vector<Regexp*> stk; 398 399 for (;;) { 400 // Invariant: TopEqual(a, b) == true. 401 Regexp* a2; 402 Regexp* b2; 403 switch (a->op()) { 404 default: 405 break; 406 case kRegexpAlternate: 407 case kRegexpConcat: 408 for (int i = 0; i < a->nsub(); i++) { 409 a2 = a->sub()[i]; 410 b2 = b->sub()[i]; 411 if (!TopEqual(a2, b2)) 412 return false; 413 stk.push_back(a2); 414 stk.push_back(b2); 415 } 416 break; 417 418 case kRegexpStar: 419 case kRegexpPlus: 420 case kRegexpQuest: 421 case kRegexpRepeat: 422 case kRegexpCapture: 423 a2 = a->sub()[0]; 424 b2 = b->sub()[0]; 425 if (!TopEqual(a2, b2)) 426 return false; 427 // Really: 428 // stk.push_back(a2); 429 // stk.push_back(b2); 430 // break; 431 // but faster to assign directly and loop. 432 a = a2; 433 b = b2; 434 continue; 435 } 436 437 int n = stk.size(); 438 if (n == 0) 439 break; 440 441 a = stk[n-2]; 442 b = stk[n-1]; 443 stk.resize(n-2); 444 } 445 446 return true; 447 } 448 449 // Keep in sync with enum RegexpStatusCode in regexp.h 450 static const string kErrorStrings[] = { 451 "no error", 452 "unexpected error", 453 "invalid escape sequence", 454 "invalid character class", 455 "invalid character class range", 456 "missing ]", 457 "missing )", 458 "trailing \\", 459 "no argument for repetition operator", 460 "invalid repetition size", 461 "bad repetition operator", 462 "invalid perl operator", 463 "invalid UTF-8", 464 "invalid named capture group", 465 }; 466 467 const string& RegexpStatus::CodeText(enum RegexpStatusCode code) { 468 if (code < 0 || code >= arraysize(kErrorStrings)) 469 code = kRegexpInternalError; 470 return kErrorStrings[code]; 471 } 472 473 string RegexpStatus::Text() const { 474 if (error_arg_.empty()) 475 return CodeText(code_); 476 string s; 477 s.append(CodeText(code_)); 478 s.append(": "); 479 s.append(error_arg_.data(), error_arg_.size()); 480 return s; 481 } 482 483 void RegexpStatus::Copy(const RegexpStatus& status) { 484 code_ = status.code_; 485 error_arg_ = status.error_arg_; 486 } 487 488 typedef int Ignored; // Walker<void> doesn't exist 489 490 // Walker subclass to count capturing parens in regexp. 491 class NumCapturesWalker : public Regexp::Walker<Ignored> { 492 public: 493 NumCapturesWalker() : ncapture_(0) {} 494 int ncapture() { return ncapture_; } 495 496 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { 497 if (re->op() == kRegexpCapture) 498 ncapture_++; 499 return ignored; 500 } 501 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { 502 // Should never be called: we use Walk not WalkExponential. 503 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called"; 504 return ignored; 505 } 506 507 private: 508 int ncapture_; 509 DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker); 510 }; 511 512 int Regexp::NumCaptures() { 513 NumCapturesWalker w; 514 w.Walk(this, 0); 515 return w.ncapture(); 516 } 517 518 // Walker class to build map of named capture groups and their indices. 519 class NamedCapturesWalker : public Regexp::Walker<Ignored> { 520 public: 521 NamedCapturesWalker() : map_(NULL) {} 522 ~NamedCapturesWalker() { delete map_; } 523 524 map<string, int>* TakeMap() { 525 map<string, int>* m = map_; 526 map_ = NULL; 527 return m; 528 } 529 530 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { 531 if (re->op() == kRegexpCapture && re->name() != NULL) { 532 // Allocate map once we find a name. 533 if (map_ == NULL) 534 map_ = new map<string, int>; 535 536 // Record first occurrence of each name. 537 // (The rule is that if you have the same name 538 // multiple times, only the leftmost one counts.) 539 if (map_->find(*re->name()) == map_->end()) 540 (*map_)[*re->name()] = re->cap(); 541 } 542 return ignored; 543 } 544 545 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { 546 // Should never be called: we use Walk not WalkExponential. 547 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called"; 548 return ignored; 549 } 550 551 private: 552 map<string, int>* map_; 553 DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker); 554 }; 555 556 map<string, int>* Regexp::NamedCaptures() { 557 NamedCapturesWalker w; 558 w.Walk(this, 0); 559 return w.TakeMap(); 560 } 561 562 // Walker class to build map from capture group indices to their names. 563 class CaptureNamesWalker : public Regexp::Walker<Ignored> { 564 public: 565 CaptureNamesWalker() : map_(NULL) {} 566 ~CaptureNamesWalker() { delete map_; } 567 568 map<int, string>* TakeMap() { 569 map<int, string>* m = map_; 570 map_ = NULL; 571 return m; 572 } 573 574 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { 575 if (re->op() == kRegexpCapture && re->name() != NULL) { 576 // Allocate map once we find a name. 577 if (map_ == NULL) 578 map_ = new map<int, string>; 579 580 (*map_)[re->cap()] = *re->name(); 581 } 582 return ignored; 583 } 584 585 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { 586 // Should never be called: we use Walk not WalkExponential. 587 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called"; 588 return ignored; 589 } 590 591 private: 592 map<int, string>* map_; 593 DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker); 594 }; 595 596 map<int, string>* Regexp::CaptureNames() { 597 CaptureNamesWalker w; 598 w.Walk(this, 0); 599 return w.TakeMap(); 600 } 601 602 // Determines whether regexp matches must be anchored 603 // with a fixed string prefix. If so, returns the prefix and 604 // the regexp that remains after the prefix. The prefix might 605 // be ASCII case-insensitive. 606 bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { 607 // No need for a walker: the regexp must be of the form 608 // 1. some number of ^ anchors 609 // 2. a literal char or string 610 // 3. the rest 611 prefix->clear(); 612 *foldcase = false; 613 *suffix = NULL; 614 if (op_ != kRegexpConcat) 615 return false; 616 617 // Some number of anchors, then a literal or concatenation. 618 int i = 0; 619 Regexp** sub = this->sub(); 620 while (i < nsub_ && sub[i]->op_ == kRegexpBeginText) 621 i++; 622 if (i == 0 || i >= nsub_) 623 return false; 624 625 Regexp* re = sub[i]; 626 switch (re->op_) { 627 default: 628 return false; 629 630 case kRegexpLiteralString: 631 // Convert to string in proper encoding. 632 if (re->parse_flags() & Latin1) { 633 prefix->resize(re->nrunes_); 634 for (int j = 0; j < re->nrunes_; j++) 635 (*prefix)[j] = re->runes_[j]; 636 } else { 637 // Convert to UTF-8 in place. 638 // Assume worst-case space and then trim. 639 prefix->resize(re->nrunes_ * UTFmax); 640 char *p = &(*prefix)[0]; 641 for (int j = 0; j < re->nrunes_; j++) { 642 Rune r = re->runes_[j]; 643 if (r < Runeself) 644 *p++ = r; 645 else 646 p += runetochar(p, &r); 647 } 648 prefix->resize(p - &(*prefix)[0]); 649 } 650 break; 651 652 case kRegexpLiteral: 653 if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { 654 prefix->append(1, re->rune_); 655 } else { 656 char buf[UTFmax]; 657 prefix->append(buf, runetochar(buf, &re->rune_)); 658 } 659 break; 660 } 661 *foldcase = (sub[i]->parse_flags() & FoldCase); 662 i++; 663 664 // The rest. 665 if (i < nsub_) { 666 for (int j = i; j < nsub_; j++) 667 sub[j]->Incref(); 668 re = Concat(sub + i, nsub_ - i, parse_flags()); 669 } else { 670 re = new Regexp(kRegexpEmptyMatch, parse_flags()); 671 } 672 *suffix = re; 673 return true; 674 } 675 676 // Character class builder is a balanced binary tree (STL set) 677 // containing non-overlapping, non-abutting RuneRanges. 678 // The less-than operator used in the tree treats two 679 // ranges as equal if they overlap at all, so that 680 // lookups for a particular Rune are possible. 681 682 CharClassBuilder::CharClassBuilder() { 683 nrunes_ = 0; 684 upper_ = 0; 685 lower_ = 0; 686 } 687 688 // Add lo-hi to the class; return whether class got bigger. 689 bool CharClassBuilder::AddRange(Rune lo, Rune hi) { 690 if (hi < lo) 691 return false; 692 693 if (lo <= 'z' && hi >= 'A') { 694 // Overlaps some alpha, maybe not all. 695 // Update bitmaps telling which ASCII letters are in the set. 696 Rune lo1 = max<Rune>(lo, 'A'); 697 Rune hi1 = min<Rune>(hi, 'Z'); 698 if (lo1 <= hi1) 699 upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A'); 700 701 lo1 = max<Rune>(lo, 'a'); 702 hi1 = min<Rune>(hi, 'z'); 703 if (lo1 <= hi1) 704 lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a'); 705 } 706 707 { // Check whether lo, hi is already in the class. 708 iterator it = ranges_.find(RuneRange(lo, lo)); 709 if (it != end() && it->lo <= lo && hi <= it->hi) 710 return false; 711 } 712 713 // Look for a range abutting lo on the left. 714 // If it exists, take it out and increase our range. 715 if (lo > 0) { 716 iterator it = ranges_.find(RuneRange(lo-1, lo-1)); 717 if (it != end()) { 718 lo = it->lo; 719 if (it->hi > hi) 720 hi = it->hi; 721 nrunes_ -= it->hi - it->lo + 1; 722 ranges_.erase(it); 723 } 724 } 725 726 // Look for a range abutting hi on the right. 727 // If it exists, take it out and increase our range. 728 if (hi < Runemax) { 729 iterator it = ranges_.find(RuneRange(hi+1, hi+1)); 730 if (it != end()) { 731 hi = it->hi; 732 nrunes_ -= it->hi - it->lo + 1; 733 ranges_.erase(it); 734 } 735 } 736 737 // Look for ranges between lo and hi. Take them out. 738 // This is only safe because the set has no overlapping ranges. 739 // We've already removed any ranges abutting lo and hi, so 740 // any that overlap [lo, hi] must be contained within it. 741 for (;;) { 742 iterator it = ranges_.find(RuneRange(lo, hi)); 743 if (it == end()) 744 break; 745 nrunes_ -= it->hi - it->lo + 1; 746 ranges_.erase(it); 747 } 748 749 // Finally, add [lo, hi]. 750 nrunes_ += hi - lo + 1; 751 ranges_.insert(RuneRange(lo, hi)); 752 return true; 753 } 754 755 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) { 756 for (iterator it = cc->begin(); it != cc->end(); ++it) 757 AddRange(it->lo, it->hi); 758 } 759 760 bool CharClassBuilder::Contains(Rune r) { 761 return ranges_.find(RuneRange(r, r)) != end(); 762 } 763 764 // Does the character class behave the same on A-Z as on a-z? 765 bool CharClassBuilder::FoldsASCII() { 766 return ((upper_ ^ lower_) & AlphaMask) == 0; 767 } 768 769 CharClassBuilder* CharClassBuilder::Copy() { 770 CharClassBuilder* cc = new CharClassBuilder; 771 for (iterator it = begin(); it != end(); ++it) 772 cc->ranges_.insert(RuneRange(it->lo, it->hi)); 773 cc->upper_ = upper_; 774 cc->lower_ = lower_; 775 cc->nrunes_ = nrunes_; 776 return cc; 777 } 778 779 780 781 void CharClassBuilder::RemoveAbove(Rune r) { 782 if (r >= Runemax) 783 return; 784 785 if (r < 'z') { 786 if (r < 'a') 787 lower_ = 0; 788 else 789 lower_ &= AlphaMask >> ('z' - r); 790 } 791 792 if (r < 'Z') { 793 if (r < 'A') 794 upper_ = 0; 795 else 796 upper_ &= AlphaMask >> ('Z' - r); 797 } 798 799 for (;;) { 800 801 iterator it = ranges_.find(RuneRange(r + 1, Runemax)); 802 if (it == end()) 803 break; 804 RuneRange rr = *it; 805 ranges_.erase(it); 806 nrunes_ -= rr.hi - rr.lo + 1; 807 if (rr.lo <= r) { 808 rr.hi = r; 809 ranges_.insert(rr); 810 nrunes_ += rr.hi - rr.lo + 1; 811 } 812 } 813 } 814 815 void CharClassBuilder::Negate() { 816 // Build up negation and then copy in. 817 // Could edit ranges in place, but C++ won't let me. 818 vector<RuneRange> v; 819 v.reserve(ranges_.size() + 1); 820 821 // In negation, first range begins at 0, unless 822 // the current class begins at 0. 823 iterator it = begin(); 824 if (it == end()) { 825 v.push_back(RuneRange(0, Runemax)); 826 } else { 827 int nextlo = 0; 828 if (it->lo == 0) { 829 nextlo = it->hi + 1; 830 ++it; 831 } 832 for (; it != end(); ++it) { 833 v.push_back(RuneRange(nextlo, it->lo - 1)); 834 nextlo = it->hi + 1; 835 } 836 if (nextlo <= Runemax) 837 v.push_back(RuneRange(nextlo, Runemax)); 838 } 839 840 ranges_.clear(); 841 for (int i = 0; i < v.size(); i++) 842 ranges_.insert(v[i]); 843 844 upper_ = AlphaMask & ~upper_; 845 lower_ = AlphaMask & ~lower_; 846 nrunes_ = Runemax+1 - nrunes_; 847 } 848 849 // Character class is a sorted list of ranges. 850 // The ranges are allocated in the same block as the header, 851 // necessitating a special allocator and Delete method. 852 853 CharClass* CharClass::New(int maxranges) { 854 CharClass* cc; 855 uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; 856 cc = reinterpret_cast<CharClass*>(data); 857 cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc); 858 cc->nranges_ = 0; 859 cc->folds_ascii_ = false; 860 cc->nrunes_ = 0; 861 return cc; 862 } 863 864 void CharClass::Delete() { 865 if (this == NULL) 866 return; 867 uint8 *data = reinterpret_cast<uint8*>(this); 868 delete[] data; 869 } 870 871 CharClass* CharClass::Negate() { 872 CharClass* cc = CharClass::New(nranges_+1); 873 cc->folds_ascii_ = folds_ascii_; 874 cc->nrunes_ = Runemax + 1 - nrunes_; 875 int n = 0; 876 int nextlo = 0; 877 for (CharClass::iterator it = begin(); it != end(); ++it) { 878 if (it->lo == nextlo) { 879 nextlo = it->hi + 1; 880 } else { 881 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1); 882 nextlo = it->hi + 1; 883 } 884 } 885 if (nextlo <= Runemax) 886 cc->ranges_[n++] = RuneRange(nextlo, Runemax); 887 cc->nranges_ = n; 888 return cc; 889 } 890 891 bool CharClass::Contains(Rune r) { 892 RuneRange* rr = ranges_; 893 int n = nranges_; 894 while (n > 0) { 895 int m = n/2; 896 if (rr[m].hi < r) { 897 rr += m+1; 898 n -= m+1; 899 } else if (r < rr[m].lo) { 900 n = m; 901 } else { // rr[m].lo <= r && r <= rr[m].hi 902 return true; 903 } 904 } 905 return false; 906 } 907 908 CharClass* CharClassBuilder::GetCharClass() { 909 CharClass* cc = CharClass::New(ranges_.size()); 910 int n = 0; 911 for (iterator it = begin(); it != end(); ++it) 912 cc->ranges_[n++] = *it; 913 cc->nranges_ = n; 914 DCHECK_LE(n, ranges_.size()); 915 cc->nrunes_ = nrunes_; 916 cc->folds_ascii_ = FoldsASCII(); 917 return cc; 918 } 919 920 } // namespace re2 921