1 // Copyright 2006 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Regular expression representation. 6 // Tested by parse_test.cc 7 8 #include "util/util.h" 9 #include "re2/regexp.h" 10 #include "re2/stringpiece.h" 11 #include "re2/walker-inl.h" 12 13 namespace re2 { 14 15 // Constructor. Allocates vectors as appropriate for operator. 16 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) 17 : op_(op), 18 simple_(false), 19 parse_flags_(static_cast<uint16>(parse_flags)), 20 ref_(1), 21 nsub_(0), 22 down_(NULL) { 23 subone_ = NULL; 24 memset(the_union_, 0, sizeof the_union_); 25 } 26 27 // Destructor. Assumes already cleaned up children. 28 // Private: use Decref() instead of delete to destroy Regexps. 29 // Can't call Decref on the sub-Regexps here because 30 // that could cause arbitrarily deep recursion, so 31 // required Decref() to have handled them for us. 32 Regexp::~Regexp() { 33 if (nsub_ > 0) 34 LOG(DFATAL) << "Regexp not destroyed."; 35 36 switch (op_) { 37 default: 38 break; 39 case kRegexpCapture: 40 delete name_; 41 break; 42 case kRegexpLiteralString: 43 delete[] runes_; 44 break; 45 case kRegexpCharClass: 46 cc_->Delete(); 47 delete ccb_; 48 break; 49 } 50 } 51 52 // If it's possible to destroy this regexp without recurring, 53 // do so and return true. Else return false. 54 bool Regexp::QuickDestroy() { 55 if (nsub_ == 0) { 56 delete this; 57 return true; 58 } 59 return false; 60 } 61 62 static map<Regexp*, int> *ref_map; 63 GLOBAL_MUTEX(ref_mutex); 64 65 int Regexp::Ref() { 66 if (ref_ < kMaxRef) 67 return ref_; 68 69 GLOBAL_MUTEX_LOCK(ref_mutex); 70 int r = 0; 71 if (ref_map != NULL) { 72 r = (*ref_map)[this]; 73 } 74 GLOBAL_MUTEX_UNLOCK(ref_mutex); 75 return r; 76 } 77 78 // Increments reference count, returns object as convenience. 79 Regexp* Regexp::Incref() { 80 if (ref_ >= kMaxRef-1) { 81 // Store ref count in overflow map. 82 GLOBAL_MUTEX_LOCK(ref_mutex); 83 if (ref_map == NULL) { 84 ref_map = new map<Regexp*, int>; 85 } 86 if (ref_ == kMaxRef) { 87 // already overflowed 88 (*ref_map)[this]++; 89 } else { 90 // overflowing now 91 (*ref_map)[this] = kMaxRef; 92 ref_ = kMaxRef; 93 } 94 GLOBAL_MUTEX_UNLOCK(ref_mutex); 95 return this; 96 } 97 98 ref_++; 99 return this; 100 } 101 102 // Decrements reference count and deletes this object if count reaches 0. 103 void Regexp::Decref() { 104 if (ref_ == kMaxRef) { 105 // Ref count is stored in overflow map. 106 GLOBAL_MUTEX_LOCK(ref_mutex); 107 int r = (*ref_map)[this] - 1; 108 if (r < kMaxRef) { 109 ref_ = r; 110 ref_map->erase(this); 111 } else { 112 (*ref_map)[this] = r; 113 } 114 GLOBAL_MUTEX_UNLOCK(ref_mutex); 115 return; 116 } 117 ref_--; 118 if (ref_ == 0) 119 Destroy(); 120 } 121 122 // Deletes this object; ref count has count reached 0. 123 void Regexp::Destroy() { 124 if (QuickDestroy()) 125 return; 126 127 // Handle recursive Destroy with explicit stack 128 // to avoid arbitrarily deep recursion on process stack [sigh]. 129 down_ = NULL; 130 Regexp* stack = this; 131 while (stack != NULL) { 132 Regexp* re = stack; 133 stack = re->down_; 134 if (re->ref_ != 0) 135 LOG(DFATAL) << "Bad reference count " << re->ref_; 136 if (re->nsub_ > 0) { 137 Regexp** subs = re->sub(); 138 for (int i = 0; i < re->nsub_; i++) { 139 Regexp* sub = subs[i]; 140 if (sub == NULL) 141 continue; 142 if (sub->ref_ == kMaxRef) 143 sub->Decref(); 144 else 145 --sub->ref_; 146 if (sub->ref_ == 0 && !sub->QuickDestroy()) { 147 sub->down_ = stack; 148 stack = sub; 149 } 150 } 151 if (re->nsub_ > 1) 152 delete[] subs; 153 re->nsub_ = 0; 154 } 155 delete re; 156 } 157 } 158 159 void Regexp::AddRuneToString(Rune r) { 160 DCHECK(op_ == kRegexpLiteralString); 161 if (nrunes_ == 0) { 162 // start with 8 163 runes_ = new Rune[8]; 164 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) { 165 // double on powers of two 166 Rune *old = runes_; 167 runes_ = new Rune[nrunes_ * 2]; 168 for (int i = 0; i < nrunes_; i++) 169 runes_[i] = old[i]; 170 delete[] old; 171 } 172 173 runes_[nrunes_++] = r; 174 } 175 176 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) { 177 Regexp* re = new Regexp(kRegexpHaveMatch, flags); 178 re->match_id_ = match_id; 179 return re; 180 } 181 182 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { 183 if (sub->op() == kRegexpPlus && sub->parse_flags() == flags) 184 return sub; 185 Regexp* re = new Regexp(kRegexpPlus, flags); 186 re->AllocSub(1); 187 re->sub()[0] = sub; 188 return re; 189 } 190 191 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) { 192 if (sub->op() == kRegexpStar && sub->parse_flags() == flags) 193 return sub; 194 Regexp* re = new Regexp(kRegexpStar, flags); 195 re->AllocSub(1); 196 re->sub()[0] = sub; 197 return re; 198 } 199 200 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) { 201 if (sub->op() == kRegexpQuest && sub->parse_flags() == flags) 202 return sub; 203 Regexp* re = new Regexp(kRegexpQuest, flags); 204 re->AllocSub(1); 205 re->sub()[0] = sub; 206 return re; 207 } 208 209 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, 210 ParseFlags flags, bool can_factor) { 211 if (nsub == 1) 212 return sub[0]; 213 214 Regexp** subcopy = NULL; 215 if (op == kRegexpAlternate && can_factor) { 216 // Going to edit sub; make a copy so we don't step on caller. 217 subcopy = new Regexp*[nsub]; 218 memmove(subcopy, sub, nsub * sizeof sub[0]); 219 sub = subcopy; 220 nsub = FactorAlternation(sub, nsub, flags); 221 if (nsub == 1) { 222 Regexp* re = sub[0]; 223 delete[] subcopy; 224 return re; 225 } 226 } 227 228 if (nsub > kMaxNsub) { 229 // Too many subexpressions to fit in a single Regexp. 230 // Make a two-level tree. Two levels gets us to 65535^2. 231 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub; 232 Regexp* re = new Regexp(op, flags); 233 re->AllocSub(nbigsub); 234 Regexp** subs = re->sub(); 235 for (int i = 0; i < nbigsub - 1; i++) 236 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false); 237 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, 238 nsub - (nbigsub-1)*kMaxNsub, flags, 239 false); 240 delete[] subcopy; 241 return re; 242 } 243 244 Regexp* re = new Regexp(op, flags); 245 re->AllocSub(nsub); 246 Regexp** subs = re->sub(); 247 for (int i = 0; i < nsub; i++) 248 subs[i] = sub[i]; 249 250 delete[] subcopy; 251 return re; 252 } 253 254 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) { 255 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false); 256 } 257 258 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) { 259 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true); 260 } 261 262 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) { 263 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false); 264 } 265 266 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) { 267 Regexp* re = new Regexp(kRegexpCapture, flags); 268 re->AllocSub(1); 269 re->sub()[0] = sub; 270 re->cap_ = cap; 271 return re; 272 } 273 274 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) { 275 Regexp* re = new Regexp(kRegexpRepeat, flags); 276 re->AllocSub(1); 277 re->sub()[0] = sub; 278 re->min_ = min; 279 re->max_ = max; 280 return re; 281 } 282 283 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) { 284 Regexp* re = new Regexp(kRegexpLiteral, flags); 285 re->rune_ = rune; 286 return re; 287 } 288 289 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) { 290 if (nrunes <= 0) 291 return new Regexp(kRegexpEmptyMatch, flags); 292 if (nrunes == 1) 293 return NewLiteral(runes[0], flags); 294 Regexp* re = new Regexp(kRegexpLiteralString, flags); 295 for (int i = 0; i < nrunes; i++) 296 re->AddRuneToString(runes[i]); 297 return re; 298 } 299 300 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) { 301 Regexp* re = new Regexp(kRegexpCharClass, flags); 302 re->cc_ = cc; 303 return re; 304 } 305 306 // Swaps this and that in place. 307 void Regexp::Swap(Regexp* that) { 308 // Can use memmove because Regexp is just a struct (no vtable). 309 char tmp[sizeof *this]; 310 memmove(tmp, this, sizeof tmp); 311 memmove(this, that, sizeof tmp); 312 memmove(that, tmp, sizeof tmp); 313 } 314 315 // Tests equality of all top-level structure but not subregexps. 316 static bool TopEqual(Regexp* a, Regexp* b) { 317 if (a->op() != b->op()) 318 return false; 319 320 switch (a->op()) { 321 case kRegexpNoMatch: 322 case kRegexpEmptyMatch: 323 case kRegexpAnyChar: 324 case kRegexpAnyByte: 325 case kRegexpBeginLine: 326 case kRegexpEndLine: 327 case kRegexpWordBoundary: 328 case kRegexpNoWordBoundary: 329 case kRegexpBeginText: 330 return true; 331 332 case kRegexpEndText: 333 // The parse flags remember whether it's \z or (?-m:$), 334 // which matters when testing against PCRE. 335 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0; 336 337 case kRegexpLiteral: 338 return a->rune() == b->rune() && 339 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0; 340 341 case kRegexpLiteralString: 342 return a->nrunes() == b->nrunes() && 343 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 && 344 memcmp(a->runes(), b->runes(), 345 a->nrunes() * sizeof a->runes()[0]) == 0; 346 347 case kRegexpAlternate: 348 case kRegexpConcat: 349 return a->nsub() == b->nsub(); 350 351 case kRegexpStar: 352 case kRegexpPlus: 353 case kRegexpQuest: 354 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0; 355 356 case kRegexpRepeat: 357 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 && 358 a->min() == b->min() && 359 a->max() == b->max(); 360 361 case kRegexpCapture: 362 return a->cap() == b->cap() && a->name() == b->name(); 363 364 case kRegexpHaveMatch: 365 return a->match_id() == b->match_id(); 366 367 case kRegexpCharClass: { 368 CharClass* acc = a->cc(); 369 CharClass* bcc = b->cc(); 370 return acc->size() == bcc->size() && 371 acc->end() - acc->begin() == bcc->end() - bcc->begin() && 372 memcmp(acc->begin(), bcc->begin(), 373 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0; 374 } 375 } 376 377 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op(); 378 return 0; 379 } 380 381 bool Regexp::Equal(Regexp* a, Regexp* b) { 382 if (a == NULL || b == NULL) 383 return a == b; 384 385 if (!TopEqual(a, b)) 386 return false; 387 388 // Fast path: 389 // return without allocating vector if there are no subregexps. 390 switch (a->op()) { 391 case kRegexpAlternate: 392 case kRegexpConcat: 393 case kRegexpStar: 394 case kRegexpPlus: 395 case kRegexpQuest: 396 case kRegexpRepeat: 397 case kRegexpCapture: 398 break; 399 400 default: 401 return true; 402 } 403 404 // Committed to doing real work. 405 // The stack (vector) has pairs of regexps waiting to 406 // be compared. The regexps are only equal if 407 // all the pairs end up being equal. 408 vector<Regexp*> stk; 409 410 for (;;) { 411 // Invariant: TopEqual(a, b) == true. 412 Regexp* a2; 413 Regexp* b2; 414 switch (a->op()) { 415 default: 416 break; 417 case kRegexpAlternate: 418 case kRegexpConcat: 419 for (int i = 0; i < a->nsub(); i++) { 420 a2 = a->sub()[i]; 421 b2 = b->sub()[i]; 422 if (!TopEqual(a2, b2)) 423 return false; 424 stk.push_back(a2); 425 stk.push_back(b2); 426 } 427 break; 428 429 case kRegexpStar: 430 case kRegexpPlus: 431 case kRegexpQuest: 432 case kRegexpRepeat: 433 case kRegexpCapture: 434 a2 = a->sub()[0]; 435 b2 = b->sub()[0]; 436 if (!TopEqual(a2, b2)) 437 return false; 438 // Really: 439 // stk.push_back(a2); 440 // stk.push_back(b2); 441 // break; 442 // but faster to assign directly and loop. 443 a = a2; 444 b = b2; 445 continue; 446 } 447 448 int n = stk.size(); 449 if (n == 0) 450 break; 451 452 a = stk[n-2]; 453 b = stk[n-1]; 454 stk.resize(n-2); 455 } 456 457 return true; 458 } 459 460 // Keep in sync with enum RegexpStatusCode in regexp.h 461 static const char *kErrorStrings[] = { 462 "no error", 463 "unexpected error", 464 "invalid escape sequence", 465 "invalid character class", 466 "invalid character class range", 467 "missing ]", 468 "missing )", 469 "trailing \\", 470 "no argument for repetition operator", 471 "invalid repetition size", 472 "bad repetition operator", 473 "invalid perl operator", 474 "invalid UTF-8", 475 "invalid named capture group", 476 }; 477 478 string RegexpStatus::CodeText(enum RegexpStatusCode code) { 479 if (code < 0 || code >= arraysize(kErrorStrings)) 480 code = kRegexpInternalError; 481 return kErrorStrings[code]; 482 } 483 484 string RegexpStatus::Text() const { 485 if (error_arg_.empty()) 486 return CodeText(code_); 487 string s; 488 s.append(CodeText(code_)); 489 s.append(": "); 490 s.append(error_arg_.data(), error_arg_.size()); 491 return s; 492 } 493 494 void RegexpStatus::Copy(const RegexpStatus& status) { 495 code_ = status.code_; 496 error_arg_ = status.error_arg_; 497 } 498 499 typedef int Ignored; // Walker<void> doesn't exist 500 501 // Walker subclass to count capturing parens in regexp. 502 class NumCapturesWalker : public Regexp::Walker<Ignored> { 503 public: 504 NumCapturesWalker() : ncapture_(0) {} 505 int ncapture() { return ncapture_; } 506 507 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { 508 if (re->op() == kRegexpCapture) 509 ncapture_++; 510 return ignored; 511 } 512 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { 513 // Should never be called: we use Walk not WalkExponential. 514 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called"; 515 return ignored; 516 } 517 518 private: 519 int ncapture_; 520 DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker); 521 }; 522 523 int Regexp::NumCaptures() { 524 NumCapturesWalker w; 525 w.Walk(this, 0); 526 return w.ncapture(); 527 } 528 529 // Walker class to build map of named capture groups and their indices. 530 class NamedCapturesWalker : public Regexp::Walker<Ignored> { 531 public: 532 NamedCapturesWalker() : map_(NULL) {} 533 ~NamedCapturesWalker() { delete map_; } 534 535 map<string, int>* TakeMap() { 536 map<string, int>* m = map_; 537 map_ = NULL; 538 return m; 539 } 540 541 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { 542 if (re->op() == kRegexpCapture && re->name() != NULL) { 543 // Allocate map once we find a name. 544 if (map_ == NULL) 545 map_ = new map<string, int>; 546 547 // Record first occurrence of each name. 548 // (The rule is that if you have the same name 549 // multiple times, only the leftmost one counts.) 550 if (map_->find(*re->name()) == map_->end()) 551 (*map_)[*re->name()] = re->cap(); 552 } 553 return ignored; 554 } 555 556 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { 557 // Should never be called: we use Walk not WalkExponential. 558 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called"; 559 return ignored; 560 } 561 562 private: 563 map<string, int>* map_; 564 DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker); 565 }; 566 567 map<string, int>* Regexp::NamedCaptures() { 568 NamedCapturesWalker w; 569 w.Walk(this, 0); 570 return w.TakeMap(); 571 } 572 573 // Walker class to build map from capture group indices to their names. 574 class CaptureNamesWalker : public Regexp::Walker<Ignored> { 575 public: 576 CaptureNamesWalker() : map_(NULL) {} 577 ~CaptureNamesWalker() { delete map_; } 578 579 map<int, string>* TakeMap() { 580 map<int, string>* m = map_; 581 map_ = NULL; 582 return m; 583 } 584 585 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { 586 if (re->op() == kRegexpCapture && re->name() != NULL) { 587 // Allocate map once we find a name. 588 if (map_ == NULL) 589 map_ = new map<int, string>; 590 591 (*map_)[re->cap()] = *re->name(); 592 } 593 return ignored; 594 } 595 596 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { 597 // Should never be called: we use Walk not WalkExponential. 598 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called"; 599 return ignored; 600 } 601 602 private: 603 map<int, string>* map_; 604 DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker); 605 }; 606 607 map<int, string>* Regexp::CaptureNames() { 608 CaptureNamesWalker w; 609 w.Walk(this, 0); 610 return w.TakeMap(); 611 } 612 613 // Determines whether regexp matches must be anchored 614 // with a fixed string prefix. If so, returns the prefix and 615 // the regexp that remains after the prefix. The prefix might 616 // be ASCII case-insensitive. 617 bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { 618 // No need for a walker: the regexp must be of the form 619 // 1. some number of ^ anchors 620 // 2. a literal char or string 621 // 3. the rest 622 prefix->clear(); 623 *foldcase = false; 624 *suffix = NULL; 625 if (op_ != kRegexpConcat) 626 return false; 627 628 // Some number of anchors, then a literal or concatenation. 629 int i = 0; 630 Regexp** sub = this->sub(); 631 while (i < nsub_ && sub[i]->op_ == kRegexpBeginText) 632 i++; 633 if (i == 0 || i >= nsub_) 634 return false; 635 636 Regexp* re = sub[i]; 637 switch (re->op_) { 638 default: 639 return false; 640 641 case kRegexpLiteralString: 642 // Convert to string in proper encoding. 643 if (re->parse_flags() & Latin1) { 644 prefix->resize(re->nrunes_); 645 for (int j = 0; j < re->nrunes_; j++) 646 (*prefix)[j] = re->runes_[j]; 647 } else { 648 // Convert to UTF-8 in place. 649 // Assume worst-case space and then trim. 650 prefix->resize(re->nrunes_ * UTFmax); 651 char *p = &(*prefix)[0]; 652 for (int j = 0; j < re->nrunes_; j++) { 653 Rune r = re->runes_[j]; 654 if (r < Runeself) 655 *p++ = r; 656 else 657 p += runetochar(p, &r); 658 } 659 prefix->resize(p - &(*prefix)[0]); 660 } 661 break; 662 663 case kRegexpLiteral: 664 if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { 665 prefix->append(1, re->rune_); 666 } else { 667 char buf[UTFmax]; 668 prefix->append(buf, runetochar(buf, &re->rune_)); 669 } 670 break; 671 } 672 *foldcase = (sub[i]->parse_flags() & FoldCase); 673 i++; 674 675 // The rest. 676 if (i < nsub_) { 677 for (int j = i; j < nsub_; j++) 678 sub[j]->Incref(); 679 re = Concat(sub + i, nsub_ - i, parse_flags()); 680 } else { 681 re = new Regexp(kRegexpEmptyMatch, parse_flags()); 682 } 683 *suffix = re; 684 return true; 685 } 686 687 // Character class builder is a balanced binary tree (STL set) 688 // containing non-overlapping, non-abutting RuneRanges. 689 // The less-than operator used in the tree treats two 690 // ranges as equal if they overlap at all, so that 691 // lookups for a particular Rune are possible. 692 693 CharClassBuilder::CharClassBuilder() { 694 nrunes_ = 0; 695 upper_ = 0; 696 lower_ = 0; 697 } 698 699 // Add lo-hi to the class; return whether class got bigger. 700 bool CharClassBuilder::AddRange(Rune lo, Rune hi) { 701 if (hi < lo) 702 return false; 703 704 if (lo <= 'z' && hi >= 'A') { 705 // Overlaps some alpha, maybe not all. 706 // Update bitmaps telling which ASCII letters are in the set. 707 Rune lo1 = max<Rune>(lo, 'A'); 708 Rune hi1 = min<Rune>(hi, 'Z'); 709 if (lo1 <= hi1) 710 upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A'); 711 712 lo1 = max<Rune>(lo, 'a'); 713 hi1 = min<Rune>(hi, 'z'); 714 if (lo1 <= hi1) 715 lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a'); 716 } 717 718 { // Check whether lo, hi is already in the class. 719 iterator it = ranges_.find(RuneRange(lo, lo)); 720 if (it != end() && it->lo <= lo && hi <= it->hi) 721 return false; 722 } 723 724 // Look for a range abutting lo on the left. 725 // If it exists, take it out and increase our range. 726 if (lo > 0) { 727 iterator it = ranges_.find(RuneRange(lo-1, lo-1)); 728 if (it != end()) { 729 lo = it->lo; 730 if (it->hi > hi) 731 hi = it->hi; 732 nrunes_ -= it->hi - it->lo + 1; 733 ranges_.erase(it); 734 } 735 } 736 737 // Look for a range abutting hi on the right. 738 // If it exists, take it out and increase our range. 739 if (hi < Runemax) { 740 iterator it = ranges_.find(RuneRange(hi+1, hi+1)); 741 if (it != end()) { 742 hi = it->hi; 743 nrunes_ -= it->hi - it->lo + 1; 744 ranges_.erase(it); 745 } 746 } 747 748 // Look for ranges between lo and hi. Take them out. 749 // This is only safe because the set has no overlapping ranges. 750 // We've already removed any ranges abutting lo and hi, so 751 // any that overlap [lo, hi] must be contained within it. 752 for (;;) { 753 iterator it = ranges_.find(RuneRange(lo, hi)); 754 if (it == end()) 755 break; 756 nrunes_ -= it->hi - it->lo + 1; 757 ranges_.erase(it); 758 } 759 760 // Finally, add [lo, hi]. 761 nrunes_ += hi - lo + 1; 762 ranges_.insert(RuneRange(lo, hi)); 763 return true; 764 } 765 766 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) { 767 for (iterator it = cc->begin(); it != cc->end(); ++it) 768 AddRange(it->lo, it->hi); 769 } 770 771 bool CharClassBuilder::Contains(Rune r) { 772 return ranges_.find(RuneRange(r, r)) != end(); 773 } 774 775 // Does the character class behave the same on A-Z as on a-z? 776 bool CharClassBuilder::FoldsASCII() { 777 return ((upper_ ^ lower_) & AlphaMask) == 0; 778 } 779 780 CharClassBuilder* CharClassBuilder::Copy() { 781 CharClassBuilder* cc = new CharClassBuilder; 782 for (iterator it = begin(); it != end(); ++it) 783 cc->ranges_.insert(RuneRange(it->lo, it->hi)); 784 cc->upper_ = upper_; 785 cc->lower_ = lower_; 786 cc->nrunes_ = nrunes_; 787 return cc; 788 } 789 790 791 792 void CharClassBuilder::RemoveAbove(Rune r) { 793 if (r >= Runemax) 794 return; 795 796 if (r < 'z') { 797 if (r < 'a') 798 lower_ = 0; 799 else 800 lower_ &= AlphaMask >> ('z' - r); 801 } 802 803 if (r < 'Z') { 804 if (r < 'A') 805 upper_ = 0; 806 else 807 upper_ &= AlphaMask >> ('Z' - r); 808 } 809 810 for (;;) { 811 812 iterator it = ranges_.find(RuneRange(r + 1, Runemax)); 813 if (it == end()) 814 break; 815 RuneRange rr = *it; 816 ranges_.erase(it); 817 nrunes_ -= rr.hi - rr.lo + 1; 818 if (rr.lo <= r) { 819 rr.hi = r; 820 ranges_.insert(rr); 821 nrunes_ += rr.hi - rr.lo + 1; 822 } 823 } 824 } 825 826 void CharClassBuilder::Negate() { 827 // Build up negation and then copy in. 828 // Could edit ranges in place, but C++ won't let me. 829 vector<RuneRange> v; 830 v.reserve(ranges_.size() + 1); 831 832 // In negation, first range begins at 0, unless 833 // the current class begins at 0. 834 iterator it = begin(); 835 if (it == end()) { 836 v.push_back(RuneRange(0, Runemax)); 837 } else { 838 int nextlo = 0; 839 if (it->lo == 0) { 840 nextlo = it->hi + 1; 841 ++it; 842 } 843 for (; it != end(); ++it) { 844 v.push_back(RuneRange(nextlo, it->lo - 1)); 845 nextlo = it->hi + 1; 846 } 847 if (nextlo <= Runemax) 848 v.push_back(RuneRange(nextlo, Runemax)); 849 } 850 851 ranges_.clear(); 852 for (int i = 0; i < v.size(); i++) 853 ranges_.insert(v[i]); 854 855 upper_ = AlphaMask & ~upper_; 856 lower_ = AlphaMask & ~lower_; 857 nrunes_ = Runemax+1 - nrunes_; 858 } 859 860 // Character class is a sorted list of ranges. 861 // The ranges are allocated in the same block as the header, 862 // necessitating a special allocator and Delete method. 863 864 CharClass* CharClass::New(int maxranges) { 865 CharClass* cc; 866 uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; 867 cc = reinterpret_cast<CharClass*>(data); 868 cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc); 869 cc->nranges_ = 0; 870 cc->folds_ascii_ = false; 871 cc->nrunes_ = 0; 872 return cc; 873 } 874 875 void CharClass::Delete() { 876 if (this == NULL) 877 return; 878 uint8 *data = reinterpret_cast<uint8*>(this); 879 delete[] data; 880 } 881 882 CharClass* CharClass::Negate() { 883 CharClass* cc = CharClass::New(nranges_+1); 884 cc->folds_ascii_ = folds_ascii_; 885 cc->nrunes_ = Runemax + 1 - nrunes_; 886 int n = 0; 887 int nextlo = 0; 888 for (CharClass::iterator it = begin(); it != end(); ++it) { 889 if (it->lo == nextlo) { 890 nextlo = it->hi + 1; 891 } else { 892 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1); 893 nextlo = it->hi + 1; 894 } 895 } 896 if (nextlo <= Runemax) 897 cc->ranges_[n++] = RuneRange(nextlo, Runemax); 898 cc->nranges_ = n; 899 return cc; 900 } 901 902 bool CharClass::Contains(Rune r) { 903 RuneRange* rr = ranges_; 904 int n = nranges_; 905 while (n > 0) { 906 int m = n/2; 907 if (rr[m].hi < r) { 908 rr += m+1; 909 n -= m+1; 910 } else if (r < rr[m].lo) { 911 n = m; 912 } else { // rr[m].lo <= r && r <= rr[m].hi 913 return true; 914 } 915 } 916 return false; 917 } 918 919 CharClass* CharClassBuilder::GetCharClass() { 920 CharClass* cc = CharClass::New(ranges_.size()); 921 int n = 0; 922 for (iterator it = begin(); it != end(); ++it) 923 cc->ranges_[n++] = *it; 924 cc->nranges_ = n; 925 DCHECK_LE(n, ranges_.size()); 926 cc->nrunes_ = nrunes_; 927 cc->folds_ascii_ = FoldsASCII(); 928 return cc; 929 } 930 931 } // namespace re2 932