Home | History | Annotate | Download | only in re2
      1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Regular expression representation.
      6 // Tested by parse_test.cc
      7 
      8 #include "util/util.h"
      9 #include "re2/regexp.h"
     10 #include "re2/stringpiece.h"
     11 #include "re2/walker-inl.h"
     12 
     13 namespace re2 {
     14 
     15 // Constructor.  Allocates vectors as appropriate for operator.
     16 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
     17   : op_(op),
     18     simple_(false),
     19     parse_flags_(static_cast<uint16>(parse_flags)),
     20     ref_(1),
     21     nsub_(0),
     22     down_(NULL) {
     23   subone_ = NULL;
     24   memset(the_union_, 0, sizeof the_union_);
     25 }
     26 
     27 // Destructor.  Assumes already cleaned up children.
     28 // Private: use Decref() instead of delete to destroy Regexps.
     29 // Can't call Decref on the sub-Regexps here because
     30 // that could cause arbitrarily deep recursion, so
     31 // required Decref() to have handled them for us.
     32 Regexp::~Regexp() {
     33   if (nsub_ > 0)
     34     LOG(DFATAL) << "Regexp not destroyed.";
     35 
     36   switch (op_) {
     37     default:
     38       break;
     39     case kRegexpCapture:
     40       delete name_;
     41       break;
     42     case kRegexpLiteralString:
     43       delete[] runes_;
     44       break;
     45     case kRegexpCharClass:
     46       cc_->Delete();
     47       delete ccb_;
     48       break;
     49   }
     50 }
     51 
     52 // If it's possible to destroy this regexp without recurring,
     53 // do so and return true.  Else return false.
     54 bool Regexp::QuickDestroy() {
     55   if (nsub_ == 0) {
     56     delete this;
     57     return true;
     58   }
     59   return false;
     60 }
     61 
     62 static map<Regexp*, int> *ref_map;
     63 GLOBAL_MUTEX(ref_mutex);
     64 
     65 int Regexp::Ref() {
     66   if (ref_ < kMaxRef)
     67     return ref_;
     68 
     69   GLOBAL_MUTEX_LOCK(ref_mutex);
     70   int r = 0;
     71   if (ref_map != NULL) {
     72     r = (*ref_map)[this];
     73   }
     74   GLOBAL_MUTEX_UNLOCK(ref_mutex);
     75   return r;
     76 }
     77 
     78 // Increments reference count, returns object as convenience.
     79 Regexp* Regexp::Incref() {
     80   if (ref_ >= kMaxRef-1) {
     81     // Store ref count in overflow map.
     82     GLOBAL_MUTEX_LOCK(ref_mutex);
     83     if (ref_map == NULL) {
     84       ref_map = new map<Regexp*, int>;
     85     }
     86     if (ref_ == kMaxRef) {
     87       // already overflowed
     88       (*ref_map)[this]++;
     89     } else {
     90       // overflowing now
     91       (*ref_map)[this] = kMaxRef;
     92       ref_ = kMaxRef;
     93     }
     94     GLOBAL_MUTEX_UNLOCK(ref_mutex);
     95     return this;
     96   }
     97 
     98   ref_++;
     99   return this;
    100 }
    101 
    102 // Decrements reference count and deletes this object if count reaches 0.
    103 void Regexp::Decref() {
    104   if (ref_ == kMaxRef) {
    105     // Ref count is stored in overflow map.
    106     GLOBAL_MUTEX_LOCK(ref_mutex);
    107     int r = (*ref_map)[this] - 1;
    108     if (r < kMaxRef) {
    109       ref_ = r;
    110       ref_map->erase(this);
    111     } else {
    112       (*ref_map)[this] = r;
    113     }
    114     GLOBAL_MUTEX_UNLOCK(ref_mutex);
    115     return;
    116   }
    117   ref_--;
    118   if (ref_ == 0)
    119     Destroy();
    120 }
    121 
    122 // Deletes this object; ref count has count reached 0.
    123 void Regexp::Destroy() {
    124   if (QuickDestroy())
    125     return;
    126 
    127   // Handle recursive Destroy with explicit stack
    128   // to avoid arbitrarily deep recursion on process stack [sigh].
    129   down_ = NULL;
    130   Regexp* stack = this;
    131   while (stack != NULL) {
    132     Regexp* re = stack;
    133     stack = re->down_;
    134     if (re->ref_ != 0)
    135       LOG(DFATAL) << "Bad reference count " << re->ref_;
    136     if (re->nsub_ > 0) {
    137       Regexp** subs = re->sub();
    138       for (int i = 0; i < re->nsub_; i++) {
    139         Regexp* sub = subs[i];
    140         if (sub == NULL)
    141           continue;
    142         if (sub->ref_ == kMaxRef)
    143           sub->Decref();
    144         else
    145           --sub->ref_;
    146         if (sub->ref_ == 0 && !sub->QuickDestroy()) {
    147           sub->down_ = stack;
    148           stack = sub;
    149         }
    150       }
    151       if (re->nsub_ > 1)
    152         delete[] subs;
    153       re->nsub_ = 0;
    154     }
    155     delete re;
    156   }
    157 }
    158 
    159 void Regexp::AddRuneToString(Rune r) {
    160   DCHECK(op_ == kRegexpLiteralString);
    161   if (nrunes_ == 0) {
    162     // start with 8
    163     runes_ = new Rune[8];
    164   } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
    165     // double on powers of two
    166     Rune *old = runes_;
    167     runes_ = new Rune[nrunes_ * 2];
    168     for (int i = 0; i < nrunes_; i++)
    169       runes_[i] = old[i];
    170     delete[] old;
    171   }
    172 
    173   runes_[nrunes_++] = r;
    174 }
    175 
    176 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
    177   Regexp* re = new Regexp(kRegexpHaveMatch, flags);
    178   re->match_id_ = match_id;
    179   return re;
    180 }
    181 
    182 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
    183   if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
    184     return sub;
    185   Regexp* re = new Regexp(kRegexpPlus, flags);
    186   re->AllocSub(1);
    187   re->sub()[0] = sub;
    188   return re;
    189 }
    190 
    191 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
    192   if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
    193     return sub;
    194   Regexp* re = new Regexp(kRegexpStar, flags);
    195   re->AllocSub(1);
    196   re->sub()[0] = sub;
    197   return re;
    198 }
    199 
    200 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
    201   if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
    202     return sub;
    203   Regexp* re = new Regexp(kRegexpQuest, flags);
    204   re->AllocSub(1);
    205   re->sub()[0] = sub;
    206   return re;
    207 }
    208 
    209 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
    210                                   ParseFlags flags, bool can_factor) {
    211   if (nsub == 1)
    212     return sub[0];
    213 
    214   Regexp** subcopy = NULL;
    215   if (op == kRegexpAlternate && can_factor) {
    216     // Going to edit sub; make a copy so we don't step on caller.
    217     subcopy = new Regexp*[nsub];
    218     memmove(subcopy, sub, nsub * sizeof sub[0]);
    219     sub = subcopy;
    220     nsub = FactorAlternation(sub, nsub, flags);
    221     if (nsub == 1) {
    222       Regexp* re = sub[0];
    223       delete[] subcopy;
    224       return re;
    225     }
    226   }
    227 
    228   if (nsub > kMaxNsub) {
    229     // Too many subexpressions to fit in a single Regexp.
    230     // Make a two-level tree.  Two levels gets us to 65535^2.
    231     int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
    232     Regexp* re = new Regexp(op, flags);
    233     re->AllocSub(nbigsub);
    234     Regexp** subs = re->sub();
    235     for (int i = 0; i < nbigsub - 1; i++)
    236       subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
    237     subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
    238                                           nsub - (nbigsub-1)*kMaxNsub, flags,
    239                                           false);
    240     delete[] subcopy;
    241     return re;
    242   }
    243 
    244   Regexp* re = new Regexp(op, flags);
    245   re->AllocSub(nsub);
    246   Regexp** subs = re->sub();
    247   for (int i = 0; i < nsub; i++)
    248     subs[i] = sub[i];
    249 
    250   delete[] subcopy;
    251   return re;
    252 }
    253 
    254 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
    255   return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
    256 }
    257 
    258 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
    259   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
    260 }
    261 
    262 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
    263   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
    264 }
    265 
    266 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
    267   Regexp* re = new Regexp(kRegexpCapture, flags);
    268   re->AllocSub(1);
    269   re->sub()[0] = sub;
    270   re->cap_ = cap;
    271   return re;
    272 }
    273 
    274 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
    275   Regexp* re = new Regexp(kRegexpRepeat, flags);
    276   re->AllocSub(1);
    277   re->sub()[0] = sub;
    278   re->min_ = min;
    279   re->max_ = max;
    280   return re;
    281 }
    282 
    283 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
    284   Regexp* re = new Regexp(kRegexpLiteral, flags);
    285   re->rune_ = rune;
    286   return re;
    287 }
    288 
    289 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
    290   if (nrunes <= 0)
    291     return new Regexp(kRegexpEmptyMatch, flags);
    292   if (nrunes == 1)
    293     return NewLiteral(runes[0], flags);
    294   Regexp* re = new Regexp(kRegexpLiteralString, flags);
    295   for (int i = 0; i < nrunes; i++)
    296     re->AddRuneToString(runes[i]);
    297   return re;
    298 }
    299 
    300 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
    301   Regexp* re = new Regexp(kRegexpCharClass, flags);
    302   re->cc_ = cc;
    303   return re;
    304 }
    305 
    306 // Swaps this and that in place.
    307 void Regexp::Swap(Regexp* that) {
    308   // Can use memmove because Regexp is just a struct (no vtable).
    309   char tmp[sizeof *this];
    310   memmove(tmp, this, sizeof tmp);
    311   memmove(this, that, sizeof tmp);
    312   memmove(that, tmp, sizeof tmp);
    313 }
    314 
    315 // Tests equality of all top-level structure but not subregexps.
    316 static bool TopEqual(Regexp* a, Regexp* b) {
    317   if (a->op() != b->op())
    318     return false;
    319 
    320   switch (a->op()) {
    321     case kRegexpNoMatch:
    322     case kRegexpEmptyMatch:
    323     case kRegexpAnyChar:
    324     case kRegexpAnyByte:
    325     case kRegexpBeginLine:
    326     case kRegexpEndLine:
    327     case kRegexpWordBoundary:
    328     case kRegexpNoWordBoundary:
    329     case kRegexpBeginText:
    330       return true;
    331 
    332     case kRegexpEndText:
    333       // The parse flags remember whether it's \z or (?-m:$),
    334       // which matters when testing against PCRE.
    335       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
    336 
    337     case kRegexpLiteral:
    338       return a->rune() == b->rune() &&
    339              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
    340 
    341     case kRegexpLiteralString:
    342       return a->nrunes() == b->nrunes() &&
    343              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
    344              memcmp(a->runes(), b->runes(),
    345                     a->nrunes() * sizeof a->runes()[0]) == 0;
    346 
    347     case kRegexpAlternate:
    348     case kRegexpConcat:
    349       return a->nsub() == b->nsub();
    350 
    351     case kRegexpStar:
    352     case kRegexpPlus:
    353     case kRegexpQuest:
    354       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
    355 
    356     case kRegexpRepeat:
    357       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
    358              a->min() == b->min() &&
    359              a->max() == b->max();
    360 
    361     case kRegexpCapture:
    362       return a->cap() == b->cap() && a->name() == b->name();
    363 
    364     case kRegexpHaveMatch:
    365       return a->match_id() == b->match_id();
    366 
    367     case kRegexpCharClass: {
    368       CharClass* acc = a->cc();
    369       CharClass* bcc = b->cc();
    370       return acc->size() == bcc->size() &&
    371              acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
    372              memcmp(acc->begin(), bcc->begin(),
    373                     (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
    374     }
    375   }
    376 
    377   LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
    378   return 0;
    379 }
    380 
    381 bool Regexp::Equal(Regexp* a, Regexp* b) {
    382   if (a == NULL || b == NULL)
    383     return a == b;
    384 
    385   if (!TopEqual(a, b))
    386     return false;
    387 
    388   // Fast path:
    389   // return without allocating vector if there are no subregexps.
    390   switch (a->op()) {
    391     case kRegexpAlternate:
    392     case kRegexpConcat:
    393     case kRegexpStar:
    394     case kRegexpPlus:
    395     case kRegexpQuest:
    396     case kRegexpRepeat:
    397     case kRegexpCapture:
    398       break;
    399 
    400     default:
    401       return true;
    402   }
    403 
    404   // Committed to doing real work.
    405   // The stack (vector) has pairs of regexps waiting to
    406   // be compared.  The regexps are only equal if
    407   // all the pairs end up being equal.
    408   vector<Regexp*> stk;
    409 
    410   for (;;) {
    411     // Invariant: TopEqual(a, b) == true.
    412     Regexp* a2;
    413     Regexp* b2;
    414     switch (a->op()) {
    415       default:
    416         break;
    417       case kRegexpAlternate:
    418       case kRegexpConcat:
    419         for (int i = 0; i < a->nsub(); i++) {
    420           a2 = a->sub()[i];
    421           b2 = b->sub()[i];
    422           if (!TopEqual(a2, b2))
    423             return false;
    424           stk.push_back(a2);
    425           stk.push_back(b2);
    426         }
    427         break;
    428 
    429       case kRegexpStar:
    430       case kRegexpPlus:
    431       case kRegexpQuest:
    432       case kRegexpRepeat:
    433       case kRegexpCapture:
    434         a2 = a->sub()[0];
    435         b2 = b->sub()[0];
    436         if (!TopEqual(a2, b2))
    437           return false;
    438         // Really:
    439         //   stk.push_back(a2);
    440         //   stk.push_back(b2);
    441         //   break;
    442         // but faster to assign directly and loop.
    443         a = a2;
    444         b = b2;
    445         continue;
    446     }
    447 
    448     int n = stk.size();
    449     if (n == 0)
    450       break;
    451 
    452     a = stk[n-2];
    453     b = stk[n-1];
    454     stk.resize(n-2);
    455   }
    456 
    457   return true;
    458 }
    459 
    460 // Keep in sync with enum RegexpStatusCode in regexp.h
    461 static const char *kErrorStrings[] = {
    462   "no error",
    463   "unexpected error",
    464   "invalid escape sequence",
    465   "invalid character class",
    466   "invalid character class range",
    467   "missing ]",
    468   "missing )",
    469   "trailing \\",
    470   "no argument for repetition operator",
    471   "invalid repetition size",
    472   "bad repetition operator",
    473   "invalid perl operator",
    474   "invalid UTF-8",
    475   "invalid named capture group",
    476 };
    477 
    478 string RegexpStatus::CodeText(enum RegexpStatusCode code) {
    479   if (code < 0 || code >= arraysize(kErrorStrings))
    480     code = kRegexpInternalError;
    481   return kErrorStrings[code];
    482 }
    483 
    484 string RegexpStatus::Text() const {
    485   if (error_arg_.empty())
    486     return CodeText(code_);
    487   string s;
    488   s.append(CodeText(code_));
    489   s.append(": ");
    490   s.append(error_arg_.data(), error_arg_.size());
    491   return s;
    492 }
    493 
    494 void RegexpStatus::Copy(const RegexpStatus& status) {
    495   code_ = status.code_;
    496   error_arg_ = status.error_arg_;
    497 }
    498 
    499 typedef int Ignored;  // Walker<void> doesn't exist
    500 
    501 // Walker subclass to count capturing parens in regexp.
    502 class NumCapturesWalker : public Regexp::Walker<Ignored> {
    503  public:
    504   NumCapturesWalker() : ncapture_(0) {}
    505   int ncapture() { return ncapture_; }
    506 
    507   virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
    508     if (re->op() == kRegexpCapture)
    509       ncapture_++;
    510     return ignored;
    511   }
    512   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
    513     // Should never be called: we use Walk not WalkExponential.
    514     LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
    515     return ignored;
    516   }
    517 
    518  private:
    519   int ncapture_;
    520   DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
    521 };
    522 
    523 int Regexp::NumCaptures() {
    524   NumCapturesWalker w;
    525   w.Walk(this, 0);
    526   return w.ncapture();
    527 }
    528 
    529 // Walker class to build map of named capture groups and their indices.
    530 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
    531  public:
    532   NamedCapturesWalker() : map_(NULL) {}
    533   ~NamedCapturesWalker() { delete map_; }
    534 
    535   map<string, int>* TakeMap() {
    536     map<string, int>* m = map_;
    537     map_ = NULL;
    538     return m;
    539   }
    540 
    541   Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
    542     if (re->op() == kRegexpCapture && re->name() != NULL) {
    543       // Allocate map once we find a name.
    544       if (map_ == NULL)
    545         map_ = new map<string, int>;
    546 
    547       // Record first occurrence of each name.
    548       // (The rule is that if you have the same name
    549       // multiple times, only the leftmost one counts.)
    550       if (map_->find(*re->name()) == map_->end())
    551         (*map_)[*re->name()] = re->cap();
    552     }
    553     return ignored;
    554   }
    555 
    556   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
    557     // Should never be called: we use Walk not WalkExponential.
    558     LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
    559     return ignored;
    560   }
    561 
    562  private:
    563   map<string, int>* map_;
    564   DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
    565 };
    566 
    567 map<string, int>* Regexp::NamedCaptures() {
    568   NamedCapturesWalker w;
    569   w.Walk(this, 0);
    570   return w.TakeMap();
    571 }
    572 
    573 // Walker class to build map from capture group indices to their names.
    574 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
    575  public:
    576   CaptureNamesWalker() : map_(NULL) {}
    577   ~CaptureNamesWalker() { delete map_; }
    578 
    579   map<int, string>* TakeMap() {
    580     map<int, string>* m = map_;
    581     map_ = NULL;
    582     return m;
    583   }
    584 
    585   Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
    586     if (re->op() == kRegexpCapture && re->name() != NULL) {
    587       // Allocate map once we find a name.
    588       if (map_ == NULL)
    589         map_ = new map<int, string>;
    590 
    591       (*map_)[re->cap()] = *re->name();
    592     }
    593     return ignored;
    594   }
    595 
    596   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
    597     // Should never be called: we use Walk not WalkExponential.
    598     LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
    599     return ignored;
    600   }
    601 
    602  private:
    603   map<int, string>* map_;
    604   DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
    605 };
    606 
    607 map<int, string>* Regexp::CaptureNames() {
    608   CaptureNamesWalker w;
    609   w.Walk(this, 0);
    610   return w.TakeMap();
    611 }
    612 
    613 // Determines whether regexp matches must be anchored
    614 // with a fixed string prefix.  If so, returns the prefix and
    615 // the regexp that remains after the prefix.  The prefix might
    616 // be ASCII case-insensitive.
    617 bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
    618   // No need for a walker: the regexp must be of the form
    619   // 1. some number of ^ anchors
    620   // 2. a literal char or string
    621   // 3. the rest
    622   prefix->clear();
    623   *foldcase = false;
    624   *suffix = NULL;
    625   if (op_ != kRegexpConcat)
    626     return false;
    627 
    628   // Some number of anchors, then a literal or concatenation.
    629   int i = 0;
    630   Regexp** sub = this->sub();
    631   while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
    632     i++;
    633   if (i == 0 || i >= nsub_)
    634     return false;
    635 
    636   Regexp* re = sub[i];
    637   switch (re->op_) {
    638     default:
    639       return false;
    640 
    641     case kRegexpLiteralString:
    642       // Convert to string in proper encoding.
    643       if (re->parse_flags() & Latin1) {
    644         prefix->resize(re->nrunes_);
    645         for (int j = 0; j < re->nrunes_; j++)
    646           (*prefix)[j] = re->runes_[j];
    647       } else {
    648         // Convert to UTF-8 in place.
    649         // Assume worst-case space and then trim.
    650         prefix->resize(re->nrunes_ * UTFmax);
    651         char *p = &(*prefix)[0];
    652         for (int j = 0; j < re->nrunes_; j++) {
    653           Rune r = re->runes_[j];
    654           if (r < Runeself)
    655             *p++ = r;
    656           else
    657             p += runetochar(p, &r);
    658         }
    659         prefix->resize(p - &(*prefix)[0]);
    660       }
    661       break;
    662 
    663     case kRegexpLiteral:
    664       if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
    665         prefix->append(1, re->rune_);
    666       } else {
    667         char buf[UTFmax];
    668         prefix->append(buf, runetochar(buf, &re->rune_));
    669       }
    670       break;
    671   }
    672   *foldcase = (sub[i]->parse_flags() & FoldCase);
    673   i++;
    674 
    675   // The rest.
    676   if (i < nsub_) {
    677     for (int j = i; j < nsub_; j++)
    678       sub[j]->Incref();
    679     re = Concat(sub + i, nsub_ - i, parse_flags());
    680   } else {
    681     re = new Regexp(kRegexpEmptyMatch, parse_flags());
    682   }
    683   *suffix = re;
    684   return true;
    685 }
    686 
    687 // Character class builder is a balanced binary tree (STL set)
    688 // containing non-overlapping, non-abutting RuneRanges.
    689 // The less-than operator used in the tree treats two
    690 // ranges as equal if they overlap at all, so that
    691 // lookups for a particular Rune are possible.
    692 
    693 CharClassBuilder::CharClassBuilder() {
    694   nrunes_ = 0;
    695   upper_ = 0;
    696   lower_ = 0;
    697 }
    698 
    699 // Add lo-hi to the class; return whether class got bigger.
    700 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
    701   if (hi < lo)
    702     return false;
    703 
    704   if (lo <= 'z' && hi >= 'A') {
    705     // Overlaps some alpha, maybe not all.
    706     // Update bitmaps telling which ASCII letters are in the set.
    707     Rune lo1 = max<Rune>(lo, 'A');
    708     Rune hi1 = min<Rune>(hi, 'Z');
    709     if (lo1 <= hi1)
    710       upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
    711 
    712     lo1 = max<Rune>(lo, 'a');
    713     hi1 = min<Rune>(hi, 'z');
    714     if (lo1 <= hi1)
    715       lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
    716   }
    717 
    718   {  // Check whether lo, hi is already in the class.
    719     iterator it = ranges_.find(RuneRange(lo, lo));
    720     if (it != end() && it->lo <= lo && hi <= it->hi)
    721       return false;
    722   }
    723 
    724   // Look for a range abutting lo on the left.
    725   // If it exists, take it out and increase our range.
    726   if (lo > 0) {
    727     iterator it = ranges_.find(RuneRange(lo-1, lo-1));
    728     if (it != end()) {
    729       lo = it->lo;
    730       if (it->hi > hi)
    731         hi = it->hi;
    732       nrunes_ -= it->hi - it->lo + 1;
    733       ranges_.erase(it);
    734     }
    735   }
    736 
    737   // Look for a range abutting hi on the right.
    738   // If it exists, take it out and increase our range.
    739   if (hi < Runemax) {
    740     iterator it = ranges_.find(RuneRange(hi+1, hi+1));
    741     if (it != end()) {
    742       hi = it->hi;
    743       nrunes_ -= it->hi - it->lo + 1;
    744       ranges_.erase(it);
    745     }
    746   }
    747 
    748   // Look for ranges between lo and hi.  Take them out.
    749   // This is only safe because the set has no overlapping ranges.
    750   // We've already removed any ranges abutting lo and hi, so
    751   // any that overlap [lo, hi] must be contained within it.
    752   for (;;) {
    753     iterator it = ranges_.find(RuneRange(lo, hi));
    754     if (it == end())
    755       break;
    756     nrunes_ -= it->hi - it->lo + 1;
    757     ranges_.erase(it);
    758   }
    759 
    760   // Finally, add [lo, hi].
    761   nrunes_ += hi - lo + 1;
    762   ranges_.insert(RuneRange(lo, hi));
    763   return true;
    764 }
    765 
    766 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
    767   for (iterator it = cc->begin(); it != cc->end(); ++it)
    768     AddRange(it->lo, it->hi);
    769 }
    770 
    771 bool CharClassBuilder::Contains(Rune r) {
    772   return ranges_.find(RuneRange(r, r)) != end();
    773 }
    774 
    775 // Does the character class behave the same on A-Z as on a-z?
    776 bool CharClassBuilder::FoldsASCII() {
    777   return ((upper_ ^ lower_) & AlphaMask) == 0;
    778 }
    779 
    780 CharClassBuilder* CharClassBuilder::Copy() {
    781   CharClassBuilder* cc = new CharClassBuilder;
    782   for (iterator it = begin(); it != end(); ++it)
    783     cc->ranges_.insert(RuneRange(it->lo, it->hi));
    784   cc->upper_ = upper_;
    785   cc->lower_ = lower_;
    786   cc->nrunes_ = nrunes_;
    787   return cc;
    788 }
    789 
    790 
    791 
    792 void CharClassBuilder::RemoveAbove(Rune r) {
    793   if (r >= Runemax)
    794     return;
    795 
    796   if (r < 'z') {
    797     if (r < 'a')
    798       lower_ = 0;
    799     else
    800       lower_ &= AlphaMask >> ('z' - r);
    801   }
    802 
    803   if (r < 'Z') {
    804     if (r < 'A')
    805       upper_ = 0;
    806     else
    807       upper_ &= AlphaMask >> ('Z' - r);
    808   }
    809 
    810   for (;;) {
    811 
    812     iterator it = ranges_.find(RuneRange(r + 1, Runemax));
    813     if (it == end())
    814       break;
    815     RuneRange rr = *it;
    816     ranges_.erase(it);
    817     nrunes_ -= rr.hi - rr.lo + 1;
    818     if (rr.lo <= r) {
    819       rr.hi = r;
    820       ranges_.insert(rr);
    821       nrunes_ += rr.hi - rr.lo + 1;
    822     }
    823   }
    824 }
    825 
    826 void CharClassBuilder::Negate() {
    827   // Build up negation and then copy in.
    828   // Could edit ranges in place, but C++ won't let me.
    829   vector<RuneRange> v;
    830   v.reserve(ranges_.size() + 1);
    831 
    832   // In negation, first range begins at 0, unless
    833   // the current class begins at 0.
    834   iterator it = begin();
    835   if (it == end()) {
    836     v.push_back(RuneRange(0, Runemax));
    837   } else {
    838     int nextlo = 0;
    839     if (it->lo == 0) {
    840       nextlo = it->hi + 1;
    841       ++it;
    842     }
    843     for (; it != end(); ++it) {
    844       v.push_back(RuneRange(nextlo, it->lo - 1));
    845       nextlo = it->hi + 1;
    846     }
    847     if (nextlo <= Runemax)
    848       v.push_back(RuneRange(nextlo, Runemax));
    849   }
    850 
    851   ranges_.clear();
    852   for (int i = 0; i < v.size(); i++)
    853     ranges_.insert(v[i]);
    854 
    855   upper_ = AlphaMask & ~upper_;
    856   lower_ = AlphaMask & ~lower_;
    857   nrunes_ = Runemax+1 - nrunes_;
    858 }
    859 
    860 // Character class is a sorted list of ranges.
    861 // The ranges are allocated in the same block as the header,
    862 // necessitating a special allocator and Delete method.
    863 
    864 CharClass* CharClass::New(int maxranges) {
    865   CharClass* cc;
    866   uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
    867   cc = reinterpret_cast<CharClass*>(data);
    868   cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
    869   cc->nranges_ = 0;
    870   cc->folds_ascii_ = false;
    871   cc->nrunes_ = 0;
    872   return cc;
    873 }
    874 
    875 void CharClass::Delete() {
    876   uint8 *data = reinterpret_cast<uint8*>(this);
    877   delete[] data;
    878 }
    879 
    880 CharClass* CharClass::Negate() {
    881   CharClass* cc = CharClass::New(nranges_+1);
    882   cc->folds_ascii_ = folds_ascii_;
    883   cc->nrunes_ = Runemax + 1 - nrunes_;
    884   int n = 0;
    885   int nextlo = 0;
    886   for (CharClass::iterator it = begin(); it != end(); ++it) {
    887     if (it->lo == nextlo) {
    888       nextlo = it->hi + 1;
    889     } else {
    890       cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
    891       nextlo = it->hi + 1;
    892     }
    893   }
    894   if (nextlo <= Runemax)
    895     cc->ranges_[n++] = RuneRange(nextlo, Runemax);
    896   cc->nranges_ = n;
    897   return cc;
    898 }
    899 
    900 bool CharClass::Contains(Rune r) {
    901   RuneRange* rr = ranges_;
    902   int n = nranges_;
    903   while (n > 0) {
    904     int m = n/2;
    905     if (rr[m].hi < r) {
    906       rr += m+1;
    907       n -= m+1;
    908     } else if (r < rr[m].lo) {
    909       n = m;
    910     } else {  // rr[m].lo <= r && r <= rr[m].hi
    911       return true;
    912     }
    913   }
    914   return false;
    915 }
    916 
    917 CharClass* CharClassBuilder::GetCharClass() {
    918   CharClass* cc = CharClass::New(ranges_.size());
    919   int n = 0;
    920   for (iterator it = begin(); it != end(); ++it)
    921     cc->ranges_[n++] = *it;
    922   cc->nranges_ = n;
    923   DCHECK_LE(n, ranges_.size());
    924   cc->nrunes_ = nrunes_;
    925   cc->folds_ascii_ = FoldsASCII();
    926   return cc;
    927 }
    928 
    929 }  // namespace re2
    930