Home | History | Annotate | Download | only in re2
      1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Regular expression representation.
      6 // Tested by parse_test.cc
      7 
      8 #include "util/util.h"
      9 #include "re2/regexp.h"
     10 #include "re2/stringpiece.h"
     11 #include "re2/walker-inl.h"
     12 
     13 namespace re2 {
     14 
     15 // Constructor.  Allocates vectors as appropriate for operator.
     16 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
     17   : op_(op),
     18     simple_(false),
     19     parse_flags_(static_cast<uint16>(parse_flags)),
     20     ref_(1),
     21     nsub_(0),
     22     down_(NULL) {
     23   subone_ = NULL;
     24   memset(the_union_, 0, sizeof the_union_);
     25 }
     26 
     27 // Destructor.  Assumes already cleaned up children.
     28 // Private: use Decref() instead of delete to destroy Regexps.
     29 // Can't call Decref on the sub-Regexps here because
     30 // that could cause arbitrarily deep recursion, so
     31 // required Decref() to have handled them for us.
     32 Regexp::~Regexp() {
     33   if (nsub_ > 0)
     34     LOG(DFATAL) << "Regexp not destroyed.";
     35 
     36   switch (op_) {
     37     default:
     38       break;
     39     case kRegexpCapture:
     40       delete name_;
     41       break;
     42     case kRegexpLiteralString:
     43       delete[] runes_;
     44       break;
     45     case kRegexpCharClass:
     46       cc_->Delete();
     47       delete ccb_;
     48       break;
     49   }
     50 }
     51 
     52 // If it's possible to destroy this regexp without recurring,
     53 // do so and return true.  Else return false.
     54 bool Regexp::QuickDestroy() {
     55   if (nsub_ == 0) {
     56     delete this;
     57     return true;
     58   }
     59   return false;
     60 }
     61 
     62 static map<Regexp*, int> ref_map;
     63 static Mutex ref_mutex;
     64 
     65 int Regexp::Ref() {
     66   if (ref_ < kMaxRef)
     67     return ref_;
     68 
     69   MutexLock l(&ref_mutex);
     70   return ref_map[this];
     71 }
     72 
     73 // Increments reference count, returns object as convenience.
     74 Regexp* Regexp::Incref() {
     75   if (ref_ >= kMaxRef-1) {
     76     // Store ref count in overflow map.
     77     MutexLock l(&ref_mutex);
     78     if (ref_ == kMaxRef) {  // already overflowed
     79       ref_map[this]++;
     80       return this;
     81     }
     82     // overflowing now
     83     ref_map[this] = kMaxRef;
     84     ref_ = kMaxRef;
     85     return this;
     86   }
     87 
     88   ref_++;
     89   return this;
     90 }
     91 
     92 // Decrements reference count and deletes this object if count reaches 0.
     93 void Regexp::Decref() {
     94   if (ref_ == kMaxRef) {
     95     // Ref count is stored in overflow map.
     96     MutexLock l(&ref_mutex);
     97     int r = ref_map[this] - 1;
     98     if (r < kMaxRef) {
     99       ref_ = r;
    100       ref_map.erase(this);
    101     } else {
    102       ref_map[this] = r;
    103     }
    104     return;
    105   }
    106   ref_--;
    107   if (ref_ == 0)
    108     Destroy();
    109 }
    110 
    111 // Deletes this object; ref count has count reached 0.
    112 void Regexp::Destroy() {
    113   if (QuickDestroy())
    114     return;
    115 
    116   // Handle recursive Destroy with explicit stack
    117   // to avoid arbitrarily deep recursion on process stack [sigh].
    118   down_ = NULL;
    119   Regexp* stack = this;
    120   while (stack != NULL) {
    121     Regexp* re = stack;
    122     stack = re->down_;
    123     if (re->ref_ != 0)
    124       LOG(DFATAL) << "Bad reference count " << re->ref_;
    125     if (re->nsub_ > 0) {
    126       Regexp** subs = re->sub();
    127       for (int i = 0; i < re->nsub_; i++) {
    128         Regexp* sub = subs[i];
    129         if (sub == NULL)
    130           continue;
    131         if (sub->ref_ == kMaxRef)
    132           sub->Decref();
    133         else
    134           --sub->ref_;
    135         if (sub->ref_ == 0 && !sub->QuickDestroy()) {
    136           sub->down_ = stack;
    137           stack = sub;
    138         }
    139       }
    140       if (re->nsub_ > 1)
    141         delete[] subs;
    142       re->nsub_ = 0;
    143     }
    144     delete re;
    145   }
    146 }
    147 
    148 void Regexp::AddRuneToString(Rune r) {
    149   DCHECK(op_ == kRegexpLiteralString);
    150   if (nrunes_ == 0) {
    151     // start with 8
    152     runes_ = new Rune[8];
    153   } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
    154     // double on powers of two
    155     Rune *old = runes_;
    156     runes_ = new Rune[nrunes_ * 2];
    157     for (int i = 0; i < nrunes_; i++)
    158       runes_[i] = old[i];
    159     delete[] old;
    160   }
    161 
    162   runes_[nrunes_++] = r;
    163 }
    164 
    165 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
    166   Regexp* re = new Regexp(kRegexpHaveMatch, flags);
    167   re->match_id_ = match_id;
    168   return re;
    169 }
    170 
    171 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
    172   if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
    173     return sub;
    174   Regexp* re = new Regexp(kRegexpPlus, flags);
    175   re->AllocSub(1);
    176   re->sub()[0] = sub;
    177   return re;
    178 }
    179 
    180 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
    181   if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
    182     return sub;
    183   Regexp* re = new Regexp(kRegexpStar, flags);
    184   re->AllocSub(1);
    185   re->sub()[0] = sub;
    186   return re;
    187 }
    188 
    189 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
    190   if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
    191     return sub;
    192   Regexp* re = new Regexp(kRegexpQuest, flags);
    193   re->AllocSub(1);
    194   re->sub()[0] = sub;
    195   return re;
    196 }
    197 
    198 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
    199                                   ParseFlags flags, bool can_factor) {
    200   if (nsub == 1)
    201     return sub[0];
    202 
    203   Regexp** subcopy = NULL;
    204   if (op == kRegexpAlternate && can_factor) {
    205     // Going to edit sub; make a copy so we don't step on caller.
    206     subcopy = new Regexp*[nsub];
    207     memmove(subcopy, sub, nsub * sizeof sub[0]);
    208     sub = subcopy;
    209     nsub = FactorAlternation(sub, nsub, flags);
    210     if (nsub == 1) {
    211       Regexp* re = sub[0];
    212       delete[] subcopy;
    213       return re;
    214     }
    215   }
    216 
    217   if (nsub > kMaxNsub) {
    218     // Too many subexpressions to fit in a single Regexp.
    219     // Make a two-level tree.  Two levels gets us to 65535^2.
    220     int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
    221     Regexp* re = new Regexp(op, flags);
    222     re->AllocSub(nbigsub);
    223     Regexp** subs = re->sub();
    224     for (int i = 0; i < nbigsub - 1; i++)
    225       subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
    226     subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
    227                                           nsub - (nbigsub-1)*kMaxNsub, flags,
    228                                           false);
    229     delete[] subcopy;
    230     return re;
    231   }
    232 
    233   Regexp* re = new Regexp(op, flags);
    234   re->AllocSub(nsub);
    235   Regexp** subs = re->sub();
    236   for (int i = 0; i < nsub; i++)
    237     subs[i] = sub[i];
    238 
    239   delete[] subcopy;
    240   return re;
    241 }
    242 
    243 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
    244   return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
    245 }
    246 
    247 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
    248   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
    249 }
    250 
    251 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
    252   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
    253 }
    254 
    255 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
    256   Regexp* re = new Regexp(kRegexpCapture, flags);
    257   re->AllocSub(1);
    258   re->sub()[0] = sub;
    259   re->cap_ = cap;
    260   return re;
    261 }
    262 
    263 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
    264   Regexp* re = new Regexp(kRegexpRepeat, flags);
    265   re->AllocSub(1);
    266   re->sub()[0] = sub;
    267   re->min_ = min;
    268   re->max_ = max;
    269   return re;
    270 }
    271 
    272 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
    273   Regexp* re = new Regexp(kRegexpLiteral, flags);
    274   re->rune_ = rune;
    275   return re;
    276 }
    277 
    278 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
    279   if (nrunes <= 0)
    280     return new Regexp(kRegexpEmptyMatch, flags);
    281   if (nrunes == 1)
    282     return NewLiteral(runes[0], flags);
    283   Regexp* re = new Regexp(kRegexpLiteralString, flags);
    284   for (int i = 0; i < nrunes; i++)
    285     re->AddRuneToString(runes[i]);
    286   return re;
    287 }
    288 
    289 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
    290   Regexp* re = new Regexp(kRegexpCharClass, flags);
    291   re->cc_ = cc;
    292   return re;
    293 }
    294 
    295 // Swaps this and that in place.
    296 void Regexp::Swap(Regexp* that) {
    297   // Can use memmove because Regexp is just a struct (no vtable).
    298   char tmp[sizeof *this];
    299   memmove(tmp, this, sizeof tmp);
    300   memmove(this, that, sizeof tmp);
    301   memmove(that, tmp, sizeof tmp);
    302 }
    303 
    304 // Tests equality of all top-level structure but not subregexps.
    305 static bool TopEqual(Regexp* a, Regexp* b) {
    306   if (a->op() != b->op())
    307     return false;
    308 
    309   switch (a->op()) {
    310     case kRegexpNoMatch:
    311     case kRegexpEmptyMatch:
    312     case kRegexpAnyChar:
    313     case kRegexpAnyByte:
    314     case kRegexpBeginLine:
    315     case kRegexpEndLine:
    316     case kRegexpWordBoundary:
    317     case kRegexpNoWordBoundary:
    318     case kRegexpBeginText:
    319       return true;
    320 
    321     case kRegexpEndText:
    322       // The parse flags remember whether it's \z or (?-m:$),
    323       // which matters when testing against PCRE.
    324       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
    325 
    326     case kRegexpLiteral:
    327       return a->rune() == b->rune() &&
    328              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
    329 
    330     case kRegexpLiteralString:
    331       return a->nrunes() == b->nrunes() &&
    332              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
    333              memcmp(a->runes(), b->runes(),
    334                     a->nrunes() * sizeof a->runes()[0]) == 0;
    335 
    336     case kRegexpAlternate:
    337     case kRegexpConcat:
    338       return a->nsub() == b->nsub();
    339 
    340     case kRegexpStar:
    341     case kRegexpPlus:
    342     case kRegexpQuest:
    343       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
    344 
    345     case kRegexpRepeat:
    346       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
    347              a->min() == b->min() &&
    348              a->max() == b->max();
    349 
    350     case kRegexpCapture:
    351       return a->cap() == b->cap() && a->name() == b->name();
    352 
    353     case kRegexpHaveMatch:
    354       return a->match_id() == b->match_id();
    355 
    356     case kRegexpCharClass: {
    357       CharClass* acc = a->cc();
    358       CharClass* bcc = b->cc();
    359       return acc->size() == bcc->size() &&
    360              acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
    361              memcmp(acc->begin(), bcc->begin(),
    362                     (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
    363     }
    364   }
    365 
    366   LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
    367   return 0;
    368 }
    369 
    370 bool Regexp::Equal(Regexp* a, Regexp* b) {
    371   if (a == NULL || b == NULL)
    372     return a == b;
    373 
    374   if (!TopEqual(a, b))
    375     return false;
    376 
    377   // Fast path:
    378   // return without allocating vector if there are no subregexps.
    379   switch (a->op()) {
    380     case kRegexpAlternate:
    381     case kRegexpConcat:
    382     case kRegexpStar:
    383     case kRegexpPlus:
    384     case kRegexpQuest:
    385     case kRegexpRepeat:
    386     case kRegexpCapture:
    387       break;
    388 
    389     default:
    390       return true;
    391   }
    392 
    393   // Committed to doing real work.
    394   // The stack (vector) has pairs of regexps waiting to
    395   // be compared.  The regexps are only equal if
    396   // all the pairs end up being equal.
    397   vector<Regexp*> stk;
    398 
    399   for (;;) {
    400     // Invariant: TopEqual(a, b) == true.
    401     Regexp* a2;
    402     Regexp* b2;
    403     switch (a->op()) {
    404       default:
    405         break;
    406       case kRegexpAlternate:
    407       case kRegexpConcat:
    408         for (int i = 0; i < a->nsub(); i++) {
    409           a2 = a->sub()[i];
    410           b2 = b->sub()[i];
    411           if (!TopEqual(a2, b2))
    412             return false;
    413           stk.push_back(a2);
    414           stk.push_back(b2);
    415         }
    416         break;
    417 
    418       case kRegexpStar:
    419       case kRegexpPlus:
    420       case kRegexpQuest:
    421       case kRegexpRepeat:
    422       case kRegexpCapture:
    423         a2 = a->sub()[0];
    424         b2 = b->sub()[0];
    425         if (!TopEqual(a2, b2))
    426           return false;
    427         // Really:
    428         //   stk.push_back(a2);
    429         //   stk.push_back(b2);
    430         //   break;
    431         // but faster to assign directly and loop.
    432         a = a2;
    433         b = b2;
    434         continue;
    435     }
    436 
    437     int n = stk.size();
    438     if (n == 0)
    439       break;
    440 
    441     a = stk[n-2];
    442     b = stk[n-1];
    443     stk.resize(n-2);
    444   }
    445 
    446   return true;
    447 }
    448 
    449 // Keep in sync with enum RegexpStatusCode in regexp.h
    450 static const string kErrorStrings[] = {
    451   "no error",
    452   "unexpected error",
    453   "invalid escape sequence",
    454   "invalid character class",
    455   "invalid character class range",
    456   "missing ]",
    457   "missing )",
    458   "trailing \\",
    459   "no argument for repetition operator",
    460   "invalid repetition size",
    461   "bad repetition operator",
    462   "invalid perl operator",
    463   "invalid UTF-8",
    464   "invalid named capture group",
    465 };
    466 
    467 const string& RegexpStatus::CodeText(enum RegexpStatusCode code) {
    468   if (code < 0 || code >= arraysize(kErrorStrings))
    469     code = kRegexpInternalError;
    470   return kErrorStrings[code];
    471 }
    472 
    473 string RegexpStatus::Text() const {
    474   if (error_arg_.empty())
    475     return CodeText(code_);
    476   string s;
    477   s.append(CodeText(code_));
    478   s.append(": ");
    479   s.append(error_arg_.data(), error_arg_.size());
    480   return s;
    481 }
    482 
    483 void RegexpStatus::Copy(const RegexpStatus& status) {
    484   code_ = status.code_;
    485   error_arg_ = status.error_arg_;
    486 }
    487 
    488 typedef int Ignored;  // Walker<void> doesn't exist
    489 
    490 // Walker subclass to count capturing parens in regexp.
    491 class NumCapturesWalker : public Regexp::Walker<Ignored> {
    492  public:
    493   NumCapturesWalker() : ncapture_(0) {}
    494   int ncapture() { return ncapture_; }
    495 
    496   virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
    497     if (re->op() == kRegexpCapture)
    498       ncapture_++;
    499     return ignored;
    500   }
    501   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
    502     // Should never be called: we use Walk not WalkExponential.
    503     LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
    504     return ignored;
    505   }
    506 
    507  private:
    508   int ncapture_;
    509   DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
    510 };
    511 
    512 int Regexp::NumCaptures() {
    513   NumCapturesWalker w;
    514   w.Walk(this, 0);
    515   return w.ncapture();
    516 }
    517 
    518 // Walker class to build map of named capture groups and their indices.
    519 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
    520  public:
    521   NamedCapturesWalker() : map_(NULL) {}
    522   ~NamedCapturesWalker() { delete map_; }
    523 
    524   map<string, int>* TakeMap() {
    525     map<string, int>* m = map_;
    526     map_ = NULL;
    527     return m;
    528   }
    529 
    530   Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
    531     if (re->op() == kRegexpCapture && re->name() != NULL) {
    532       // Allocate map once we find a name.
    533       if (map_ == NULL)
    534         map_ = new map<string, int>;
    535 
    536       // Record first occurrence of each name.
    537       // (The rule is that if you have the same name
    538       // multiple times, only the leftmost one counts.)
    539       if (map_->find(*re->name()) == map_->end())
    540         (*map_)[*re->name()] = re->cap();
    541     }
    542     return ignored;
    543   }
    544 
    545   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
    546     // Should never be called: we use Walk not WalkExponential.
    547     LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
    548     return ignored;
    549   }
    550 
    551  private:
    552   map<string, int>* map_;
    553   DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
    554 };
    555 
    556 map<string, int>* Regexp::NamedCaptures() {
    557   NamedCapturesWalker w;
    558   w.Walk(this, 0);
    559   return w.TakeMap();
    560 }
    561 
    562 // Walker class to build map from capture group indices to their names.
    563 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
    564  public:
    565   CaptureNamesWalker() : map_(NULL) {}
    566   ~CaptureNamesWalker() { delete map_; }
    567 
    568   map<int, string>* TakeMap() {
    569     map<int, string>* m = map_;
    570     map_ = NULL;
    571     return m;
    572   }
    573 
    574   Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
    575     if (re->op() == kRegexpCapture && re->name() != NULL) {
    576       // Allocate map once we find a name.
    577       if (map_ == NULL)
    578         map_ = new map<int, string>;
    579 
    580       (*map_)[re->cap()] = *re->name();
    581     }
    582     return ignored;
    583   }
    584 
    585   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
    586     // Should never be called: we use Walk not WalkExponential.
    587     LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
    588     return ignored;
    589   }
    590 
    591  private:
    592   map<int, string>* map_;
    593   DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
    594 };
    595 
    596 map<int, string>* Regexp::CaptureNames() {
    597   CaptureNamesWalker w;
    598   w.Walk(this, 0);
    599   return w.TakeMap();
    600 }
    601 
    602 // Determines whether regexp matches must be anchored
    603 // with a fixed string prefix.  If so, returns the prefix and
    604 // the regexp that remains after the prefix.  The prefix might
    605 // be ASCII case-insensitive.
    606 bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
    607   // No need for a walker: the regexp must be of the form
    608   // 1. some number of ^ anchors
    609   // 2. a literal char or string
    610   // 3. the rest
    611   prefix->clear();
    612   *foldcase = false;
    613   *suffix = NULL;
    614   if (op_ != kRegexpConcat)
    615     return false;
    616 
    617   // Some number of anchors, then a literal or concatenation.
    618   int i = 0;
    619   Regexp** sub = this->sub();
    620   while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
    621     i++;
    622   if (i == 0 || i >= nsub_)
    623     return false;
    624 
    625   Regexp* re = sub[i];
    626   switch (re->op_) {
    627     default:
    628       return false;
    629 
    630     case kRegexpLiteralString:
    631       // Convert to string in proper encoding.
    632       if (re->parse_flags() & Latin1) {
    633         prefix->resize(re->nrunes_);
    634         for (int j = 0; j < re->nrunes_; j++)
    635           (*prefix)[j] = re->runes_[j];
    636       } else {
    637         // Convert to UTF-8 in place.
    638         // Assume worst-case space and then trim.
    639         prefix->resize(re->nrunes_ * UTFmax);
    640         char *p = &(*prefix)[0];
    641         for (int j = 0; j < re->nrunes_; j++) {
    642           Rune r = re->runes_[j];
    643           if (r < Runeself)
    644             *p++ = r;
    645           else
    646             p += runetochar(p, &r);
    647         }
    648         prefix->resize(p - &(*prefix)[0]);
    649       }
    650       break;
    651 
    652     case kRegexpLiteral:
    653       if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
    654         prefix->append(1, re->rune_);
    655       } else {
    656         char buf[UTFmax];
    657         prefix->append(buf, runetochar(buf, &re->rune_));
    658       }
    659       break;
    660   }
    661   *foldcase = (sub[i]->parse_flags() & FoldCase);
    662   i++;
    663 
    664   // The rest.
    665   if (i < nsub_) {
    666     for (int j = i; j < nsub_; j++)
    667       sub[j]->Incref();
    668     re = Concat(sub + i, nsub_ - i, parse_flags());
    669   } else {
    670     re = new Regexp(kRegexpEmptyMatch, parse_flags());
    671   }
    672   *suffix = re;
    673   return true;
    674 }
    675 
    676 // Character class builder is a balanced binary tree (STL set)
    677 // containing non-overlapping, non-abutting RuneRanges.
    678 // The less-than operator used in the tree treats two
    679 // ranges as equal if they overlap at all, so that
    680 // lookups for a particular Rune are possible.
    681 
    682 CharClassBuilder::CharClassBuilder() {
    683   nrunes_ = 0;
    684   upper_ = 0;
    685   lower_ = 0;
    686 }
    687 
    688 // Add lo-hi to the class; return whether class got bigger.
    689 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
    690   if (hi < lo)
    691     return false;
    692 
    693   if (lo <= 'z' && hi >= 'A') {
    694     // Overlaps some alpha, maybe not all.
    695     // Update bitmaps telling which ASCII letters are in the set.
    696     Rune lo1 = max<Rune>(lo, 'A');
    697     Rune hi1 = min<Rune>(hi, 'Z');
    698     if (lo1 <= hi1)
    699       upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
    700 
    701     lo1 = max<Rune>(lo, 'a');
    702     hi1 = min<Rune>(hi, 'z');
    703     if (lo1 <= hi1)
    704       lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
    705   }
    706 
    707   {  // Check whether lo, hi is already in the class.
    708     iterator it = ranges_.find(RuneRange(lo, lo));
    709     if (it != end() && it->lo <= lo && hi <= it->hi)
    710       return false;
    711   }
    712 
    713   // Look for a range abutting lo on the left.
    714   // If it exists, take it out and increase our range.
    715   if (lo > 0) {
    716     iterator it = ranges_.find(RuneRange(lo-1, lo-1));
    717     if (it != end()) {
    718       lo = it->lo;
    719       if (it->hi > hi)
    720         hi = it->hi;
    721       nrunes_ -= it->hi - it->lo + 1;
    722       ranges_.erase(it);
    723     }
    724   }
    725 
    726   // Look for a range abutting hi on the right.
    727   // If it exists, take it out and increase our range.
    728   if (hi < Runemax) {
    729     iterator it = ranges_.find(RuneRange(hi+1, hi+1));
    730     if (it != end()) {
    731       hi = it->hi;
    732       nrunes_ -= it->hi - it->lo + 1;
    733       ranges_.erase(it);
    734     }
    735   }
    736 
    737   // Look for ranges between lo and hi.  Take them out.
    738   // This is only safe because the set has no overlapping ranges.
    739   // We've already removed any ranges abutting lo and hi, so
    740   // any that overlap [lo, hi] must be contained within it.
    741   for (;;) {
    742     iterator it = ranges_.find(RuneRange(lo, hi));
    743     if (it == end())
    744       break;
    745     nrunes_ -= it->hi - it->lo + 1;
    746     ranges_.erase(it);
    747   }
    748 
    749   // Finally, add [lo, hi].
    750   nrunes_ += hi - lo + 1;
    751   ranges_.insert(RuneRange(lo, hi));
    752   return true;
    753 }
    754 
    755 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
    756   for (iterator it = cc->begin(); it != cc->end(); ++it)
    757     AddRange(it->lo, it->hi);
    758 }
    759 
    760 bool CharClassBuilder::Contains(Rune r) {
    761   return ranges_.find(RuneRange(r, r)) != end();
    762 }
    763 
    764 // Does the character class behave the same on A-Z as on a-z?
    765 bool CharClassBuilder::FoldsASCII() {
    766   return ((upper_ ^ lower_) & AlphaMask) == 0;
    767 }
    768 
    769 CharClassBuilder* CharClassBuilder::Copy() {
    770   CharClassBuilder* cc = new CharClassBuilder;
    771   for (iterator it = begin(); it != end(); ++it)
    772     cc->ranges_.insert(RuneRange(it->lo, it->hi));
    773   cc->upper_ = upper_;
    774   cc->lower_ = lower_;
    775   cc->nrunes_ = nrunes_;
    776   return cc;
    777 }
    778 
    779 
    780 
    781 void CharClassBuilder::RemoveAbove(Rune r) {
    782   if (r >= Runemax)
    783     return;
    784 
    785   if (r < 'z') {
    786     if (r < 'a')
    787       lower_ = 0;
    788     else
    789       lower_ &= AlphaMask >> ('z' - r);
    790   }
    791 
    792   if (r < 'Z') {
    793     if (r < 'A')
    794       upper_ = 0;
    795     else
    796       upper_ &= AlphaMask >> ('Z' - r);
    797   }
    798 
    799   for (;;) {
    800 
    801     iterator it = ranges_.find(RuneRange(r + 1, Runemax));
    802     if (it == end())
    803       break;
    804     RuneRange rr = *it;
    805     ranges_.erase(it);
    806     nrunes_ -= rr.hi - rr.lo + 1;
    807     if (rr.lo <= r) {
    808       rr.hi = r;
    809       ranges_.insert(rr);
    810       nrunes_ += rr.hi - rr.lo + 1;
    811     }
    812   }
    813 }
    814 
    815 void CharClassBuilder::Negate() {
    816   // Build up negation and then copy in.
    817   // Could edit ranges in place, but C++ won't let me.
    818   vector<RuneRange> v;
    819   v.reserve(ranges_.size() + 1);
    820 
    821   // In negation, first range begins at 0, unless
    822   // the current class begins at 0.
    823   iterator it = begin();
    824   if (it == end()) {
    825     v.push_back(RuneRange(0, Runemax));
    826   } else {
    827     int nextlo = 0;
    828     if (it->lo == 0) {
    829       nextlo = it->hi + 1;
    830       ++it;
    831     }
    832     for (; it != end(); ++it) {
    833       v.push_back(RuneRange(nextlo, it->lo - 1));
    834       nextlo = it->hi + 1;
    835     }
    836     if (nextlo <= Runemax)
    837       v.push_back(RuneRange(nextlo, Runemax));
    838   }
    839 
    840   ranges_.clear();
    841   for (int i = 0; i < v.size(); i++)
    842     ranges_.insert(v[i]);
    843 
    844   upper_ = AlphaMask & ~upper_;
    845   lower_ = AlphaMask & ~lower_;
    846   nrunes_ = Runemax+1 - nrunes_;
    847 }
    848 
    849 // Character class is a sorted list of ranges.
    850 // The ranges are allocated in the same block as the header,
    851 // necessitating a special allocator and Delete method.
    852 
    853 CharClass* CharClass::New(int maxranges) {
    854   CharClass* cc;
    855   uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
    856   cc = reinterpret_cast<CharClass*>(data);
    857   cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
    858   cc->nranges_ = 0;
    859   cc->folds_ascii_ = false;
    860   cc->nrunes_ = 0;
    861   return cc;
    862 }
    863 
    864 void CharClass::Delete() {
    865   if (this == NULL)
    866     return;
    867   uint8 *data = reinterpret_cast<uint8*>(this);
    868   delete[] data;
    869 }
    870 
    871 CharClass* CharClass::Negate() {
    872   CharClass* cc = CharClass::New(nranges_+1);
    873   cc->folds_ascii_ = folds_ascii_;
    874   cc->nrunes_ = Runemax + 1 - nrunes_;
    875   int n = 0;
    876   int nextlo = 0;
    877   for (CharClass::iterator it = begin(); it != end(); ++it) {
    878     if (it->lo == nextlo) {
    879       nextlo = it->hi + 1;
    880     } else {
    881       cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
    882       nextlo = it->hi + 1;
    883     }
    884   }
    885   if (nextlo <= Runemax)
    886     cc->ranges_[n++] = RuneRange(nextlo, Runemax);
    887   cc->nranges_ = n;
    888   return cc;
    889 }
    890 
    891 bool CharClass::Contains(Rune r) {
    892   RuneRange* rr = ranges_;
    893   int n = nranges_;
    894   while (n > 0) {
    895     int m = n/2;
    896     if (rr[m].hi < r) {
    897       rr += m+1;
    898       n -= m+1;
    899     } else if (r < rr[m].lo) {
    900       n = m;
    901     } else {  // rr[m].lo <= r && r <= rr[m].hi
    902       return true;
    903     }
    904   }
    905   return false;
    906 }
    907 
    908 CharClass* CharClassBuilder::GetCharClass() {
    909   CharClass* cc = CharClass::New(ranges_.size());
    910   int n = 0;
    911   for (iterator it = begin(); it != end(); ++it)
    912     cc->ranges_[n++] = *it;
    913   cc->nranges_ = n;
    914   DCHECK_LE(n, ranges_.size());
    915   cc->nrunes_ = nrunes_;
    916   cc->folds_ascii_ = FoldsASCII();
    917   return cc;
    918 }
    919 
    920 }  // namespace re2
    921