Home | History | Annotate | Download | only in re2
      1 // Copyright 2003-2009 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Regular expression interface RE2.
      6 //
      7 // Originally the PCRE C++ wrapper, but adapted to use
      8 // the new automata-based regular expression engines.
      9 
     10 #include "re2/re2.h"
     11 
     12 #include <stdio.h>
     13 #include <string>
     14 #ifdef WIN32
     15 #define strtoll _strtoi64
     16 #define strtoull _strtoui64
     17 #define strtof strtod
     18 #else
     19 #include <pthread.h>
     20 #endif
     21 #include <errno.h>
     22 #include "util/util.h"
     23 #include "util/flags.h"
     24 #include "re2/prog.h"
     25 #include "re2/regexp.h"
     26 
     27 DEFINE_bool(trace_re2, false, "trace RE2 execution");
     28 
     29 namespace re2 {
     30 
     31 // Maximum number of args we can set
     32 static const int kMaxArgs = 16;
     33 static const int kVecSize = 1+kMaxArgs;
     34 
     35 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch = {};
     36 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch = {};
     37 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume = {};
     38 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume = {};
     39 
     40 #define kDefaultMaxMem (8<<20)
     41 
     42 RE2::Options::Options()
     43   :  encoding_(EncodingUTF8),
     44      posix_syntax_(false),
     45      longest_match_(false),
     46      log_errors_(true),
     47      max_mem_(kDefaultMaxMem),
     48      literal_(false),
     49      never_nl_(false),
     50      never_capture_(false),
     51      case_sensitive_(true),
     52      perl_classes_(false),
     53      word_boundary_(false),
     54      one_line_(false) {
     55 }
     56 
     57 RE2::Options::Options(RE2::CannedOptions opt)
     58   : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
     59     posix_syntax_(opt == RE2::POSIX_SYNTAX),
     60     longest_match_(opt == RE2::POSIX_SYNTAX),
     61     log_errors_(opt != RE2::Quiet),
     62     max_mem_(kDefaultMaxMem),
     63     literal_(false),
     64     never_nl_(false),
     65     never_capture_(false),
     66     case_sensitive_(true),
     67     perl_classes_(false),
     68     word_boundary_(false),
     69     one_line_(false) {
     70 }
     71 
     72 // static empty things for use as const references.
     73 // To avoid global constructors, initialized on demand.
     74 GLOBAL_MUTEX(empty_mutex);
     75 static const string *empty_string;
     76 static const map<string, int> *empty_named_groups;
     77 static const map<int, string> *empty_group_names;
     78 
     79 static void InitEmpty() {
     80   GLOBAL_MUTEX_LOCK(empty_mutex);
     81   if (empty_string == NULL) {
     82     empty_string = new string;
     83     empty_named_groups = new map<string, int>;
     84     empty_group_names = new map<int, string>;
     85   }
     86   GLOBAL_MUTEX_UNLOCK(empty_mutex);
     87 }
     88 
     89 // Converts from Regexp error code to RE2 error code.
     90 // Maybe some day they will diverge.  In any event, this
     91 // hides the existence of Regexp from RE2 users.
     92 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
     93   switch (code) {
     94     case re2::kRegexpSuccess:
     95       return RE2::NoError;
     96     case re2::kRegexpInternalError:
     97       return RE2::ErrorInternal;
     98     case re2::kRegexpBadEscape:
     99       return RE2::ErrorBadEscape;
    100     case re2::kRegexpBadCharClass:
    101       return RE2::ErrorBadCharClass;
    102     case re2::kRegexpBadCharRange:
    103       return RE2::ErrorBadCharRange;
    104     case re2::kRegexpMissingBracket:
    105       return RE2::ErrorMissingBracket;
    106     case re2::kRegexpMissingParen:
    107       return RE2::ErrorMissingParen;
    108     case re2::kRegexpTrailingBackslash:
    109       return RE2::ErrorTrailingBackslash;
    110     case re2::kRegexpRepeatArgument:
    111       return RE2::ErrorRepeatArgument;
    112     case re2::kRegexpRepeatSize:
    113       return RE2::ErrorRepeatSize;
    114     case re2::kRegexpRepeatOp:
    115       return RE2::ErrorRepeatOp;
    116     case re2::kRegexpBadPerlOp:
    117       return RE2::ErrorBadPerlOp;
    118     case re2::kRegexpBadUTF8:
    119       return RE2::ErrorBadUTF8;
    120     case re2::kRegexpBadNamedCapture:
    121       return RE2::ErrorBadNamedCapture;
    122   }
    123   return RE2::ErrorInternal;
    124 }
    125 
    126 static string trunc(const StringPiece& pattern) {
    127   if (pattern.size() < 100)
    128     return pattern.as_string();
    129   return pattern.substr(0, 100).as_string() + "...";
    130 }
    131 
    132 
    133 RE2::RE2(const char* pattern) {
    134   Init(pattern, DefaultOptions);
    135 }
    136 
    137 RE2::RE2(const string& pattern) {
    138   Init(pattern, DefaultOptions);
    139 }
    140 
    141 RE2::RE2(const StringPiece& pattern) {
    142   Init(pattern, DefaultOptions);
    143 }
    144 
    145 RE2::RE2(const StringPiece& pattern, const Options& options) {
    146   Init(pattern, options);
    147 }
    148 
    149 int RE2::Options::ParseFlags() const {
    150   int flags = Regexp::ClassNL;
    151   switch (encoding()) {
    152     default:
    153       if (log_errors())
    154         LOG(ERROR) << "Unknown encoding " << encoding();
    155       break;
    156     case RE2::Options::EncodingUTF8:
    157       break;
    158     case RE2::Options::EncodingLatin1:
    159       flags |= Regexp::Latin1;
    160       break;
    161   }
    162 
    163   if (!posix_syntax())
    164     flags |= Regexp::LikePerl;
    165 
    166   if (literal())
    167     flags |= Regexp::Literal;
    168 
    169   if (never_nl())
    170     flags |= Regexp::NeverNL;
    171 
    172   if (never_capture())
    173     flags |= Regexp::NeverCapture;
    174 
    175   if (!case_sensitive())
    176     flags |= Regexp::FoldCase;
    177 
    178   if (perl_classes())
    179     flags |= Regexp::PerlClasses;
    180 
    181   if (word_boundary())
    182     flags |= Regexp::PerlB;
    183 
    184   if (one_line())
    185     flags |= Regexp::OneLine;
    186 
    187   return flags;
    188 }
    189 
    190 void RE2::Init(const StringPiece& pattern, const Options& options) {
    191   mutex_ = new Mutex;
    192   pattern_ = pattern.as_string();
    193   options_.Copy(options);
    194   InitEmpty();
    195   error_ = empty_string;
    196   error_code_ = NoError;
    197   suffix_regexp_ = NULL;
    198   entire_regexp_ = NULL;
    199   prog_ = NULL;
    200   rprog_ = NULL;
    201   named_groups_ = NULL;
    202   group_names_ = NULL;
    203   num_captures_ = -1;
    204 
    205   RegexpStatus status;
    206   entire_regexp_ = Regexp::Parse(
    207     pattern_,
    208     static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
    209     &status);
    210   if (entire_regexp_ == NULL) {
    211     if (error_ == empty_string)
    212       error_ = new string(status.Text());
    213     if (options_.log_errors()) {
    214       LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
    215                  << status.Text();
    216     }
    217     error_arg_ = status.error_arg().as_string();
    218     error_code_ = RegexpErrorToRE2(status.code());
    219     return;
    220   }
    221 
    222   prefix_.clear();
    223   prefix_foldcase_ = false;
    224   re2::Regexp* suffix;
    225   if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
    226     suffix_regexp_ = suffix;
    227   else
    228     suffix_regexp_ = entire_regexp_->Incref();
    229 
    230   // Two thirds of the memory goes to the forward Prog,
    231   // one third to the reverse prog, because the forward
    232   // Prog has two DFAs but the reverse prog has one.
    233   prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
    234   if (prog_ == NULL) {
    235     if (options_.log_errors())
    236       LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
    237     error_ = new string("pattern too large - compile failed");
    238     error_code_ = RE2::ErrorPatternTooLarge;
    239     return;
    240   }
    241 
    242   // Could delay this until the first match call that
    243   // cares about submatch information, but the one-pass
    244   // machine's memory gets cut from the DFA memory budget,
    245   // and that is harder to do if the DFA has already
    246   // been built.
    247   is_one_pass_ = prog_->IsOnePass();
    248 }
    249 
    250 // Returns rprog_, computing it if needed.
    251 re2::Prog* RE2::ReverseProg() const {
    252   MutexLock l(mutex_);
    253   if (rprog_ == NULL && error_ == empty_string) {
    254     rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);
    255     if (rprog_ == NULL) {
    256       if (options_.log_errors())
    257         LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";
    258       error_ = new string("pattern too large - reverse compile failed");
    259       error_code_ = RE2::ErrorPatternTooLarge;
    260       return NULL;
    261     }
    262   }
    263   return rprog_;
    264 }
    265 
    266 RE2::~RE2() {
    267   if (suffix_regexp_)
    268     suffix_regexp_->Decref();
    269   if (entire_regexp_)
    270     entire_regexp_->Decref();
    271   delete mutex_;
    272   delete prog_;
    273   delete rprog_;
    274   if (error_ != empty_string)
    275     delete error_;
    276   if (named_groups_ != NULL && named_groups_ != empty_named_groups)
    277     delete named_groups_;
    278   if (group_names_ != NULL &&  group_names_ != empty_group_names)
    279     delete group_names_;
    280 }
    281 
    282 int RE2::ProgramSize() const {
    283   if (prog_ == NULL)
    284     return -1;
    285   return prog_->size();
    286 }
    287 
    288 // Returns named_groups_, computing it if needed.
    289 const map<string, int>&  RE2::NamedCapturingGroups() const {
    290   MutexLock l(mutex_);
    291   if (!ok())
    292     return *empty_named_groups;
    293   if (named_groups_ == NULL) {
    294     named_groups_ = suffix_regexp_->NamedCaptures();
    295     if (named_groups_ == NULL)
    296       named_groups_ = empty_named_groups;
    297   }
    298   return *named_groups_;
    299 }
    300 
    301 // Returns group_names_, computing it if needed.
    302 const map<int, string>&  RE2::CapturingGroupNames() const {
    303   MutexLock l(mutex_);
    304   if (!ok())
    305     return *empty_group_names;
    306   if (group_names_ == NULL) {
    307     group_names_ = suffix_regexp_->CaptureNames();
    308     if (group_names_ == NULL)
    309       group_names_ = empty_group_names;
    310   }
    311   return *group_names_;
    312 }
    313 
    314 /***** Convenience interfaces *****/
    315 
    316 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
    317                      const Arg* const args[], int n) {
    318   return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
    319 }
    320 
    321 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
    322                         const Arg* const args[], int n) {
    323   return re.DoMatch(text, UNANCHORED, NULL, args, n);
    324 }
    325 
    326 bool RE2::ConsumeN(StringPiece* input, const RE2& re,
    327                    const Arg* const args[], int n) {
    328   int consumed;
    329   if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
    330     input->remove_prefix(consumed);
    331     return true;
    332   } else {
    333     return false;
    334   }
    335 }
    336 
    337 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
    338                           const Arg* const args[], int n) {
    339   int consumed;
    340   if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
    341     input->remove_prefix(consumed);
    342     return true;
    343   } else {
    344     return false;
    345   }
    346 }
    347 
    348 // Returns the maximum submatch needed for the rewrite to be done by Replace().
    349 // E.g. if rewrite == "foo \\2,\\1", returns 2.
    350 int RE2::MaxSubmatch(const StringPiece& rewrite) {
    351   int max = 0;
    352   for (const char *s = rewrite.data(), *end = s + rewrite.size();
    353        s < end; s++) {
    354     if (*s == '\\') {
    355       s++;
    356       int c = (s < end) ? *s : -1;
    357       if (isdigit(c)) {
    358         int n = (c - '0');
    359         if (n > max)
    360           max = n;
    361       }
    362     }
    363   }
    364   return max;
    365 }
    366 
    367 bool RE2::Replace(string *str,
    368                  const RE2& re,
    369                  const StringPiece& rewrite) {
    370   StringPiece vec[kVecSize];
    371   int nvec = 1 + MaxSubmatch(rewrite);
    372   if (nvec > arraysize(vec))
    373     return false;
    374   if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
    375     return false;
    376 
    377   string s;
    378   if (!re.Rewrite(&s, rewrite, vec, nvec))
    379     return false;
    380 
    381   assert(vec[0].begin() >= str->data());
    382   assert(vec[0].end() <= str->data()+str->size());
    383   str->replace(vec[0].data() - str->data(), vec[0].size(), s);
    384   return true;
    385 }
    386 
    387 int RE2::GlobalReplace(string *str,
    388                       const RE2& re,
    389                       const StringPiece& rewrite) {
    390   StringPiece vec[kVecSize];
    391   int nvec = 1 + MaxSubmatch(rewrite);
    392   if (nvec > arraysize(vec))
    393     return false;
    394 
    395   const char* p = str->data();
    396   const char* ep = p + str->size();
    397   const char* lastend = NULL;
    398   string out;
    399   int count = 0;
    400   while (p <= ep) {
    401     if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec))
    402       break;
    403     if (p < vec[0].begin())
    404       out.append(p, vec[0].begin() - p);
    405     if (vec[0].begin() == lastend && vec[0].size() == 0) {
    406       // Disallow empty match at end of last match: skip ahead.
    407       if (p < ep)
    408         out.append(p, 1);
    409       p++;
    410       continue;
    411     }
    412     re.Rewrite(&out, rewrite, vec, nvec);
    413     p = vec[0].end();
    414     lastend = p;
    415     count++;
    416   }
    417 
    418   if (count == 0)
    419     return 0;
    420 
    421   if (p < ep)
    422     out.append(p, ep - p);
    423   swap(out, *str);
    424   return count;
    425 }
    426 
    427 bool RE2::Extract(const StringPiece &text,
    428                  const RE2& re,
    429                  const StringPiece &rewrite,
    430                  string *out) {
    431   StringPiece vec[kVecSize];
    432   int nvec = 1 + MaxSubmatch(rewrite);
    433   if (nvec > arraysize(vec))
    434     return false;
    435 
    436   if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
    437     return false;
    438 
    439   out->clear();
    440   return re.Rewrite(out, rewrite, vec, nvec);
    441 }
    442 
    443 string RE2::QuoteMeta(const StringPiece& unquoted) {
    444   string result;
    445   result.reserve(unquoted.size() << 1);
    446 
    447   // Escape any ascii character not in [A-Za-z_0-9].
    448   //
    449   // Note that it's legal to escape a character even if it has no
    450   // special meaning in a regular expression -- so this function does
    451   // that.  (This also makes it identical to the perl function of the
    452   // same name except for the null-character special case;
    453   // see `perldoc -f quotemeta`.)
    454   for (int ii = 0; ii < unquoted.length(); ++ii) {
    455     // Note that using 'isalnum' here raises the benchmark time from
    456     // 32ns to 58ns:
    457     if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
    458         (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
    459         (unquoted[ii] < '0' || unquoted[ii] > '9') &&
    460         unquoted[ii] != '_' &&
    461         // If this is the part of a UTF8 or Latin1 character, we need
    462         // to copy this byte without escaping.  Experimentally this is
    463         // what works correctly with the regexp library.
    464         !(unquoted[ii] & 128)) {
    465       if (unquoted[ii] == '\0') {  // Special handling for null chars.
    466         // Note that this special handling is not strictly required for RE2,
    467         // but this quoting is required for other regexp libraries such as
    468         // PCRE.
    469         // Can't use "\\0" since the next character might be a digit.
    470         result += "\\x00";
    471         continue;
    472       }
    473       result += '\\';
    474     }
    475     result += unquoted[ii];
    476   }
    477 
    478   return result;
    479 }
    480 
    481 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
    482   if (prog_ == NULL)
    483     return false;
    484 
    485   int n = prefix_.size();
    486   if (n > maxlen)
    487     n = maxlen;
    488 
    489   // Determine initial min max from prefix_ literal.
    490   string pmin, pmax;
    491   pmin = prefix_.substr(0, n);
    492   pmax = prefix_.substr(0, n);
    493   if (prefix_foldcase_) {
    494     // prefix is ASCII lowercase; change pmin to uppercase.
    495     for (int i = 0; i < n; i++) {
    496       if ('a' <= pmin[i] && pmin[i] <= 'z')
    497         pmin[i] += 'A' - 'a';
    498     }
    499   }
    500 
    501   // Add to prefix min max using PossibleMatchRange on regexp.
    502   string dmin, dmax;
    503   maxlen -= n;
    504   if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
    505     pmin += dmin;
    506     pmax += dmax;
    507   } else if (pmax.size() > 0) {
    508     // prog_->PossibleMatchRange has failed us,
    509     // but we still have useful information from prefix_.
    510     // Round up pmax to allow any possible suffix.
    511     pmax = PrefixSuccessor(pmax);
    512   } else {
    513     // Nothing useful.
    514     *min = "";
    515     *max = "";
    516     return false;
    517   }
    518 
    519   *min = pmin;
    520   *max = pmax;
    521   return true;
    522 }
    523 
    524 // Avoid possible locale nonsense in standard strcasecmp.
    525 // The string a is known to be all lowercase.
    526 static int ascii_strcasecmp(const char* a, const char* b, int len) {
    527   const char *ae = a + len;
    528 
    529   for (; a < ae; a++, b++) {
    530     uint8 x = *a;
    531     uint8 y = *b;
    532     if ('A' <= y && y <= 'Z')
    533       y += 'a' - 'A';
    534     if (x != y)
    535       return x - y;
    536   }
    537   return 0;
    538 }
    539 
    540 
    541 /***** Actual matching and rewriting code *****/
    542 
    543 bool RE2::Match(const StringPiece& text,
    544                 int startpos,
    545                 int endpos,
    546                 Anchor re_anchor,
    547                 StringPiece* submatch,
    548                 int nsubmatch) const {
    549   if (!ok() || suffix_regexp_ == NULL) {
    550     if (options_.log_errors())
    551       LOG(ERROR) << "Invalid RE2: " << *error_;
    552     return false;
    553   }
    554 
    555   if (startpos < 0 || startpos > endpos || endpos > text.size()) {
    556     if (options_.log_errors())
    557       LOG(ERROR) << "RE2: invalid startpos, endpos pair.";
    558     return false;
    559   }
    560 
    561   StringPiece subtext = text;
    562   subtext.remove_prefix(startpos);
    563   subtext.remove_suffix(text.size() - endpos);
    564 
    565   // Use DFAs to find exact location of match, filter out non-matches.
    566 
    567   // Don't ask for the location if we won't use it.
    568   // SearchDFA can do extra optimizations in that case.
    569   StringPiece match;
    570   StringPiece* matchp = &match;
    571   if (nsubmatch == 0)
    572     matchp = NULL;
    573 
    574   int ncap = 1 + NumberOfCapturingGroups();
    575   if (ncap > nsubmatch)
    576     ncap = nsubmatch;
    577 
    578   // If the regexp is anchored explicitly, must not be in middle of text.
    579   if (prog_->anchor_start() && startpos != 0)
    580     return false;
    581 
    582   // If the regexp is anchored explicitly, update re_anchor
    583   // so that we can potentially fall into a faster case below.
    584   if (prog_->anchor_start() && prog_->anchor_end())
    585     re_anchor = ANCHOR_BOTH;
    586   else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
    587     re_anchor = ANCHOR_START;
    588 
    589   // Check for the required prefix, if any.
    590   int prefixlen = 0;
    591   if (!prefix_.empty()) {
    592     if (startpos != 0)
    593       return false;
    594     prefixlen = prefix_.size();
    595     if (prefixlen > subtext.size())
    596       return false;
    597     if (prefix_foldcase_) {
    598       if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
    599         return false;
    600     } else {
    601       if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
    602         return false;
    603     }
    604     subtext.remove_prefix(prefixlen);
    605     // If there is a required prefix, the anchor must be at least ANCHOR_START.
    606     if (re_anchor != ANCHOR_BOTH)
    607       re_anchor = ANCHOR_START;
    608   }
    609 
    610   Prog::Anchor anchor = Prog::kUnanchored;
    611   Prog::MatchKind kind = Prog::kFirstMatch;
    612   if (options_.longest_match())
    613     kind = Prog::kLongestMatch;
    614   bool skipped_test = false;
    615 
    616   bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
    617 
    618   // SearchBitState allocates a bit vector of size prog_->size() * text.size().
    619   // It also allocates a stack of 3-word structures which could potentially
    620   // grow as large as prog_->size() * text.size() but in practice is much
    621   // smaller.
    622   // Conditions for using SearchBitState:
    623   const int MaxBitStateProg = 500;   // prog_->size() <= Max.
    624   const int MaxBitStateVector = 256*1024;  // bit vector size <= Max (bits)
    625   bool can_bit_state = prog_->size() <= MaxBitStateProg;
    626   int bit_state_text_max = MaxBitStateVector / prog_->size();
    627 
    628   bool dfa_failed = false;
    629   switch (re_anchor) {
    630     default:
    631     case UNANCHORED: {
    632       if (!prog_->SearchDFA(subtext, text, anchor, kind,
    633                             matchp, &dfa_failed, NULL)) {
    634         if (dfa_failed) {
    635           // Fall back to NFA below.
    636           skipped_test = true;
    637           if (FLAGS_trace_re2)
    638             LOG(INFO) << "Match " << trunc(pattern_)
    639                       << " [" << CEscape(subtext) << "]"
    640                       << " DFA failed.";
    641           break;
    642         }
    643         if (FLAGS_trace_re2)
    644           LOG(INFO) << "Match " << trunc(pattern_)
    645                     << " [" << CEscape(subtext) << "]"
    646                     << " used DFA - no match.";
    647         return false;
    648       }
    649       if (FLAGS_trace_re2)
    650         LOG(INFO) << "Match " << trunc(pattern_)
    651                   << " [" << CEscape(subtext) << "]"
    652                   << " used DFA - match";
    653       if (matchp == NULL)  // Matched.  Don't care where
    654         return true;
    655       // SearchDFA set match[0].end() but didn't know where the
    656       // match started.  Run the regexp backward from match[0].end()
    657       // to find the longest possible match -- that's where it started.
    658       Prog* prog = ReverseProg();
    659       if (prog == NULL)
    660         return false;
    661       if (!prog->SearchDFA(match, text, Prog::kAnchored,
    662                            Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
    663         if (dfa_failed) {
    664           // Fall back to NFA below.
    665           skipped_test = true;
    666           if (FLAGS_trace_re2)
    667             LOG(INFO) << "Match " << trunc(pattern_)
    668                       << " [" << CEscape(subtext) << "]"
    669                       << " reverse DFA failed.";
    670           break;
    671         }
    672         if (FLAGS_trace_re2)
    673           LOG(INFO) << "Match " << trunc(pattern_)
    674                     << " [" << CEscape(subtext) << "]"
    675                     << " DFA inconsistency.";
    676         if (options_.log_errors())
    677           LOG(ERROR) << "DFA inconsistency";
    678         return false;
    679       }
    680       if (FLAGS_trace_re2)
    681         LOG(INFO) << "Match " << trunc(pattern_)
    682                   << " [" << CEscape(subtext) << "]"
    683                   << " used reverse DFA.";
    684       break;
    685     }
    686 
    687     case ANCHOR_BOTH:
    688     case ANCHOR_START:
    689       if (re_anchor == ANCHOR_BOTH)
    690         kind = Prog::kFullMatch;
    691       anchor = Prog::kAnchored;
    692 
    693       // If only a small amount of text and need submatch
    694       // information anyway and we're going to use OnePass or BitState
    695       // to get it, we might as well not even bother with the DFA:
    696       // OnePass or BitState will be fast enough.
    697       // On tiny texts, OnePass outruns even the DFA, and
    698       // it doesn't have the shared state and occasional mutex that
    699       // the DFA does.
    700       if (can_one_pass && text.size() <= 4096 &&
    701           (ncap > 1 || text.size() <= 8)) {
    702         if (FLAGS_trace_re2)
    703           LOG(INFO) << "Match " << trunc(pattern_)
    704                     << " [" << CEscape(subtext) << "]"
    705                     << " skipping DFA for OnePass.";
    706         skipped_test = true;
    707         break;
    708       }
    709       if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
    710         if (FLAGS_trace_re2)
    711           LOG(INFO) << "Match " << trunc(pattern_)
    712                     << " [" << CEscape(subtext) << "]"
    713                     << " skipping DFA for BitState.";
    714         skipped_test = true;
    715         break;
    716       }
    717       if (!prog_->SearchDFA(subtext, text, anchor, kind,
    718                             &match, &dfa_failed, NULL)) {
    719         if (dfa_failed) {
    720           if (FLAGS_trace_re2)
    721             LOG(INFO) << "Match " << trunc(pattern_)
    722                       << " [" << CEscape(subtext) << "]"
    723                       << " DFA failed.";
    724           skipped_test = true;
    725           break;
    726         }
    727         if (FLAGS_trace_re2)
    728           LOG(INFO) << "Match " << trunc(pattern_)
    729                     << " [" << CEscape(subtext) << "]"
    730                     << " used DFA - no match.";
    731         return false;
    732       }
    733       break;
    734   }
    735 
    736   if (!skipped_test && ncap <= 1) {
    737     // We know exactly where it matches.  That's enough.
    738     if (ncap == 1)
    739       submatch[0] = match;
    740   } else {
    741     StringPiece subtext1;
    742     if (skipped_test) {
    743       // DFA ran out of memory or was skipped:
    744       // need to search in entire original text.
    745       subtext1 = subtext;
    746     } else {
    747       // DFA found the exact match location:
    748       // let NFA run an anchored, full match search
    749       // to find submatch locations.
    750       subtext1 = match;
    751       anchor = Prog::kAnchored;
    752       kind = Prog::kFullMatch;
    753     }
    754 
    755     if (can_one_pass && anchor != Prog::kUnanchored) {
    756       if (FLAGS_trace_re2)
    757         LOG(INFO) << "Match " << trunc(pattern_)
    758                   << " [" << CEscape(subtext) << "]"
    759                   << " using OnePass.";
    760       if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
    761         if (!skipped_test && options_.log_errors())
    762           LOG(ERROR) << "SearchOnePass inconsistency";
    763         return false;
    764       }
    765     } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
    766       if (FLAGS_trace_re2)
    767         LOG(INFO) << "Match " << trunc(pattern_)
    768                   << " [" << CEscape(subtext) << "]"
    769                   << " using BitState.";
    770       if (!prog_->SearchBitState(subtext1, text, anchor,
    771                                  kind, submatch, ncap)) {
    772         if (!skipped_test && options_.log_errors())
    773           LOG(ERROR) << "SearchBitState inconsistency";
    774         return false;
    775       }
    776     } else {
    777       if (FLAGS_trace_re2)
    778         LOG(INFO) << "Match " << trunc(pattern_)
    779                   << " [" << CEscape(subtext) << "]"
    780                   << " using NFA.";
    781       if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
    782         if (!skipped_test && options_.log_errors())
    783           LOG(ERROR) << "SearchNFA inconsistency";
    784         return false;
    785       }
    786     }
    787   }
    788 
    789   // Adjust overall match for required prefix that we stripped off.
    790   if (prefixlen > 0 && nsubmatch > 0)
    791     submatch[0] = StringPiece(submatch[0].begin() - prefixlen,
    792                               submatch[0].size() + prefixlen);
    793 
    794   // Zero submatches that don't exist in the regexp.
    795   for (int i = ncap; i < nsubmatch; i++)
    796     submatch[i] = NULL;
    797   return true;
    798 }
    799 
    800 // Internal matcher - like Match() but takes Args not StringPieces.
    801 bool RE2::DoMatch(const StringPiece& text,
    802                   Anchor anchor,
    803                   int* consumed,
    804                   const Arg* const* args,
    805                   int n) const {
    806   if (!ok()) {
    807     if (options_.log_errors())
    808       LOG(ERROR) << "Invalid RE2: " << *error_;
    809     return false;
    810   }
    811 
    812   // Count number of capture groups needed.
    813   int nvec;
    814   if (n == 0 && consumed == NULL)
    815     nvec = 0;
    816   else
    817     nvec = n+1;
    818 
    819   StringPiece* vec;
    820   StringPiece stkvec[kVecSize];
    821   StringPiece* heapvec = NULL;
    822 
    823   if (nvec <= arraysize(stkvec)) {
    824     vec = stkvec;
    825   } else {
    826     vec = new StringPiece[nvec];
    827     heapvec = vec;
    828   }
    829 
    830   if (!Match(text, 0, text.size(), anchor, vec, nvec)) {
    831     delete[] heapvec;
    832     return false;
    833   }
    834 
    835   if(consumed != NULL)
    836     *consumed = vec[0].end() - text.begin();
    837 
    838   if (n == 0 || args == NULL) {
    839     // We are not interested in results
    840     delete[] heapvec;
    841     return true;
    842   }
    843 
    844   int ncap = NumberOfCapturingGroups();
    845   if (ncap < n) {
    846     // RE has fewer capturing groups than number of arg pointers passed in
    847     VLOG(1) << "Asked for " << n << " but only have " << ncap;
    848     delete[] heapvec;
    849     return false;
    850   }
    851 
    852   // If we got here, we must have matched the whole pattern.
    853   for (int i = 0; i < n; i++) {
    854     const StringPiece& s = vec[i+1];
    855     if (!args[i]->Parse(s.data(), s.size())) {
    856       // TODO: Should we indicate what the error was?
    857       VLOG(1) << "Parse error on #" << i << " " << s << " "
    858 	      << (void*)s.data() << "/" << s.size();
    859       delete[] heapvec;
    860       return false;
    861     }
    862   }
    863 
    864   delete[] heapvec;
    865   return true;
    866 }
    867 
    868 // Append the "rewrite" string, with backslash subsitutions from "vec",
    869 // to string "out".
    870 bool RE2::Rewrite(string *out, const StringPiece &rewrite,
    871                  const StringPiece *vec, int veclen) const {
    872   for (const char *s = rewrite.data(), *end = s + rewrite.size();
    873        s < end; s++) {
    874     int c = *s;
    875     if (c == '\\') {
    876       s++;
    877       c = (s < end) ? *s : -1;
    878       if (isdigit(c)) {
    879         int n = (c - '0');
    880         if (n >= veclen) {
    881           if (options_.log_errors()) {
    882             LOG(ERROR) << "requested group " << n
    883                        << " in regexp " << rewrite.data();
    884           }
    885           return false;
    886         }
    887         StringPiece snip = vec[n];
    888         if (snip.size() > 0)
    889           out->append(snip.data(), snip.size());
    890       } else if (c == '\\') {
    891         out->push_back('\\');
    892       } else {
    893         if (options_.log_errors())
    894           LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
    895         return false;
    896       }
    897     } else {
    898       out->push_back(c);
    899     }
    900   }
    901   return true;
    902 }
    903 
    904 // Return the number of capturing subpatterns, or -1 if the
    905 // regexp wasn't valid on construction.
    906 int RE2::NumberOfCapturingGroups() const {
    907   if (suffix_regexp_ == NULL)
    908     return -1;
    909   ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"
    910     " multiple threads end up doing the same work in parallel.");
    911   if (num_captures_ == -1)
    912     num_captures_ = suffix_regexp_->NumCaptures();
    913   return num_captures_;
    914 }
    915 
    916 // Checks that the rewrite string is well-formed with respect to this
    917 // regular expression.
    918 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
    919   int max_token = -1;
    920   for (const char *s = rewrite.data(), *end = s + rewrite.size();
    921        s < end; s++) {
    922     int c = *s;
    923     if (c != '\\') {
    924       continue;
    925     }
    926     if (++s == end) {
    927       *error = "Rewrite schema error: '\\' not allowed at end.";
    928       return false;
    929     }
    930     c = *s;
    931     if (c == '\\') {
    932       continue;
    933     }
    934     if (!isdigit(c)) {
    935       *error = "Rewrite schema error: "
    936                "'\\' must be followed by a digit or '\\'.";
    937       return false;
    938     }
    939     int n = (c - '0');
    940     if (max_token < n) {
    941       max_token = n;
    942     }
    943   }
    944 
    945   if (max_token > NumberOfCapturingGroups()) {
    946     SStringPrintf(error, "Rewrite schema requests %d matches, "
    947                   "but the regexp only has %d parenthesized subexpressions.",
    948                   max_token, NumberOfCapturingGroups());
    949     return false;
    950   }
    951   return true;
    952 }
    953 
    954 /***** Parsers for various types *****/
    955 
    956 bool RE2::Arg::parse_null(const char* str, int n, void* dest) {
    957   // We fail if somebody asked us to store into a non-NULL void* pointer
    958   return (dest == NULL);
    959 }
    960 
    961 bool RE2::Arg::parse_string(const char* str, int n, void* dest) {
    962   if (dest == NULL) return true;
    963   reinterpret_cast<string*>(dest)->assign(str, n);
    964   return true;
    965 }
    966 
    967 bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {
    968   if (dest == NULL) return true;
    969   reinterpret_cast<StringPiece*>(dest)->set(str, n);
    970   return true;
    971 }
    972 
    973 bool RE2::Arg::parse_char(const char* str, int n, void* dest) {
    974   if (n != 1) return false;
    975   if (dest == NULL) return true;
    976   *(reinterpret_cast<char*>(dest)) = str[0];
    977   return true;
    978 }
    979 
    980 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
    981   if (n != 1) return false;
    982   if (dest == NULL) return true;
    983   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
    984   return true;
    985 }
    986 
    987 // Largest number spec that we are willing to parse
    988 static const int kMaxNumberLength = 32;
    989 
    990 // REQUIRES "buf" must have length at least kMaxNumberLength+1
    991 // Copies "str" into "buf" and null-terminates.
    992 // Overwrites *np with the new length.
    993 static const char* TerminateNumber(char* buf, const char* str, int* np) {
    994   int n = *np;
    995   if (n <= 0) return "";
    996   if (n > 0 && isspace(*str)) {
    997     // We are less forgiving than the strtoxxx() routines and do not
    998     // allow leading spaces.
    999     return "";
   1000   }
   1001 
   1002   // Although buf has a fixed maximum size, we can still handle
   1003   // arbitrarily large integers correctly by omitting leading zeros.
   1004   // (Numbers that are still too long will be out of range.)
   1005   // Before deciding whether str is too long,
   1006   // remove leading zeros with s/000+/00/.
   1007   // Leaving the leading two zeros in place means that
   1008   // we don't change 0000x123 (invalid) into 0x123 (valid).
   1009   // Skip over leading - before replacing.
   1010   bool neg = false;
   1011   if (n >= 1 && str[0] == '-') {
   1012     neg = true;
   1013     n--;
   1014     str++;
   1015   }
   1016 
   1017   if (n >= 3 && str[0] == '0' && str[1] == '0') {
   1018     while (n >= 3 && str[2] == '0') {
   1019       n--;
   1020       str++;
   1021     }
   1022   }
   1023 
   1024   if (neg) {  // make room in buf for -
   1025     n++;
   1026     str--;
   1027   }
   1028 
   1029   if (n > kMaxNumberLength) return "";
   1030 
   1031   memmove(buf, str, n);
   1032   if (neg) {
   1033     buf[0] = '-';
   1034   }
   1035   buf[n] = '\0';
   1036   *np = n;
   1037   return buf;
   1038 }
   1039 
   1040 bool RE2::Arg::parse_long_radix(const char* str,
   1041                                int n,
   1042                                void* dest,
   1043                                int radix) {
   1044   if (n == 0) return false;
   1045   char buf[kMaxNumberLength+1];
   1046   str = TerminateNumber(buf, str, &n);
   1047   char* end;
   1048   errno = 0;
   1049   long r = strtol(str, &end, radix);
   1050   if (end != str + n) return false;   // Leftover junk
   1051   if (errno) return false;
   1052   if (dest == NULL) return true;
   1053   *(reinterpret_cast<long*>(dest)) = r;
   1054   return true;
   1055 }
   1056 
   1057 bool RE2::Arg::parse_ulong_radix(const char* str,
   1058                                 int n,
   1059                                 void* dest,
   1060                                 int radix) {
   1061   if (n == 0) return false;
   1062   char buf[kMaxNumberLength+1];
   1063   str = TerminateNumber(buf, str, &n);
   1064   if (str[0] == '-') {
   1065    // strtoul() will silently accept negative numbers and parse
   1066    // them.  This module is more strict and treats them as errors.
   1067    return false;
   1068   }
   1069 
   1070   char* end;
   1071   errno = 0;
   1072   unsigned long r = strtoul(str, &end, radix);
   1073   if (end != str + n) return false;   // Leftover junk
   1074   if (errno) return false;
   1075   if (dest == NULL) return true;
   1076   *(reinterpret_cast<unsigned long*>(dest)) = r;
   1077   return true;
   1078 }
   1079 
   1080 bool RE2::Arg::parse_short_radix(const char* str,
   1081                                 int n,
   1082                                 void* dest,
   1083                                 int radix) {
   1084   long r;
   1085   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
   1086   if ((short)r != r) return false;       // Out of range
   1087   if (dest == NULL) return true;
   1088   *(reinterpret_cast<short*>(dest)) = r;
   1089   return true;
   1090 }
   1091 
   1092 bool RE2::Arg::parse_ushort_radix(const char* str,
   1093                                  int n,
   1094                                  void* dest,
   1095                                  int radix) {
   1096   unsigned long r;
   1097   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
   1098   if ((ushort)r != r) return false;                      // Out of range
   1099   if (dest == NULL) return true;
   1100   *(reinterpret_cast<unsigned short*>(dest)) = r;
   1101   return true;
   1102 }
   1103 
   1104 bool RE2::Arg::parse_int_radix(const char* str,
   1105                               int n,
   1106                               void* dest,
   1107                               int radix) {
   1108   long r;
   1109   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
   1110   if ((int)r != r) return false;         // Out of range
   1111   if (dest == NULL) return true;
   1112   *(reinterpret_cast<int*>(dest)) = r;
   1113   return true;
   1114 }
   1115 
   1116 bool RE2::Arg::parse_uint_radix(const char* str,
   1117                                int n,
   1118                                void* dest,
   1119                                int radix) {
   1120   unsigned long r;
   1121   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
   1122   if ((uint)r != r) return false;                       // Out of range
   1123   if (dest == NULL) return true;
   1124   *(reinterpret_cast<unsigned int*>(dest)) = r;
   1125   return true;
   1126 }
   1127 
   1128 bool RE2::Arg::parse_longlong_radix(const char* str,
   1129                                    int n,
   1130                                    void* dest,
   1131                                    int radix) {
   1132   if (n == 0) return false;
   1133   char buf[kMaxNumberLength+1];
   1134   str = TerminateNumber(buf, str, &n);
   1135   char* end;
   1136   errno = 0;
   1137   int64 r = strtoll(str, &end, radix);
   1138   if (end != str + n) return false;   // Leftover junk
   1139   if (errno) return false;
   1140   if (dest == NULL) return true;
   1141   *(reinterpret_cast<int64*>(dest)) = r;
   1142   return true;
   1143 }
   1144 
   1145 bool RE2::Arg::parse_ulonglong_radix(const char* str,
   1146                                     int n,
   1147                                     void* dest,
   1148                                     int radix) {
   1149   if (n == 0) return false;
   1150   char buf[kMaxNumberLength+1];
   1151   str = TerminateNumber(buf, str, &n);
   1152   if (str[0] == '-') {
   1153     // strtoull() will silently accept negative numbers and parse
   1154     // them.  This module is more strict and treats them as errors.
   1155     return false;
   1156   }
   1157   char* end;
   1158   errno = 0;
   1159   uint64 r = strtoull(str, &end, radix);
   1160   if (end != str + n) return false;   // Leftover junk
   1161   if (errno) return false;
   1162   if (dest == NULL) return true;
   1163   *(reinterpret_cast<uint64*>(dest)) = r;
   1164   return true;
   1165 }
   1166 
   1167 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
   1168   if (n == 0) return false;
   1169   static const int kMaxLength = 200;
   1170   char buf[kMaxLength];
   1171   if (n >= kMaxLength) return false;
   1172   memcpy(buf, str, n);
   1173   buf[n] = '\0';
   1174   errno = 0;
   1175   char* end;
   1176   double r;
   1177   if (isfloat) {
   1178     r = strtof(buf, &end);
   1179   } else {
   1180     r = strtod(buf, &end);
   1181   }
   1182   if (end != buf + n) return false;   // Leftover junk
   1183   if (errno) return false;
   1184   if (dest == NULL) return true;
   1185   if (isfloat) {
   1186     *(reinterpret_cast<float*>(dest)) = r;
   1187   } else {
   1188     *(reinterpret_cast<double*>(dest)) = r;
   1189   }
   1190   return true;
   1191 }
   1192 
   1193 bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
   1194   return parse_double_float(str, n, false, dest);
   1195 }
   1196 
   1197 bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
   1198   return parse_double_float(str, n, true, dest);
   1199 }
   1200 
   1201 
   1202 #define DEFINE_INTEGER_PARSERS(name)                                        \
   1203   bool RE2::Arg::parse_##name(const char* str, int n, void* dest) {          \
   1204     return parse_##name##_radix(str, n, dest, 10);                          \
   1205   }                                                                         \
   1206   bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
   1207     return parse_##name##_radix(str, n, dest, 16);                          \
   1208   }                                                                         \
   1209   bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
   1210     return parse_##name##_radix(str, n, dest, 8);                           \
   1211   }                                                                         \
   1212   bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
   1213     return parse_##name##_radix(str, n, dest, 0);                           \
   1214   }
   1215 
   1216 DEFINE_INTEGER_PARSERS(short);
   1217 DEFINE_INTEGER_PARSERS(ushort);
   1218 DEFINE_INTEGER_PARSERS(int);
   1219 DEFINE_INTEGER_PARSERS(uint);
   1220 DEFINE_INTEGER_PARSERS(long);
   1221 DEFINE_INTEGER_PARSERS(ulong);
   1222 DEFINE_INTEGER_PARSERS(longlong);
   1223 DEFINE_INTEGER_PARSERS(ulonglong);
   1224 
   1225 #undef DEFINE_INTEGER_PARSERS
   1226 
   1227 }  // namespace re2
   1228