Home | History | Annotate | Download | only in re2
      1 // Copyright 2003-2009 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Regular expression interface RE2.
      6 //
      7 // Originally the PCRE C++ wrapper, but adapted to use
      8 // the new automata-based regular expression engines.
      9 
     10 #include "re2/re2.h"
     11 
     12 #include <stdio.h>
     13 #include <ctype.h>
     14 #include <string>
     15 #include <pthread.h>
     16 #include <errno.h>
     17 #include "util/util.h"
     18 #include "util/flags.h"
     19 #include "re2/prog.h"
     20 #include "re2/regexp.h"
     21 
     22 DEFINE_bool(trace_re2, false, "trace RE2 execution");
     23 
     24 namespace re2 {
     25 
     26 // Maximum number of args we can set
     27 static const int kMaxArgs = 16;
     28 static const int kVecSize = 1+kMaxArgs;
     29 
     30 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch;
     31 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch;
     32 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume;
     33 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume;
     34 
     35 const int RE2::Options::kDefaultMaxMem;  // initialized in re2.h
     36 
     37 // Commonly-used option sets; arguments to constructor are:
     38 //   utf8 input
     39 //   posix syntax
     40 //   longest match
     41 //   log errors
     42 const RE2::Options RE2::DefaultOptions;  // EncodingUTF8, false, false, true
     43 const RE2::Options RE2::Latin1(RE2::Options::EncodingLatin1, false, false, true);
     44 const RE2::Options RE2::POSIX(RE2::Options::EncodingUTF8, true, true, true);
     45 const RE2::Options RE2::Quiet(RE2::Options::EncodingUTF8, false, false, false);
     46 
     47 // If a regular expression has no error, its error_ field points here
     48 static const string empty_string;
     49 
     50 // Converts from Regexp error code to RE2 error code.
     51 // Maybe some day they will diverge.  In any event, this
     52 // hides the existence of Regexp from RE2 users.
     53 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
     54   switch (code) {
     55     case re2::kRegexpSuccess:
     56       return RE2::NoError;
     57     case re2::kRegexpInternalError:
     58       return RE2::ErrorInternal;
     59     case re2::kRegexpBadEscape:
     60       return RE2::ErrorBadEscape;
     61     case re2::kRegexpBadCharClass:
     62       return RE2::ErrorBadCharClass;
     63     case re2::kRegexpBadCharRange:
     64       return RE2::ErrorBadCharRange;
     65     case re2::kRegexpMissingBracket:
     66       return RE2::ErrorMissingBracket;
     67     case re2::kRegexpMissingParen:
     68       return RE2::ErrorMissingParen;
     69     case re2::kRegexpTrailingBackslash:
     70       return RE2::ErrorTrailingBackslash;
     71     case re2::kRegexpRepeatArgument:
     72       return RE2::ErrorRepeatArgument;
     73     case re2::kRegexpRepeatSize:
     74       return RE2::ErrorRepeatSize;
     75     case re2::kRegexpRepeatOp:
     76       return RE2::ErrorRepeatOp;
     77     case re2::kRegexpBadPerlOp:
     78       return RE2::ErrorBadPerlOp;
     79     case re2::kRegexpBadUTF8:
     80       return RE2::ErrorBadUTF8;
     81     case re2::kRegexpBadNamedCapture:
     82       return RE2::ErrorBadNamedCapture;
     83   }
     84   return RE2::ErrorInternal;
     85 }
     86 
     87 static string trunc(const StringPiece& pattern) {
     88   if (pattern.size() < 100)
     89     return pattern.as_string();
     90   return pattern.substr(0, 100).as_string() + "...";
     91 }
     92 
     93 
     94 RE2::RE2(const char* pattern) {
     95   Init(pattern, DefaultOptions);
     96 }
     97 
     98 RE2::RE2(const string& pattern) {
     99   Init(pattern, DefaultOptions);
    100 }
    101 
    102 RE2::RE2(const StringPiece& pattern) {
    103   Init(pattern, DefaultOptions);
    104 }
    105 
    106 RE2::RE2(const StringPiece& pattern, const Options& options) {
    107   Init(pattern, options);
    108 }
    109 
    110 int RE2::Options::ParseFlags() const {
    111   int flags = Regexp::ClassNL;
    112   switch (encoding()) {
    113     default:
    114       LOG(ERROR) << "Unknown encoding " << encoding();
    115       break;
    116     case RE2::Options::EncodingUTF8:
    117       break;
    118     case RE2::Options::EncodingLatin1:
    119       flags |= Regexp::Latin1;
    120       break;
    121   }
    122 
    123   if (!posix_syntax())
    124     flags |= Regexp::LikePerl;
    125 
    126   if (literal())
    127     flags |= Regexp::Literal;
    128 
    129   if (never_nl())
    130     flags |= Regexp::NeverNL;
    131 
    132   if (!case_sensitive())
    133     flags |= Regexp::FoldCase;
    134 
    135   if (perl_classes())
    136     flags |= Regexp::PerlClasses;
    137 
    138   if (word_boundary())
    139     flags |= Regexp::PerlB;
    140 
    141   if (one_line())
    142     flags |= Regexp::OneLine;
    143 
    144   return flags;
    145 }
    146 
    147 void RE2::Init(const StringPiece& pattern, const Options& options) {
    148   mutex_ = new Mutex;
    149   pattern_ = pattern.as_string();
    150   options_.Copy(options);
    151   error_ = &empty_string;
    152   error_code_ = NoError;
    153   suffix_regexp_ = NULL;
    154   entire_regexp_ = NULL;
    155   prog_ = NULL;
    156   rprog_ = NULL;
    157   named_groups_ = NULL;
    158   group_names_ = NULL;
    159   num_captures_ = -1;
    160 
    161   RegexpStatus status;
    162   entire_regexp_ = Regexp::Parse(
    163     pattern_,
    164     static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
    165     &status);
    166   if (entire_regexp_ == NULL) {
    167     if (error_ == &empty_string)
    168       error_ = new string(status.Text());
    169     if (options_.log_errors()) {
    170       LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
    171                  << status.Text();
    172     }
    173     error_arg_ = status.error_arg().as_string();
    174     error_code_ = RegexpErrorToRE2(status.code());
    175     return;
    176   }
    177 
    178   prefix_.clear();
    179   prefix_foldcase_ = false;
    180   re2::Regexp* suffix;
    181   if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
    182     suffix_regexp_ = suffix;
    183   else
    184     suffix_regexp_ = entire_regexp_->Incref();
    185 
    186   // Two thirds of the memory goes to the forward Prog,
    187   // one third to the reverse prog, because the forward
    188   // Prog has two DFAs but the reverse prog has one.
    189   prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
    190   if (prog_ == NULL) {
    191     if (options_.log_errors())
    192       LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
    193     error_ = new string("pattern too large - compile failed");
    194     error_code_ = RE2::ErrorPatternTooLarge;
    195     return;
    196   }
    197 
    198   // Could delay this until the first match call that
    199   // cares about submatch information, but the one-pass
    200   // machine's memory gets cut from the DFA memory budget,
    201   // and that is harder to do if the DFA has already
    202   // been built.
    203   is_one_pass_ = prog_->IsOnePass();
    204 }
    205 
    206 // Returns rprog_, computing it if needed.
    207 re2::Prog* RE2::ReverseProg() const {
    208   MutexLock l(mutex_);
    209   if (rprog_ == NULL && error_ == &empty_string) {
    210     rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);
    211     if (rprog_ == NULL) {
    212       if (options_.log_errors())
    213         LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";
    214       error_ = new string("pattern too large - reverse compile failed");
    215       error_code_ = RE2::ErrorPatternTooLarge;
    216       return NULL;
    217     }
    218   }
    219   return rprog_;
    220 }
    221 
    222 static const map<string, int> empty_named_groups;
    223 static const map<int, string> empty_group_names;
    224 
    225 RE2::~RE2() {
    226   if (suffix_regexp_)
    227     suffix_regexp_->Decref();
    228   if (entire_regexp_)
    229     entire_regexp_->Decref();
    230   delete mutex_;
    231   delete prog_;
    232   delete rprog_;
    233   if (error_ != &empty_string)
    234     delete error_;
    235   if (named_groups_ != NULL && named_groups_ != &empty_named_groups)
    236     delete named_groups_;
    237   if (group_names_ != NULL &&  group_names_ != &empty_group_names)
    238     delete group_names_;
    239 }
    240 
    241 int RE2::ProgramSize() const {
    242   if (prog_ == NULL)
    243     return -1;
    244   return prog_->size();
    245 }
    246 
    247 // Returns named_groups_, computing it if needed.
    248 const map<string, int>&  RE2::NamedCapturingGroups() const {
    249   MutexLock l(mutex_);
    250   if (!ok())
    251     return empty_named_groups;
    252   if (named_groups_ == NULL) {
    253     named_groups_ = suffix_regexp_->NamedCaptures();
    254     if (named_groups_ == NULL)
    255       named_groups_ = &empty_named_groups;
    256   }
    257   return *named_groups_;
    258 }
    259 
    260 // Returns group_names_, computing it if needed.
    261 const map<int, string>&  RE2::CapturingGroupNames() const {
    262   MutexLock l(mutex_);
    263   if (!ok())
    264     return empty_group_names;
    265   if (group_names_ == NULL) {
    266     group_names_ = suffix_regexp_->CaptureNames();
    267     if (group_names_ == NULL)
    268       group_names_ = &empty_group_names;
    269   }
    270   return *group_names_;
    271 }
    272 
    273 /***** Convenience interfaces *****/
    274 
    275 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
    276                      const Arg* const args[], int n) {
    277   return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
    278 }
    279 
    280 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
    281                         const Arg* const args[], int n) {
    282   return re.DoMatch(text, UNANCHORED, NULL, args, n);
    283 }
    284 
    285 bool RE2::ConsumeN(StringPiece* input, const RE2& re,
    286                    const Arg* const args[], int n) {
    287   int consumed;
    288   if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
    289     input->remove_prefix(consumed);
    290     return true;
    291   } else {
    292     return false;
    293   }
    294 }
    295 
    296 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
    297                           const Arg* const args[], int n) {
    298   int consumed;
    299   if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
    300     input->remove_prefix(consumed);
    301     return true;
    302   } else {
    303     return false;
    304   }
    305 }
    306 
    307 // Returns the maximum submatch needed for the rewrite to be done by Replace().
    308 // E.g. if rewrite == "foo \\2,\\1", returns 2.
    309 static int MaxSubmatch(const StringPiece& rewrite) {
    310   int max = 0;
    311   for (const char *s = rewrite.data(), *end = s + rewrite.size();
    312        s < end; s++) {
    313     if (*s == '\\') {
    314       s++;
    315       int c = (s < end) ? *s : -1;
    316       if (isdigit(c)) {
    317         int n = (c - '0');
    318         if (n > max)
    319           max = n;
    320       }
    321     }
    322   }
    323   return max;
    324 }
    325 
    326 bool RE2::Replace(string *str,
    327                  const RE2& re,
    328                  const StringPiece& rewrite) {
    329   StringPiece vec[kVecSize];
    330   int nvec = 1 + MaxSubmatch(rewrite);
    331   if (nvec > arraysize(vec))
    332     return false;
    333   if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
    334     return false;
    335 
    336   string s;
    337   if (!re.Rewrite(&s, rewrite, vec, nvec))
    338     return false;
    339 
    340   assert(vec[0].begin() >= str->data());
    341   assert(vec[0].end() <= str->data()+str->size());
    342   str->replace(vec[0].data() - str->data(), vec[0].size(), s);
    343   return true;
    344 }
    345 
    346 int RE2::GlobalReplace(string *str,
    347                       const RE2& re,
    348                       const StringPiece& rewrite) {
    349   StringPiece vec[kVecSize];
    350   int nvec = 1 + MaxSubmatch(rewrite);
    351   if (nvec > arraysize(vec))
    352     return false;
    353 
    354   const char* p = str->data();
    355   const char* ep = p + str->size();
    356   const char* lastend = NULL;
    357   string out;
    358   int count = 0;
    359   while (p <= ep) {
    360     if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec))
    361       break;
    362     if (p < vec[0].begin())
    363       out.append(p, vec[0].begin() - p);
    364     if (vec[0].begin() == lastend && vec[0].size() == 0) {
    365       // Disallow empty match at end of last match: skip ahead.
    366       if (p < ep)
    367         out.append(p, 1);
    368       p++;
    369       continue;
    370     }
    371     re.Rewrite(&out, rewrite, vec, nvec);
    372     p = vec[0].end();
    373     lastend = p;
    374     count++;
    375   }
    376 
    377   if (count == 0)
    378     return 0;
    379 
    380   if (p < ep)
    381     out.append(p, ep - p);
    382   swap(out, *str);
    383   return count;
    384 }
    385 
    386 bool RE2::Extract(const StringPiece &text,
    387                  const RE2& re,
    388                  const StringPiece &rewrite,
    389                  string *out) {
    390   StringPiece vec[kVecSize];
    391   int nvec = 1 + MaxSubmatch(rewrite);
    392   if (nvec > arraysize(vec))
    393     return false;
    394 
    395   if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
    396     return false;
    397 
    398   out->clear();
    399   return re.Rewrite(out, rewrite, vec, nvec);
    400 }
    401 
    402 string RE2::QuoteMeta(const StringPiece& unquoted) {
    403   string result;
    404   result.reserve(unquoted.size() << 1);
    405 
    406   // Escape any ascii character not in [A-Za-z_0-9].
    407   //
    408   // Note that it's legal to escape a character even if it has no
    409   // special meaning in a regular expression -- so this function does
    410   // that.  (This also makes it identical to the perl function of the
    411   // same name except for the null-character special case;
    412   // see `perldoc -f quotemeta`.)
    413   for (int ii = 0; ii < unquoted.length(); ++ii) {
    414     // Note that using 'isalnum' here raises the benchmark time from
    415     // 32ns to 58ns:
    416     if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
    417         (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
    418         (unquoted[ii] < '0' || unquoted[ii] > '9') &&
    419         unquoted[ii] != '_' &&
    420         // If this is the part of a UTF8 or Latin1 character, we need
    421         // to copy this byte without escaping.  Experimentally this is
    422         // what works correctly with the regexp library.
    423         !(unquoted[ii] & 128)) {
    424       if (unquoted[ii] == '\0') {  // Special handling for null chars.
    425         // Note that this special handling is not strictly required for RE2,
    426         // but this quoting is required for other regexp libraries such as
    427         // PCRE.
    428         // Can't use "\\0" since the next character might be a digit.
    429         result += "\\x00";
    430         continue;
    431       }
    432       result += '\\';
    433     }
    434     result += unquoted[ii];
    435   }
    436 
    437   return result;
    438 }
    439 
    440 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
    441   if (prog_ == NULL)
    442     return false;
    443 
    444   int n = prefix_.size();
    445   if (n > maxlen)
    446     n = maxlen;
    447 
    448   // Determine initial min max from prefix_ literal.
    449   string pmin, pmax;
    450   pmin = prefix_.substr(0, n);
    451   pmax = prefix_.substr(0, n);
    452   if (prefix_foldcase_) {
    453     // prefix is ASCII lowercase; change pmin to uppercase.
    454     for (int i = 0; i < n; i++) {
    455       if ('a' <= pmin[i] && pmin[i] <= 'z')
    456         pmin[i] += 'A' - 'a';
    457     }
    458   }
    459 
    460   // Add to prefix min max using PossibleMatchRange on regexp.
    461   string dmin, dmax;
    462   maxlen -= n;
    463   if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
    464     pmin += dmin;
    465     pmax += dmax;
    466   } else if (pmax.size() > 0) {
    467     // prog_->PossibleMatchRange has failed us,
    468     // but we still have useful information from prefix_.
    469     // Round up pmax to allow any possible suffix.
    470     pmax = PrefixSuccessor(pmax);
    471   } else {
    472     // Nothing useful.
    473     *min = "";
    474     *max = "";
    475     return false;
    476   }
    477 
    478   *min = pmin;
    479   *max = pmax;
    480   return true;
    481 }
    482 
    483 // Avoid possible locale nonsense in standard strcasecmp.
    484 // The string a is known to be all lowercase.
    485 static int ascii_strcasecmp(const char* a, const char* b, int len) {
    486   const char *ae = a + len;
    487 
    488   for (; a < ae; a++, b++) {
    489     uint8 x = *a;
    490     uint8 y = *b;
    491     if ('A' <= y && y <= 'Z')
    492       y += 'a' - 'A';
    493     if (x != y)
    494       return x - y;
    495   }
    496   return 0;
    497 }
    498 
    499 
    500 /***** Actual matching and rewriting code *****/
    501 
    502 bool RE2::Match(const StringPiece& text,
    503                 int startpos,
    504                 int endpos,
    505                 Anchor re_anchor,
    506                 StringPiece* submatch,
    507                 int nsubmatch) const {
    508   if (!ok() || suffix_regexp_ == NULL) {
    509     if (options_.log_errors())
    510       LOG(ERROR) << "Invalid RE2: " << *error_;
    511     return false;
    512   }
    513 
    514   if (startpos < 0 || startpos > endpos || endpos > text.size()) {
    515     LOG(ERROR) << "RE2: invalid startpos, endpos pair.";
    516     return false;
    517   }
    518 
    519   StringPiece subtext = text;
    520   subtext.remove_prefix(startpos);
    521   subtext.remove_suffix(text.size() - endpos);
    522 
    523   // Use DFAs to find exact location of match, filter out non-matches.
    524 
    525   // Don't ask for the location if we won't use it.
    526   // SearchDFA can do extra optimizations in that case.
    527   StringPiece match;
    528   StringPiece* matchp = &match;
    529   if (nsubmatch == 0)
    530     matchp = NULL;
    531 
    532   int ncap = 1 + NumberOfCapturingGroups();
    533   if (ncap > nsubmatch)
    534     ncap = nsubmatch;
    535 
    536   // If the regexp is anchored explicitly, must not be in middle of text.
    537   if (prog_->anchor_start() && startpos != 0)
    538     return false;
    539 
    540   // If the regexp is anchored explicitly, update re_anchor
    541   // so that we can potentially fall into a faster case below.
    542   if (prog_->anchor_start() && prog_->anchor_end())
    543     re_anchor = ANCHOR_BOTH;
    544   else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
    545     re_anchor = ANCHOR_START;
    546 
    547   // Check for the required prefix, if any.
    548   int prefixlen = 0;
    549   if (!prefix_.empty()) {
    550     if (startpos != 0)
    551       return false;
    552     prefixlen = prefix_.size();
    553     if (prefixlen > subtext.size())
    554       return false;
    555     if (prefix_foldcase_) {
    556       if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
    557         return false;
    558     } else {
    559       if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
    560         return false;
    561     }
    562     subtext.remove_prefix(prefixlen);
    563     // If there is a required prefix, the anchor must be at least ANCHOR_START.
    564     if (re_anchor != ANCHOR_BOTH)
    565       re_anchor = ANCHOR_START;
    566   }
    567 
    568   Prog::Anchor anchor = Prog::kUnanchored;
    569   Prog::MatchKind kind = Prog::kFirstMatch;
    570   if (options_.longest_match())
    571     kind = Prog::kLongestMatch;
    572   bool skipped_test = false;
    573 
    574   bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
    575 
    576   // SearchBitState allocates a bit vector of size prog_->size() * text.size().
    577   // It also allocates a stack of 3-word structures which could potentially
    578   // grow as large as prog_->size() * text.size() but in practice is much
    579   // smaller.
    580   // Conditions for using SearchBitState:
    581   const int MaxBitStateProg = 500;   // prog_->size() <= Max.
    582   const int MaxBitStateVector = 256*1024;  // bit vector size <= Max (bits)
    583   bool can_bit_state = prog_->size() <= MaxBitStateProg;
    584   int bit_state_text_max = MaxBitStateVector / prog_->size();
    585 
    586   bool dfa_failed = false;
    587   switch (re_anchor) {
    588     default:
    589     case UNANCHORED: {
    590       if (!prog_->SearchDFA(subtext, text, anchor, kind,
    591                             matchp, &dfa_failed, NULL)) {
    592         if (dfa_failed) {
    593           // Fall back to NFA below.
    594           skipped_test = true;
    595           if (FLAGS_trace_re2)
    596             LOG(INFO) << "Match " << trunc(pattern_)
    597                       << " [" << CEscape(subtext) << "]"
    598                       << " DFA failed.";
    599           break;
    600         }
    601         if (FLAGS_trace_re2)
    602           LOG(INFO) << "Match " << trunc(pattern_)
    603                     << " [" << CEscape(subtext) << "]"
    604                     << " used DFA - no match.";
    605         return false;
    606       }
    607       if (FLAGS_trace_re2)
    608         LOG(INFO) << "Match " << trunc(pattern_)
    609                   << " [" << CEscape(subtext) << "]"
    610                   << " used DFA - match";
    611       if (matchp == NULL)  // Matched.  Don't care where
    612         return true;
    613       // SearchDFA set match[0].end() but didn't know where the
    614       // match started.  Run the regexp backward from match[0].end()
    615       // to find the longest possible match -- that's where it started.
    616       Prog* prog = ReverseProg();
    617       if (prog == NULL)
    618         return false;
    619       if (!prog->SearchDFA(match, text, Prog::kAnchored,
    620                            Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
    621         if (dfa_failed) {
    622           // Fall back to NFA below.
    623           skipped_test = true;
    624           if (FLAGS_trace_re2)
    625             LOG(INFO) << "Match " << trunc(pattern_)
    626                       << " [" << CEscape(subtext) << "]"
    627                       << " reverse DFA failed.";
    628           break;
    629         }
    630         if (FLAGS_trace_re2)
    631           LOG(INFO) << "Match " << trunc(pattern_)
    632                     << " [" << CEscape(subtext) << "]"
    633                     << " DFA inconsistency.";
    634         LOG(ERROR) << "DFA inconsistency";
    635         return false;
    636       }
    637       if (FLAGS_trace_re2)
    638         LOG(INFO) << "Match " << trunc(pattern_)
    639                   << " [" << CEscape(subtext) << "]"
    640                   << " used reverse DFA.";
    641       break;
    642     }
    643 
    644     case ANCHOR_BOTH:
    645     case ANCHOR_START:
    646       if (re_anchor == ANCHOR_BOTH)
    647         kind = Prog::kFullMatch;
    648       anchor = Prog::kAnchored;
    649 
    650       // If only a small amount of text and need submatch
    651       // information anyway and we're going to use OnePass or BitState
    652       // to get it, we might as well not even bother with the DFA:
    653       // OnePass or BitState will be fast enough.
    654       // On tiny texts, OnePass outruns even the DFA, and
    655       // it doesn't have the shared state and occasional mutex that
    656       // the DFA does.
    657       if (can_one_pass && text.size() <= 4096 &&
    658           (ncap > 1 || text.size() <= 8)) {
    659         if (FLAGS_trace_re2)
    660           LOG(INFO) << "Match " << trunc(pattern_)
    661                     << " [" << CEscape(subtext) << "]"
    662                     << " skipping DFA for OnePass.";
    663         skipped_test = true;
    664         break;
    665       }
    666       if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
    667         if (FLAGS_trace_re2)
    668           LOG(INFO) << "Match " << trunc(pattern_)
    669                     << " [" << CEscape(subtext) << "]"
    670                     << " skipping DFA for BitState.";
    671         skipped_test = true;
    672         break;
    673       }
    674       if (!prog_->SearchDFA(subtext, text, anchor, kind,
    675                             &match, &dfa_failed, NULL)) {
    676         if (dfa_failed) {
    677           if (FLAGS_trace_re2)
    678             LOG(INFO) << "Match " << trunc(pattern_)
    679                       << " [" << CEscape(subtext) << "]"
    680                       << " DFA failed.";
    681           skipped_test = true;
    682           break;
    683         }
    684         if (FLAGS_trace_re2)
    685           LOG(INFO) << "Match " << trunc(pattern_)
    686                     << " [" << CEscape(subtext) << "]"
    687                     << " used DFA - no match.";
    688         return false;
    689       }
    690       break;
    691   }
    692 
    693   if (!skipped_test && ncap <= 1) {
    694     // We know exactly where it matches.  That's enough.
    695     if (ncap == 1)
    696       submatch[0] = match;
    697   } else {
    698     StringPiece subtext1;
    699     if (skipped_test) {
    700       // DFA ran out of memory or was skipped:
    701       // need to search in entire original text.
    702       subtext1 = subtext;
    703     } else {
    704       // DFA found the exact match location:
    705       // let NFA run an anchored, full match search
    706       // to find submatch locations.
    707       subtext1 = match;
    708       anchor = Prog::kAnchored;
    709       kind = Prog::kFullMatch;
    710     }
    711 
    712     if (can_one_pass && anchor != Prog::kUnanchored) {
    713       if (FLAGS_trace_re2)
    714         LOG(INFO) << "Match " << trunc(pattern_)
    715                   << " [" << CEscape(subtext) << "]"
    716                   << " using OnePass.";
    717       if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
    718         if (!skipped_test)
    719           LOG(ERROR) << "SearchOnePass inconsistency";
    720         return false;
    721       }
    722     } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
    723       if (FLAGS_trace_re2)
    724         LOG(INFO) << "Match " << trunc(pattern_)
    725                   << " [" << CEscape(subtext) << "]"
    726                   << " using BitState.";
    727       if (!prog_->SearchBitState(subtext1, text, anchor,
    728                                  kind, submatch, ncap)) {
    729         if (!skipped_test)
    730           LOG(ERROR) << "SearchBitState inconsistency";
    731         return false;
    732       }
    733     } else {
    734       if (FLAGS_trace_re2)
    735         LOG(INFO) << "Match " << trunc(pattern_)
    736                   << " [" << CEscape(subtext) << "]"
    737                   << " using NFA.";
    738       if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
    739         if (!skipped_test)
    740           LOG(ERROR) << "SearchNFA inconsistency";
    741         return false;
    742       }
    743     }
    744   }
    745 
    746   // Adjust overall match for required prefix that we stripped off.
    747   if (prefixlen > 0 && nsubmatch > 0)
    748     submatch[0] = StringPiece(submatch[0].begin() - prefixlen,
    749                               submatch[0].size() + prefixlen);
    750 
    751   // Zero submatches that don't exist in the regexp.
    752   for (int i = ncap; i < nsubmatch; i++)
    753     submatch[i] = NULL;
    754   return true;
    755 }
    756 
    757 // Internal matcher - like Match() but takes Args not StringPieces.
    758 bool RE2::DoMatch(const StringPiece& text,
    759                   Anchor anchor,
    760                   int* consumed,
    761                   const Arg* const* args,
    762                   int n) const {
    763   if (!ok()) {
    764     if (options_.log_errors())
    765       LOG(ERROR) << "Invalid RE2: " << *error_;
    766     return false;
    767   }
    768 
    769   // Count number of capture groups needed.
    770   int nvec;
    771   if (n == 0 && consumed == NULL)
    772     nvec = 0;
    773   else
    774     nvec = n+1;
    775 
    776   StringPiece* vec;
    777   StringPiece stkvec[kVecSize];
    778   StringPiece* heapvec = NULL;
    779 
    780   if (nvec <= arraysize(stkvec)) {
    781     vec = stkvec;
    782   } else {
    783     vec = new StringPiece[nvec];
    784     heapvec = vec;
    785   }
    786 
    787   if (!Match(text, 0, text.size(), anchor, vec, nvec)) {
    788     delete[] heapvec;
    789     return false;
    790   }
    791 
    792   if(consumed != NULL)
    793     *consumed = vec[0].end() - text.begin();
    794 
    795   if (n == 0 || args == NULL) {
    796     // We are not interested in results
    797     delete[] heapvec;
    798     return true;
    799   }
    800 
    801   int ncap = NumberOfCapturingGroups();
    802   if (ncap < n) {
    803     // RE has fewer capturing groups than number of arg pointers passed in
    804     VLOG(1) << "Asked for " << n << " but only have " << ncap;
    805     delete[] heapvec;
    806     return false;
    807   }
    808 
    809   // If we got here, we must have matched the whole pattern.
    810   for (int i = 0; i < n; i++) {
    811     const StringPiece& s = vec[i+1];
    812     if (!args[i]->Parse(s.data(), s.size())) {
    813       // TODO: Should we indicate what the error was?
    814       VLOG(1) << "Parse error on #" << i << " " << s << " "
    815 	      << (void*)s.data() << "/" << s.size();
    816       delete[] heapvec;
    817       return false;
    818     }
    819   }
    820 
    821   delete[] heapvec;
    822   return true;
    823 }
    824 
    825 // Append the "rewrite" string, with backslash subsitutions from "vec",
    826 // to string "out".
    827 bool RE2::Rewrite(string *out, const StringPiece &rewrite,
    828                  const StringPiece *vec, int veclen) const {
    829   for (const char *s = rewrite.data(), *end = s + rewrite.size();
    830        s < end; s++) {
    831     int c = *s;
    832     if (c == '\\') {
    833       s++;
    834       c = (s < end) ? *s : -1;
    835       if (isdigit(c)) {
    836         int n = (c - '0');
    837         if (n >= veclen) {
    838           LOG(ERROR) << "requested group " << n
    839                      << " in regexp " << rewrite.data();
    840           return false;
    841         }
    842         StringPiece snip = vec[n];
    843         if (snip.size() > 0)
    844           out->append(snip.data(), snip.size());
    845       } else if (c == '\\') {
    846         out->push_back('\\');
    847       } else {
    848         LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
    849         return false;
    850       }
    851     } else {
    852       out->push_back(c);
    853     }
    854   }
    855   return true;
    856 }
    857 
    858 // Return the number of capturing subpatterns, or -1 if the
    859 // regexp wasn't valid on construction.
    860 int RE2::NumberOfCapturingGroups() const {
    861   if (suffix_regexp_ == NULL)
    862     return -1;
    863   ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"
    864     " multiple threads end up doing the same work in parallel.");
    865   if (num_captures_ == -1)
    866     num_captures_ = suffix_regexp_->NumCaptures();
    867   return num_captures_;
    868 }
    869 
    870 // Checks that the rewrite string is well-formed with respect to this
    871 // regular expression.
    872 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
    873   int max_token = -1;
    874   for (const char *s = rewrite.data(), *end = s + rewrite.size();
    875        s < end; s++) {
    876     int c = *s;
    877     if (c != '\\') {
    878       continue;
    879     }
    880     if (++s == end) {
    881       *error = "Rewrite schema error: '\\' not allowed at end.";
    882       return false;
    883     }
    884     c = *s;
    885     if (c == '\\') {
    886       continue;
    887     }
    888     if (!isdigit(c)) {
    889       *error = "Rewrite schema error: "
    890                "'\\' must be followed by a digit or '\\'.";
    891       return false;
    892     }
    893     int n = (c - '0');
    894     if (max_token < n) {
    895       max_token = n;
    896     }
    897   }
    898 
    899   if (max_token > NumberOfCapturingGroups()) {
    900     SStringPrintf(error, "Rewrite schema requests %d matches, "
    901                   "but the regexp only has %d parenthesized subexpressions.",
    902                   max_token, NumberOfCapturingGroups());
    903     return false;
    904   }
    905   return true;
    906 }
    907 
    908 /***** Parsers for various types *****/
    909 
    910 bool RE2::Arg::parse_null(const char* str, int n, void* dest) {
    911   // We fail if somebody asked us to store into a non-NULL void* pointer
    912   return (dest == NULL);
    913 }
    914 
    915 bool RE2::Arg::parse_string(const char* str, int n, void* dest) {
    916   if (dest == NULL) return true;
    917   reinterpret_cast<string*>(dest)->assign(str, n);
    918   return true;
    919 }
    920 
    921 bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {
    922   if (dest == NULL) return true;
    923   reinterpret_cast<StringPiece*>(dest)->set(str, n);
    924   return true;
    925 }
    926 
    927 bool RE2::Arg::parse_char(const char* str, int n, void* dest) {
    928   if (n != 1) return false;
    929   if (dest == NULL) return true;
    930   *(reinterpret_cast<char*>(dest)) = str[0];
    931   return true;
    932 }
    933 
    934 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
    935   if (n != 1) return false;
    936   if (dest == NULL) return true;
    937   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
    938   return true;
    939 }
    940 
    941 // Largest number spec that we are willing to parse
    942 static const int kMaxNumberLength = 32;
    943 
    944 // REQUIRES "buf" must have length at least kMaxNumberLength+1
    945 // Copies "str" into "buf" and null-terminates.
    946 // Overwrites *np with the new length.
    947 static const char* TerminateNumber(char* buf, const char* str, int* np) {
    948   int n = *np;
    949   if (n <= 0) return "";
    950   if (n > 0 && isspace(*str)) {
    951     // We are less forgiving than the strtoxxx() routines and do not
    952     // allow leading spaces.
    953     return "";
    954   }
    955 
    956   // Although buf has a fixed maximum size, we can still handle
    957   // arbitrarily large integers correctly by omitting leading zeros.
    958   // (Numbers that are still too long will be out of range.)
    959   // Before deciding whether str is too long,
    960   // remove leading zeros with s/000+/00/.
    961   // Leaving the leading two zeros in place means that
    962   // we don't change 0000x123 (invalid) into 0x123 (valid).
    963   // Skip over leading - before replacing.
    964   bool neg = false;
    965   if (n >= 1 && str[0] == '-') {
    966     neg = true;
    967     n--;
    968     str++;
    969   }
    970 
    971   if (n >= 3 && str[0] == '0' && str[1] == '0') {
    972     while (n >= 3 && str[2] == '0') {
    973       n--;
    974       str++;
    975     }
    976   }
    977 
    978   if (neg) {  // make room in buf for -
    979     n++;
    980     str--;
    981   }
    982 
    983   if (n > kMaxNumberLength) return "";
    984 
    985   memmove(buf, str, n);
    986   if (neg) {
    987     buf[0] = '-';
    988   }
    989   buf[n] = '\0';
    990   *np = n;
    991   return buf;
    992 }
    993 
    994 bool RE2::Arg::parse_long_radix(const char* str,
    995                                int n,
    996                                void* dest,
    997                                int radix) {
    998   if (n == 0) return false;
    999   char buf[kMaxNumberLength+1];
   1000   str = TerminateNumber(buf, str, &n);
   1001   char* end;
   1002   errno = 0;
   1003   long r = strtol(str, &end, radix);
   1004   if (end != str + n) return false;   // Leftover junk
   1005   if (errno) return false;
   1006   if (dest == NULL) return true;
   1007   *(reinterpret_cast<long*>(dest)) = r;
   1008   return true;
   1009 }
   1010 
   1011 bool RE2::Arg::parse_ulong_radix(const char* str,
   1012                                 int n,
   1013                                 void* dest,
   1014                                 int radix) {
   1015   if (n == 0) return false;
   1016   char buf[kMaxNumberLength+1];
   1017   str = TerminateNumber(buf, str, &n);
   1018   if (str[0] == '-') {
   1019    // strtoul() will silently accept negative numbers and parse
   1020    // them.  This module is more strict and treats them as errors.
   1021    return false;
   1022   }
   1023 
   1024   char* end;
   1025   errno = 0;
   1026   unsigned long r = strtoul(str, &end, radix);
   1027   if (end != str + n) return false;   // Leftover junk
   1028   if (errno) return false;
   1029   if (dest == NULL) return true;
   1030   *(reinterpret_cast<unsigned long*>(dest)) = r;
   1031   return true;
   1032 }
   1033 
   1034 bool RE2::Arg::parse_short_radix(const char* str,
   1035                                 int n,
   1036                                 void* dest,
   1037                                 int radix) {
   1038   long r;
   1039   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
   1040   if ((short)r != r) return false;       // Out of range
   1041   if (dest == NULL) return true;
   1042   *(reinterpret_cast<short*>(dest)) = r;
   1043   return true;
   1044 }
   1045 
   1046 bool RE2::Arg::parse_ushort_radix(const char* str,
   1047                                  int n,
   1048                                  void* dest,
   1049                                  int radix) {
   1050   unsigned long r;
   1051   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
   1052   if ((ushort)r != r) return false;                      // Out of range
   1053   if (dest == NULL) return true;
   1054   *(reinterpret_cast<unsigned short*>(dest)) = r;
   1055   return true;
   1056 }
   1057 
   1058 bool RE2::Arg::parse_int_radix(const char* str,
   1059                               int n,
   1060                               void* dest,
   1061                               int radix) {
   1062   long r;
   1063   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
   1064   if ((int)r != r) return false;         // Out of range
   1065   if (dest == NULL) return true;
   1066   *(reinterpret_cast<int*>(dest)) = r;
   1067   return true;
   1068 }
   1069 
   1070 bool RE2::Arg::parse_uint_radix(const char* str,
   1071                                int n,
   1072                                void* dest,
   1073                                int radix) {
   1074   unsigned long r;
   1075   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
   1076   if ((uint)r != r) return false;                       // Out of range
   1077   if (dest == NULL) return true;
   1078   *(reinterpret_cast<unsigned int*>(dest)) = r;
   1079   return true;
   1080 }
   1081 
   1082 bool RE2::Arg::parse_longlong_radix(const char* str,
   1083                                    int n,
   1084                                    void* dest,
   1085                                    int radix) {
   1086   if (n == 0) return false;
   1087   char buf[kMaxNumberLength+1];
   1088   str = TerminateNumber(buf, str, &n);
   1089   char* end;
   1090   errno = 0;
   1091   int64 r = strtoll(str, &end, radix);
   1092   if (end != str + n) return false;   // Leftover junk
   1093   if (errno) return false;
   1094   if (dest == NULL) return true;
   1095   *(reinterpret_cast<int64*>(dest)) = r;
   1096   return true;
   1097 }
   1098 
   1099 bool RE2::Arg::parse_ulonglong_radix(const char* str,
   1100                                     int n,
   1101                                     void* dest,
   1102                                     int radix) {
   1103   if (n == 0) return false;
   1104   char buf[kMaxNumberLength+1];
   1105   str = TerminateNumber(buf, str, &n);
   1106   if (str[0] == '-') {
   1107     // strtoull() will silently accept negative numbers and parse
   1108     // them.  This module is more strict and treats them as errors.
   1109     return false;
   1110   }
   1111   char* end;
   1112   errno = 0;
   1113   uint64 r = strtoull(str, &end, radix);
   1114   if (end != str + n) return false;   // Leftover junk
   1115   if (errno) return false;
   1116   if (dest == NULL) return true;
   1117   *(reinterpret_cast<uint64*>(dest)) = r;
   1118   return true;
   1119 }
   1120 
   1121 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
   1122   if (n == 0) return false;
   1123   static const int kMaxLength = 200;
   1124   char buf[kMaxLength];
   1125   if (n >= kMaxLength) return false;
   1126   memcpy(buf, str, n);
   1127   buf[n] = '\0';
   1128   errno = 0;
   1129   char* end;
   1130   double r;
   1131   if (isfloat) {
   1132     r = strtof(buf, &end);
   1133   } else {
   1134     r = strtod(buf, &end);
   1135   }
   1136   if (end != buf + n) return false;   // Leftover junk
   1137   if (errno) return false;
   1138   if (dest == NULL) return true;
   1139   if (isfloat) {
   1140     *(reinterpret_cast<float*>(dest)) = r;
   1141   } else {
   1142     *(reinterpret_cast<double*>(dest)) = r;
   1143   }
   1144   return true;
   1145 }
   1146 
   1147 bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
   1148   return parse_double_float(str, n, false, dest);
   1149 }
   1150 
   1151 bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
   1152   return parse_double_float(str, n, true, dest);
   1153 }
   1154 
   1155 
   1156 #define DEFINE_INTEGER_PARSERS(name)                                        \
   1157   bool RE2::Arg::parse_##name(const char* str, int n, void* dest) {          \
   1158     return parse_##name##_radix(str, n, dest, 10);                          \
   1159   }                                                                         \
   1160   bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
   1161     return parse_##name##_radix(str, n, dest, 16);                          \
   1162   }                                                                         \
   1163   bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
   1164     return parse_##name##_radix(str, n, dest, 8);                           \
   1165   }                                                                         \
   1166   bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
   1167     return parse_##name##_radix(str, n, dest, 0);                           \
   1168   }
   1169 
   1170 DEFINE_INTEGER_PARSERS(short);
   1171 DEFINE_INTEGER_PARSERS(ushort);
   1172 DEFINE_INTEGER_PARSERS(int);
   1173 DEFINE_INTEGER_PARSERS(uint);
   1174 DEFINE_INTEGER_PARSERS(long);
   1175 DEFINE_INTEGER_PARSERS(ulong);
   1176 DEFINE_INTEGER_PARSERS(longlong);
   1177 DEFINE_INTEGER_PARSERS(ulonglong);
   1178 
   1179 #undef DEFINE_INTEGER_PARSERS
   1180 
   1181 }  // namespace re2
   1182