Home | History | Annotate | Download | only in re2
      1 // Copyright 2003-2009 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Regular expression interface RE2.
      6 //
      7 // Originally the PCRE C++ wrapper, but adapted to use
      8 // the new automata-based regular expression engines.
      9 
     10 #include "re2/re2.h"
     11 
     12 #include <stdio.h>
     13 #include <string>
     14 #include <pthread.h>
     15 #include <errno.h>
     16 #include "util/util.h"
     17 #include "util/flags.h"
     18 #include "re2/prog.h"
     19 #include "re2/regexp.h"
     20 
     21 DEFINE_bool(trace_re2, false, "trace RE2 execution");
     22 
     23 namespace re2 {
     24 
     25 // Maximum number of args we can set
     26 static const int kMaxArgs = 16;
     27 static const int kVecSize = 1+kMaxArgs;
     28 
     29 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch;
     30 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch;
     31 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume;
     32 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume;
     33 
     34 // This will trigger LNK2005 error in MSVC.
     35 #ifndef COMPILER_MSVC
     36 const int RE2::Options::kDefaultMaxMem;  // initialized in re2.h
     37 #endif  // COMPILER_MSVC
     38 
     39 RE2::Options::Options(RE2::CannedOptions opt)
     40   : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
     41     posix_syntax_(opt == RE2::POSIX),
     42     longest_match_(opt == RE2::POSIX),
     43     log_errors_(opt != RE2::Quiet),
     44     max_mem_(kDefaultMaxMem),
     45     literal_(false),
     46     never_nl_(false),
     47     never_capture_(false),
     48     case_sensitive_(true),
     49     perl_classes_(false),
     50     word_boundary_(false),
     51     one_line_(false) {
     52 }
     53 
     54 // static empty things for use as const references.
     55 // To avoid global constructors, initialized on demand.
     56 GLOBAL_MUTEX(empty_mutex);
     57 static const string *empty_string;
     58 static const map<string, int> *empty_named_groups;
     59 static const map<int, string> *empty_group_names;
     60 
     61 static void InitEmpty() {
     62   GLOBAL_MUTEX_LOCK(empty_mutex);
     63   if (empty_string == NULL) {
     64     empty_string = new string;
     65     empty_named_groups = new map<string, int>;
     66     empty_group_names = new map<int, string>;
     67   }
     68   GLOBAL_MUTEX_UNLOCK(empty_mutex);
     69 }
     70 
     71 // Converts from Regexp error code to RE2 error code.
     72 // Maybe some day they will diverge.  In any event, this
     73 // hides the existence of Regexp from RE2 users.
     74 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
     75   switch (code) {
     76     case re2::kRegexpSuccess:
     77       return RE2::NoError;
     78     case re2::kRegexpInternalError:
     79       return RE2::ErrorInternal;
     80     case re2::kRegexpBadEscape:
     81       return RE2::ErrorBadEscape;
     82     case re2::kRegexpBadCharClass:
     83       return RE2::ErrorBadCharClass;
     84     case re2::kRegexpBadCharRange:
     85       return RE2::ErrorBadCharRange;
     86     case re2::kRegexpMissingBracket:
     87       return RE2::ErrorMissingBracket;
     88     case re2::kRegexpMissingParen:
     89       return RE2::ErrorMissingParen;
     90     case re2::kRegexpTrailingBackslash:
     91       return RE2::ErrorTrailingBackslash;
     92     case re2::kRegexpRepeatArgument:
     93       return RE2::ErrorRepeatArgument;
     94     case re2::kRegexpRepeatSize:
     95       return RE2::ErrorRepeatSize;
     96     case re2::kRegexpRepeatOp:
     97       return RE2::ErrorRepeatOp;
     98     case re2::kRegexpBadPerlOp:
     99       return RE2::ErrorBadPerlOp;
    100     case re2::kRegexpBadUTF8:
    101       return RE2::ErrorBadUTF8;
    102     case re2::kRegexpBadNamedCapture:
    103       return RE2::ErrorBadNamedCapture;
    104   }
    105   return RE2::ErrorInternal;
    106 }
    107 
    108 static string trunc(const StringPiece& pattern) {
    109   if (pattern.size() < 100)
    110     return pattern.as_string();
    111   return pattern.substr(0, 100).as_string() + "...";
    112 }
    113 
    114 
    115 RE2::RE2(const char* pattern) {
    116   Init(pattern, DefaultOptions);
    117 }
    118 
    119 RE2::RE2(const string& pattern) {
    120   Init(pattern, DefaultOptions);
    121 }
    122 
    123 RE2::RE2(const StringPiece& pattern) {
    124   Init(pattern, DefaultOptions);
    125 }
    126 
    127 RE2::RE2(const StringPiece& pattern, const Options& options) {
    128   Init(pattern, options);
    129 }
    130 
    131 int RE2::Options::ParseFlags() const {
    132   int flags = Regexp::ClassNL;
    133   switch (encoding()) {
    134     default:
    135       if (log_errors())
    136         LOG(ERROR) << "Unknown encoding " << encoding();
    137       break;
    138     case RE2::Options::EncodingUTF8:
    139       break;
    140     case RE2::Options::EncodingLatin1:
    141       flags |= Regexp::Latin1;
    142       break;
    143   }
    144 
    145   if (!posix_syntax())
    146     flags |= Regexp::LikePerl;
    147 
    148   if (literal())
    149     flags |= Regexp::Literal;
    150 
    151   if (never_nl())
    152     flags |= Regexp::NeverNL;
    153 
    154   if (never_capture())
    155     flags |= Regexp::NeverCapture;
    156 
    157   if (!case_sensitive())
    158     flags |= Regexp::FoldCase;
    159 
    160   if (perl_classes())
    161     flags |= Regexp::PerlClasses;
    162 
    163   if (word_boundary())
    164     flags |= Regexp::PerlB;
    165 
    166   if (one_line())
    167     flags |= Regexp::OneLine;
    168 
    169   return flags;
    170 }
    171 
    172 void RE2::Init(const StringPiece& pattern, const Options& options) {
    173   mutex_ = new Mutex;
    174   pattern_ = pattern.as_string();
    175   options_.Copy(options);
    176   InitEmpty();
    177   error_ = empty_string;
    178   error_code_ = NoError;
    179   suffix_regexp_ = NULL;
    180   entire_regexp_ = NULL;
    181   prog_ = NULL;
    182   rprog_ = NULL;
    183   named_groups_ = NULL;
    184   group_names_ = NULL;
    185   num_captures_ = -1;
    186 
    187   RegexpStatus status;
    188   entire_regexp_ = Regexp::Parse(
    189     pattern_,
    190     static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
    191     &status);
    192   if (entire_regexp_ == NULL) {
    193     if (error_ == empty_string)
    194       error_ = new string(status.Text());
    195     if (options_.log_errors()) {
    196       LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
    197                  << status.Text();
    198     }
    199     error_arg_ = status.error_arg().as_string();
    200     error_code_ = RegexpErrorToRE2(status.code());
    201     return;
    202   }
    203 
    204   prefix_.clear();
    205   prefix_foldcase_ = false;
    206   re2::Regexp* suffix;
    207   if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
    208     suffix_regexp_ = suffix;
    209   else
    210     suffix_regexp_ = entire_regexp_->Incref();
    211 
    212   // Two thirds of the memory goes to the forward Prog,
    213   // one third to the reverse prog, because the forward
    214   // Prog has two DFAs but the reverse prog has one.
    215   prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
    216   if (prog_ == NULL) {
    217     if (options_.log_errors())
    218       LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
    219     error_ = new string("pattern too large - compile failed");
    220     error_code_ = RE2::ErrorPatternTooLarge;
    221     return;
    222   }
    223 
    224   // Could delay this until the first match call that
    225   // cares about submatch information, but the one-pass
    226   // machine's memory gets cut from the DFA memory budget,
    227   // and that is harder to do if the DFA has already
    228   // been built.
    229   is_one_pass_ = prog_->IsOnePass();
    230 }
    231 
    232 // Returns rprog_, computing it if needed.
    233 re2::Prog* RE2::ReverseProg() const {
    234   MutexLock l(mutex_);
    235   if (rprog_ == NULL && error_ == empty_string) {
    236     rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);
    237     if (rprog_ == NULL) {
    238       if (options_.log_errors())
    239         LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";
    240       error_ = new string("pattern too large - reverse compile failed");
    241       error_code_ = RE2::ErrorPatternTooLarge;
    242       return NULL;
    243     }
    244   }
    245   return rprog_;
    246 }
    247 
    248 RE2::~RE2() {
    249   if (suffix_regexp_)
    250     suffix_regexp_->Decref();
    251   if (entire_regexp_)
    252     entire_regexp_->Decref();
    253   delete mutex_;
    254   delete prog_;
    255   delete rprog_;
    256   if (error_ != empty_string)
    257     delete error_;
    258   if (named_groups_ != NULL && named_groups_ != empty_named_groups)
    259     delete named_groups_;
    260   if (group_names_ != NULL &&  group_names_ != empty_group_names)
    261     delete group_names_;
    262 }
    263 
    264 int RE2::ProgramSize() const {
    265   if (prog_ == NULL)
    266     return -1;
    267   return prog_->size();
    268 }
    269 
    270 // Returns named_groups_, computing it if needed.
    271 const map<string, int>&  RE2::NamedCapturingGroups() const {
    272   MutexLock l(mutex_);
    273   if (!ok())
    274     return *empty_named_groups;
    275   if (named_groups_ == NULL) {
    276     named_groups_ = suffix_regexp_->NamedCaptures();
    277     if (named_groups_ == NULL)
    278       named_groups_ = empty_named_groups;
    279   }
    280   return *named_groups_;
    281 }
    282 
    283 // Returns group_names_, computing it if needed.
    284 const map<int, string>&  RE2::CapturingGroupNames() const {
    285   MutexLock l(mutex_);
    286   if (!ok())
    287     return *empty_group_names;
    288   if (group_names_ == NULL) {
    289     group_names_ = suffix_regexp_->CaptureNames();
    290     if (group_names_ == NULL)
    291       group_names_ = empty_group_names;
    292   }
    293   return *group_names_;
    294 }
    295 
    296 /***** Convenience interfaces *****/
    297 
    298 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
    299                      const Arg* const args[], int n) {
    300   return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
    301 }
    302 
    303 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
    304                         const Arg* const args[], int n) {
    305   return re.DoMatch(text, UNANCHORED, NULL, args, n);
    306 }
    307 
    308 bool RE2::ConsumeN(StringPiece* input, const RE2& re,
    309                    const Arg* const args[], int n) {
    310   int consumed;
    311   if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
    312     input->remove_prefix(consumed);
    313     return true;
    314   } else {
    315     return false;
    316   }
    317 }
    318 
    319 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
    320                           const Arg* const args[], int n) {
    321   int consumed;
    322   if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
    323     input->remove_prefix(consumed);
    324     return true;
    325   } else {
    326     return false;
    327   }
    328 }
    329 
    330 // Returns the maximum submatch needed for the rewrite to be done by Replace().
    331 // E.g. if rewrite == "foo \\2,\\1", returns 2.
    332 int RE2::MaxSubmatch(const StringPiece& rewrite) {
    333   int max = 0;
    334   for (const char *s = rewrite.data(), *end = s + rewrite.size();
    335        s < end; s++) {
    336     if (*s == '\\') {
    337       s++;
    338       int c = (s < end) ? *s : -1;
    339       if (isdigit(c)) {
    340         int n = (c - '0');
    341         if (n > max)
    342           max = n;
    343       }
    344     }
    345   }
    346   return max;
    347 }
    348 
    349 bool RE2::Replace(string *str,
    350                  const RE2& re,
    351                  const StringPiece& rewrite) {
    352   StringPiece vec[kVecSize];
    353   int nvec = 1 + MaxSubmatch(rewrite);
    354   if (nvec > arraysize(vec))
    355     return false;
    356   if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
    357     return false;
    358 
    359   string s;
    360   if (!re.Rewrite(&s, rewrite, vec, nvec))
    361     return false;
    362 
    363   assert(vec[0].begin() >= str->data());
    364   assert(vec[0].end() <= str->data()+str->size());
    365   str->replace(vec[0].data() - str->data(), vec[0].size(), s);
    366   return true;
    367 }
    368 
    369 int RE2::GlobalReplace(string *str,
    370                       const RE2& re,
    371                       const StringPiece& rewrite) {
    372   StringPiece vec[kVecSize];
    373   int nvec = 1 + MaxSubmatch(rewrite);
    374   if (nvec > arraysize(vec))
    375     return false;
    376 
    377   const char* p = str->data();
    378   const char* ep = p + str->size();
    379   const char* lastend = NULL;
    380   string out;
    381   int count = 0;
    382   while (p <= ep) {
    383     if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec))
    384       break;
    385     if (p < vec[0].begin())
    386       out.append(p, vec[0].begin() - p);
    387     if (vec[0].begin() == lastend && vec[0].size() == 0) {
    388       // Disallow empty match at end of last match: skip ahead.
    389       if (p < ep)
    390         out.append(p, 1);
    391       p++;
    392       continue;
    393     }
    394     re.Rewrite(&out, rewrite, vec, nvec);
    395     p = vec[0].end();
    396     lastend = p;
    397     count++;
    398   }
    399 
    400   if (count == 0)
    401     return 0;
    402 
    403   if (p < ep)
    404     out.append(p, ep - p);
    405   swap(out, *str);
    406   return count;
    407 }
    408 
    409 bool RE2::Extract(const StringPiece &text,
    410                  const RE2& re,
    411                  const StringPiece &rewrite,
    412                  string *out) {
    413   StringPiece vec[kVecSize];
    414   int nvec = 1 + MaxSubmatch(rewrite);
    415   if (nvec > arraysize(vec))
    416     return false;
    417 
    418   if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
    419     return false;
    420 
    421   out->clear();
    422   return re.Rewrite(out, rewrite, vec, nvec);
    423 }
    424 
    425 string RE2::QuoteMeta(const StringPiece& unquoted) {
    426   string result;
    427   result.reserve(unquoted.size() << 1);
    428 
    429   // Escape any ascii character not in [A-Za-z_0-9].
    430   //
    431   // Note that it's legal to escape a character even if it has no
    432   // special meaning in a regular expression -- so this function does
    433   // that.  (This also makes it identical to the perl function of the
    434   // same name except for the null-character special case;
    435   // see `perldoc -f quotemeta`.)
    436   for (int ii = 0; ii < unquoted.length(); ++ii) {
    437     // Note that using 'isalnum' here raises the benchmark time from
    438     // 32ns to 58ns:
    439     if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
    440         (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
    441         (unquoted[ii] < '0' || unquoted[ii] > '9') &&
    442         unquoted[ii] != '_' &&
    443         // If this is the part of a UTF8 or Latin1 character, we need
    444         // to copy this byte without escaping.  Experimentally this is
    445         // what works correctly with the regexp library.
    446         !(unquoted[ii] & 128)) {
    447       if (unquoted[ii] == '\0') {  // Special handling for null chars.
    448         // Note that this special handling is not strictly required for RE2,
    449         // but this quoting is required for other regexp libraries such as
    450         // PCRE.
    451         // Can't use "\\0" since the next character might be a digit.
    452         result += "\\x00";
    453         continue;
    454       }
    455       result += '\\';
    456     }
    457     result += unquoted[ii];
    458   }
    459 
    460   return result;
    461 }
    462 
    463 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
    464   if (prog_ == NULL)
    465     return false;
    466 
    467   int n = prefix_.size();
    468   if (n > maxlen)
    469     n = maxlen;
    470 
    471   // Determine initial min max from prefix_ literal.
    472   string pmin, pmax;
    473   pmin = prefix_.substr(0, n);
    474   pmax = prefix_.substr(0, n);
    475   if (prefix_foldcase_) {
    476     // prefix is ASCII lowercase; change pmin to uppercase.
    477     for (int i = 0; i < n; i++) {
    478       if ('a' <= pmin[i] && pmin[i] <= 'z')
    479         pmin[i] += 'A' - 'a';
    480     }
    481   }
    482 
    483   // Add to prefix min max using PossibleMatchRange on regexp.
    484   string dmin, dmax;
    485   maxlen -= n;
    486   if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
    487     pmin += dmin;
    488     pmax += dmax;
    489   } else if (pmax.size() > 0) {
    490     // prog_->PossibleMatchRange has failed us,
    491     // but we still have useful information from prefix_.
    492     // Round up pmax to allow any possible suffix.
    493     pmax = PrefixSuccessor(pmax);
    494   } else {
    495     // Nothing useful.
    496     *min = "";
    497     *max = "";
    498     return false;
    499   }
    500 
    501   *min = pmin;
    502   *max = pmax;
    503   return true;
    504 }
    505 
    506 // Avoid possible locale nonsense in standard strcasecmp.
    507 // The string a is known to be all lowercase.
    508 static int ascii_strcasecmp(const char* a, const char* b, int len) {
    509   const char *ae = a + len;
    510 
    511   for (; a < ae; a++, b++) {
    512     uint8 x = *a;
    513     uint8 y = *b;
    514     if ('A' <= y && y <= 'Z')
    515       y += 'a' - 'A';
    516     if (x != y)
    517       return x - y;
    518   }
    519   return 0;
    520 }
    521 
    522 
    523 /***** Actual matching and rewriting code *****/
    524 
    525 bool RE2::Match(const StringPiece& text,
    526                 int startpos,
    527                 int endpos,
    528                 Anchor re_anchor,
    529                 StringPiece* submatch,
    530                 int nsubmatch) const {
    531   if (!ok() || suffix_regexp_ == NULL) {
    532     if (options_.log_errors())
    533       LOG(ERROR) << "Invalid RE2: " << *error_;
    534     return false;
    535   }
    536 
    537   if (startpos < 0 || startpos > endpos || endpos > text.size()) {
    538     if (options_.log_errors())
    539       LOG(ERROR) << "RE2: invalid startpos, endpos pair.";
    540     return false;
    541   }
    542 
    543   StringPiece subtext = text;
    544   subtext.remove_prefix(startpos);
    545   subtext.remove_suffix(text.size() - endpos);
    546 
    547   // Use DFAs to find exact location of match, filter out non-matches.
    548 
    549   // Don't ask for the location if we won't use it.
    550   // SearchDFA can do extra optimizations in that case.
    551   StringPiece match;
    552   StringPiece* matchp = &match;
    553   if (nsubmatch == 0)
    554     matchp = NULL;
    555 
    556   int ncap = 1 + NumberOfCapturingGroups();
    557   if (ncap > nsubmatch)
    558     ncap = nsubmatch;
    559 
    560   // If the regexp is anchored explicitly, must not be in middle of text.
    561   if (prog_->anchor_start() && startpos != 0)
    562     return false;
    563 
    564   // If the regexp is anchored explicitly, update re_anchor
    565   // so that we can potentially fall into a faster case below.
    566   if (prog_->anchor_start() && prog_->anchor_end())
    567     re_anchor = ANCHOR_BOTH;
    568   else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
    569     re_anchor = ANCHOR_START;
    570 
    571   // Check for the required prefix, if any.
    572   int prefixlen = 0;
    573   if (!prefix_.empty()) {
    574     if (startpos != 0)
    575       return false;
    576     prefixlen = prefix_.size();
    577     if (prefixlen > subtext.size())
    578       return false;
    579     if (prefix_foldcase_) {
    580       if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
    581         return false;
    582     } else {
    583       if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
    584         return false;
    585     }
    586     subtext.remove_prefix(prefixlen);
    587     // If there is a required prefix, the anchor must be at least ANCHOR_START.
    588     if (re_anchor != ANCHOR_BOTH)
    589       re_anchor = ANCHOR_START;
    590   }
    591 
    592   Prog::Anchor anchor = Prog::kUnanchored;
    593   Prog::MatchKind kind = Prog::kFirstMatch;
    594   if (options_.longest_match())
    595     kind = Prog::kLongestMatch;
    596   bool skipped_test = false;
    597 
    598   bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
    599 
    600   // SearchBitState allocates a bit vector of size prog_->size() * text.size().
    601   // It also allocates a stack of 3-word structures which could potentially
    602   // grow as large as prog_->size() * text.size() but in practice is much
    603   // smaller.
    604   // Conditions for using SearchBitState:
    605   const int MaxBitStateProg = 500;   // prog_->size() <= Max.
    606   const int MaxBitStateVector = 256*1024;  // bit vector size <= Max (bits)
    607   bool can_bit_state = prog_->size() <= MaxBitStateProg;
    608   int bit_state_text_max = MaxBitStateVector / prog_->size();
    609 
    610   bool dfa_failed = false;
    611   switch (re_anchor) {
    612     default:
    613     case UNANCHORED: {
    614       if (!prog_->SearchDFA(subtext, text, anchor, kind,
    615                             matchp, &dfa_failed, NULL)) {
    616         if (dfa_failed) {
    617           // Fall back to NFA below.
    618           skipped_test = true;
    619           if (FLAGS_trace_re2)
    620             LOG(INFO) << "Match " << trunc(pattern_)
    621                       << " [" << CEscape(subtext) << "]"
    622                       << " DFA failed.";
    623           break;
    624         }
    625         if (FLAGS_trace_re2)
    626           LOG(INFO) << "Match " << trunc(pattern_)
    627                     << " [" << CEscape(subtext) << "]"
    628                     << " used DFA - no match.";
    629         return false;
    630       }
    631       if (FLAGS_trace_re2)
    632         LOG(INFO) << "Match " << trunc(pattern_)
    633                   << " [" << CEscape(subtext) << "]"
    634                   << " used DFA - match";
    635       if (matchp == NULL)  // Matched.  Don't care where
    636         return true;
    637       // SearchDFA set match[0].end() but didn't know where the
    638       // match started.  Run the regexp backward from match[0].end()
    639       // to find the longest possible match -- that's where it started.
    640       Prog* prog = ReverseProg();
    641       if (prog == NULL)
    642         return false;
    643       if (!prog->SearchDFA(match, text, Prog::kAnchored,
    644                            Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
    645         if (dfa_failed) {
    646           // Fall back to NFA below.
    647           skipped_test = true;
    648           if (FLAGS_trace_re2)
    649             LOG(INFO) << "Match " << trunc(pattern_)
    650                       << " [" << CEscape(subtext) << "]"
    651                       << " reverse DFA failed.";
    652           break;
    653         }
    654         if (FLAGS_trace_re2)
    655           LOG(INFO) << "Match " << trunc(pattern_)
    656                     << " [" << CEscape(subtext) << "]"
    657                     << " DFA inconsistency.";
    658         if (options_.log_errors())
    659           LOG(ERROR) << "DFA inconsistency";
    660         return false;
    661       }
    662       if (FLAGS_trace_re2)
    663         LOG(INFO) << "Match " << trunc(pattern_)
    664                   << " [" << CEscape(subtext) << "]"
    665                   << " used reverse DFA.";
    666       break;
    667     }
    668 
    669     case ANCHOR_BOTH:
    670     case ANCHOR_START:
    671       if (re_anchor == ANCHOR_BOTH)
    672         kind = Prog::kFullMatch;
    673       anchor = Prog::kAnchored;
    674 
    675       // If only a small amount of text and need submatch
    676       // information anyway and we're going to use OnePass or BitState
    677       // to get it, we might as well not even bother with the DFA:
    678       // OnePass or BitState will be fast enough.
    679       // On tiny texts, OnePass outruns even the DFA, and
    680       // it doesn't have the shared state and occasional mutex that
    681       // the DFA does.
    682       if (can_one_pass && text.size() <= 4096 &&
    683           (ncap > 1 || text.size() <= 8)) {
    684         if (FLAGS_trace_re2)
    685           LOG(INFO) << "Match " << trunc(pattern_)
    686                     << " [" << CEscape(subtext) << "]"
    687                     << " skipping DFA for OnePass.";
    688         skipped_test = true;
    689         break;
    690       }
    691       if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
    692         if (FLAGS_trace_re2)
    693           LOG(INFO) << "Match " << trunc(pattern_)
    694                     << " [" << CEscape(subtext) << "]"
    695                     << " skipping DFA for BitState.";
    696         skipped_test = true;
    697         break;
    698       }
    699       if (!prog_->SearchDFA(subtext, text, anchor, kind,
    700                             &match, &dfa_failed, NULL)) {
    701         if (dfa_failed) {
    702           if (FLAGS_trace_re2)
    703             LOG(INFO) << "Match " << trunc(pattern_)
    704                       << " [" << CEscape(subtext) << "]"
    705                       << " DFA failed.";
    706           skipped_test = true;
    707           break;
    708         }
    709         if (FLAGS_trace_re2)
    710           LOG(INFO) << "Match " << trunc(pattern_)
    711                     << " [" << CEscape(subtext) << "]"
    712                     << " used DFA - no match.";
    713         return false;
    714       }
    715       break;
    716   }
    717 
    718   if (!skipped_test && ncap <= 1) {
    719     // We know exactly where it matches.  That's enough.
    720     if (ncap == 1)
    721       submatch[0] = match;
    722   } else {
    723     StringPiece subtext1;
    724     if (skipped_test) {
    725       // DFA ran out of memory or was skipped:
    726       // need to search in entire original text.
    727       subtext1 = subtext;
    728     } else {
    729       // DFA found the exact match location:
    730       // let NFA run an anchored, full match search
    731       // to find submatch locations.
    732       subtext1 = match;
    733       anchor = Prog::kAnchored;
    734       kind = Prog::kFullMatch;
    735     }
    736 
    737     if (can_one_pass && anchor != Prog::kUnanchored) {
    738       if (FLAGS_trace_re2)
    739         LOG(INFO) << "Match " << trunc(pattern_)
    740                   << " [" << CEscape(subtext) << "]"
    741                   << " using OnePass.";
    742       if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
    743         if (!skipped_test && options_.log_errors())
    744           LOG(ERROR) << "SearchOnePass inconsistency";
    745         return false;
    746       }
    747     } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
    748       if (FLAGS_trace_re2)
    749         LOG(INFO) << "Match " << trunc(pattern_)
    750                   << " [" << CEscape(subtext) << "]"
    751                   << " using BitState.";
    752       if (!prog_->SearchBitState(subtext1, text, anchor,
    753                                  kind, submatch, ncap)) {
    754         if (!skipped_test && options_.log_errors())
    755           LOG(ERROR) << "SearchBitState inconsistency";
    756         return false;
    757       }
    758     } else {
    759       if (FLAGS_trace_re2)
    760         LOG(INFO) << "Match " << trunc(pattern_)
    761                   << " [" << CEscape(subtext) << "]"
    762                   << " using NFA.";
    763       if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
    764         if (!skipped_test && options_.log_errors())
    765           LOG(ERROR) << "SearchNFA inconsistency";
    766         return false;
    767       }
    768     }
    769   }
    770 
    771   // Adjust overall match for required prefix that we stripped off.
    772   if (prefixlen > 0 && nsubmatch > 0)
    773     submatch[0] = StringPiece(submatch[0].begin() - prefixlen,
    774                               submatch[0].size() + prefixlen);
    775 
    776   // Zero submatches that don't exist in the regexp.
    777   for (int i = ncap; i < nsubmatch; i++)
    778     submatch[i] = NULL;
    779   return true;
    780 }
    781 
    782 // Internal matcher - like Match() but takes Args not StringPieces.
    783 bool RE2::DoMatch(const StringPiece& text,
    784                   Anchor anchor,
    785                   int* consumed,
    786                   const Arg* const* args,
    787                   int n) const {
    788   if (!ok()) {
    789     if (options_.log_errors())
    790       LOG(ERROR) << "Invalid RE2: " << *error_;
    791     return false;
    792   }
    793 
    794   // Count number of capture groups needed.
    795   int nvec;
    796   if (n == 0 && consumed == NULL)
    797     nvec = 0;
    798   else
    799     nvec = n+1;
    800 
    801   StringPiece* vec;
    802   StringPiece stkvec[kVecSize];
    803   StringPiece* heapvec = NULL;
    804 
    805   if (nvec <= arraysize(stkvec)) {
    806     vec = stkvec;
    807   } else {
    808     vec = new StringPiece[nvec];
    809     heapvec = vec;
    810   }
    811 
    812   if (!Match(text, 0, text.size(), anchor, vec, nvec)) {
    813     delete[] heapvec;
    814     return false;
    815   }
    816 
    817   if(consumed != NULL)
    818     *consumed = vec[0].end() - text.begin();
    819 
    820   if (n == 0 || args == NULL) {
    821     // We are not interested in results
    822     delete[] heapvec;
    823     return true;
    824   }
    825 
    826   int ncap = NumberOfCapturingGroups();
    827   if (ncap < n) {
    828     // RE has fewer capturing groups than number of arg pointers passed in
    829     VLOG(1) << "Asked for " << n << " but only have " << ncap;
    830     delete[] heapvec;
    831     return false;
    832   }
    833 
    834   // If we got here, we must have matched the whole pattern.
    835   for (int i = 0; i < n; i++) {
    836     const StringPiece& s = vec[i+1];
    837     if (!args[i]->Parse(s.data(), s.size())) {
    838       // TODO: Should we indicate what the error was?
    839       VLOG(1) << "Parse error on #" << i << " " << s << " "
    840 	      << (void*)s.data() << "/" << s.size();
    841       delete[] heapvec;
    842       return false;
    843     }
    844   }
    845 
    846   delete[] heapvec;
    847   return true;
    848 }
    849 
    850 // Append the "rewrite" string, with backslash subsitutions from "vec",
    851 // to string "out".
    852 bool RE2::Rewrite(string *out, const StringPiece &rewrite,
    853                  const StringPiece *vec, int veclen) const {
    854   for (const char *s = rewrite.data(), *end = s + rewrite.size();
    855        s < end; s++) {
    856     int c = *s;
    857     if (c == '\\') {
    858       s++;
    859       c = (s < end) ? *s : -1;
    860       if (isdigit(c)) {
    861         int n = (c - '0');
    862         if (n >= veclen) {
    863           if (options_.log_errors()) {
    864             LOG(ERROR) << "requested group " << n
    865                        << " in regexp " << rewrite.data();
    866           }
    867           return false;
    868         }
    869         StringPiece snip = vec[n];
    870         if (snip.size() > 0)
    871           out->append(snip.data(), snip.size());
    872       } else if (c == '\\') {
    873         out->push_back('\\');
    874       } else {
    875         if (options_.log_errors())
    876           LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
    877         return false;
    878       }
    879     } else {
    880       out->push_back(c);
    881     }
    882   }
    883   return true;
    884 }
    885 
    886 // Return the number of capturing subpatterns, or -1 if the
    887 // regexp wasn't valid on construction.
    888 int RE2::NumberOfCapturingGroups() const {
    889   if (suffix_regexp_ == NULL)
    890     return -1;
    891   ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"
    892     " multiple threads end up doing the same work in parallel.");
    893   if (num_captures_ == -1)
    894     num_captures_ = suffix_regexp_->NumCaptures();
    895   return num_captures_;
    896 }
    897 
    898 // Checks that the rewrite string is well-formed with respect to this
    899 // regular expression.
    900 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
    901   int max_token = -1;
    902   for (const char *s = rewrite.data(), *end = s + rewrite.size();
    903        s < end; s++) {
    904     int c = *s;
    905     if (c != '\\') {
    906       continue;
    907     }
    908     if (++s == end) {
    909       *error = "Rewrite schema error: '\\' not allowed at end.";
    910       return false;
    911     }
    912     c = *s;
    913     if (c == '\\') {
    914       continue;
    915     }
    916     if (!isdigit(c)) {
    917       *error = "Rewrite schema error: "
    918                "'\\' must be followed by a digit or '\\'.";
    919       return false;
    920     }
    921     int n = (c - '0');
    922     if (max_token < n) {
    923       max_token = n;
    924     }
    925   }
    926 
    927   if (max_token > NumberOfCapturingGroups()) {
    928     SStringPrintf(error, "Rewrite schema requests %d matches, "
    929                   "but the regexp only has %d parenthesized subexpressions.",
    930                   max_token, NumberOfCapturingGroups());
    931     return false;
    932   }
    933   return true;
    934 }
    935 
    936 /***** Parsers for various types *****/
    937 
    938 bool RE2::Arg::parse_null(const char* str, int n, void* dest) {
    939   // We fail if somebody asked us to store into a non-NULL void* pointer
    940   return (dest == NULL);
    941 }
    942 
    943 bool RE2::Arg::parse_string(const char* str, int n, void* dest) {
    944   if (dest == NULL) return true;
    945   reinterpret_cast<string*>(dest)->assign(str, n);
    946   return true;
    947 }
    948 
    949 bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {
    950   if (dest == NULL) return true;
    951   reinterpret_cast<StringPiece*>(dest)->set(str, n);
    952   return true;
    953 }
    954 
    955 bool RE2::Arg::parse_char(const char* str, int n, void* dest) {
    956   if (n != 1) return false;
    957   if (dest == NULL) return true;
    958   *(reinterpret_cast<char*>(dest)) = str[0];
    959   return true;
    960 }
    961 
    962 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
    963   if (n != 1) return false;
    964   if (dest == NULL) return true;
    965   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
    966   return true;
    967 }
    968 
    969 // Largest number spec that we are willing to parse
    970 static const int kMaxNumberLength = 32;
    971 
    972 // REQUIRES "buf" must have length at least kMaxNumberLength+1
    973 // Copies "str" into "buf" and null-terminates.
    974 // Overwrites *np with the new length.
    975 static const char* TerminateNumber(char* buf, const char* str, int* np) {
    976   int n = *np;
    977   if (n <= 0) return "";
    978   if (n > 0 && isspace(*str)) {
    979     // We are less forgiving than the strtoxxx() routines and do not
    980     // allow leading spaces.
    981     return "";
    982   }
    983 
    984   // Although buf has a fixed maximum size, we can still handle
    985   // arbitrarily large integers correctly by omitting leading zeros.
    986   // (Numbers that are still too long will be out of range.)
    987   // Before deciding whether str is too long,
    988   // remove leading zeros with s/000+/00/.
    989   // Leaving the leading two zeros in place means that
    990   // we don't change 0000x123 (invalid) into 0x123 (valid).
    991   // Skip over leading - before replacing.
    992   bool neg = false;
    993   if (n >= 1 && str[0] == '-') {
    994     neg = true;
    995     n--;
    996     str++;
    997   }
    998 
    999   if (n >= 3 && str[0] == '0' && str[1] == '0') {
   1000     while (n >= 3 && str[2] == '0') {
   1001       n--;
   1002       str++;
   1003     }
   1004   }
   1005 
   1006   if (neg) {  // make room in buf for -
   1007     n++;
   1008     str--;
   1009   }
   1010 
   1011   if (n > kMaxNumberLength) return "";
   1012 
   1013   memmove(buf, str, n);
   1014   if (neg) {
   1015     buf[0] = '-';
   1016   }
   1017   buf[n] = '\0';
   1018   *np = n;
   1019   return buf;
   1020 }
   1021 
   1022 bool RE2::Arg::parse_long_radix(const char* str,
   1023                                int n,
   1024                                void* dest,
   1025                                int radix) {
   1026   if (n == 0) return false;
   1027   char buf[kMaxNumberLength+1];
   1028   str = TerminateNumber(buf, str, &n);
   1029   char* end;
   1030   errno = 0;
   1031   long r = strtol(str, &end, radix);
   1032   if (end != str + n) return false;   // Leftover junk
   1033   if (errno) return false;
   1034   if (dest == NULL) return true;
   1035   *(reinterpret_cast<long*>(dest)) = r;
   1036   return true;
   1037 }
   1038 
   1039 bool RE2::Arg::parse_ulong_radix(const char* str,
   1040                                 int n,
   1041                                 void* dest,
   1042                                 int radix) {
   1043   if (n == 0) return false;
   1044   char buf[kMaxNumberLength+1];
   1045   str = TerminateNumber(buf, str, &n);
   1046   if (str[0] == '-') {
   1047    // strtoul() will silently accept negative numbers and parse
   1048    // them.  This module is more strict and treats them as errors.
   1049    return false;
   1050   }
   1051 
   1052   char* end;
   1053   errno = 0;
   1054   unsigned long r = strtoul(str, &end, radix);
   1055   if (end != str + n) return false;   // Leftover junk
   1056   if (errno) return false;
   1057   if (dest == NULL) return true;
   1058   *(reinterpret_cast<unsigned long*>(dest)) = r;
   1059   return true;
   1060 }
   1061 
   1062 bool RE2::Arg::parse_short_radix(const char* str,
   1063                                 int n,
   1064                                 void* dest,
   1065                                 int radix) {
   1066   long r;
   1067   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
   1068   if ((short)r != r) return false;       // Out of range
   1069   if (dest == NULL) return true;
   1070   *(reinterpret_cast<short*>(dest)) = r;
   1071   return true;
   1072 }
   1073 
   1074 bool RE2::Arg::parse_ushort_radix(const char* str,
   1075                                  int n,
   1076                                  void* dest,
   1077                                  int radix) {
   1078   unsigned long r;
   1079   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
   1080   if ((ushort)r != r) return false;                      // Out of range
   1081   if (dest == NULL) return true;
   1082   *(reinterpret_cast<unsigned short*>(dest)) = r;
   1083   return true;
   1084 }
   1085 
   1086 bool RE2::Arg::parse_int_radix(const char* str,
   1087                               int n,
   1088                               void* dest,
   1089                               int radix) {
   1090   long r;
   1091   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
   1092   if ((int)r != r) return false;         // Out of range
   1093   if (dest == NULL) return true;
   1094   *(reinterpret_cast<int*>(dest)) = r;
   1095   return true;
   1096 }
   1097 
   1098 bool RE2::Arg::parse_uint_radix(const char* str,
   1099                                int n,
   1100                                void* dest,
   1101                                int radix) {
   1102   unsigned long r;
   1103   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
   1104   if ((uint)r != r) return false;                       // Out of range
   1105   if (dest == NULL) return true;
   1106   *(reinterpret_cast<unsigned int*>(dest)) = r;
   1107   return true;
   1108 }
   1109 
   1110 bool RE2::Arg::parse_longlong_radix(const char* str,
   1111                                    int n,
   1112                                    void* dest,
   1113                                    int radix) {
   1114   if (n == 0) return false;
   1115   char buf[kMaxNumberLength+1];
   1116   str = TerminateNumber(buf, str, &n);
   1117   char* end;
   1118   errno = 0;
   1119   int64 r = strtoll(str, &end, radix);
   1120   if (end != str + n) return false;   // Leftover junk
   1121   if (errno) return false;
   1122   if (dest == NULL) return true;
   1123   *(reinterpret_cast<int64*>(dest)) = r;
   1124   return true;
   1125 }
   1126 
   1127 bool RE2::Arg::parse_ulonglong_radix(const char* str,
   1128                                     int n,
   1129                                     void* dest,
   1130                                     int radix) {
   1131   if (n == 0) return false;
   1132   char buf[kMaxNumberLength+1];
   1133   str = TerminateNumber(buf, str, &n);
   1134   if (str[0] == '-') {
   1135     // strtoull() will silently accept negative numbers and parse
   1136     // them.  This module is more strict and treats them as errors.
   1137     return false;
   1138   }
   1139   char* end;
   1140   errno = 0;
   1141   uint64 r = strtoull(str, &end, radix);
   1142   if (end != str + n) return false;   // Leftover junk
   1143   if (errno) return false;
   1144   if (dest == NULL) return true;
   1145   *(reinterpret_cast<uint64*>(dest)) = r;
   1146   return true;
   1147 }
   1148 
   1149 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
   1150   if (n == 0) return false;
   1151   static const int kMaxLength = 200;
   1152   char buf[kMaxLength];
   1153   if (n >= kMaxLength) return false;
   1154   memcpy(buf, str, n);
   1155   buf[n] = '\0';
   1156   errno = 0;
   1157   char* end;
   1158   double r;
   1159   if (isfloat) {
   1160     r = strtof(buf, &end);
   1161   } else {
   1162     r = strtod(buf, &end);
   1163   }
   1164   if (end != buf + n) return false;   // Leftover junk
   1165   if (errno) return false;
   1166   if (dest == NULL) return true;
   1167   if (isfloat) {
   1168     *(reinterpret_cast<float*>(dest)) = r;
   1169   } else {
   1170     *(reinterpret_cast<double*>(dest)) = r;
   1171   }
   1172   return true;
   1173 }
   1174 
   1175 bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
   1176   return parse_double_float(str, n, false, dest);
   1177 }
   1178 
   1179 bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
   1180   return parse_double_float(str, n, true, dest);
   1181 }
   1182 
   1183 
   1184 #define DEFINE_INTEGER_PARSERS(name)                                        \
   1185   bool RE2::Arg::parse_##name(const char* str, int n, void* dest) {          \
   1186     return parse_##name##_radix(str, n, dest, 10);                          \
   1187   }                                                                         \
   1188   bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
   1189     return parse_##name##_radix(str, n, dest, 16);                          \
   1190   }                                                                         \
   1191   bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
   1192     return parse_##name##_radix(str, n, dest, 8);                           \
   1193   }                                                                         \
   1194   bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
   1195     return parse_##name##_radix(str, n, dest, 0);                           \
   1196   }
   1197 
   1198 DEFINE_INTEGER_PARSERS(short);
   1199 DEFINE_INTEGER_PARSERS(ushort);
   1200 DEFINE_INTEGER_PARSERS(int);
   1201 DEFINE_INTEGER_PARSERS(uint);
   1202 DEFINE_INTEGER_PARSERS(long);
   1203 DEFINE_INTEGER_PARSERS(ulong);
   1204 DEFINE_INTEGER_PARSERS(longlong);
   1205 DEFINE_INTEGER_PARSERS(ulonglong);
   1206 
   1207 #undef DEFINE_INTEGER_PARSERS
   1208 
   1209 }  // namespace re2
   1210