Home | History | Annotate | Download | only in util
      1 // Copyright 2003-2009 Google Inc.  All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // This is a variant of PCRE's pcrecpp.cc, originally written at Google.
      6 // The main changes are the addition of the HitLimit method and
      7 // compilation as PCRE in namespace re2.
      8 
      9 #include <errno.h>
     10 #include "util/util.h"
     11 #include "util/flags.h"
     12 #include "util/pcre.h"
     13 
     14 #ifdef WIN32
     15 #define strtoll _strtoi64
     16 #define strtoull _strtoui64
     17 #endif
     18 
     19 #define PCREPORT(level) LOG(level)
     20 
     21 // Default PCRE limits.
     22 // Defaults chosen to allow a plausible amount of CPU and
     23 // not exceed main thread stacks.  Note that other threads
     24 // often have smaller stacks, and therefore tightening
     25 // regexp_stack_limit may frequently be necessary.
     26 DEFINE_int32(regexp_stack_limit, 256<<10, "default PCRE stack limit (bytes)");
     27 DEFINE_int32(regexp_match_limit, 1000000,
     28              "default PCRE match limit (function calls)");
     29 
     30 namespace re2 {
     31 
     32 // Maximum number of args we can set
     33 static const int kMaxArgs = 16;
     34 static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
     35 
     36 // Approximate size of a recursive invocation of PCRE's
     37 // internal "match()" frame.  This varies depending on the
     38 // compiler and architecture, of course, so the constant is
     39 // just a conservative estimate.  To find the exact number,
     40 // run regexp_unittest with --regexp_stack_limit=0 under
     41 // a debugger and look at the frames when it crashes.
     42 // The exact frame size was 656 in production on 2008/02/03.
     43 static const int kPCREFrameSize = 700;
     44 
     45 // Special name for missing C++ arguments.
     46 PCRE::Arg PCRE::no_more_args((void*)NULL);
     47 
     48 const PCRE::PartialMatchFunctor PCRE::PartialMatch = { };
     49 const PCRE::FullMatchFunctor PCRE::FullMatch = { } ;
     50 const PCRE::ConsumeFunctor PCRE::Consume = { };
     51 const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { };
     52 
     53 // If a regular expression has no error, its error_ field points here
     54 static const string empty_string;
     55 
     56 void PCRE::Init(const char* pattern, Option options, int match_limit,
     57               int stack_limit, bool report_errors) {
     58   pattern_ = pattern;
     59   options_ = options;
     60   match_limit_ = match_limit;
     61   stack_limit_ = stack_limit;
     62   hit_limit_ = false;
     63   error_ = &empty_string;
     64   report_errors_ = report_errors;
     65   re_full_ = NULL;
     66   re_partial_ = NULL;
     67 
     68   if (options & ~(EnabledCompileOptions | EnabledExecOptions)) {
     69     error_ = new string("illegal regexp option");
     70     PCREPORT(ERROR)
     71         << "Error compiling '" << pattern << "': illegal regexp option";
     72   } else {
     73     re_partial_ = Compile(UNANCHORED);
     74     if (re_partial_ != NULL) {
     75       re_full_ = Compile(ANCHOR_BOTH);
     76     }
     77   }
     78 }
     79 
     80 PCRE::PCRE(const char* pattern) {
     81   Init(pattern, None, 0, 0, true);
     82 }
     83 PCRE::PCRE(const char* pattern, Option option) {
     84   Init(pattern, option, 0, 0, true);
     85 }
     86 PCRE::PCRE(const string& pattern) {
     87   Init(pattern.c_str(), None, 0, 0, true);
     88 }
     89 PCRE::PCRE(const string& pattern, Option option) {
     90   Init(pattern.c_str(), option, 0, 0, true);
     91 }
     92 PCRE::PCRE(const string& pattern, const PCRE_Options& re_option) {
     93   Init(pattern.c_str(), re_option.option(), re_option.match_limit(),
     94        re_option.stack_limit(), re_option.report_errors());
     95 }
     96 
     97 PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) {
     98   Init(pattern, re_option.option(), re_option.match_limit(),
     99        re_option.stack_limit(), re_option.report_errors());
    100 }
    101 
    102 PCRE::~PCRE() {
    103   if (re_full_ != NULL)         pcre_free(re_full_);
    104   if (re_partial_ != NULL)      pcre_free(re_partial_);
    105   if (error_ != &empty_string)  delete error_;
    106 }
    107 
    108 pcre* PCRE::Compile(Anchor anchor) {
    109   // Special treatment for anchoring.  This is needed because at
    110   // runtime pcre only provides an option for anchoring at the
    111   // beginning of a string.
    112   //
    113   // There are three types of anchoring we want:
    114   //    UNANCHORED      Compile the original pattern, and use
    115   //                    a pcre unanchored match.
    116   //    ANCHOR_START    Compile the original pattern, and use
    117   //                    a pcre anchored match.
    118   //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern
    119   //                    and use a pcre anchored match.
    120 
    121   const char* error;
    122   int eoffset;
    123   pcre* re;
    124   if (anchor != ANCHOR_BOTH) {
    125     re = pcre_compile(pattern_.c_str(),
    126                       (options_ & EnabledCompileOptions),
    127                       &error, &eoffset, NULL);
    128   } else {
    129     // Tack a '\z' at the end of PCRE.  Parenthesize it first so that
    130     // the '\z' applies to all top-level alternatives in the regexp.
    131     string wrapped = "(?:";  // A non-counting grouping operator
    132     wrapped += pattern_;
    133     wrapped += ")\\z";
    134     re = pcre_compile(wrapped.c_str(),
    135                       (options_ & EnabledCompileOptions),
    136                       &error, &eoffset, NULL);
    137   }
    138   if (re == NULL) {
    139     if (error_ == &empty_string) error_ = new string(error);
    140     PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error;
    141   }
    142   return re;
    143 }
    144 
    145 /***** Convenience interfaces *****/
    146 
    147 bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text,
    148                                        const PCRE& re,
    149                                        const Arg& a0,
    150                                        const Arg& a1,
    151                                        const Arg& a2,
    152                                        const Arg& a3,
    153                                        const Arg& a4,
    154                                        const Arg& a5,
    155                                        const Arg& a6,
    156                                        const Arg& a7,
    157                                        const Arg& a8,
    158                                        const Arg& a9,
    159                                        const Arg& a10,
    160                                        const Arg& a11,
    161                                        const Arg& a12,
    162                                        const Arg& a13,
    163                                        const Arg& a14,
    164                                        const Arg& a15) const {
    165   const Arg* args[kMaxArgs];
    166   int n = 0;
    167   if (&a0 == &no_more_args)  goto done; args[n++] = &a0;
    168   if (&a1 == &no_more_args)  goto done; args[n++] = &a1;
    169   if (&a2 == &no_more_args)  goto done; args[n++] = &a2;
    170   if (&a3 == &no_more_args)  goto done; args[n++] = &a3;
    171   if (&a4 == &no_more_args)  goto done; args[n++] = &a4;
    172   if (&a5 == &no_more_args)  goto done; args[n++] = &a5;
    173   if (&a6 == &no_more_args)  goto done; args[n++] = &a6;
    174   if (&a7 == &no_more_args)  goto done; args[n++] = &a7;
    175   if (&a8 == &no_more_args)  goto done; args[n++] = &a8;
    176   if (&a9 == &no_more_args)  goto done; args[n++] = &a9;
    177   if (&a10 == &no_more_args) goto done; args[n++] = &a10;
    178   if (&a11 == &no_more_args) goto done; args[n++] = &a11;
    179   if (&a12 == &no_more_args) goto done; args[n++] = &a12;
    180   if (&a13 == &no_more_args) goto done; args[n++] = &a13;
    181   if (&a14 == &no_more_args) goto done; args[n++] = &a14;
    182   if (&a15 == &no_more_args) goto done; args[n++] = &a15;
    183 done:
    184 
    185   int consumed;
    186   int vec[kVecSize];
    187   return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
    188 }
    189 
    190 bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text,
    191                                           const PCRE& re,
    192                                           const Arg& a0,
    193                                           const Arg& a1,
    194                                           const Arg& a2,
    195                                           const Arg& a3,
    196                                           const Arg& a4,
    197                                           const Arg& a5,
    198                                           const Arg& a6,
    199                                           const Arg& a7,
    200                                           const Arg& a8,
    201                                           const Arg& a9,
    202                                           const Arg& a10,
    203                                           const Arg& a11,
    204                                           const Arg& a12,
    205                                           const Arg& a13,
    206                                           const Arg& a14,
    207                                           const Arg& a15) const {
    208   const Arg* args[kMaxArgs];
    209   int n = 0;
    210   if (&a0 == &no_more_args)  goto done; args[n++] = &a0;
    211   if (&a1 == &no_more_args)  goto done; args[n++] = &a1;
    212   if (&a2 == &no_more_args)  goto done; args[n++] = &a2;
    213   if (&a3 == &no_more_args)  goto done; args[n++] = &a3;
    214   if (&a4 == &no_more_args)  goto done; args[n++] = &a4;
    215   if (&a5 == &no_more_args)  goto done; args[n++] = &a5;
    216   if (&a6 == &no_more_args)  goto done; args[n++] = &a6;
    217   if (&a7 == &no_more_args)  goto done; args[n++] = &a7;
    218   if (&a8 == &no_more_args)  goto done; args[n++] = &a8;
    219   if (&a9 == &no_more_args)  goto done; args[n++] = &a9;
    220   if (&a10 == &no_more_args) goto done; args[n++] = &a10;
    221   if (&a11 == &no_more_args) goto done; args[n++] = &a11;
    222   if (&a12 == &no_more_args) goto done; args[n++] = &a12;
    223   if (&a13 == &no_more_args) goto done; args[n++] = &a13;
    224   if (&a14 == &no_more_args) goto done; args[n++] = &a14;
    225   if (&a15 == &no_more_args) goto done; args[n++] = &a15;
    226 done:
    227 
    228   int consumed;
    229   int vec[kVecSize];
    230   return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
    231 }
    232 
    233 bool PCRE::ConsumeFunctor::operator ()(StringPiece* input,
    234                                      const PCRE& pattern,
    235                                      const Arg& a0,
    236                                      const Arg& a1,
    237                                      const Arg& a2,
    238                                      const Arg& a3,
    239                                      const Arg& a4,
    240                                      const Arg& a5,
    241                                      const Arg& a6,
    242                                      const Arg& a7,
    243                                      const Arg& a8,
    244                                      const Arg& a9,
    245                                      const Arg& a10,
    246                                      const Arg& a11,
    247                                      const Arg& a12,
    248                                      const Arg& a13,
    249                                      const Arg& a14,
    250                                      const Arg& a15) const {
    251   const Arg* args[kMaxArgs];
    252   int n = 0;
    253   if (&a0 == &no_more_args)  goto done; args[n++] = &a0;
    254   if (&a1 == &no_more_args)  goto done; args[n++] = &a1;
    255   if (&a2 == &no_more_args)  goto done; args[n++] = &a2;
    256   if (&a3 == &no_more_args)  goto done; args[n++] = &a3;
    257   if (&a4 == &no_more_args)  goto done; args[n++] = &a4;
    258   if (&a5 == &no_more_args)  goto done; args[n++] = &a5;
    259   if (&a6 == &no_more_args)  goto done; args[n++] = &a6;
    260   if (&a7 == &no_more_args)  goto done; args[n++] = &a7;
    261   if (&a8 == &no_more_args)  goto done; args[n++] = &a8;
    262   if (&a9 == &no_more_args)  goto done; args[n++] = &a9;
    263   if (&a10 == &no_more_args) goto done; args[n++] = &a10;
    264   if (&a11 == &no_more_args) goto done; args[n++] = &a11;
    265   if (&a12 == &no_more_args) goto done; args[n++] = &a12;
    266   if (&a13 == &no_more_args) goto done; args[n++] = &a13;
    267   if (&a14 == &no_more_args) goto done; args[n++] = &a14;
    268   if (&a15 == &no_more_args) goto done; args[n++] = &a15;
    269 done:
    270 
    271   int consumed;
    272   int vec[kVecSize];
    273   if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed,
    274                           args, n, vec, kVecSize)) {
    275     input->remove_prefix(consumed);
    276     return true;
    277   } else {
    278     return false;
    279   }
    280 }
    281 
    282 bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input,
    283                                             const PCRE& pattern,
    284                                             const Arg& a0,
    285                                             const Arg& a1,
    286                                             const Arg& a2,
    287                                             const Arg& a3,
    288                                             const Arg& a4,
    289                                             const Arg& a5,
    290                                             const Arg& a6,
    291                                             const Arg& a7,
    292                                             const Arg& a8,
    293                                             const Arg& a9,
    294                                             const Arg& a10,
    295                                             const Arg& a11,
    296                                             const Arg& a12,
    297                                             const Arg& a13,
    298                                             const Arg& a14,
    299                                             const Arg& a15) const {
    300   const Arg* args[kMaxArgs];
    301   int n = 0;
    302   if (&a0 == &no_more_args)  goto done; args[n++] = &a0;
    303   if (&a1 == &no_more_args)  goto done; args[n++] = &a1;
    304   if (&a2 == &no_more_args)  goto done; args[n++] = &a2;
    305   if (&a3 == &no_more_args)  goto done; args[n++] = &a3;
    306   if (&a4 == &no_more_args)  goto done; args[n++] = &a4;
    307   if (&a5 == &no_more_args)  goto done; args[n++] = &a5;
    308   if (&a6 == &no_more_args)  goto done; args[n++] = &a6;
    309   if (&a7 == &no_more_args)  goto done; args[n++] = &a7;
    310   if (&a8 == &no_more_args)  goto done; args[n++] = &a8;
    311   if (&a9 == &no_more_args)  goto done; args[n++] = &a9;
    312   if (&a10 == &no_more_args) goto done; args[n++] = &a10;
    313   if (&a11 == &no_more_args) goto done; args[n++] = &a11;
    314   if (&a12 == &no_more_args) goto done; args[n++] = &a12;
    315   if (&a13 == &no_more_args) goto done; args[n++] = &a13;
    316   if (&a14 == &no_more_args) goto done; args[n++] = &a14;
    317   if (&a15 == &no_more_args) goto done; args[n++] = &a15;
    318 done:
    319 
    320   int consumed;
    321   int vec[kVecSize];
    322   if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed,
    323                           args, n, vec, kVecSize)) {
    324     input->remove_prefix(consumed);
    325     return true;
    326   } else {
    327     return false;
    328   }
    329 }
    330 
    331 bool PCRE::Replace(string *str,
    332                  const PCRE& pattern,
    333                  const StringPiece& rewrite) {
    334   int vec[kVecSize];
    335   int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
    336   if (matches == 0)
    337     return false;
    338 
    339   string s;
    340   if (!pattern.Rewrite(&s, rewrite, *str, vec, matches))
    341     return false;
    342 
    343   assert(vec[0] >= 0);
    344   assert(vec[1] >= 0);
    345   str->replace(vec[0], vec[1] - vec[0], s);
    346   return true;
    347 }
    348 
    349 int PCRE::GlobalReplace(string *str,
    350                       const PCRE& pattern,
    351                       const StringPiece& rewrite) {
    352   int count = 0;
    353   int vec[kVecSize];
    354   string out;
    355   int start = 0;
    356   bool last_match_was_empty_string = false;
    357 
    358   for (; start <= str->length();) {
    359     // If the previous match was for the empty string, we shouldn't
    360     // just match again: we'll match in the same way and get an
    361     // infinite loop.  Instead, we do the match in a special way:
    362     // anchored -- to force another try at the same position --
    363     // and with a flag saying that this time, ignore empty matches.
    364     // If this special match returns, that means there's a non-empty
    365     // match at this position as well, and we can continue.  If not,
    366     // we do what perl does, and just advance by one.
    367     // Notice that perl prints '@@@' for this;
    368     //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
    369     int matches;
    370     if (last_match_was_empty_string) {
    371       matches = pattern.TryMatch(*str, start, ANCHOR_START, false,
    372                                  vec, kVecSize);
    373       if (matches <= 0) {
    374         if (start < str->length())
    375           out.push_back((*str)[start]);
    376         start++;
    377         last_match_was_empty_string = false;
    378         continue;
    379       }
    380     } else {
    381       matches = pattern.TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
    382       if (matches <= 0)
    383         break;
    384     }
    385     int matchstart = vec[0], matchend = vec[1];
    386     assert(matchstart >= start);
    387     assert(matchend >= matchstart);
    388 
    389     out.append(*str, start, matchstart - start);
    390     pattern.Rewrite(&out, rewrite, *str, vec, matches);
    391     start = matchend;
    392     count++;
    393     last_match_was_empty_string = (matchstart == matchend);
    394   }
    395 
    396   if (count == 0)
    397     return 0;
    398 
    399   if (start < str->length())
    400     out.append(*str, start, str->length() - start);
    401   swap(out, *str);
    402   return count;
    403 }
    404 
    405 bool PCRE::Extract(const StringPiece &text,
    406                  const PCRE& pattern,
    407                  const StringPiece &rewrite,
    408                  string *out) {
    409   int vec[kVecSize];
    410   int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
    411   if (matches == 0)
    412     return false;
    413   out->clear();
    414   return pattern.Rewrite(out, rewrite, text, vec, matches);
    415 }
    416 
    417 string PCRE::QuoteMeta(const StringPiece& unquoted) {
    418   string result;
    419   result.reserve(unquoted.size() << 1);
    420 
    421   // Escape any ascii character not in [A-Za-z_0-9].
    422   //
    423   // Note that it's legal to escape a character even if it has no
    424   // special meaning in a regular expression -- so this function does
    425   // that.  (This also makes it identical to the perl function of the
    426   // same name except for the null-character special case;
    427   // see `perldoc -f quotemeta`.)
    428   for (int ii = 0; ii < unquoted.length(); ++ii) {
    429     // Note that using 'isalnum' here raises the benchmark time from
    430     // 32ns to 58ns:
    431     if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
    432         (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
    433         (unquoted[ii] < '0' || unquoted[ii] > '9') &&
    434         unquoted[ii] != '_' &&
    435         // If this is the part of a UTF8 or Latin1 character, we need
    436         // to copy this byte without escaping.  Experimentally this is
    437         // what works correctly with the regexp library.
    438         !(unquoted[ii] & 128)) {
    439       if (unquoted[ii] == '\0') {  // Special handling for null chars.
    440         // Can't use "\\0" since the next character might be a digit.
    441         result += "\\x00";
    442         continue;
    443       }
    444       result += '\\';
    445     }
    446     result += unquoted[ii];
    447   }
    448 
    449   return result;
    450 }
    451 
    452 /***** Actual matching and rewriting code *****/
    453 
    454 bool PCRE::HitLimit() {
    455   return hit_limit_;
    456 }
    457 
    458 void PCRE::ClearHitLimit() {
    459   hit_limit_ = 0;
    460 }
    461 
    462 int PCRE::TryMatch(const StringPiece& text,
    463                  int startpos,
    464                  Anchor anchor,
    465                  bool empty_ok,
    466                  int *vec,
    467                  int vecsize) const {
    468   pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
    469   if (re == NULL) {
    470     PCREPORT(ERROR) << "Matching against invalid re: " << *error_;
    471     return 0;
    472   }
    473 
    474   int match_limit = match_limit_;
    475   if (match_limit <= 0) {
    476     match_limit = FLAGS_regexp_match_limit;
    477   }
    478 
    479   int stack_limit = stack_limit_;
    480   if (stack_limit <= 0) {
    481     stack_limit = FLAGS_regexp_stack_limit;
    482   }
    483 
    484   pcre_extra extra = { 0 };
    485   if (match_limit > 0) {
    486     extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
    487     extra.match_limit = match_limit;
    488   }
    489   if (stack_limit > 0) {
    490     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
    491     extra.match_limit_recursion = stack_limit / kPCREFrameSize;
    492   }
    493 
    494   int options = 0;
    495   if (anchor != UNANCHORED)
    496     options |= PCRE_ANCHORED;
    497   if (!empty_ok)
    498     options |= PCRE_NOTEMPTY;
    499 
    500   int rc = pcre_exec(re,              // The regular expression object
    501                      &extra,
    502                      (text.data() == NULL) ? "" : text.data(),
    503                      text.size(),
    504                      startpos,
    505                      options,
    506                      vec,
    507                      vecsize);
    508 
    509   // Handle errors
    510   if (rc == 0) {
    511     // pcre_exec() returns 0 as a special case when the number of
    512     // capturing subpatterns exceeds the size of the vector.
    513     // When this happens, there is a match and the output vector
    514     // is filled, but we miss out on the positions of the extra subpatterns.
    515     rc = vecsize / 2;
    516   } else if (rc < 0) {
    517     switch (rc) {
    518       case PCRE_ERROR_NOMATCH:
    519         return 0;
    520       case PCRE_ERROR_MATCHLIMIT:
    521         // Writing to hit_limit is not safe if multiple threads
    522         // are using the PCRE, but the flag is only intended
    523         // for use by unit tests anyway, so we let it go.
    524         hit_limit_ = true;
    525         PCREPORT(WARNING) << "Exceeded match limit of " << match_limit
    526                         << " when matching '" << pattern_ << "'"
    527                         << " against text that is " << text.size() << " bytes.";
    528         return 0;
    529       case PCRE_ERROR_RECURSIONLIMIT:
    530         // See comment about hit_limit above.
    531         hit_limit_ = true;
    532         PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit
    533                         << " when matching '" << pattern_ << "'"
    534                         << " against text that is " << text.size() << " bytes.";
    535         return 0;
    536       default:
    537         // There are other return codes from pcre.h :
    538         // PCRE_ERROR_NULL           (-2)
    539         // PCRE_ERROR_BADOPTION      (-3)
    540         // PCRE_ERROR_BADMAGIC       (-4)
    541         // PCRE_ERROR_UNKNOWN_NODE   (-5)
    542         // PCRE_ERROR_NOMEMORY       (-6)
    543         // PCRE_ERROR_NOSUBSTRING    (-7)
    544         // ...
    545         PCREPORT(ERROR) << "Unexpected return code: " << rc
    546                       << " when matching '" << pattern_ << "'"
    547                       << ", re=" << re
    548                       << ", text=" << text
    549                       << ", vec=" << vec
    550                       << ", vecsize=" << vecsize;
    551         return 0;
    552     }
    553   }
    554 
    555   return rc;
    556 }
    557 
    558 bool PCRE::DoMatchImpl(const StringPiece& text,
    559                      Anchor anchor,
    560                      int* consumed,
    561                      const Arg* const* args,
    562                      int n,
    563                      int* vec,
    564                      int vecsize) const {
    565   assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
    566   int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
    567   assert(matches >= 0);  // TryMatch never returns negatives
    568   if (matches == 0)
    569     return false;
    570 
    571   *consumed = vec[1];
    572 
    573   if (n == 0 || args == NULL) {
    574     // We are not interested in results
    575     return true;
    576   }
    577   if (NumberOfCapturingGroups() < n) {
    578     // PCRE has fewer capturing groups than number of arg pointers passed in
    579     return false;
    580   }
    581 
    582   // If we got here, we must have matched the whole pattern.
    583   // We do not need (can not do) any more checks on the value of 'matches' here
    584   // -- see the comment for TryMatch.
    585   for (int i = 0; i < n; i++) {
    586     const int start = vec[2*(i+1)];
    587     const int limit = vec[2*(i+1)+1];
    588     if (!args[i]->Parse(text.data() + start, limit-start)) {
    589       // TODO: Should we indicate what the error was?
    590       return false;
    591     }
    592   }
    593 
    594   return true;
    595 }
    596 
    597 bool PCRE::DoMatch(const StringPiece& text,
    598                  Anchor anchor,
    599                  int* consumed,
    600                  const Arg* const args[],
    601                  int n) const {
    602   assert(n >= 0);
    603   size_t const vecsize = (1 + n) * 3;  // results + PCRE workspace
    604                                        // (as for kVecSize)
    605   int *vec = new int[vecsize];
    606   bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
    607   delete[] vec;
    608   return b;
    609 }
    610 
    611 bool PCRE::Rewrite(string *out, const StringPiece &rewrite,
    612                  const StringPiece &text, int *vec, int veclen) const {
    613   int number_of_capturing_groups = NumberOfCapturingGroups();
    614   for (const char *s = rewrite.data(), *end = s + rewrite.size();
    615        s < end; s++) {
    616     int c = *s;
    617     if (c == '\\') {
    618       c = *++s;
    619       if (isdigit(c)) {
    620         int n = (c - '0');
    621         if (n >= veclen) {
    622           if (n <= number_of_capturing_groups) {
    623             // unmatched optional capturing group. treat
    624             // its value as empty string; i.e., nothing to append.
    625           } else {
    626             PCREPORT(ERROR) << "requested group " << n
    627                           << " in regexp " << rewrite.data();
    628             return false;
    629           }
    630         }
    631         int start = vec[2 * n];
    632         if (start >= 0)
    633           out->append(text.data() + start, vec[2 * n + 1] - start);
    634       } else if (c == '\\') {
    635         out->push_back('\\');
    636       } else {
    637         PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data();
    638         return false;
    639       }
    640     } else {
    641       out->push_back(c);
    642     }
    643   }
    644   return true;
    645 }
    646 
    647 bool PCRE::CheckRewriteString(const StringPiece& rewrite, string* error) const {
    648   int max_token = -1;
    649   for (const char *s = rewrite.data(), *end = s + rewrite.size();
    650        s < end; s++) {
    651     int c = *s;
    652     if (c != '\\') {
    653       continue;
    654     }
    655     if (++s == end) {
    656       *error = "Rewrite schema error: '\\' not allowed at end.";
    657       return false;
    658     }
    659     c = *s;
    660     if (c == '\\') {
    661       continue;
    662     }
    663     if (!isdigit(c)) {
    664       *error = "Rewrite schema error: "
    665                "'\\' must be followed by a digit or '\\'.";
    666       return false;
    667     }
    668     int n = (c - '0');
    669     if (max_token < n) {
    670       max_token = n;
    671     }
    672   }
    673 
    674   if (max_token > NumberOfCapturingGroups()) {
    675     SStringPrintf(error, "Rewrite schema requests %d matches, "
    676                   "but the regexp only has %d parenthesized subexpressions.",
    677                   max_token, NumberOfCapturingGroups());
    678     return false;
    679   }
    680   return true;
    681 }
    682 
    683 
    684 // Return the number of capturing subpatterns, or -1 if the
    685 // regexp wasn't valid on construction.
    686 int PCRE::NumberOfCapturingGroups() const {
    687   if (re_partial_ == NULL) return -1;
    688 
    689   int result;
    690   CHECK(pcre_fullinfo(re_partial_,       // The regular expression object
    691                       NULL,              // We did not study the pattern
    692                       PCRE_INFO_CAPTURECOUNT,
    693                       &result) == 0);
    694   return result;
    695 }
    696 
    697 
    698 /***** Parsers for various types *****/
    699 
    700 bool PCRE::Arg::parse_null(const char* str, int n, void* dest) {
    701   // We fail if somebody asked us to store into a non-NULL void* pointer
    702   return (dest == NULL);
    703 }
    704 
    705 bool PCRE::Arg::parse_string(const char* str, int n, void* dest) {
    706   if (dest == NULL) return true;
    707   reinterpret_cast<string*>(dest)->assign(str, n);
    708   return true;
    709 }
    710 
    711 bool PCRE::Arg::parse_stringpiece(const char* str, int n, void* dest) {
    712   if (dest == NULL) return true;
    713   reinterpret_cast<StringPiece*>(dest)->set(str, n);
    714   return true;
    715 }
    716 
    717 bool PCRE::Arg::parse_char(const char* str, int n, void* dest) {
    718   if (n != 1) return false;
    719   if (dest == NULL) return true;
    720   *(reinterpret_cast<char*>(dest)) = str[0];
    721   return true;
    722 }
    723 
    724 bool PCRE::Arg::parse_uchar(const char* str, int n, void* dest) {
    725   if (n != 1) return false;
    726   if (dest == NULL) return true;
    727   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
    728   return true;
    729 }
    730 
    731 // Largest number spec that we are willing to parse
    732 static const int kMaxNumberLength = 32;
    733 
    734 // PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1
    735 // PCREQUIPCRES "n > 0"
    736 // Copies "str" into "buf" and null-terminates if necessary.
    737 // Returns one of:
    738 //      a. "str" if no termination is needed
    739 //      b. "buf" if the string was copied and null-terminated
    740 //      c. "" if the input was invalid and has no hope of being parsed
    741 static const char* TerminateNumber(char* buf, const char* str, int n) {
    742   if ((n > 0) && isspace(*str)) {
    743     // We are less forgiving than the strtoxxx() routines and do not
    744     // allow leading spaces.
    745     return "";
    746   }
    747 
    748   // See if the character right after the input text may potentially
    749   // look like a digit.
    750   if (isdigit(str[n]) ||
    751       ((str[n] >= 'a') && (str[n] <= 'f')) ||
    752       ((str[n] >= 'A') && (str[n] <= 'F'))) {
    753     if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
    754     memcpy(buf, str, n);
    755     buf[n] = '\0';
    756     return buf;
    757   } else {
    758     // We can parse right out of the supplied string, so return it.
    759     return str;
    760   }
    761 }
    762 
    763 bool PCRE::Arg::parse_long_radix(const char* str,
    764                                int n,
    765                                void* dest,
    766                                int radix) {
    767   if (n == 0) return false;
    768   char buf[kMaxNumberLength+1];
    769   str = TerminateNumber(buf, str, n);
    770   char* end;
    771   errno = 0;
    772   long r = strtol(str, &end, radix);
    773   if (end != str + n) return false;   // Leftover junk
    774   if (errno) return false;
    775   if (dest == NULL) return true;
    776   *(reinterpret_cast<long*>(dest)) = r;
    777   return true;
    778 }
    779 
    780 bool PCRE::Arg::parse_ulong_radix(const char* str,
    781                                 int n,
    782                                 void* dest,
    783                                 int radix) {
    784   if (n == 0) return false;
    785   char buf[kMaxNumberLength+1];
    786   str = TerminateNumber(buf, str, n);
    787   if (str[0] == '-') {
    788    // strtoul() will silently accept negative numbers and parse
    789    // them.  This module is more strict and treats them as errors.
    790    return false;
    791   }
    792 
    793   char* end;
    794   errno = 0;
    795   unsigned long r = strtoul(str, &end, radix);
    796   if (end != str + n) return false;   // Leftover junk
    797   if (errno) return false;
    798   if (dest == NULL) return true;
    799   *(reinterpret_cast<unsigned long*>(dest)) = r;
    800   return true;
    801 }
    802 
    803 bool PCRE::Arg::parse_short_radix(const char* str,
    804                                 int n,
    805                                 void* dest,
    806                                 int radix) {
    807   long r;
    808   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
    809   if ((short)r != r) return false;       // Out of range
    810   if (dest == NULL) return true;
    811   *(reinterpret_cast<short*>(dest)) = r;
    812   return true;
    813 }
    814 
    815 bool PCRE::Arg::parse_ushort_radix(const char* str,
    816                                  int n,
    817                                  void* dest,
    818                                  int radix) {
    819   unsigned long r;
    820   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
    821   if ((ushort)r != r) return false;                      // Out of range
    822   if (dest == NULL) return true;
    823   *(reinterpret_cast<unsigned short*>(dest)) = r;
    824   return true;
    825 }
    826 
    827 bool PCRE::Arg::parse_int_radix(const char* str,
    828                               int n,
    829                               void* dest,
    830                               int radix) {
    831   long r;
    832   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
    833   if ((int)r != r) return false;         // Out of range
    834   if (dest == NULL) return true;
    835   *(reinterpret_cast<int*>(dest)) = r;
    836   return true;
    837 }
    838 
    839 bool PCRE::Arg::parse_uint_radix(const char* str,
    840                                int n,
    841                                void* dest,
    842                                int radix) {
    843   unsigned long r;
    844   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
    845   if ((uint)r != r) return false;                       // Out of range
    846   if (dest == NULL) return true;
    847   *(reinterpret_cast<unsigned int*>(dest)) = r;
    848   return true;
    849 }
    850 
    851 bool PCRE::Arg::parse_longlong_radix(const char* str,
    852                                    int n,
    853                                    void* dest,
    854                                    int radix) {
    855   if (n == 0) return false;
    856   char buf[kMaxNumberLength+1];
    857   str = TerminateNumber(buf, str, n);
    858   char* end;
    859   errno = 0;
    860   int64 r = strtoll(str, &end, radix);
    861   if (end != str + n) return false;   // Leftover junk
    862   if (errno) return false;
    863   if (dest == NULL) return true;
    864   *(reinterpret_cast<int64*>(dest)) = r;
    865   return true;
    866 }
    867 
    868 bool PCRE::Arg::parse_ulonglong_radix(const char* str,
    869                                     int n,
    870                                     void* dest,
    871                                     int radix) {
    872   if (n == 0) return false;
    873   char buf[kMaxNumberLength+1];
    874   str = TerminateNumber(buf, str, n);
    875   if (str[0] == '-') {
    876     // strtoull() will silently accept negative numbers and parse
    877     // them.  This module is more strict and treats them as errors.
    878     return false;
    879   }
    880   char* end;
    881   errno = 0;
    882   uint64 r = strtoull(str, &end, radix);
    883   if (end != str + n) return false;   // Leftover junk
    884   if (errno) return false;
    885   if (dest == NULL) return true;
    886   *(reinterpret_cast<uint64*>(dest)) = r;
    887   return true;
    888 }
    889 
    890 bool PCRE::Arg::parse_double(const char* str, int n, void* dest) {
    891   if (n == 0) return false;
    892   static const int kMaxLength = 200;
    893   char buf[kMaxLength];
    894   if (n >= kMaxLength) return false;
    895   memcpy(buf, str, n);
    896   buf[n] = '\0';
    897   errno = 0;
    898   char* end;
    899   double r = strtod(buf, &end);
    900   if (end != buf + n) {
    901 #ifdef COMPILER_MSVC
    902     // Microsoft's strtod() doesn't handle inf and nan, so we have to
    903     // handle it explicitly.  Speed is not important here because this
    904     // code is only called in unit tests.
    905     bool pos = true;
    906     const char* i = buf;
    907     if ('-' == *i) {
    908       pos = false;
    909       ++i;
    910     } else if ('+' == *i) {
    911       ++i;
    912     }
    913     if (0 == stricmp(i, "inf") || 0 == stricmp(i, "infinity")) {
    914       r = numeric_limits<double>::infinity();
    915       if (!pos)
    916         r = -r;
    917     } else if (0 == stricmp(i, "nan")) {
    918       r = numeric_limits<double>::quiet_NaN();
    919     } else {
    920       return false;
    921     }
    922 #else
    923     return false;   // Leftover junk
    924 #endif
    925   }
    926   if (errno) return false;
    927   if (dest == NULL) return true;
    928   *(reinterpret_cast<double*>(dest)) = r;
    929   return true;
    930 }
    931 
    932 bool PCRE::Arg::parse_float(const char* str, int n, void* dest) {
    933   double r;
    934   if (!parse_double(str, n, &r)) return false;
    935   if (dest == NULL) return true;
    936   *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
    937   return true;
    938 }
    939 
    940 
    941 #define DEFINE_INTEGER_PARSERS(name)                                        \
    942   bool PCRE::Arg::parse_##name(const char* str, int n, void* dest) {          \
    943     return parse_##name##_radix(str, n, dest, 10);                          \
    944   }                                                                         \
    945   bool PCRE::Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
    946     return parse_##name##_radix(str, n, dest, 16);                          \
    947   }                                                                         \
    948   bool PCRE::Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
    949     return parse_##name##_radix(str, n, dest, 8);                           \
    950   }                                                                         \
    951   bool PCRE::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
    952     return parse_##name##_radix(str, n, dest, 0);                           \
    953   }
    954 
    955 DEFINE_INTEGER_PARSERS(short);
    956 DEFINE_INTEGER_PARSERS(ushort);
    957 DEFINE_INTEGER_PARSERS(int);
    958 DEFINE_INTEGER_PARSERS(uint);
    959 DEFINE_INTEGER_PARSERS(long);
    960 DEFINE_INTEGER_PARSERS(ulong);
    961 DEFINE_INTEGER_PARSERS(longlong);
    962 DEFINE_INTEGER_PARSERS(ulonglong);
    963 
    964 #undef DEFINE_INTEGER_PARSERS
    965 
    966 }  // namespace re2
    967